1R"( 2 3 4 5 6#ifndef ARM_COMPUTE_HELPER_H 7#define ARM_COMPUTE_HELPER_H 8 9 10 11 12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 VSTORE(N0) \ 14 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15 16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 VSTORE(N0) \ 19 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20 21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 VSTORE(N0) \ 24 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25 26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 VSTORE(N0) \ 29 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30 31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 VSTORE(N0) \ 34 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35 36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 VSTORE(N0) \ 39 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40 41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 VSTORE(N0) \ 44 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45 46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 VSTORE(N0) \ 49 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50 51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 VSTORE(N0) \ 54 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55 56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 VSTORE(N0) \ 59 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60 61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 VSTORE(N0) \ 64 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65 66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 VSTORE(N0) \ 69 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70 71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 VSTORE(N0) \ 74 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75 76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 VSTORE(N0) \ 79 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80 81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 VSTORE(N0) \ 84 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85 86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90 91 92 93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 VSTORE(N0) \ 95 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96 97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 VSTORE(N0) \ 100 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101 102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 VSTORE(N0) \ 105 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106 107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 VSTORE(N0) \ 110 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111 112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116 117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121 122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126 127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131 132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136 137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141 142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146 147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151 152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161 162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166 167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171 172 173 174 175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177 178 179 180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182 183 184 185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE_PARTIAL(N0, STORE_N0) \ 187 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188 189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE_PARTIAL(N0, STORE_N0) \ 192 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193 194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE_PARTIAL(N0, STORE_N0) \ 197 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198 199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE_PARTIAL(N0, STORE_N0) \ 202 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203 204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE_PARTIAL(N0, STORE_N0) \ 207 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208 209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE_PARTIAL(N0, STORE_N0) \ 212 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213 214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE_PARTIAL(N0, STORE_N0) \ 217 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218 219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE_PARTIAL(N0, STORE_N0) \ 222 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223 224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE_PARTIAL(N0, STORE_N0) \ 227 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228 229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE_PARTIAL(N0, STORE_N0) \ 232 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233 234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 VSTORE_PARTIAL(N0, STORE_N0) \ 237 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238 239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 VSTORE_PARTIAL(N0, STORE_N0) \ 242 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243 244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 VSTORE_PARTIAL(N0, STORE_N0) \ 247 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248 249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 VSTORE_PARTIAL(N0, STORE_N0) \ 252 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253 254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 VSTORE_PARTIAL(N0, STORE_N0) \ 257 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258 259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261 VSTORE_PARTIAL(N0, STORE_N0) \ 262 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263 264 265 266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268 269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279 { \ 280 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } \ 282 else \ 283 { \ 284 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285 } 286 287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288 if(!(PARTIAL_COND_X)) \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } \ 292 else \ 293 { \ 294 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295 } 296 297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298 if(!(PARTIAL_COND_Y)) \ 299 { \ 300 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } \ 302 else \ 303 { \ 304 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305 } 306 307 308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309 310 311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312 313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315 316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317 318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320 321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322 323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325 326#else 327 328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330 331#endif 332 333#endif 334 335 336#if defined(PARTIAL_STORE_M0) 337 338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340#else 341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342 ((uint)(y * M0)) 343#endif 344 345 346 347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349 350 351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361#endif 362 363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364#pragma OPENCL EXTENSION cl_arm_printf : enable 365#endif 366 367#define GPU_ARCH_MIDGARD 0x100 368#define GPU_ARCH_BIFROST 0x200 369#define GPU_ARCH_VALHALL 0x300 370 371 372#define CONCAT(a, b) a##b 373 374 375#define EXPAND(x) x 376 377 378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379 380 381#define REV1(x) ((x)) 382#define REV2(x) ((x).s10) 383#define REV3(x) ((x).s210) 384#define REV4(x) ((x).s3210) 385#define REV8(x) ((x).s76543210) 386#define REV16(x) ((x).sFEDCBA9876543210) 387 388 389 390#define REVERSE_STR(x, s) REV##s((x)) 391#define REVERSE(x, s) REVERSE_STR(x, s) 392 393 394 395#define ROT1_0(x) ((x)) 396#define ROT1_1(x) ((x)) 397 398#define ROT2_0(x) ((x)) 399#define ROT2_1(x) ((x).s10) 400#define ROT2_2(x) ((x)) 401 402#define ROT3_0(x) ((x)) 403#define ROT3_1(x) ((x).s201) 404#define ROT3_2(x) ((x).s120) 405#define ROT3_3(x) ((x)) 406 407#define ROT4_0(x) ((x)) 408#define ROT4_1(x) ((x).s3012) 409#define ROT4_2(x) ((x).s2301) 410#define ROT4_3(x) ((x).s1230) 411#define ROT4_4(x) ((x)) 412 413#define ROT8_0(x) ((x)) 414#define ROT8_1(x) ((x).s70123456) 415#define ROT8_2(x) ((x).s67012345) 416#define ROT8_3(x) ((x).s56701234) 417#define ROT8_4(x) ((x).s45670123) 418#define ROT8_5(x) ((x).s34567012) 419#define ROT8_6(x) ((x).s23456701) 420#define ROT8_7(x) ((x).s12345670) 421#define ROT8_8(x) ((x)) 422 423#define ROT16_0(x) ((x)) 424#define ROT16_1(x) ((x).sF0123456789ABCDE) 425#define ROT16_2(x) ((x).sEF0123456789ABCD) 426#define ROT16_3(x) ((x).sDEF0123456789ABC) 427#define ROT16_4(x) ((x).sCDEF0123456789AB) 428#define ROT16_5(x) ((x).sBCDEF0123456789A) 429#define ROT16_6(x) ((x).sABCDEF0123456789) 430#define ROT16_7(x) ((x).s9ABCDEF012345678) 431#define ROT16_8(x) ((x).s89ABCDEF01234567) 432#define ROT16_9(x) ((x).s789ABCDEF0123456) 433#define ROT16_10(x) ((x).s6789ABCDEF012345) 434#define ROT16_11(x) ((x).s56789ABCDEF01234) 435#define ROT16_12(x) ((x).s456789ABCDEF0123) 436#define ROT16_13(x) ((x).s3456789ABCDEF012) 437#define ROT16_14(x) ((x).s23456789ABCDEF01) 438#define ROT16_15(x) ((x).s123456789ABCDEF0) 439#define ROT16_16(x) ((x)) 440 441 442 443#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445 446 447 448#define V_OFFS1(dt) (dt##1)(0) 449#define V_OFFS2(dt) (dt##2)(0, 1) 450#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454 455 456 457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459 460 461#define VLOAD_STR(size) vload##size 462#define VLOAD(size) VLOAD_STR(size) 463 464 465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467 468#define NO_LOAD(data, offs, ptr) \ 469 { \ 470 } 471 472 473#define vload_partial_1_0 NO_LOAD 474#define vload_partial_1_1 vload1 475#define vload_partial_1_2 NO_LOAD 476#define vload_partial_1_3 NO_LOAD 477#define vload_partial_1_4 NO_LOAD 478#define vload_partial_1_5 NO_LOAD 479#define vload_partial_1_6 NO_LOAD 480#define vload_partial_1_7 NO_LOAD 481#define vload_partial_1_8 NO_LOAD 482#define vload_partial_1_9 NO_LOAD 483#define vload_partial_1_10 NO_LOAD 484#define vload_partial_1_11 NO_LOAD 485#define vload_partial_1_12 NO_LOAD 486#define vload_partial_1_13 NO_LOAD 487#define vload_partial_1_14 NO_LOAD 488#define vload_partial_1_15 NO_LOAD 489#define vload_partial_1_16 NO_LOAD 490 491#define vload_partial_2_0 NO_LOAD 492#define vload_partial_2_1 vload_partial_1 493#define vload_partial_2_2 vload_partial_2 494#define vload_partial_2_3 NO_LOAD 495#define vload_partial_2_4 NO_LOAD 496#define vload_partial_2_5 NO_LOAD 497#define vload_partial_2_6 NO_LOAD 498#define vload_partial_2_7 NO_LOAD 499#define vload_partial_2_8 NO_LOAD 500#define vload_partial_2_9 NO_LOAD 501#define vload_partial_2_10 NO_LOAD 502#define vload_partial_2_11 NO_LOAD 503#define vload_partial_2_12 NO_LOAD 504#define vload_partial_2_13 NO_LOAD 505#define vload_partial_2_14 NO_LOAD 506#define vload_partial_2_15 NO_LOAD 507#define vload_partial_2_16 NO_LOAD 508 509#define vload_partial_3_0 NO_LOAD 510#define vload_partial_3_1 vload_partial_1 511#define vload_partial_3_2 vload_partial_2 512#define vload_partial_3_3 vload_partial_3 513#define vload_partial_3_4 NO_LOAD 514#define vload_partial_3_5 NO_LOAD 515#define vload_partial_3_6 NO_LOAD 516#define vload_partial_3_7 NO_LOAD 517#define vload_partial_3_8 NO_LOAD 518#define vload_partial_3_9 NO_LOAD 519#define vload_partial_3_10 NO_LOAD 520#define vload_partial_3_11 NO_LOAD 521#define vload_partial_3_12 NO_LOAD 522#define vload_partial_3_13 NO_LOAD 523#define vload_partial_3_14 NO_LOAD 524#define vload_partial_3_15 NO_LOAD 525#define vload_partial_3_16 NO_LOAD 526 527#define vload_partial_4_0 NO_LOAD 528#define vload_partial_4_1 vload_partial_1 529#define vload_partial_4_2 vload_partial_2 530#define vload_partial_4_3 vload_partial_3 531#define vload_partial_4_4 vload_partial_4 532#define vload_partial_4_5 NO_LOAD 533#define vload_partial_4_6 NO_LOAD 534#define vload_partial_4_7 NO_LOAD 535#define vload_partial_4_8 NO_LOAD 536#define vload_partial_4_9 NO_LOAD 537#define vload_partial_4_10 NO_LOAD 538#define vload_partial_4_11 NO_LOAD 539#define vload_partial_4_12 NO_LOAD 540#define vload_partial_4_13 NO_LOAD 541#define vload_partial_4_14 NO_LOAD 542#define vload_partial_4_15 NO_LOAD 543#define vload_partial_4_16 NO_LOAD 544 545#define vload_partial_8_0 NO_LOAD 546#define vload_partial_8_1 vload_partial_1 547#define vload_partial_8_2 vload_partial_2 548#define vload_partial_8_3 vload_partial_3 549#define vload_partial_8_4 vload_partial_4 550#define vload_partial_8_5 vload_partial_5 551#define vload_partial_8_6 vload_partial_6 552#define vload_partial_8_7 vload_partial_7 553#define vload_partial_8_8 vload_partial_8 554#define vload_partial_8_9 NO_LOAD 555#define vload_partial_8_10 NO_LOAD 556#define vload_partial_8_11 NO_LOAD 557#define vload_partial_8_12 NO_LOAD 558#define vload_partial_8_13 NO_LOAD 559#define vload_partial_8_14 NO_LOAD 560#define vload_partial_8_15 NO_LOAD 561#define vload_partial_8_16 NO_LOAD 562 563#define vload_partial_16_0 NO_LOAD 564#define vload_partial_16_1 vload_partial_1 565#define vload_partial_16_2 vload_partial_2 566#define vload_partial_16_3 vload_partial_3 567#define vload_partial_16_4 vload_partial_4 568#define vload_partial_16_5 vload_partial_5 569#define vload_partial_16_6 vload_partial_6 570#define vload_partial_16_7 vload_partial_7 571#define vload_partial_16_8 vload_partial_8 572#define vload_partial_16_9 vload_partial_9 573#define vload_partial_16_10 vload_partial_10 574#define vload_partial_16_11 vload_partial_11 575#define vload_partial_16_12 vload_partial_12 576#define vload_partial_16_13 vload_partial_13 577#define vload_partial_16_14 vload_partial_14 578#define vload_partial_16_15 vload_partial_15 579#define vload_partial_16_16 vload_partial_16 580 581 582#define vload_partial_1(DATA, OFFSET, PTR) \ 583 DATA.s0 = vload1(OFFSET, PTR); 584 585#define vload_partial_2(DATA, OFFSET, PTR) \ 586 DATA.s01 = vload2(OFFSET, PTR); 587 588#define vload_partial_3(DATA, OFFSET, PTR) \ 589 DATA.s012 = vload3(OFFSET, PTR); 590 591#define vload_partial_4(DATA, OFFSET, PTR) \ 592 DATA.s0123 = vload4(OFFSET, PTR); 593 594#define vload_partial_5(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 DATA.s4 = vload1(OFFSET, PTR + 4); 597 598#define vload_partial_6(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601 602#define vload_partial_7(DATA, OFFSET, PTR) \ 603 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605 606#define vload_partial_8(DATA, OFFSET, PTR) \ 607 DATA.s01234567 = vload8(OFFSET, PTR); 608 609#define vload_partial_9(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 DATA.s8 = vload1(OFFSET, PTR + 8); 612 613#define vload_partial_10(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616 617#define vload_partial_11(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620 621#define vload_partial_12(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624 625#define vload_partial_13(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_14(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_15(DATA, OFFSET, PTR) \ 634 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636 637#define vload_partial_16(DATA, OFFSET, PTR) \ 638 DATA = vload16(OFFSET, PTR); 639 640 641 642#define PIXEL_UNIT4 1 643#define PIXEL_UNIT8 2 644#define PIXEL_UNIT16 4 645 646 647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649 650 651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654 655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659#endif 660 661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669#endif 670 671 672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674 675 676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678 679#define VSTORE_STR(size) vstore##size 680#define VSTORE(size) VSTORE_STR(size) 681 682#define float1 float 683#define half1 half 684#define char1 char 685#define uchar1 uchar 686#define short1 short 687#define ushort1 ushort 688#define int1 int 689#define uint1 uint 690#define long1 long 691#define ulong1 ulong 692#define double1 double 693 694#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696 697 698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700 701#define NO_STORE(data, offs, ptr) \ 702 { \ 703 } 704 705 706#define vstore_partial_1_0 NO_STORE 707#define vstore_partial_1_1 vstore1 708#define vstore_partial_1_2 NO_STORE 709#define vstore_partial_1_3 NO_STORE 710#define vstore_partial_1_4 NO_STORE 711#define vstore_partial_1_5 NO_STORE 712#define vstore_partial_1_6 NO_STORE 713#define vstore_partial_1_7 NO_STORE 714#define vstore_partial_1_8 NO_STORE 715#define vstore_partial_1_9 NO_STORE 716#define vstore_partial_1_10 NO_STORE 717#define vstore_partial_1_11 NO_STORE 718#define vstore_partial_1_12 NO_STORE 719#define vstore_partial_1_13 NO_STORE 720#define vstore_partial_1_14 NO_STORE 721#define vstore_partial_1_15 NO_STORE 722#define vstore_partial_1_16 NO_STORE 723 724#define vstore_partial_2_0 NO_STORE 725#define vstore_partial_2_1 vstore_partial_1 726#define vstore_partial_2_2 vstore_partial_2 727#define vstore_partial_2_3 NO_STORE 728#define vstore_partial_2_4 NO_STORE 729#define vstore_partial_2_5 NO_STORE 730#define vstore_partial_2_6 NO_STORE 731#define vstore_partial_2_7 NO_STORE 732#define vstore_partial_2_8 NO_STORE 733#define vstore_partial_2_9 NO_STORE 734#define vstore_partial_2_10 NO_STORE 735#define vstore_partial_2_11 NO_STORE 736#define vstore_partial_2_12 NO_STORE 737#define vstore_partial_2_13 NO_STORE 738#define vstore_partial_2_14 NO_STORE 739#define vstore_partial_2_15 NO_STORE 740#define vstore_partial_2_16 NO_STORE 741 742#define vstore_partial_3_0 NO_STORE 743#define vstore_partial_3_1 vstore_partial_1 744#define vstore_partial_3_2 vstore_partial_2 745#define vstore_partial_3_3 vstore_partial_3 746#define vstore_partial_3_4 NO_STORE 747#define vstore_partial_3_5 NO_STORE 748#define vstore_partial_3_6 NO_STORE 749#define vstore_partial_3_7 NO_STORE 750#define vstore_partial_3_8 NO_STORE 751#define vstore_partial_3_9 NO_STORE 752#define vstore_partial_3_10 NO_STORE 753#define vstore_partial_3_11 NO_STORE 754#define vstore_partial_3_12 NO_STORE 755#define vstore_partial_3_13 NO_STORE 756#define vstore_partial_3_14 NO_STORE 757#define vstore_partial_3_15 NO_STORE 758#define vstore_partial_3_16 NO_STORE 759 760#define vstore_partial_4_0 NO_STORE 761#define vstore_partial_4_1 vstore_partial_1 762#define vstore_partial_4_2 vstore_partial_2 763#define vstore_partial_4_3 vstore_partial_3 764#define vstore_partial_4_4 vstore_partial_4 765#define vstore_partial_4_5 NO_STORE 766#define vstore_partial_4_6 NO_STORE 767#define vstore_partial_4_7 NO_STORE 768#define vstore_partial_4_8 NO_STORE 769#define vstore_partial_4_9 NO_STORE 770#define vstore_partial_4_10 NO_STORE 771#define vstore_partial_4_11 NO_STORE 772#define vstore_partial_4_12 NO_STORE 773#define vstore_partial_4_13 NO_STORE 774#define vstore_partial_4_14 NO_STORE 775#define vstore_partial_4_15 NO_STORE 776#define vstore_partial_4_16 NO_STORE 777 778#define vstore_partial_8_0 NO_STORE 779#define vstore_partial_8_1 vstore_partial_1 780#define vstore_partial_8_2 vstore_partial_2 781#define vstore_partial_8_3 vstore_partial_3 782#define vstore_partial_8_4 vstore_partial_4 783#define vstore_partial_8_5 vstore_partial_5 784#define vstore_partial_8_6 vstore_partial_6 785#define vstore_partial_8_7 vstore_partial_7 786#define vstore_partial_8_8 vstore_partial_8 787#define vstore_partial_8_9 NO_STORE 788#define vstore_partial_8_10 NO_STORE 789#define vstore_partial_8_11 NO_STORE 790#define vstore_partial_8_12 NO_STORE 791#define vstore_partial_8_13 NO_STORE 792#define vstore_partial_8_14 NO_STORE 793#define vstore_partial_8_15 NO_STORE 794#define vstore_partial_8_16 NO_STORE 795 796#define vstore_partial_16_0 NO_STORE 797#define vstore_partial_16_1 vstore_partial_1 798#define vstore_partial_16_2 vstore_partial_2 799#define vstore_partial_16_3 vstore_partial_3 800#define vstore_partial_16_4 vstore_partial_4 801#define vstore_partial_16_5 vstore_partial_5 802#define vstore_partial_16_6 vstore_partial_6 803#define vstore_partial_16_7 vstore_partial_7 804#define vstore_partial_16_8 vstore_partial_8 805#define vstore_partial_16_9 vstore_partial_9 806#define vstore_partial_16_10 vstore_partial_10 807#define vstore_partial_16_11 vstore_partial_11 808#define vstore_partial_16_12 vstore_partial_12 809#define vstore_partial_16_13 vstore_partial_13 810#define vstore_partial_16_14 vstore_partial_14 811#define vstore_partial_16_15 vstore_partial_15 812#define vstore_partial_16_16 vstore_partial_16 813 814 815#define vstore_partial_1(DATA, OFFSET, PTR) \ 816 vstore1(DATA.s0, OFFSET, PTR); 817 818#define vstore_partial_2(DATA, OFFSET, PTR) \ 819 vstore2(DATA.s01, OFFSET, PTR); 820 821#define vstore_partial_3(DATA, OFFSET, PTR) \ 822 vstore3(DATA.s012, OFFSET, PTR); 823 824#define vstore_partial_4(DATA, OFFSET, PTR) \ 825 vstore4(DATA.s0123, OFFSET, PTR); 826 827#define vstore_partial_5(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore1(DATA.s4, OFFSET, PTR + 4); 830 831#define vstore_partial_6(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834 835#define vstore_partial_7(DATA, OFFSET, PTR) \ 836 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838 839#define vstore_partial_8(DATA, OFFSET, PTR) \ 840 vstore8(DATA.s01234567, OFFSET, PTR); 841 842#define vstore_partial_9(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore1(DATA.s8, OFFSET, PTR + 8); 845 846#define vstore_partial_10(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849 850#define vstore_partial_11(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853 854#define vstore_partial_12(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857 858#define vstore_partial_13(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_14(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_15(DATA, OFFSET, PTR) \ 867 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869 870#define vstore_partial_16(DATA, OFFSET, PTR) \ 871 vstore16(DATA, OFFSET, PTR); 872 873 874 875 876 877#define convert_float_sat convert_float 878#define convert_float1_sat convert_float 879#define convert_float2_sat convert_float2 880#define convert_float3_sat convert_float3 881#define convert_float4_sat convert_float4 882#define convert_float8_sat convert_float8 883#define convert_float16_sat convert_float16 884#define convert_half_sat convert_float 885#define convert_half1_sat convert_half 886#define convert_half2_sat convert_half2 887#define convert_half3_sat convert_half3 888#define convert_half4_sat convert_half4 889#define convert_half8_sat convert_half8 890#define convert_half16_sat convert_half16 891 892#define convert_float1 convert_float 893#define convert_half1 convert_half 894#define convert_char1 convert_char 895#define convert_uchar1 convert_uchar 896#define convert_short1 convert_short 897#define convert_ushort1 convert_ushort 898#define convert_int1 convert_int 899#define convert_uint1 convert_uint 900#define convert_long1 convert_long 901#define convert_ulong1 convert_ulong 902#define convert_double1 convert_double 903 904#define convert_char1_sat convert_char_sat 905#define convert_uchar1_sat convert_uchar_sat 906#define convert_uchar2_sat convert_uchar2_sat 907#define convert_uchar3_sat convert_uchar3_sat 908#define convert_uchar4_sat convert_uchar4_sat 909#define convert_uchar8_sat convert_uchar8_sat 910#define convert_uchar16_sat convert_uchar16_sat 911#define convert_short1_sat convert_short_sat 912#define convert_ushort1_sat convert_ushort_sat 913#define convert_int1_sat convert_int_sat 914#define convert_uint1_sat convert_uint_sat 915#define convert_long1_sat convert_long_sat 916#define convert_ulong1_sat convert_ulong_sat 917#define convert_double1_sat convert_double_sat 918 919#define VEC_DATA_TYPE_STR(type, size) type##size 920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921 922#define CONVERT_STR(x, type) (convert_##type((x))) 923#define CONVERT(x, type) CONVERT_STR(x, type) 924 925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927 928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930 931#define select_vec_dt_uchar(size) uchar##size 932#define select_vec_dt_char(size) char##size 933#define select_vec_dt_ushort(size) ushort##size 934#define select_vec_dt_short(size) short##size 935#define select_vec_dt_half(size) short##size 936#define select_vec_dt_uint(size) uint##size 937#define select_vec_dt_int(size) int##size 938#define select_vec_dt_float(size) int##size 939#define select_vec_dt_ulong(size) ulong##size 940#define select_vec_dt_long(size) long##size 941 942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945 946#define signed_int_vec_dt_uchar(size) char##size 947#define signed_int_vec_dt_char(size) char##size 948#define signed_int_vec_dt_ushort(size) short##size 949#define signed_int_vec_dt_short(size) short##size 950#define signed_int_vec_dt_half(size) short##size 951#define signed_int_vec_dt_uint(size) int##size 952#define signed_int_vec_dt_int(size) int##size 953#define signed_int_vec_dt_float(size) int##size 954#define signed_int_vec_dt_ulong(size) long##size 955#define signed_int_vec_dt_long(size) long##size 956 957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960 961#define sum_reduce_1(x) (x) 962#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967 968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970 971#define prod_reduce_1(x) (x) 972#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977 978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980 981#define max_reduce_1(x) (x) 982#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987 988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990 991#define VECTOR_DECLARATION(name) \ 992 __global uchar *name##_ptr, \ 993 uint name##_stride_x, \ 994 uint name##_step_x, \ 995 uint name##_offset_first_element_in_bytes 996 997#define IMAGE_DECLARATION(name) \ 998 __global uchar *name##_ptr, \ 999 uint name##_stride_x, \ 1000 uint name##_step_x, \ 1001 uint name##_stride_y, \ 1002 uint name##_step_y, \ 1003 uint name##_offset_first_element_in_bytes 1004 1005#define TENSOR3D_DECLARATION(name) \ 1006 __global uchar *name##_ptr, \ 1007 uint name##_stride_x, \ 1008 uint name##_step_x, \ 1009 uint name##_stride_y, \ 1010 uint name##_step_y, \ 1011 uint name##_stride_z, \ 1012 uint name##_step_z, \ 1013 uint name##_offset_first_element_in_bytes 1014 1015#define TENSOR4D_DECLARATION(name) \ 1016 __global uchar *name##_ptr, \ 1017 uint name##_stride_x, \ 1018 uint name##_step_x, \ 1019 uint name##_stride_y, \ 1020 uint name##_step_y, \ 1021 uint name##_stride_z, \ 1022 uint name##_step_z, \ 1023 uint name##_stride_w, \ 1024 uint name##_step_w, \ 1025 uint name##_offset_first_element_in_bytes 1026 1027#define TENSOR5D_DECLARATION(name) \ 1028 __global uchar *name##_ptr, \ 1029 uint name##_stride_x, \ 1030 uint name##_step_x, \ 1031 uint name##_stride_y, \ 1032 uint name##_step_y, \ 1033 uint name##_stride_z, \ 1034 uint name##_step_z, \ 1035 uint name##_stride_w, \ 1036 uint name##_step_w, \ 1037 uint name##_stride_v, \ 1038 uint name##_step_v, \ 1039 uint name##_offset_first_element_in_bytes 1040 1041#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043 1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046 1047#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049 1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052 1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055 1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058 1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064 name##_stride_z, name##_step_z) 1065 1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072 1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075 1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078 name##_stride_z, name##_step_z) 1079 1080 1081typedef struct Vector 1082{ 1083 __global uchar *ptr; 1084 int offset_first_element_in_bytes; 1085 int stride_x; 1086} Vector; 1087 1088 1089typedef struct Image 1090{ 1091 __global uchar *ptr; 1092 int offset_first_element_in_bytes; 1093 int stride_x; 1094 int stride_y; 1095} Image; 1096 1097 1098typedef struct Tensor3D 1099{ 1100 __global uchar *ptr; 1101 int offset_first_element_in_bytes; 1102 int stride_x; 1103 int stride_y; 1104 int stride_z; 1105} Tensor3D; 1106 1107 1108typedef struct Tensor4D 1109{ 1110 __global uchar *ptr; 1111 int offset_first_element_in_bytes; 1112 int stride_x; 1113 int stride_y; 1114 int stride_z; 1115 int stride_w; 1116} Tensor4D; 1117 1118 1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120{ 1121 Vector vector = 1122 { 1123 .ptr = ptr, 1124 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125 .stride_x = stride_x, 1126 }; 1127 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128 return vector; 1129} 1130 1131 1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133{ 1134 Image img = 1135 { 1136 .ptr = ptr, 1137 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138 .stride_x = stride_x, 1139 .stride_y = stride_y 1140 }; 1141 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142 return img; 1143} 1144 1145 1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147{ 1148 Image img = 1149 { 1150 .ptr = ptr, 1151 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152 .stride_x = stride_x, 1153 .stride_y = stride_y 1154 }; 1155 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156 return img; 1157} 1158 1159 1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161{ 1162 Tensor3D tensor = 1163 { 1164 .ptr = ptr, 1165 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166 .stride_x = stride_x, 1167 .stride_y = stride_y, 1168 .stride_z = stride_z 1169 }; 1170 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171 return tensor; 1172} 1173 1174 1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176{ 1177 Tensor3D tensor = 1178 { 1179 .ptr = ptr, 1180 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181 .stride_x = stride_x, 1182 .stride_y = stride_y, 1183 .stride_z = stride_z 1184 }; 1185 return tensor; 1186} 1187 1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189 uint step_w, 1190 uint mod_size) 1191{ 1192 Tensor4D tensor = 1193 { 1194 .ptr = ptr, 1195 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196 .stride_x = stride_x, 1197 .stride_y = stride_y, 1198 .stride_z = stride_z, 1199 .stride_w = stride_w 1200 }; 1201 1202 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203 return tensor; 1204} 1205 1206 1207inline __global const uchar *vector_offset(const Vector *vec, int x) 1208{ 1209 return vec->ptr + x * vec->stride_x; 1210} 1211 1212 1213inline __global uchar *offset(const Image *img, int x, int y) 1214{ 1215 return img->ptr + x * img->stride_x + y * img->stride_y; 1216} 1217 1218 1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220{ 1221 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222} 1223 1224 1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226{ 1227 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228} 1229 1230 1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232{ 1233 uint num_elements = width * height; 1234 1235 const uint z = index / num_elements; 1236 1237 index %= num_elements; 1238 1239 const uint y = index / width; 1240 1241 index %= width; 1242 1243 const uint x = index; 1244 1245 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246} 1247 1248#endif 1249 1250#if GPU_ARCH == GPU_ARCH_BIFROST 1251#define MLA(a, b, c) (fma(c, b, a)) 1252#else 1253#define MLA(a, b, c) ((b) * (c) + (a)) 1254#endif 1255 1256 1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258 1259 1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261 1262 1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264 1265 1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267 1268 1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270 1271 1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273 1274 1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276 1277 1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279 1280 1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282 1283 1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285 1286 1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288 1289 1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291 1292 1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294 1295 1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297 1298 1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300 1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302 1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304 1305#ifndef ARM_COMPUTE_HELPER_H 1306#define ARM_COMPUTE_HELPER_H 1307 1308 1309 1310 1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312 VSTORE(N0) \ 1313 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314 1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317 VSTORE(N0) \ 1318 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319 1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322 VSTORE(N0) \ 1323 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324 1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327 VSTORE(N0) \ 1328 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329 1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332 VSTORE(N0) \ 1333 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334 1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337 VSTORE(N0) \ 1338 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339 1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342 VSTORE(N0) \ 1343 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344 1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347 VSTORE(N0) \ 1348 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349 1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352 VSTORE(N0) \ 1353 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354 1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357 VSTORE(N0) \ 1358 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359 1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362 VSTORE(N0) \ 1363 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364 1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367 VSTORE(N0) \ 1368 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369 1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372 VSTORE(N0) \ 1373 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374 1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377 VSTORE(N0) \ 1378 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379 1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382 VSTORE(N0) \ 1383 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384 1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387 VSTORE(N0) \ 1388 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389 1390 1391 1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393 VSTORE(N0) \ 1394 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395 1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398 VSTORE(N0) \ 1399 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400 1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403 VSTORE(N0) \ 1404 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405 1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408 VSTORE(N0) \ 1409 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410 1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413 VSTORE(N0) \ 1414 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415 1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418 VSTORE(N0) \ 1419 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420 1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423 VSTORE(N0) \ 1424 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425 1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428 VSTORE(N0) \ 1429 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430 1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433 VSTORE(N0) \ 1434 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435 1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438 VSTORE(N0) \ 1439 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440 1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443 VSTORE(N0) \ 1444 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445 1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448 VSTORE(N0) \ 1449 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450 1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453 VSTORE(N0) \ 1454 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455 1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458 VSTORE(N0) \ 1459 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460 1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463 VSTORE(N0) \ 1464 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465 1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468 VSTORE(N0) \ 1469 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470 1471 1472 1473 1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476 1477 1478 1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481 1482 1483 1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485 VSTORE_PARTIAL(N0, STORE_N0) \ 1486 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487 1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490 VSTORE_PARTIAL(N0, STORE_N0) \ 1491 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492 1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495 VSTORE_PARTIAL(N0, STORE_N0) \ 1496 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497 1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500 VSTORE_PARTIAL(N0, STORE_N0) \ 1501 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502 1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505 VSTORE_PARTIAL(N0, STORE_N0) \ 1506 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507 1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510 VSTORE_PARTIAL(N0, STORE_N0) \ 1511 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512 1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515 VSTORE_PARTIAL(N0, STORE_N0) \ 1516 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517 1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520 VSTORE_PARTIAL(N0, STORE_N0) \ 1521 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522 1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525 VSTORE_PARTIAL(N0, STORE_N0) \ 1526 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527 1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530 VSTORE_PARTIAL(N0, STORE_N0) \ 1531 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532 1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535 VSTORE_PARTIAL(N0, STORE_N0) \ 1536 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537 1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540 VSTORE_PARTIAL(N0, STORE_N0) \ 1541 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542 1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545 VSTORE_PARTIAL(N0, STORE_N0) \ 1546 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547 1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550 VSTORE_PARTIAL(N0, STORE_N0) \ 1551 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552 1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555 VSTORE_PARTIAL(N0, STORE_N0) \ 1556 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557 1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560 VSTORE_PARTIAL(N0, STORE_N0) \ 1561 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562 1563 1564 1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567 1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570 { \ 1571 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572 } \ 1573 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574 { \ 1575 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576 } \ 1577 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578 { \ 1579 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580 } \ 1581 else \ 1582 { \ 1583 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584 } 1585 1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587 if(!(PARTIAL_COND_X)) \ 1588 { \ 1589 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590 } \ 1591 else \ 1592 { \ 1593 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594 } 1595 1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597 if(!(PARTIAL_COND_Y)) \ 1598 { \ 1599 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600 } \ 1601 else \ 1602 { \ 1603 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604 } 1605 1606 1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608 1609 1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611 1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614 1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616 1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619 1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621 1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624 1625#else 1626 1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629 1630#endif 1631 1632#endif 1633 1634 1635#if defined(PARTIAL_STORE_M0) 1636 1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639#else 1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641 ((uint)(y * M0)) 1642#endif 1643 1644 1645 1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648 1649 1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652#endif 1653 1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656#endif 1657 1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660#endif 1661 1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663#pragma OPENCL EXTENSION cl_arm_printf : enable 1664#endif 1665 1666#define GPU_ARCH_MIDGARD 0x100 1667#define GPU_ARCH_BIFROST 0x200 1668#define GPU_ARCH_VALHALL 0x300 1669 1670 1671#define CONCAT(a, b) a##b 1672 1673 1674#define EXPAND(x) x 1675 1676 1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678 1679 1680#define REV1(x) ((x)) 1681#define REV2(x) ((x).s10) 1682#define REV3(x) ((x).s210) 1683#define REV4(x) ((x).s3210) 1684#define REV8(x) ((x).s76543210) 1685#define REV16(x) ((x).sFEDCBA9876543210) 1686 1687 1688 1689#define REVERSE_STR(x, s) REV##s((x)) 1690#define REVERSE(x, s) REVERSE_STR(x, s) 1691 1692 1693 1694#define ROT1_0(x) ((x)) 1695#define ROT1_1(x) ((x)) 1696 1697#define ROT2_0(x) ((x)) 1698#define ROT2_1(x) ((x).s10) 1699#define ROT2_2(x) ((x)) 1700 1701#define ROT3_0(x) ((x)) 1702#define ROT3_1(x) ((x).s201) 1703#define ROT3_2(x) ((x).s120) 1704#define ROT3_3(x) ((x)) 1705 1706#define ROT4_0(x) ((x)) 1707#define ROT4_1(x) ((x).s3012) 1708#define ROT4_2(x) ((x).s2301) 1709#define ROT4_3(x) ((x).s1230) 1710#define ROT4_4(x) ((x)) 1711 1712#define ROT8_0(x) ((x)) 1713#define ROT8_1(x) ((x).s70123456) 1714#define ROT8_2(x) ((x).s67012345) 1715#define ROT8_3(x) ((x).s56701234) 1716#define ROT8_4(x) ((x).s45670123) 1717#define ROT8_5(x) ((x).s34567012) 1718#define ROT8_6(x) ((x).s23456701) 1719#define ROT8_7(x) ((x).s12345670) 1720#define ROT8_8(x) ((x)) 1721 1722#define ROT16_0(x) ((x)) 1723#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728#define ROT16_6(x) ((x).sABCDEF0123456789) 1729#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738#define ROT16_16(x) ((x)) 1739 1740 1741 1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744 1745 1746 1747#define V_OFFS1(dt) (dt##1)(0) 1748#define V_OFFS2(dt) (dt##2)(0, 1) 1749#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753 1754 1755 1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758 1759 1760#define VLOAD_STR(size) vload##size 1761#define VLOAD(size) VLOAD_STR(size) 1762 1763 1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766 1767#define NO_LOAD(data, offs, ptr) \ 1768 { \ 1769 } 1770 1771 1772#define vload_partial_1_0 NO_LOAD 1773#define vload_partial_1_1 vload1 1774#define vload_partial_1_2 NO_LOAD 1775#define vload_partial_1_3 NO_LOAD 1776#define vload_partial_1_4 NO_LOAD 1777#define vload_partial_1_5 NO_LOAD 1778#define vload_partial_1_6 NO_LOAD 1779#define vload_partial_1_7 NO_LOAD 1780#define vload_partial_1_8 NO_LOAD 1781#define vload_partial_1_9 NO_LOAD 1782#define vload_partial_1_10 NO_LOAD 1783#define vload_partial_1_11 NO_LOAD 1784#define vload_partial_1_12 NO_LOAD 1785#define vload_partial_1_13 NO_LOAD 1786#define vload_partial_1_14 NO_LOAD 1787#define vload_partial_1_15 NO_LOAD 1788#define vload_partial_1_16 NO_LOAD 1789 1790#define vload_partial_2_0 NO_LOAD 1791#define vload_partial_2_1 vload_partial_1 1792#define vload_partial_2_2 vload_partial_2 1793#define vload_partial_2_3 NO_LOAD 1794#define vload_partial_2_4 NO_LOAD 1795#define vload_partial_2_5 NO_LOAD 1796#define vload_partial_2_6 NO_LOAD 1797#define vload_partial_2_7 NO_LOAD 1798#define vload_partial_2_8 NO_LOAD 1799#define vload_partial_2_9 NO_LOAD 1800#define vload_partial_2_10 NO_LOAD 1801#define vload_partial_2_11 NO_LOAD 1802#define vload_partial_2_12 NO_LOAD 1803#define vload_partial_2_13 NO_LOAD 1804#define vload_partial_2_14 NO_LOAD 1805#define vload_partial_2_15 NO_LOAD 1806#define vload_partial_2_16 NO_LOAD 1807 1808#define vload_partial_3_0 NO_LOAD 1809#define vload_partial_3_1 vload_partial_1 1810#define vload_partial_3_2 vload_partial_2 1811#define vload_partial_3_3 vload_partial_3 1812#define vload_partial_3_4 NO_LOAD 1813#define vload_partial_3_5 NO_LOAD 1814#define vload_partial_3_6 NO_LOAD 1815#define vload_partial_3_7 NO_LOAD 1816#define vload_partial_3_8 NO_LOAD 1817#define vload_partial_3_9 NO_LOAD 1818#define vload_partial_3_10 NO_LOAD 1819#define vload_partial_3_11 NO_LOAD 1820#define vload_partial_3_12 NO_LOAD 1821#define vload_partial_3_13 NO_LOAD 1822#define vload_partial_3_14 NO_LOAD 1823#define vload_partial_3_15 NO_LOAD 1824#define vload_partial_3_16 NO_LOAD 1825 1826#define vload_partial_4_0 NO_LOAD 1827#define vload_partial_4_1 vload_partial_1 1828#define vload_partial_4_2 vload_partial_2 1829#define vload_partial_4_3 vload_partial_3 1830#define vload_partial_4_4 vload_partial_4 1831#define vload_partial_4_5 NO_LOAD 1832#define vload_partial_4_6 NO_LOAD 1833#define vload_partial_4_7 NO_LOAD 1834#define vload_partial_4_8 NO_LOAD 1835#define vload_partial_4_9 NO_LOAD 1836#define vload_partial_4_10 NO_LOAD 1837#define vload_partial_4_11 NO_LOAD 1838#define vload_partial_4_12 NO_LOAD 1839#define vload_partial_4_13 NO_LOAD 1840#define vload_partial_4_14 NO_LOAD 1841#define vload_partial_4_15 NO_LOAD 1842#define vload_partial_4_16 NO_LOAD 1843 1844#define vload_partial_8_0 NO_LOAD 1845#define vload_partial_8_1 vload_partial_1 1846#define vload_partial_8_2 vload_partial_2 1847#define vload_partial_8_3 vload_partial_3 1848#define vload_partial_8_4 vload_partial_4 1849#define vload_partial_8_5 vload_partial_5 1850#define vload_partial_8_6 vload_partial_6 1851#define vload_partial_8_7 vload_partial_7 1852#define vload_partial_8_8 vload_partial_8 1853#define vload_partial_8_9 NO_LOAD 1854#define vload_partial_8_10 NO_LOAD 1855#define vload_partial_8_11 NO_LOAD 1856#define vload_partial_8_12 NO_LOAD 1857#define vload_partial_8_13 NO_LOAD 1858#define vload_partial_8_14 NO_LOAD 1859#define vload_partial_8_15 NO_LOAD 1860#define vload_partial_8_16 NO_LOAD 1861 1862#define vload_partial_16_0 NO_LOAD 1863#define vload_partial_16_1 vload_partial_1 1864#define vload_partial_16_2 vload_partial_2 1865#define vload_partial_16_3 vload_partial_3 1866#define vload_partial_16_4 vload_partial_4 1867#define vload_partial_16_5 vload_partial_5 1868#define vload_partial_16_6 vload_partial_6 1869#define vload_partial_16_7 vload_partial_7 1870#define vload_partial_16_8 vload_partial_8 1871#define vload_partial_16_9 vload_partial_9 1872#define vload_partial_16_10 vload_partial_10 1873#define vload_partial_16_11 vload_partial_11 1874#define vload_partial_16_12 vload_partial_12 1875#define vload_partial_16_13 vload_partial_13 1876#define vload_partial_16_14 vload_partial_14 1877#define vload_partial_16_15 vload_partial_15 1878#define vload_partial_16_16 vload_partial_16 1879 1880 1881#define vload_partial_1(DATA, OFFSET, PTR) \ 1882 DATA.s0 = vload1(OFFSET, PTR); 1883 1884#define vload_partial_2(DATA, OFFSET, PTR) \ 1885 DATA.s01 = vload2(OFFSET, PTR); 1886 1887#define vload_partial_3(DATA, OFFSET, PTR) \ 1888 DATA.s012 = vload3(OFFSET, PTR); 1889 1890#define vload_partial_4(DATA, OFFSET, PTR) \ 1891 DATA.s0123 = vload4(OFFSET, PTR); 1892 1893#define vload_partial_5(DATA, OFFSET, PTR) \ 1894 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895 DATA.s4 = vload1(OFFSET, PTR + 4); 1896 1897#define vload_partial_6(DATA, OFFSET, PTR) \ 1898 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900 1901#define vload_partial_7(DATA, OFFSET, PTR) \ 1902 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904 1905#define vload_partial_8(DATA, OFFSET, PTR) \ 1906 DATA.s01234567 = vload8(OFFSET, PTR); 1907 1908#define vload_partial_9(DATA, OFFSET, PTR) \ 1909 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910 DATA.s8 = vload1(OFFSET, PTR + 8); 1911 1912#define vload_partial_10(DATA, OFFSET, PTR) \ 1913 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915 1916#define vload_partial_11(DATA, OFFSET, PTR) \ 1917 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919 1920#define vload_partial_12(DATA, OFFSET, PTR) \ 1921 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923 1924#define vload_partial_13(DATA, OFFSET, PTR) \ 1925 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927 1928#define vload_partial_14(DATA, OFFSET, PTR) \ 1929 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931 1932#define vload_partial_15(DATA, OFFSET, PTR) \ 1933 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935 1936#define vload_partial_16(DATA, OFFSET, PTR) \ 1937 DATA = vload16(OFFSET, PTR); 1938 1939 1940 1941#define PIXEL_UNIT4 1 1942#define PIXEL_UNIT8 2 1943#define PIXEL_UNIT16 4 1944 1945 1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948 1949 1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953 1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958#endif 1959 1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963 1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968#endif 1969 1970 1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973 1974 1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977 1978#define VSTORE_STR(size) vstore##size 1979#define VSTORE(size) VSTORE_STR(size) 1980 1981#define float1 float 1982#define half1 half 1983#define char1 char 1984#define uchar1 uchar 1985#define short1 short 1986#define ushort1 ushort 1987#define int1 int 1988#define uint1 uint 1989#define long1 long 1990#define ulong1 ulong 1991#define double1 double 1992 1993#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995 1996 1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999 2000#define NO_STORE(data, offs, ptr) \ 2001 { \ 2002 } 2003 2004 2005#define vstore_partial_1_0 NO_STORE 2006#define vstore_partial_1_1 vstore1 2007#define vstore_partial_1_2 NO_STORE 2008#define vstore_partial_1_3 NO_STORE 2009#define vstore_partial_1_4 NO_STORE 2010#define vstore_partial_1_5 NO_STORE 2011#define vstore_partial_1_6 NO_STORE 2012#define vstore_partial_1_7 NO_STORE 2013#define vstore_partial_1_8 NO_STORE 2014#define vstore_partial_1_9 NO_STORE 2015#define vstore_partial_1_10 NO_STORE 2016#define vstore_partial_1_11 NO_STORE 2017#define vstore_partial_1_12 NO_STORE 2018#define vstore_partial_1_13 NO_STORE 2019#define vstore_partial_1_14 NO_STORE 2020#define vstore_partial_1_15 NO_STORE 2021#define vstore_partial_1_16 NO_STORE 2022 2023#define vstore_partial_2_0 NO_STORE 2024#define vstore_partial_2_1 vstore_partial_1 2025#define vstore_partial_2_2 vstore_partial_2 2026#define vstore_partial_2_3 NO_STORE 2027#define vstore_partial_2_4 NO_STORE 2028#define vstore_partial_2_5 NO_STORE 2029#define vstore_partial_2_6 NO_STORE 2030#define vstore_partial_2_7 NO_STORE 2031#define vstore_partial_2_8 NO_STORE 2032#define vstore_partial_2_9 NO_STORE 2033#define vstore_partial_2_10 NO_STORE 2034#define vstore_partial_2_11 NO_STORE 2035#define vstore_partial_2_12 NO_STORE 2036#define vstore_partial_2_13 NO_STORE 2037#define vstore_partial_2_14 NO_STORE 2038#define vstore_partial_2_15 NO_STORE 2039#define vstore_partial_2_16 NO_STORE 2040 2041#define vstore_partial_3_0 NO_STORE 2042#define vstore_partial_3_1 vstore_partial_1 2043#define vstore_partial_3_2 vstore_partial_2 2044#define vstore_partial_3_3 vstore_partial_3 2045#define vstore_partial_3_4 NO_STORE 2046#define vstore_partial_3_5 NO_STORE 2047#define vstore_partial_3_6 NO_STORE 2048#define vstore_partial_3_7 NO_STORE 2049#define vstore_partial_3_8 NO_STORE 2050#define vstore_partial_3_9 NO_STORE 2051#define vstore_partial_3_10 NO_STORE 2052#define vstore_partial_3_11 NO_STORE 2053#define vstore_partial_3_12 NO_STORE 2054#define vstore_partial_3_13 NO_STORE 2055#define vstore_partial_3_14 NO_STORE 2056#define vstore_partial_3_15 NO_STORE 2057#define vstore_partial_3_16 NO_STORE 2058 2059#define vstore_partial_4_0 NO_STORE 2060#define vstore_partial_4_1 vstore_partial_1 2061#define vstore_partial_4_2 vstore_partial_2 2062#define vstore_partial_4_3 vstore_partial_3 2063#define vstore_partial_4_4 vstore_partial_4 2064#define vstore_partial_4_5 NO_STORE 2065#define vstore_partial_4_6 NO_STORE 2066#define vstore_partial_4_7 NO_STORE 2067#define vstore_partial_4_8 NO_STORE 2068#define vstore_partial_4_9 NO_STORE 2069#define vstore_partial_4_10 NO_STORE 2070#define vstore_partial_4_11 NO_STORE 2071#define vstore_partial_4_12 NO_STORE 2072#define vstore_partial_4_13 NO_STORE 2073#define vstore_partial_4_14 NO_STORE 2074#define vstore_partial_4_15 NO_STORE 2075#define vstore_partial_4_16 NO_STORE 2076 2077#define vstore_partial_8_0 NO_STORE 2078#define vstore_partial_8_1 vstore_partial_1 2079#define vstore_partial_8_2 vstore_partial_2 2080#define vstore_partial_8_3 vstore_partial_3 2081#define vstore_partial_8_4 vstore_partial_4 2082#define vstore_partial_8_5 vstore_partial_5 2083#define vstore_partial_8_6 vstore_partial_6 2084#define vstore_partial_8_7 vstore_partial_7 2085#define vstore_partial_8_8 vstore_partial_8 2086#define vstore_partial_8_9 NO_STORE 2087#define vstore_partial_8_10 NO_STORE 2088#define vstore_partial_8_11 NO_STORE 2089#define vstore_partial_8_12 NO_STORE 2090#define vstore_partial_8_13 NO_STORE 2091#define vstore_partial_8_14 NO_STORE 2092#define vstore_partial_8_15 NO_STORE 2093#define vstore_partial_8_16 NO_STORE 2094 2095#define vstore_partial_16_0 NO_STORE 2096#define vstore_partial_16_1 vstore_partial_1 2097#define vstore_partial_16_2 vstore_partial_2 2098#define vstore_partial_16_3 vstore_partial_3 2099#define vstore_partial_16_4 vstore_partial_4 2100#define vstore_partial_16_5 vstore_partial_5 2101#define vstore_partial_16_6 vstore_partial_6 2102#define vstore_partial_16_7 vstore_partial_7 2103#define vstore_partial_16_8 vstore_partial_8 2104#define vstore_partial_16_9 vstore_partial_9 2105#define vstore_partial_16_10 vstore_partial_10 2106#define vstore_partial_16_11 vstore_partial_11 2107#define vstore_partial_16_12 vstore_partial_12 2108#define vstore_partial_16_13 vstore_partial_13 2109#define vstore_partial_16_14 vstore_partial_14 2110#define vstore_partial_16_15 vstore_partial_15 2111#define vstore_partial_16_16 vstore_partial_16 2112 2113 2114#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115 vstore1(DATA.s0, OFFSET, PTR); 2116 2117#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118 vstore2(DATA.s01, OFFSET, PTR); 2119 2120#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121 vstore3(DATA.s012, OFFSET, PTR); 2122 2123#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124 vstore4(DATA.s0123, OFFSET, PTR); 2125 2126#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128 vstore1(DATA.s4, OFFSET, PTR + 4); 2129 2130#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133 2134#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137 2138#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139 vstore8(DATA.s01234567, OFFSET, PTR); 2140 2141#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143 vstore1(DATA.s8, OFFSET, PTR + 8); 2144 2145#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148 2149#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152 2153#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156 2157#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160 2161#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164 2165#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168 2169#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170 vstore16(DATA, OFFSET, PTR); 2171 2172 2173 2174 2175 2176#define convert_float_sat convert_float 2177#define convert_float1_sat convert_float 2178#define convert_float2_sat convert_float2 2179#define convert_float3_sat convert_float3 2180#define convert_float4_sat convert_float4 2181#define convert_float8_sat convert_float8 2182#define convert_float16_sat convert_float16 2183#define convert_half_sat convert_float 2184#define convert_half1_sat convert_half 2185#define convert_half2_sat convert_half2 2186#define convert_half3_sat convert_half3 2187#define convert_half4_sat convert_half4 2188#define convert_half8_sat convert_half8 2189#define convert_half16_sat convert_half16 2190 2191#define convert_float1 convert_float 2192#define convert_half1 convert_half 2193#define convert_char1 convert_char 2194#define convert_uchar1 convert_uchar 2195#define convert_short1 convert_short 2196#define convert_ushort1 convert_ushort 2197#define convert_int1 convert_int 2198#define convert_uint1 convert_uint 2199#define convert_long1 convert_long 2200#define convert_ulong1 convert_ulong 2201#define convert_double1 convert_double 2202 2203#define convert_char1_sat convert_char_sat 2204#define convert_uchar1_sat convert_uchar_sat 2205#define convert_uchar2_sat convert_uchar2_sat 2206#define convert_uchar3_sat convert_uchar3_sat 2207#define convert_uchar4_sat convert_uchar4_sat 2208#define convert_uchar8_sat convert_uchar8_sat 2209#define convert_uchar16_sat convert_uchar16_sat 2210#define convert_short1_sat convert_short_sat 2211#define convert_ushort1_sat convert_ushort_sat 2212#define convert_int1_sat convert_int_sat 2213#define convert_uint1_sat convert_uint_sat 2214#define convert_long1_sat convert_long_sat 2215#define convert_ulong1_sat convert_ulong_sat 2216#define convert_double1_sat convert_double_sat 2217 2218#define VEC_DATA_TYPE_STR(type, size) type##size 2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220 2221#define CONVERT_STR(x, type) (convert_##type((x))) 2222#define CONVERT(x, type) CONVERT_STR(x, type) 2223 2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226 2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229 2230#define select_vec_dt_uchar(size) uchar##size 2231#define select_vec_dt_char(size) char##size 2232#define select_vec_dt_ushort(size) ushort##size 2233#define select_vec_dt_short(size) short##size 2234#define select_vec_dt_half(size) short##size 2235#define select_vec_dt_uint(size) uint##size 2236#define select_vec_dt_int(size) int##size 2237#define select_vec_dt_float(size) int##size 2238#define select_vec_dt_ulong(size) ulong##size 2239#define select_vec_dt_long(size) long##size 2240 2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244 2245#define signed_int_vec_dt_uchar(size) char##size 2246#define signed_int_vec_dt_char(size) char##size 2247#define signed_int_vec_dt_ushort(size) short##size 2248#define signed_int_vec_dt_short(size) short##size 2249#define signed_int_vec_dt_half(size) short##size 2250#define signed_int_vec_dt_uint(size) int##size 2251#define signed_int_vec_dt_int(size) int##size 2252#define signed_int_vec_dt_float(size) int##size 2253#define signed_int_vec_dt_ulong(size) long##size 2254#define signed_int_vec_dt_long(size) long##size 2255 2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259 2260#define sum_reduce_1(x) (x) 2261#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266 2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269 2270#define prod_reduce_1(x) (x) 2271#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276 2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279 2280#define max_reduce_1(x) (x) 2281#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286 2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289 2290#define VECTOR_DECLARATION(name) \ 2291 __global uchar *name##_ptr, \ 2292 uint name##_stride_x, \ 2293 uint name##_step_x, \ 2294 uint name##_offset_first_element_in_bytes 2295 2296#define IMAGE_DECLARATION(name) \ 2297 __global uchar *name##_ptr, \ 2298 uint name##_stride_x, \ 2299 uint name##_step_x, \ 2300 uint name##_stride_y, \ 2301 uint name##_step_y, \ 2302 uint name##_offset_first_element_in_bytes 2303 2304#define TENSOR3D_DECLARATION(name) \ 2305 __global uchar *name##_ptr, \ 2306 uint name##_stride_x, \ 2307 uint name##_step_x, \ 2308 uint name##_stride_y, \ 2309 uint name##_step_y, \ 2310 uint name##_stride_z, \ 2311 uint name##_step_z, \ 2312 uint name##_offset_first_element_in_bytes 2313 2314#define TENSOR4D_DECLARATION(name) \ 2315 __global uchar *name##_ptr, \ 2316 uint name##_stride_x, \ 2317 uint name##_step_x, \ 2318 uint name##_stride_y, \ 2319 uint name##_step_y, \ 2320 uint name##_stride_z, \ 2321 uint name##_step_z, \ 2322 uint name##_stride_w, \ 2323 uint name##_step_w, \ 2324 uint name##_offset_first_element_in_bytes 2325 2326#define TENSOR5D_DECLARATION(name) \ 2327 __global uchar *name##_ptr, \ 2328 uint name##_stride_x, \ 2329 uint name##_step_x, \ 2330 uint name##_stride_y, \ 2331 uint name##_step_y, \ 2332 uint name##_stride_z, \ 2333 uint name##_step_z, \ 2334 uint name##_stride_w, \ 2335 uint name##_step_w, \ 2336 uint name##_stride_v, \ 2337 uint name##_step_v, \ 2338 uint name##_offset_first_element_in_bytes 2339 2340#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342 2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345 2346#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348 2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351 2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354 2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357 2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360 2361#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363 name##_stride_z, name##_step_z) 2364 2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367 2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371 2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374 2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377 name##_stride_z, name##_step_z) 2378 2379 2380typedef struct Vector 2381{ 2382 __global uchar *ptr; 2383 int offset_first_element_in_bytes; 2384 int stride_x; 2385} Vector; 2386 2387 2388typedef struct Image 2389{ 2390 __global uchar *ptr; 2391 int offset_first_element_in_bytes; 2392 int stride_x; 2393 int stride_y; 2394} Image; 2395 2396 2397typedef struct Tensor3D 2398{ 2399 __global uchar *ptr; 2400 int offset_first_element_in_bytes; 2401 int stride_x; 2402 int stride_y; 2403 int stride_z; 2404} Tensor3D; 2405 2406 2407typedef struct Tensor4D 2408{ 2409 __global uchar *ptr; 2410 int offset_first_element_in_bytes; 2411 int stride_x; 2412 int stride_y; 2413 int stride_z; 2414 int stride_w; 2415} Tensor4D; 2416 2417 2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419{ 2420 Vector vector = 2421 { 2422 .ptr = ptr, 2423 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424 .stride_x = stride_x, 2425 }; 2426 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427 return vector; 2428} 2429 2430 2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432{ 2433 Image img = 2434 { 2435 .ptr = ptr, 2436 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437 .stride_x = stride_x, 2438 .stride_y = stride_y 2439 }; 2440 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441 return img; 2442} 2443 2444 2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446{ 2447 Image img = 2448 { 2449 .ptr = ptr, 2450 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451 .stride_x = stride_x, 2452 .stride_y = stride_y 2453 }; 2454 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455 return img; 2456} 2457 2458 2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460{ 2461 Tensor3D tensor = 2462 { 2463 .ptr = ptr, 2464 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465 .stride_x = stride_x, 2466 .stride_y = stride_y, 2467 .stride_z = stride_z 2468 }; 2469 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470 return tensor; 2471} 2472 2473 2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475{ 2476 Tensor3D tensor = 2477 { 2478 .ptr = ptr, 2479 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480 .stride_x = stride_x, 2481 .stride_y = stride_y, 2482 .stride_z = stride_z 2483 }; 2484 return tensor; 2485} 2486 2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488 uint step_w, 2489 uint mod_size) 2490{ 2491 Tensor4D tensor = 2492 { 2493 .ptr = ptr, 2494 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495 .stride_x = stride_x, 2496 .stride_y = stride_y, 2497 .stride_z = stride_z, 2498 .stride_w = stride_w 2499 }; 2500 2501 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502 return tensor; 2503} 2504 2505 2506inline __global const uchar *vector_offset(const Vector *vec, int x) 2507{ 2508 return vec->ptr + x * vec->stride_x; 2509} 2510 2511 2512inline __global uchar *offset(const Image *img, int x, int y) 2513{ 2514 return img->ptr + x * img->stride_x + y * img->stride_y; 2515} 2516 2517 2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519{ 2520 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521} 2522 2523 2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525{ 2526 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527} 2528 2529 2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531{ 2532 uint num_elements = width * height; 2533 2534 const uint z = index / num_elements; 2535 2536 index %= num_elements; 2537 2538 const uint y = index / width; 2539 2540 index %= width; 2541 2542 const uint x = index; 2543 2544 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545} 2546 2547#endif 2548 2549 2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 2552 2553 2554#define scalar_access_0_1(x) ((x).s0) 2555#define scalar_access_0_2(x) ((x).s01) 2556#define scalar_access_0_3(x) ((x).s012) 2557#define scalar_access_0_4(x) ((x).s0123) 2558#define scalar_access_0_8(x) ((x).s01234567) 2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 2560 2561 2562#define scalar_access_1_1(x) ((x).s1) 2563#define scalar_access_1_2(x) ((x).s12) 2564#define scalar_access_1_3(x) ((x).s123) 2565#define scalar_access_1_4(x) ((x).s1234) 2566#define scalar_access_1_8(x) ((x).s12345678) 2567 2568 2569#define scalar_access_2_1(x) ((x).s2) 2570#define scalar_access_2_2(x) ((x).s23) 2571#define scalar_access_2_3(x) ((x).s234) 2572#define scalar_access_2_4(x) ((x).s2345) 2573#define scalar_access_2_8(x) ((x).s23456789) 2574 2575 2576#define scalar_access_3_1(x) ((x).s3) 2577#define scalar_access_3_2(x) ((x).s34) 2578#define scalar_access_3_3(x) ((x).s345) 2579#define scalar_access_3_4(x) ((x).s3456) 2580#define scalar_access_3_8(x) ((x).s3456789A) 2581 2582 2583#define scalar_access_4_1(x) ((x).s4) 2584#define scalar_access_4_2(x) ((x).s45) 2585#define scalar_access_4_3(x) ((x).s456) 2586#define scalar_access_4_4(x) ((x).s4567) 2587#define scalar_access_4_8(x) ((x).s456789AB) 2588 2589 2590#define scalar_access_8_1(x) ((x).s8) 2591#define scalar_access_8_2(x) ((x).s89) 2592#define scalar_access_8_3(x) ((x).s89A) 2593#define scalar_access_8_4(x) ((x).s89AB) 2594#define scalar_access_8_8(x) ((x).s89ABCDEF) 2595 2596 2597#define scalar_access_12_1(x) ((x).sC) 2598#define scalar_access_12_2(x) ((x).sCD) 2599#define scalar_access_12_3(x) ((x).sCDE) 2600#define scalar_access_12_4(x) ((x).sCDEF) 2601 2602 2603#define scalar_access_16_1(x) ((x).sF) 2604 2605 2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2607 ({}) 2608 2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2610 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2611 2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2613 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2614 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2615 2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2617 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2618 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2619 2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2621 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2622 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2623 2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2625 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2626 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2627 2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2629 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2630 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2631 2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2633 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2634 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2635 2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2637 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2638 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2639 2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2641 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2642 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2643 2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2645 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2646 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2647 2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2649 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2650 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2651 2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2653 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2654 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2655 2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2657 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2658 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2659 2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2661 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2662 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2663 2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2665 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2666 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2667 2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2669 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2670 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2671 2672 2673 2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2676 2677 2678 2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2680 ({}) 2681 2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2683 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2684 2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2686 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2687 2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2689 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2690 2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2692 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2693 2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2695 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2696 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2697 2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2699 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2700 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2701 2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2703 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2704 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2705 2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2707 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2708 2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2710 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2711 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2712 2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2714 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2715 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2716 2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2718 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2719 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2720 2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2722 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2723 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2724 2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2726 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2727 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2728 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2729 2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2731 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2732 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2733 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2734 2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2736 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2737 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2738 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2739 2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2741 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2742 2743 2744 2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2747 2748 2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2750 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2751 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2752 2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2754 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2755 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2756 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2757 2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2759 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2760 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2761 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2762 2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2764 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2765 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2766 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2767 2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2769 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2770 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2771 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2772 2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2774 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2775 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2776 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2777 2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2779 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2780 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2781 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2782 2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2784 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2785 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2786 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2787 2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2789 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2790 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2791 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2792 2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2794 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2795 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2796 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2797 2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2799 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2800 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2801 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2802 2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2804 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2805 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2806 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2807 2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2809 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2810 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2811 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2812 2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2814 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2815 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2816 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2817 2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2819 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2820 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2821 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2822 2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2824 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2825 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2826 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2827 2828 2829 2830 2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2833 2834 2835 2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2837 VLOAD_PARTIAL(N0, LOAD_N0) \ 2838 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2839 2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2841 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2842 VLOAD_PARTIAL(N0, LOAD_N0) \ 2843 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2844 2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2846 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2847 VLOAD_PARTIAL(N0, LOAD_N0) \ 2848 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2849 2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2851 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2852 VLOAD_PARTIAL(N0, LOAD_N0) \ 2853 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2854 2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2856 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2857 VLOAD_PARTIAL(N0, LOAD_N0) \ 2858 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2859 2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2861 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2862 VLOAD_PARTIAL(N0, LOAD_N0) \ 2863 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2864 2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2866 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2867 VLOAD_PARTIAL(N0, LOAD_N0) \ 2868 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2869 2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2871 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2872 VLOAD_PARTIAL(N0, LOAD_N0) \ 2873 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2874 2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2876 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2877 VLOAD_PARTIAL(N0, LOAD_N0) \ 2878 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2879 2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2881 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2882 VLOAD_PARTIAL(N0, LOAD_N0) \ 2883 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2884 2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2886 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2887 VLOAD_PARTIAL(N0, LOAD_N0) \ 2888 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2889 2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2891 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2892 VLOAD_PARTIAL(N0, LOAD_N0) \ 2893 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2894 2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2896 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2897 VLOAD_PARTIAL(N0, LOAD_N0) \ 2898 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2899 2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2901 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2902 VLOAD_PARTIAL(N0, LOAD_N0) \ 2903 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2904 2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2906 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2907 VLOAD_PARTIAL(N0, LOAD_N0) \ 2908 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2909 2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2911 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2912 VLOAD_PARTIAL(N0, LOAD_N0) \ 2913 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2914 2915 2916 2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2919 2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2921 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2922 { \ 2923 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2924 } \ 2925 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2926 { \ 2927 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2928 } \ 2929 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2930 { \ 2931 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2932 } \ 2933 else \ 2934 { \ 2935 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2936 } 2937 2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2939 if(!(PARTIAL_COND_X)) \ 2940 { \ 2941 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2942 } \ 2943 else \ 2944 { \ 2945 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2946 } 2947 2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2949 if(!(PARTIAL_COND_Y)) \ 2950 { \ 2951 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2952 } \ 2953 else \ 2954 { \ 2955 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2956 } 2957 2958 2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2960 2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2962 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2963 2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2965 2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2967 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2968 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2969 2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2971 2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2973 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2974 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2975 2976#else 2977 2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2979 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2980 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2981 2982#endif 2983 2984 2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2986 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 2987 2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2989 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2990 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 2991 2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2993 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2994 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 2995 2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2997 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2998 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 2999 3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3001 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3002 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 3003 3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3005 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3006 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 3007 3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3009 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3010 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 3011 3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3013 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3014 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 3015 3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3017 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3018 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 3019 3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3021 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3022 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 3023 3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3025 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3026 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 3027 3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3029 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3030 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 3031 3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3033 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3034 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 3035 3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3037 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3038 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 3039 3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3041 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3042 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 3043 3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3045 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3046 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 3047 3048 3049 3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3052 3053 3054 3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3056 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3057 BASENAME##0; \ 3058 if(Y_MASK##0 != 0) \ 3059 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 3060 else \ 3061 BASENAME##0 = 0; 3062 3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3064 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3065 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3066 BASENAME##1; \ 3067 if(Y_MASK##1 != 0) \ 3068 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 3069 else \ 3070 BASENAME##1 = 0; 3071 3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3073 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3074 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3075 BASENAME##2; \ 3076 if(Y_MASK##2 != 0) \ 3077 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 3078 else \ 3079 BASENAME##2 = 0; 3080 3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3082 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3083 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3084 BASENAME##3; \ 3085 if(Y_MASK##3 != 0) \ 3086 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 3087 else \ 3088 BASENAME##3 = 0; 3089 3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3091 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3092 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3093 BASENAME##4; \ 3094 if(Y_MASK##4 != 0) \ 3095 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 3096 else \ 3097 BASENAME##4 = 0; 3098 3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3100 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3101 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3102 BASENAME##5; \ 3103 if(Y_MASK##5 != 0) \ 3104 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 3105 else \ 3106 BASENAME##5 = 0; 3107 3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3109 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3110 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3111 BASENAME##6; \ 3112 if(Y_MASK##6 != 0) \ 3113 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 3114 else \ 3115 BASENAME##6 = 0; 3116 3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3118 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3119 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3120 BASENAME##7; \ 3121 if(Y_MASK##7 != 0) \ 3122 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 3123 else \ 3124 BASENAME##7 = 0; 3125 3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3127 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3128 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3129 BASENAME##8; \ 3130 if(Y_MASK##8 != 0) \ 3131 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 3132 else \ 3133 BASENAME##8 = 0; 3134 3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3136 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3137 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3138 BASENAME##9; \ 3139 if(Y_MASK##9 != 0) \ 3140 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 3141 else \ 3142 BASENAME##9 = 0; 3143 3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3145 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3146 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3147 BASENAME##A; \ 3148 if(Y_MASK##A != 0) \ 3149 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 3150 else \ 3151 BASENAME##A = 0; 3152 3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3154 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3155 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3156 BASENAME##B; \ 3157 if(Y_MASK##B != 0) \ 3158 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 3159 else \ 3160 BASENAME##B = 0; 3161 3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3163 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3164 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3165 BASENAME##C; \ 3166 if(Y_MASK##C != 0) \ 3167 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 3168 else \ 3169 BASENAME##C = 0; 3170 3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3172 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3173 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3174 BASENAME##D; \ 3175 if(Y_MASK##D != 0) \ 3176 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 3177 else \ 3178 BASENAME##D = 0; 3179 3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3181 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3182 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3183 BASENAME##E; \ 3184 if(Y_MASK##E != 0) \ 3185 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 3186 else \ 3187 BASENAME##E = 0; 3188 3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3190 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3191 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3192 BASENAME##F; \ 3193 if(Y_MASK##F != 0) \ 3194 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 3195 else \ 3196 BASENAME##F = 0; 3197 3198 3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3201 3202 3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3204 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3205 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 3206 3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3208 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3209 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3210 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 3211 3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3213 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3214 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3215 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 3216 3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3218 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3219 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3220 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 3221 3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3223 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3224 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3225 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 3226 3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3228 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3229 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3230 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 3231 3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3233 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3234 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3235 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 3236 3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3238 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3239 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3240 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 3241 3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3243 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3244 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3245 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 3246 3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3248 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3249 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3250 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 3251 3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3253 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3254 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3255 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 3256 3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3258 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3259 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3260 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 3261 3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3263 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3264 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3265 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 3266 3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3268 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3269 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3270 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 3271 3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3273 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3274 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3275 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 3276 3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3278 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3279 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3280 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 3281 3282 3283 3284 3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3287 3288 3289 3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3291 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3292 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 3293 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 3294 3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3296 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3297 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3298 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 3299 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 3300 3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3302 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3303 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3304 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 3305 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 3306 3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3308 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3309 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3310 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 3311 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 3312 3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3314 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3315 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3316 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 3317 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 3318 3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3320 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3321 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3322 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 3323 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 3324 3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3326 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3327 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3328 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 3329 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 3330 3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3332 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3333 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3334 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 3335 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 3336 3337 3338 3339 3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3342 3343 3344 3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3346 BASENAME##0 *= (DATA_TYPE)SCALE; 3347 3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3349 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3350 BASENAME##1 *= (DATA_TYPE)SCALE; 3351 3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3353 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3354 BASENAME##2 *= (DATA_TYPE)SCALE; 3355 3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3357 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3358 BASENAME##3 *= (DATA_TYPE)SCALE; 3359 3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3361 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3362 BASENAME##4 *= (DATA_TYPE)SCALE; 3363 3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3365 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3366 BASENAME##5 *= (DATA_TYPE)SCALE; 3367 3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3369 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3370 BASENAME##6 *= (DATA_TYPE)SCALE; 3371 3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3373 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3374 BASENAME##7 *= (DATA_TYPE)SCALE; 3375 3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3377 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3378 BASENAME##8 *= (DATA_TYPE)SCALE; 3379 3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3381 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3382 BASENAME##9 *= (DATA_TYPE)SCALE; 3383 3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3385 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3386 BASENAME##A *= (DATA_TYPE)SCALE; 3387 3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3389 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3390 BASENAME##B *= (DATA_TYPE)SCALE; 3391 3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3393 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3394 BASENAME##C *= (DATA_TYPE)SCALE; 3395 3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3397 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3398 BASENAME##D *= (DATA_TYPE)SCALE; 3399 3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3401 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3402 BASENAME##E *= (DATA_TYPE)SCALE; 3403 3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 3405 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3406 BASENAME##F *= (DATA_TYPE)SCALE; 3407 3408 3409 3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 3412 3413 3414 3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 3416 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 3418 VEC_DATA_TYPE(TYPE, 2) \ 3419 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 3421 VEC_DATA_TYPE(TYPE, 3) \ 3422 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 3424 VEC_DATA_TYPE(TYPE, 4) \ 3425 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 3427 VEC_DATA_TYPE(TYPE, 8) \ 3428 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 3430 VEC_DATA_TYPE(TYPE, 16) \ 3431 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 3432 3433 3434 3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 3436 TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 3438 VEC_DATA_TYPE(TYPE, 2) \ 3439 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 3441 VEC_DATA_TYPE(TYPE, 3) \ 3442 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 3444 VEC_DATA_TYPE(TYPE, 4) \ 3445 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 3447 VEC_DATA_TYPE(TYPE, 8) \ 3448 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 3450 VEC_DATA_TYPE(TYPE, 16) \ 3451 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 3452 3453 3454 3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 3456 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 3458 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 3459 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 3461 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 3462 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 3464 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 3465 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 3467 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 3468 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 3469 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 3470 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 3471 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 3473 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 3474 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 3475 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 3476 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 3477 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 3478 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 3479 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 3480 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 3481 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 3482 3483 3484 3485 3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3487 CONCAT(COLUMN_VECTOR, K0) \ 3488 (IDX_COL, BASENAME, BS, TYPE); 3489 3490 3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3492 CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 3493 (IDX_COL, BASENAME, BS, TYPE); 3494 3495 3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 3497 CONCAT(TRANSPOSE_K0X, N0) \ 3498 (K0, BASENAME, BS, TYPE); 3499 3500 3501#define ADD_ROW_1(BASENAME, BIAS) \ 3502 BASENAME##0 += BIAS##0; 3503 3504#define ADD_ROW_2(BASENAME, BIAS) \ 3505 ADD_ROW_1(BASENAME, BIAS) \ 3506 BASENAME##1 += BIAS##1; 3507 3508#define ADD_ROW_3(BASENAME, BIAS) \ 3509 ADD_ROW_2(BASENAME, BIAS) \ 3510 BASENAME##2 += BIAS##2; 3511 3512#define ADD_ROW_4(BASENAME, BIAS) \ 3513 ADD_ROW_3(BASENAME, BIAS) \ 3514 BASENAME##3 += BIAS##3; 3515 3516#define ADD_ROW_5(BASENAME, BIAS) \ 3517 ADD_ROW_4(BASENAME, BIAS) \ 3518 BASENAME##4 += BIAS##4; 3519 3520#define ADD_ROW_6(BASENAME, BIAS) \ 3521 ADD_ROW_5(BASENAME, BIAS) \ 3522 BASENAME##5 += BIAS##5; 3523 3524#define ADD_ROW_7(BASENAME, BIAS) \ 3525 ADD_ROW_6(BASENAME, BIAS) \ 3526 BASENAME##6 += BIAS##6; 3527 3528#define ADD_ROW_8(BASENAME, BIAS) \ 3529 ADD_ROW_7(BASENAME, BIAS) \ 3530 BASENAME##7 += BIAS##7; 3531 3532#define ADD_ROW_9(BASENAME, BIAS) \ 3533 ADD_ROW_8(BASENAME, BIAS) \ 3534 BASENAME##8 += BIAS##8; 3535 3536#define ADD_ROW_10(BASENAME, BIAS) \ 3537 ADD_ROW_9(BASENAME, BIAS) \ 3538 BASENAME##9 += BIAS##9; 3539 3540#define ADD_ROW_11(BASENAME, BIAS) \ 3541 ADD_ROW_10(BASENAME, BIAS) \ 3542 BASENAME##A += BIAS##A; 3543 3544#define ADD_ROW_12(BASENAME, BIAS) \ 3545 ADD_ROW_11(BASENAME, BIAS) \ 3546 BASENAME##B += BIAS##B; 3547 3548#define ADD_ROW_13(BASENAME, BIAS) \ 3549 ADD_ROW_12(BASENAME, BIAS) \ 3550 BASENAME##C += BIAS##C; 3551 3552#define ADD_ROW_14(BASENAME, BIAS) \ 3553 ADD_ROW_13(BASENAME, BIAS) \ 3554 BASENAME##D += BIAS##D; 3555 3556#define ADD_ROW_15(BASENAME, BIAS) \ 3557 ADD_ROW_14(BASENAME, BIAS) \ 3558 BASENAME##E += BIAS##E; 3559 3560#define ADD_ROW_16(BASENAME, BIAS) \ 3561 ADD_ROW_15(BASENAME, BIAS) \ 3562 BASENAME##F += BIAS##F; 3563 3564 3565 3566 3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 3569 3570 3571 3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3573 BASENAME##0 += BIAS; 3574 3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3576 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3577 BASENAME##1 += BIAS; 3578 3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3580 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3581 BASENAME##2 += BIAS; 3582 3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3584 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3585 BASENAME##3 += BIAS; 3586 3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3588 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3589 BASENAME##4 += BIAS; 3590 3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3592 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3593 BASENAME##5 += BIAS; 3594 3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3596 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3597 BASENAME##6 += BIAS; 3598 3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3600 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3601 BASENAME##7 += BIAS; 3602 3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3604 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3605 BASENAME##8 += BIAS; 3606 3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3608 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3609 BASENAME##9 += BIAS; 3610 3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3612 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3613 BASENAME##A += BIAS; 3614 3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3616 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3617 BASENAME##B += BIAS; 3618 3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3620 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3621 BASENAME##C += BIAS; 3622 3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3624 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3625 BASENAME##D += BIAS; 3626 3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3628 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3629 BASENAME##E += BIAS; 3630 3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 3632 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3633 BASENAME##F += BIAS; 3634 3635 3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 3638 3639 3640 3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3642 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 3643 3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3645 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3646 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 3647 3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3649 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3650 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 3651 3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3653 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3654 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 3655 3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3657 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3658 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 3659 3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3661 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3662 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 3663 3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3665 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3666 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 3667 3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3669 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3670 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 3671 3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3673 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3674 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 3675 3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3677 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3678 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 3679 3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3681 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3682 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 3683 3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3685 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3686 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 3687 3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3689 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3690 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 3691 3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3693 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3694 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 3695 3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3697 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3698 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 3699 3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3701 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3702 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 3703 3704 3705 3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3708 3709 3710 3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3712 VEC_DATA_TYPE(DATA_TYPE, N) \ 3713 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 3714 3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3716 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3717 VEC_DATA_TYPE(DATA_TYPE, N) \ 3718 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 3719 3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3721 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3722 VEC_DATA_TYPE(DATA_TYPE, N) \ 3723 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 3724 3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3726 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3727 VEC_DATA_TYPE(DATA_TYPE, N) \ 3728 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 3729 3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3731 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3732 VEC_DATA_TYPE(DATA_TYPE, N) \ 3733 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 3734 3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3736 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3737 VEC_DATA_TYPE(DATA_TYPE, N) \ 3738 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 3739 3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3741 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3742 VEC_DATA_TYPE(DATA_TYPE, N) \ 3743 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 3744 3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3746 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3747 VEC_DATA_TYPE(DATA_TYPE, N) \ 3748 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 3749 3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3751 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3752 VEC_DATA_TYPE(DATA_TYPE, N) \ 3753 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 3754 3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3756 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3757 VEC_DATA_TYPE(DATA_TYPE, N) \ 3758 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 3759 3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3761 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3762 VEC_DATA_TYPE(DATA_TYPE, N) \ 3763 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 3764 3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3766 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3767 VEC_DATA_TYPE(DATA_TYPE, N) \ 3768 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 3769 3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3771 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3772 VEC_DATA_TYPE(DATA_TYPE, N) \ 3773 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 3774 3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3776 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3777 VEC_DATA_TYPE(DATA_TYPE, N) \ 3778 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 3779 3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3781 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3782 VEC_DATA_TYPE(DATA_TYPE, N) \ 3783 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 3784 3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3786 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3787 VEC_DATA_TYPE(DATA_TYPE, N) \ 3788 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 3789 3790 3791 3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3794 3795 3796#ifndef ARM_COMPUTE_HELPER_H 3797#define ARM_COMPUTE_HELPER_H 3798 3799 3800 3801 3802#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3803 VSTORE(N0) \ 3804 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3805 3806#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3807 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3808 VSTORE(N0) \ 3809 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3810 3811#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3812 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3813 VSTORE(N0) \ 3814 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3815 3816#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3817 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3818 VSTORE(N0) \ 3819 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3820 3821#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3822 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3823 VSTORE(N0) \ 3824 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3825 3826#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3827 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3828 VSTORE(N0) \ 3829 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3830 3831#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3832 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3833 VSTORE(N0) \ 3834 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3835 3836#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3837 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3838 VSTORE(N0) \ 3839 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3840 3841#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3842 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3843 VSTORE(N0) \ 3844 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3845 3846#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3847 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3848 VSTORE(N0) \ 3849 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3850 3851#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3852 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3853 VSTORE(N0) \ 3854 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3855 3856#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3857 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3858 VSTORE(N0) \ 3859 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3860 3861#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3862 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3863 VSTORE(N0) \ 3864 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3865 3866#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3867 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3868 VSTORE(N0) \ 3869 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3870 3871#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3872 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3873 VSTORE(N0) \ 3874 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3875 3876#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3877 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3878 VSTORE(N0) \ 3879 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3880 3881 3882 3883#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3884 VSTORE(N0) \ 3885 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3886 3887#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3888 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3889 VSTORE(N0) \ 3890 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3891 3892#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3893 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3894 VSTORE(N0) \ 3895 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3896 3897#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3898 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3899 VSTORE(N0) \ 3900 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3901 3902#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3903 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3904 VSTORE(N0) \ 3905 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3906 3907#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3908 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3909 VSTORE(N0) \ 3910 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3911 3912#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3913 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3914 VSTORE(N0) \ 3915 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3916 3917#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3918 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3919 VSTORE(N0) \ 3920 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3921 3922#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3923 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3924 VSTORE(N0) \ 3925 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3926 3927#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 3928 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3929 VSTORE(N0) \ 3930 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3931 3932#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3933 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3934 VSTORE(N0) \ 3935 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3936 3937#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3938 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3939 VSTORE(N0) \ 3940 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3941 3942#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3943 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3944 VSTORE(N0) \ 3945 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3946 3947#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3948 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3949 VSTORE(N0) \ 3950 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3951 3952#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3953 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3954 VSTORE(N0) \ 3955 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3956 3957#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3958 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3959 VSTORE(N0) \ 3960 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3961 3962 3963 3964 3965#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3966#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3967 3968 3969 3970#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3971#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3972 3973 3974 3975#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3976 VSTORE_PARTIAL(N0, STORE_N0) \ 3977 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3978 3979#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3980 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3981 VSTORE_PARTIAL(N0, STORE_N0) \ 3982 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3983 3984#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3985 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3986 VSTORE_PARTIAL(N0, STORE_N0) \ 3987 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3988 3989#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3990 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3991 VSTORE_PARTIAL(N0, STORE_N0) \ 3992 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3993 3994#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3995 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3996 VSTORE_PARTIAL(N0, STORE_N0) \ 3997 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3998 3999#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4000 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4001 VSTORE_PARTIAL(N0, STORE_N0) \ 4002 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4003 4004#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4005 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4006 VSTORE_PARTIAL(N0, STORE_N0) \ 4007 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4008 4009#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4010 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4011 VSTORE_PARTIAL(N0, STORE_N0) \ 4012 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4013 4014#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4015 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4016 VSTORE_PARTIAL(N0, STORE_N0) \ 4017 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4018 4019#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4020 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4021 VSTORE_PARTIAL(N0, STORE_N0) \ 4022 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4023 4024#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4025 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4026 VSTORE_PARTIAL(N0, STORE_N0) \ 4027 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4028 4029#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4030 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4031 VSTORE_PARTIAL(N0, STORE_N0) \ 4032 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4033 4034#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4035 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4036 VSTORE_PARTIAL(N0, STORE_N0) \ 4037 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4038 4039#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4040 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4041 VSTORE_PARTIAL(N0, STORE_N0) \ 4042 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4043 4044#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4045 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4046 VSTORE_PARTIAL(N0, STORE_N0) \ 4047 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4048 4049#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4050 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4051 VSTORE_PARTIAL(N0, STORE_N0) \ 4052 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4053 4054 4055 4056#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4057#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4058 4059#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4060 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4061 { \ 4062 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4063 } \ 4064 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4065 { \ 4066 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4067 } \ 4068 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4069 { \ 4070 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4071 } \ 4072 else \ 4073 { \ 4074 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4075 } 4076 4077#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4078 if(!(PARTIAL_COND_X)) \ 4079 { \ 4080 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4081 } \ 4082 else \ 4083 { \ 4084 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4085 } 4086 4087#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4088 if(!(PARTIAL_COND_Y)) \ 4089 { \ 4090 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4091 } \ 4092 else \ 4093 { \ 4094 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4095 } 4096 4097 4098#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 4099 4100 4101#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4102 4103#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4104 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4105 4106#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4107 4108#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4109 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4110 4111#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4112 4113#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4114 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4115 4116#else 4117 4118#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4119 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4120 4121#endif 4122 4123#endif 4124 4125 4126#if defined(PARTIAL_STORE_M0) 4127 4128#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4129 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 4130#else 4131#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4132 ((uint)(y * M0)) 4133#endif 4134 4135 4136 4137#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 4138 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 4139 4140 4141#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4142#pragma OPENCL EXTENSION cl_khr_fp16 : enable 4143#endif 4144 4145#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4146#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 4147#endif 4148 4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 4151#endif 4152 4153#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 4154#pragma OPENCL EXTENSION cl_arm_printf : enable 4155#endif 4156 4157#define GPU_ARCH_MIDGARD 0x100 4158#define GPU_ARCH_BIFROST 0x200 4159#define GPU_ARCH_VALHALL 0x300 4160 4161 4162#define CONCAT(a, b) a##b 4163 4164 4165#define EXPAND(x) x 4166 4167 4168#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 4169 4170 4171#define REV1(x) ((x)) 4172#define REV2(x) ((x).s10) 4173#define REV3(x) ((x).s210) 4174#define REV4(x) ((x).s3210) 4175#define REV8(x) ((x).s76543210) 4176#define REV16(x) ((x).sFEDCBA9876543210) 4177 4178 4179 4180#define REVERSE_STR(x, s) REV##s((x)) 4181#define REVERSE(x, s) REVERSE_STR(x, s) 4182 4183 4184 4185#define ROT1_0(x) ((x)) 4186#define ROT1_1(x) ((x)) 4187 4188#define ROT2_0(x) ((x)) 4189#define ROT2_1(x) ((x).s10) 4190#define ROT2_2(x) ((x)) 4191 4192#define ROT3_0(x) ((x)) 4193#define ROT3_1(x) ((x).s201) 4194#define ROT3_2(x) ((x).s120) 4195#define ROT3_3(x) ((x)) 4196 4197#define ROT4_0(x) ((x)) 4198#define ROT4_1(x) ((x).s3012) 4199#define ROT4_2(x) ((x).s2301) 4200#define ROT4_3(x) ((x).s1230) 4201#define ROT4_4(x) ((x)) 4202 4203#define ROT8_0(x) ((x)) 4204#define ROT8_1(x) ((x).s70123456) 4205#define ROT8_2(x) ((x).s67012345) 4206#define ROT8_3(x) ((x).s56701234) 4207#define ROT8_4(x) ((x).s45670123) 4208#define ROT8_5(x) ((x).s34567012) 4209#define ROT8_6(x) ((x).s23456701) 4210#define ROT8_7(x) ((x).s12345670) 4211#define ROT8_8(x) ((x)) 4212 4213#define ROT16_0(x) ((x)) 4214#define ROT16_1(x) ((x).sF0123456789ABCDE) 4215#define ROT16_2(x) ((x).sEF0123456789ABCD) 4216#define ROT16_3(x) ((x).sDEF0123456789ABC) 4217#define ROT16_4(x) ((x).sCDEF0123456789AB) 4218#define ROT16_5(x) ((x).sBCDEF0123456789A) 4219#define ROT16_6(x) ((x).sABCDEF0123456789) 4220#define ROT16_7(x) ((x).s9ABCDEF012345678) 4221#define ROT16_8(x) ((x).s89ABCDEF01234567) 4222#define ROT16_9(x) ((x).s789ABCDEF0123456) 4223#define ROT16_10(x) ((x).s6789ABCDEF012345) 4224#define ROT16_11(x) ((x).s56789ABCDEF01234) 4225#define ROT16_12(x) ((x).s456789ABCDEF0123) 4226#define ROT16_13(x) ((x).s3456789ABCDEF012) 4227#define ROT16_14(x) ((x).s23456789ABCDEF01) 4228#define ROT16_15(x) ((x).s123456789ABCDEF0) 4229#define ROT16_16(x) ((x)) 4230 4231 4232 4233#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 4234#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 4235 4236 4237 4238#define V_OFFS1(dt) (dt##1)(0) 4239#define V_OFFS2(dt) (dt##2)(0, 1) 4240#define V_OFFS3(dt) (dt##3)(0, 1, 2) 4241#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 4242#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 4243#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 4244 4245 4246 4247#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 4248#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 4249 4250 4251#define VLOAD_STR(size) vload##size 4252#define VLOAD(size) VLOAD_STR(size) 4253 4254 4255#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 4256#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 4257 4258#define NO_LOAD(data, offs, ptr) \ 4259 { \ 4260 } 4261 4262 4263#define vload_partial_1_0 NO_LOAD 4264#define vload_partial_1_1 vload1 4265#define vload_partial_1_2 NO_LOAD 4266#define vload_partial_1_3 NO_LOAD 4267#define vload_partial_1_4 NO_LOAD 4268#define vload_partial_1_5 NO_LOAD 4269#define vload_partial_1_6 NO_LOAD 4270#define vload_partial_1_7 NO_LOAD 4271#define vload_partial_1_8 NO_LOAD 4272#define vload_partial_1_9 NO_LOAD 4273#define vload_partial_1_10 NO_LOAD 4274#define vload_partial_1_11 NO_LOAD 4275#define vload_partial_1_12 NO_LOAD 4276#define vload_partial_1_13 NO_LOAD 4277#define vload_partial_1_14 NO_LOAD 4278#define vload_partial_1_15 NO_LOAD 4279#define vload_partial_1_16 NO_LOAD 4280 4281#define vload_partial_2_0 NO_LOAD 4282#define vload_partial_2_1 vload_partial_1 4283#define vload_partial_2_2 vload_partial_2 4284#define vload_partial_2_3 NO_LOAD 4285#define vload_partial_2_4 NO_LOAD 4286#define vload_partial_2_5 NO_LOAD 4287#define vload_partial_2_6 NO_LOAD 4288#define vload_partial_2_7 NO_LOAD 4289#define vload_partial_2_8 NO_LOAD 4290#define vload_partial_2_9 NO_LOAD 4291#define vload_partial_2_10 NO_LOAD 4292#define vload_partial_2_11 NO_LOAD 4293#define vload_partial_2_12 NO_LOAD 4294#define vload_partial_2_13 NO_LOAD 4295#define vload_partial_2_14 NO_LOAD 4296#define vload_partial_2_15 NO_LOAD 4297#define vload_partial_2_16 NO_LOAD 4298 4299#define vload_partial_3_0 NO_LOAD 4300#define vload_partial_3_1 vload_partial_1 4301#define vload_partial_3_2 vload_partial_2 4302#define vload_partial_3_3 vload_partial_3 4303#define vload_partial_3_4 NO_LOAD 4304#define vload_partial_3_5 NO_LOAD 4305#define vload_partial_3_6 NO_LOAD 4306#define vload_partial_3_7 NO_LOAD 4307#define vload_partial_3_8 NO_LOAD 4308#define vload_partial_3_9 NO_LOAD 4309#define vload_partial_3_10 NO_LOAD 4310#define vload_partial_3_11 NO_LOAD 4311#define vload_partial_3_12 NO_LOAD 4312#define vload_partial_3_13 NO_LOAD 4313#define vload_partial_3_14 NO_LOAD 4314#define vload_partial_3_15 NO_LOAD 4315#define vload_partial_3_16 NO_LOAD 4316 4317#define vload_partial_4_0 NO_LOAD 4318#define vload_partial_4_1 vload_partial_1 4319#define vload_partial_4_2 vload_partial_2 4320#define vload_partial_4_3 vload_partial_3 4321#define vload_partial_4_4 vload_partial_4 4322#define vload_partial_4_5 NO_LOAD 4323#define vload_partial_4_6 NO_LOAD 4324#define vload_partial_4_7 NO_LOAD 4325#define vload_partial_4_8 NO_LOAD 4326#define vload_partial_4_9 NO_LOAD 4327#define vload_partial_4_10 NO_LOAD 4328#define vload_partial_4_11 NO_LOAD 4329#define vload_partial_4_12 NO_LOAD 4330#define vload_partial_4_13 NO_LOAD 4331#define vload_partial_4_14 NO_LOAD 4332#define vload_partial_4_15 NO_LOAD 4333#define vload_partial_4_16 NO_LOAD 4334 4335#define vload_partial_8_0 NO_LOAD 4336#define vload_partial_8_1 vload_partial_1 4337#define vload_partial_8_2 vload_partial_2 4338#define vload_partial_8_3 vload_partial_3 4339#define vload_partial_8_4 vload_partial_4 4340#define vload_partial_8_5 vload_partial_5 4341#define vload_partial_8_6 vload_partial_6 4342#define vload_partial_8_7 vload_partial_7 4343#define vload_partial_8_8 vload_partial_8 4344#define vload_partial_8_9 NO_LOAD 4345#define vload_partial_8_10 NO_LOAD 4346#define vload_partial_8_11 NO_LOAD 4347#define vload_partial_8_12 NO_LOAD 4348#define vload_partial_8_13 NO_LOAD 4349#define vload_partial_8_14 NO_LOAD 4350#define vload_partial_8_15 NO_LOAD 4351#define vload_partial_8_16 NO_LOAD 4352 4353#define vload_partial_16_0 NO_LOAD 4354#define vload_partial_16_1 vload_partial_1 4355#define vload_partial_16_2 vload_partial_2 4356#define vload_partial_16_3 vload_partial_3 4357#define vload_partial_16_4 vload_partial_4 4358#define vload_partial_16_5 vload_partial_5 4359#define vload_partial_16_6 vload_partial_6 4360#define vload_partial_16_7 vload_partial_7 4361#define vload_partial_16_8 vload_partial_8 4362#define vload_partial_16_9 vload_partial_9 4363#define vload_partial_16_10 vload_partial_10 4364#define vload_partial_16_11 vload_partial_11 4365#define vload_partial_16_12 vload_partial_12 4366#define vload_partial_16_13 vload_partial_13 4367#define vload_partial_16_14 vload_partial_14 4368#define vload_partial_16_15 vload_partial_15 4369#define vload_partial_16_16 vload_partial_16 4370 4371 4372#define vload_partial_1(DATA, OFFSET, PTR) \ 4373 DATA.s0 = vload1(OFFSET, PTR); 4374 4375#define vload_partial_2(DATA, OFFSET, PTR) \ 4376 DATA.s01 = vload2(OFFSET, PTR); 4377 4378#define vload_partial_3(DATA, OFFSET, PTR) \ 4379 DATA.s012 = vload3(OFFSET, PTR); 4380 4381#define vload_partial_4(DATA, OFFSET, PTR) \ 4382 DATA.s0123 = vload4(OFFSET, PTR); 4383 4384#define vload_partial_5(DATA, OFFSET, PTR) \ 4385 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4386 DATA.s4 = vload1(OFFSET, PTR + 4); 4387 4388#define vload_partial_6(DATA, OFFSET, PTR) \ 4389 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4390 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 4391 4392#define vload_partial_7(DATA, OFFSET, PTR) \ 4393 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4394 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 4395 4396#define vload_partial_8(DATA, OFFSET, PTR) \ 4397 DATA.s01234567 = vload8(OFFSET, PTR); 4398 4399#define vload_partial_9(DATA, OFFSET, PTR) \ 4400 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4401 DATA.s8 = vload1(OFFSET, PTR + 8); 4402 4403#define vload_partial_10(DATA, OFFSET, PTR) \ 4404 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4405 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 4406 4407#define vload_partial_11(DATA, OFFSET, PTR) \ 4408 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4409 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 4410 4411#define vload_partial_12(DATA, OFFSET, PTR) \ 4412 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4413 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 4414 4415#define vload_partial_13(DATA, OFFSET, PTR) \ 4416 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4417 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 4418 4419#define vload_partial_14(DATA, OFFSET, PTR) \ 4420 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4421 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 4422 4423#define vload_partial_15(DATA, OFFSET, PTR) \ 4424 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4425 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 4426 4427#define vload_partial_16(DATA, OFFSET, PTR) \ 4428 DATA = vload16(OFFSET, PTR); 4429 4430 4431 4432#define PIXEL_UNIT4 1 4433#define PIXEL_UNIT8 2 4434#define PIXEL_UNIT16 4 4435 4436 4437#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 4438#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 4439 4440 4441#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 4442#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 4443#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 4444 4445#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4446#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 4447#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 4448#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 4449#endif 4450 4451#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 4452#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4453#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4454 4455#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4456#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 4457#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4458#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4459#endif 4460 4461 4462#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 4463#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 4464 4465 4466#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 4467#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 4468 4469#define VSTORE_STR(size) vstore##size 4470#define VSTORE(size) VSTORE_STR(size) 4471 4472#define float1 float 4473#define half1 half 4474#define char1 char 4475#define uchar1 uchar 4476#define short1 short 4477#define ushort1 ushort 4478#define int1 int 4479#define uint1 uint 4480#define long1 long 4481#define ulong1 ulong 4482#define double1 double 4483 4484#define vload1(OFFSET, PTR) *(OFFSET + PTR) 4485#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 4486 4487 4488#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 4489#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 4490 4491#define NO_STORE(data, offs, ptr) \ 4492 { \ 4493 } 4494 4495 4496#define vstore_partial_1_0 NO_STORE 4497#define vstore_partial_1_1 vstore1 4498#define vstore_partial_1_2 NO_STORE 4499#define vstore_partial_1_3 NO_STORE 4500#define vstore_partial_1_4 NO_STORE 4501#define vstore_partial_1_5 NO_STORE 4502#define vstore_partial_1_6 NO_STORE 4503#define vstore_partial_1_7 NO_STORE 4504#define vstore_partial_1_8 NO_STORE 4505#define vstore_partial_1_9 NO_STORE 4506#define vstore_partial_1_10 NO_STORE 4507#define vstore_partial_1_11 NO_STORE 4508#define vstore_partial_1_12 NO_STORE 4509#define vstore_partial_1_13 NO_STORE 4510#define vstore_partial_1_14 NO_STORE 4511#define vstore_partial_1_15 NO_STORE 4512#define vstore_partial_1_16 NO_STORE 4513 4514#define vstore_partial_2_0 NO_STORE 4515#define vstore_partial_2_1 vstore_partial_1 4516#define vstore_partial_2_2 vstore_partial_2 4517#define vstore_partial_2_3 NO_STORE 4518#define vstore_partial_2_4 NO_STORE 4519#define vstore_partial_2_5 NO_STORE 4520#define vstore_partial_2_6 NO_STORE 4521#define vstore_partial_2_7 NO_STORE 4522#define vstore_partial_2_8 NO_STORE 4523#define vstore_partial_2_9 NO_STORE 4524#define vstore_partial_2_10 NO_STORE 4525#define vstore_partial_2_11 NO_STORE 4526#define vstore_partial_2_12 NO_STORE 4527#define vstore_partial_2_13 NO_STORE 4528#define vstore_partial_2_14 NO_STORE 4529#define vstore_partial_2_15 NO_STORE 4530#define vstore_partial_2_16 NO_STORE 4531 4532#define vstore_partial_3_0 NO_STORE 4533#define vstore_partial_3_1 vstore_partial_1 4534#define vstore_partial_3_2 vstore_partial_2 4535#define vstore_partial_3_3 vstore_partial_3 4536#define vstore_partial_3_4 NO_STORE 4537#define vstore_partial_3_5 NO_STORE 4538#define vstore_partial_3_6 NO_STORE 4539#define vstore_partial_3_7 NO_STORE 4540#define vstore_partial_3_8 NO_STORE 4541#define vstore_partial_3_9 NO_STORE 4542#define vstore_partial_3_10 NO_STORE 4543#define vstore_partial_3_11 NO_STORE 4544#define vstore_partial_3_12 NO_STORE 4545#define vstore_partial_3_13 NO_STORE 4546#define vstore_partial_3_14 NO_STORE 4547#define vstore_partial_3_15 NO_STORE 4548#define vstore_partial_3_16 NO_STORE 4549 4550#define vstore_partial_4_0 NO_STORE 4551#define vstore_partial_4_1 vstore_partial_1 4552#define vstore_partial_4_2 vstore_partial_2 4553#define vstore_partial_4_3 vstore_partial_3 4554#define vstore_partial_4_4 vstore_partial_4 4555#define vstore_partial_4_5 NO_STORE 4556#define vstore_partial_4_6 NO_STORE 4557#define vstore_partial_4_7 NO_STORE 4558#define vstore_partial_4_8 NO_STORE 4559#define vstore_partial_4_9 NO_STORE 4560#define vstore_partial_4_10 NO_STORE 4561#define vstore_partial_4_11 NO_STORE 4562#define vstore_partial_4_12 NO_STORE 4563#define vstore_partial_4_13 NO_STORE 4564#define vstore_partial_4_14 NO_STORE 4565#define vstore_partial_4_15 NO_STORE 4566#define vstore_partial_4_16 NO_STORE 4567 4568#define vstore_partial_8_0 NO_STORE 4569#define vstore_partial_8_1 vstore_partial_1 4570#define vstore_partial_8_2 vstore_partial_2 4571#define vstore_partial_8_3 vstore_partial_3 4572#define vstore_partial_8_4 vstore_partial_4 4573#define vstore_partial_8_5 vstore_partial_5 4574#define vstore_partial_8_6 vstore_partial_6 4575#define vstore_partial_8_7 vstore_partial_7 4576#define vstore_partial_8_8 vstore_partial_8 4577#define vstore_partial_8_9 NO_STORE 4578#define vstore_partial_8_10 NO_STORE 4579#define vstore_partial_8_11 NO_STORE 4580#define vstore_partial_8_12 NO_STORE 4581#define vstore_partial_8_13 NO_STORE 4582#define vstore_partial_8_14 NO_STORE 4583#define vstore_partial_8_15 NO_STORE 4584#define vstore_partial_8_16 NO_STORE 4585 4586#define vstore_partial_16_0 NO_STORE 4587#define vstore_partial_16_1 vstore_partial_1 4588#define vstore_partial_16_2 vstore_partial_2 4589#define vstore_partial_16_3 vstore_partial_3 4590#define vstore_partial_16_4 vstore_partial_4 4591#define vstore_partial_16_5 vstore_partial_5 4592#define vstore_partial_16_6 vstore_partial_6 4593#define vstore_partial_16_7 vstore_partial_7 4594#define vstore_partial_16_8 vstore_partial_8 4595#define vstore_partial_16_9 vstore_partial_9 4596#define vstore_partial_16_10 vstore_partial_10 4597#define vstore_partial_16_11 vstore_partial_11 4598#define vstore_partial_16_12 vstore_partial_12 4599#define vstore_partial_16_13 vstore_partial_13 4600#define vstore_partial_16_14 vstore_partial_14 4601#define vstore_partial_16_15 vstore_partial_15 4602#define vstore_partial_16_16 vstore_partial_16 4603 4604 4605#define vstore_partial_1(DATA, OFFSET, PTR) \ 4606 vstore1(DATA.s0, OFFSET, PTR); 4607 4608#define vstore_partial_2(DATA, OFFSET, PTR) \ 4609 vstore2(DATA.s01, OFFSET, PTR); 4610 4611#define vstore_partial_3(DATA, OFFSET, PTR) \ 4612 vstore3(DATA.s012, OFFSET, PTR); 4613 4614#define vstore_partial_4(DATA, OFFSET, PTR) \ 4615 vstore4(DATA.s0123, OFFSET, PTR); 4616 4617#define vstore_partial_5(DATA, OFFSET, PTR) \ 4618 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4619 vstore1(DATA.s4, OFFSET, PTR + 4); 4620 4621#define vstore_partial_6(DATA, OFFSET, PTR) \ 4622 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4623 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 4624 4625#define vstore_partial_7(DATA, OFFSET, PTR) \ 4626 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4627 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 4628 4629#define vstore_partial_8(DATA, OFFSET, PTR) \ 4630 vstore8(DATA.s01234567, OFFSET, PTR); 4631 4632#define vstore_partial_9(DATA, OFFSET, PTR) \ 4633 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4634 vstore1(DATA.s8, OFFSET, PTR + 8); 4635 4636#define vstore_partial_10(DATA, OFFSET, PTR) \ 4637 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4638 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 4639 4640#define vstore_partial_11(DATA, OFFSET, PTR) \ 4641 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4642 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 4643 4644#define vstore_partial_12(DATA, OFFSET, PTR) \ 4645 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4646 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 4647 4648#define vstore_partial_13(DATA, OFFSET, PTR) \ 4649 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4650 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 4651 4652#define vstore_partial_14(DATA, OFFSET, PTR) \ 4653 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4654 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 4655 4656#define vstore_partial_15(DATA, OFFSET, PTR) \ 4657 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4658 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 4659 4660#define vstore_partial_16(DATA, OFFSET, PTR) \ 4661 vstore16(DATA, OFFSET, PTR); 4662 4663 4664 4665 4666 4667#define convert_float_sat convert_float 4668#define convert_float1_sat convert_float 4669#define convert_float2_sat convert_float2 4670#define convert_float3_sat convert_float3 4671#define convert_float4_sat convert_float4 4672#define convert_float8_sat convert_float8 4673#define convert_float16_sat convert_float16 4674#define convert_half_sat convert_float 4675#define convert_half1_sat convert_half 4676#define convert_half2_sat convert_half2 4677#define convert_half3_sat convert_half3 4678#define convert_half4_sat convert_half4 4679#define convert_half8_sat convert_half8 4680#define convert_half16_sat convert_half16 4681 4682#define convert_float1 convert_float 4683#define convert_half1 convert_half 4684#define convert_char1 convert_char 4685#define convert_uchar1 convert_uchar 4686#define convert_short1 convert_short 4687#define convert_ushort1 convert_ushort 4688#define convert_int1 convert_int 4689#define convert_uint1 convert_uint 4690#define convert_long1 convert_long 4691#define convert_ulong1 convert_ulong 4692#define convert_double1 convert_double 4693 4694#define convert_char1_sat convert_char_sat 4695#define convert_uchar1_sat convert_uchar_sat 4696#define convert_uchar2_sat convert_uchar2_sat 4697#define convert_uchar3_sat convert_uchar3_sat 4698#define convert_uchar4_sat convert_uchar4_sat 4699#define convert_uchar8_sat convert_uchar8_sat 4700#define convert_uchar16_sat convert_uchar16_sat 4701#define convert_short1_sat convert_short_sat 4702#define convert_ushort1_sat convert_ushort_sat 4703#define convert_int1_sat convert_int_sat 4704#define convert_uint1_sat convert_uint_sat 4705#define convert_long1_sat convert_long_sat 4706#define convert_ulong1_sat convert_ulong_sat 4707#define convert_double1_sat convert_double_sat 4708 4709#define VEC_DATA_TYPE_STR(type, size) type##size 4710#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 4711 4712#define CONVERT_STR(x, type) (convert_##type((x))) 4713#define CONVERT(x, type) CONVERT_STR(x, type) 4714 4715#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 4716#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 4717 4718#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 4719#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 4720 4721#define select_vec_dt_uchar(size) uchar##size 4722#define select_vec_dt_char(size) char##size 4723#define select_vec_dt_ushort(size) ushort##size 4724#define select_vec_dt_short(size) short##size 4725#define select_vec_dt_half(size) short##size 4726#define select_vec_dt_uint(size) uint##size 4727#define select_vec_dt_int(size) int##size 4728#define select_vec_dt_float(size) int##size 4729#define select_vec_dt_ulong(size) ulong##size 4730#define select_vec_dt_long(size) long##size 4731 4732#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 4733#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 4734#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 4735 4736#define signed_int_vec_dt_uchar(size) char##size 4737#define signed_int_vec_dt_char(size) char##size 4738#define signed_int_vec_dt_ushort(size) short##size 4739#define signed_int_vec_dt_short(size) short##size 4740#define signed_int_vec_dt_half(size) short##size 4741#define signed_int_vec_dt_uint(size) int##size 4742#define signed_int_vec_dt_int(size) int##size 4743#define signed_int_vec_dt_float(size) int##size 4744#define signed_int_vec_dt_ulong(size) long##size 4745#define signed_int_vec_dt_long(size) long##size 4746 4747#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 4748#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 4749#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 4750 4751#define sum_reduce_1(x) (x) 4752#define sum_reduce_2(x) ((x).s0) + ((x).s1) 4753#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 4754#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 4755#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 4756#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 4757 4758#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 4759#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 4760 4761#define prod_reduce_1(x) (x) 4762#define prod_reduce_2(x) ((x).s0) * ((x).s1) 4763#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 4764#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 4765#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 4766#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 4767 4768#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 4769#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 4770 4771#define max_reduce_1(x) (x) 4772#define max_reduce_2(x) max(((x).s0), ((x).s1)) 4773#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 4774#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 4775#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 4776#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 4777 4778#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 4779#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 4780 4781#define VECTOR_DECLARATION(name) \ 4782 __global uchar *name##_ptr, \ 4783 uint name##_stride_x, \ 4784 uint name##_step_x, \ 4785 uint name##_offset_first_element_in_bytes 4786 4787#define IMAGE_DECLARATION(name) \ 4788 __global uchar *name##_ptr, \ 4789 uint name##_stride_x, \ 4790 uint name##_step_x, \ 4791 uint name##_stride_y, \ 4792 uint name##_step_y, \ 4793 uint name##_offset_first_element_in_bytes 4794 4795#define TENSOR3D_DECLARATION(name) \ 4796 __global uchar *name##_ptr, \ 4797 uint name##_stride_x, \ 4798 uint name##_step_x, \ 4799 uint name##_stride_y, \ 4800 uint name##_step_y, \ 4801 uint name##_stride_z, \ 4802 uint name##_step_z, \ 4803 uint name##_offset_first_element_in_bytes 4804 4805#define TENSOR4D_DECLARATION(name) \ 4806 __global uchar *name##_ptr, \ 4807 uint name##_stride_x, \ 4808 uint name##_step_x, \ 4809 uint name##_stride_y, \ 4810 uint name##_step_y, \ 4811 uint name##_stride_z, \ 4812 uint name##_step_z, \ 4813 uint name##_stride_w, \ 4814 uint name##_step_w, \ 4815 uint name##_offset_first_element_in_bytes 4816 4817#define TENSOR5D_DECLARATION(name) \ 4818 __global uchar *name##_ptr, \ 4819 uint name##_stride_x, \ 4820 uint name##_step_x, \ 4821 uint name##_stride_y, \ 4822 uint name##_step_y, \ 4823 uint name##_stride_z, \ 4824 uint name##_step_z, \ 4825 uint name##_stride_w, \ 4826 uint name##_step_w, \ 4827 uint name##_stride_v, \ 4828 uint name##_step_v, \ 4829 uint name##_offset_first_element_in_bytes 4830 4831#define CONVERT_TO_VECTOR_STRUCT(name) \ 4832 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 4833 4834#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 4835 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 4836 4837#define CONVERT_TO_IMAGE_STRUCT(name) \ 4838 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 4839 4840#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 4841 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 4842 4843#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4844 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4845 4846#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 4847 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 4848 4849#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4850 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4851 4852#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 4853 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4854 name##_stride_z, name##_step_z) 4855 4856#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 4857 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 4858 4859#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 4860 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4861 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 4862 4863#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 4864 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 4865 4866#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 4867 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4868 name##_stride_z, name##_step_z) 4869 4870 4871typedef struct Vector 4872{ 4873 __global uchar *ptr; 4874 int offset_first_element_in_bytes; 4875 int stride_x; 4876} Vector; 4877 4878 4879typedef struct Image 4880{ 4881 __global uchar *ptr; 4882 int offset_first_element_in_bytes; 4883 int stride_x; 4884 int stride_y; 4885} Image; 4886 4887 4888typedef struct Tensor3D 4889{ 4890 __global uchar *ptr; 4891 int offset_first_element_in_bytes; 4892 int stride_x; 4893 int stride_y; 4894 int stride_z; 4895} Tensor3D; 4896 4897 4898typedef struct Tensor4D 4899{ 4900 __global uchar *ptr; 4901 int offset_first_element_in_bytes; 4902 int stride_x; 4903 int stride_y; 4904 int stride_z; 4905 int stride_w; 4906} Tensor4D; 4907 4908 4909inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 4910{ 4911 Vector vector = 4912 { 4913 .ptr = ptr, 4914 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4915 .stride_x = stride_x, 4916 }; 4917 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 4918 return vector; 4919} 4920 4921 4922inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 4923{ 4924 Image img = 4925 { 4926 .ptr = ptr, 4927 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4928 .stride_x = stride_x, 4929 .stride_y = stride_y 4930 }; 4931 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 4932 return img; 4933} 4934 4935 4936inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4937{ 4938 Image img = 4939 { 4940 .ptr = ptr, 4941 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4942 .stride_x = stride_x, 4943 .stride_y = stride_y 4944 }; 4945 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4946 return img; 4947} 4948 4949 4950inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4951{ 4952 Tensor3D tensor = 4953 { 4954 .ptr = ptr, 4955 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4956 .stride_x = stride_x, 4957 .stride_y = stride_y, 4958 .stride_z = stride_z 4959 }; 4960 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4961 return tensor; 4962} 4963 4964 4965inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4966{ 4967 Tensor3D tensor = 4968 { 4969 .ptr = ptr, 4970 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4971 .stride_x = stride_x, 4972 .stride_y = stride_y, 4973 .stride_z = stride_z 4974 }; 4975 return tensor; 4976} 4977 4978inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 4979 uint step_w, 4980 uint mod_size) 4981{ 4982 Tensor4D tensor = 4983 { 4984 .ptr = ptr, 4985 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4986 .stride_x = stride_x, 4987 .stride_y = stride_y, 4988 .stride_z = stride_z, 4989 .stride_w = stride_w 4990 }; 4991 4992 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 4993 return tensor; 4994} 4995 4996 4997inline __global const uchar *vector_offset(const Vector *vec, int x) 4998{ 4999 return vec->ptr + x * vec->stride_x; 5000} 5001 5002 5003inline __global uchar *offset(const Image *img, int x, int y) 5004{ 5005 return img->ptr + x * img->stride_x + y * img->stride_y; 5006} 5007 5008 5009inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 5010{ 5011 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 5012} 5013 5014 5015inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 5016{ 5017 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 5018} 5019 5020 5021inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 5022{ 5023 uint num_elements = width * height; 5024 5025 const uint z = index / num_elements; 5026 5027 index %= num_elements; 5028 5029 const uint y = index / width; 5030 5031 index %= width; 5032 5033 const uint x = index; 5034 5035 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 5036} 5037 5038#endif 5039 5040#ifndef ARM_COMPUTE_REPEAT_H 5041#define ARM_COMPUTE_REPEAT_H 5042 5043 5044#ifndef ARM_COMPUTE_HELPER_H 5045#define ARM_COMPUTE_HELPER_H 5046 5047 5048 5049 5050#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5051 VSTORE(N0) \ 5052 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5053 5054#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5055 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5056 VSTORE(N0) \ 5057 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5058 5059#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5060 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5061 VSTORE(N0) \ 5062 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5063 5064#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5065 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5066 VSTORE(N0) \ 5067 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5068 5069#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5070 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5071 VSTORE(N0) \ 5072 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5073 5074#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5075 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5076 VSTORE(N0) \ 5077 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5078 5079#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5080 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5081 VSTORE(N0) \ 5082 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5083 5084#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5085 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5086 VSTORE(N0) \ 5087 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5088 5089#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5090 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5091 VSTORE(N0) \ 5092 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5093 5094#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5095 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5096 VSTORE(N0) \ 5097 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5098 5099#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5100 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5101 VSTORE(N0) \ 5102 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5103 5104#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5105 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5106 VSTORE(N0) \ 5107 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5108 5109#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5110 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5111 VSTORE(N0) \ 5112 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5113 5114#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5115 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5116 VSTORE(N0) \ 5117 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5118 5119#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5120 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5121 VSTORE(N0) \ 5122 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5123 5124#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5125 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5126 VSTORE(N0) \ 5127 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5128 5129 5130 5131#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5132 VSTORE(N0) \ 5133 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5134 5135#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5136 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5137 VSTORE(N0) \ 5138 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5139 5140#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5141 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5142 VSTORE(N0) \ 5143 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5144 5145#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5146 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5147 VSTORE(N0) \ 5148 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5149 5150#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5151 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5152 VSTORE(N0) \ 5153 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5154 5155#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5156 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5157 VSTORE(N0) \ 5158 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5159 5160#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5161 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5162 VSTORE(N0) \ 5163 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5164 5165#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5166 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5167 VSTORE(N0) \ 5168 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5169 5170#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5171 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5172 VSTORE(N0) \ 5173 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5174 5175#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 5176 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5177 VSTORE(N0) \ 5178 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5179 5180#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5181 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5182 VSTORE(N0) \ 5183 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5184 5185#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5186 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5187 VSTORE(N0) \ 5188 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5189 5190#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5191 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5192 VSTORE(N0) \ 5193 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5194 5195#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5196 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5197 VSTORE(N0) \ 5198 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5199 5200#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5201 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5202 VSTORE(N0) \ 5203 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5204 5205#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5206 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5207 VSTORE(N0) \ 5208 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5209 5210 5211 5212 5213#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5214#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5215 5216 5217 5218#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5219#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5220 5221 5222 5223#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5224 VSTORE_PARTIAL(N0, STORE_N0) \ 5225 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5226 5227#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5228 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5229 VSTORE_PARTIAL(N0, STORE_N0) \ 5230 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5231 5232#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5233 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5234 VSTORE_PARTIAL(N0, STORE_N0) \ 5235 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5236 5237#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5238 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5239 VSTORE_PARTIAL(N0, STORE_N0) \ 5240 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5241 5242#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5243 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5244 VSTORE_PARTIAL(N0, STORE_N0) \ 5245 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5246 5247#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5248 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5249 VSTORE_PARTIAL(N0, STORE_N0) \ 5250 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5251 5252#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5253 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5254 VSTORE_PARTIAL(N0, STORE_N0) \ 5255 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5256 5257#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5258 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5259 VSTORE_PARTIAL(N0, STORE_N0) \ 5260 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5261 5262#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5263 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5264 VSTORE_PARTIAL(N0, STORE_N0) \ 5265 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5266 5267#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5268 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5269 VSTORE_PARTIAL(N0, STORE_N0) \ 5270 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5271 5272#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5273 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5274 VSTORE_PARTIAL(N0, STORE_N0) \ 5275 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5276 5277#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5278 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5279 VSTORE_PARTIAL(N0, STORE_N0) \ 5280 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5281 5282#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5283 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5284 VSTORE_PARTIAL(N0, STORE_N0) \ 5285 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5286 5287#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5288 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5289 VSTORE_PARTIAL(N0, STORE_N0) \ 5290 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5291 5292#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5293 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5294 VSTORE_PARTIAL(N0, STORE_N0) \ 5295 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5296 5297#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5298 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5299 VSTORE_PARTIAL(N0, STORE_N0) \ 5300 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5301 5302 5303 5304#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5305#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5306 5307#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5308 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 5309 { \ 5310 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5311 } \ 5312 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 5313 { \ 5314 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5315 } \ 5316 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 5317 { \ 5318 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5319 } \ 5320 else \ 5321 { \ 5322 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5323 } 5324 5325#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 5326 if(!(PARTIAL_COND_X)) \ 5327 { \ 5328 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5329 } \ 5330 else \ 5331 { \ 5332 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5333 } 5334 5335#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 5336 if(!(PARTIAL_COND_Y)) \ 5337 { \ 5338 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5339 } \ 5340 else \ 5341 { \ 5342 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5343 } 5344 5345 5346#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 5347 5348 5349#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 5350 5351#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5352 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5353 5354#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 5355 5356#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5357 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 5358 5359#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 5360 5361#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5362 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 5363 5364#else 5365 5366#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5367 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 5368 5369#endif 5370 5371#endif 5372 5373 5374#if defined(PARTIAL_STORE_M0) 5375 5376#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5377 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 5378#else 5379#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5380 ((uint)(y * M0)) 5381#endif 5382 5383 5384 5385#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 5386 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 5387 5388 5389#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 5390#pragma OPENCL EXTENSION cl_khr_fp16 : enable 5391#endif 5392 5393#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 5394#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 5395#endif 5396 5397#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 5398#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 5399#endif 5400 5401#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 5402#pragma OPENCL EXTENSION cl_arm_printf : enable 5403#endif 5404 5405#define GPU_ARCH_MIDGARD 0x100 5406#define GPU_ARCH_BIFROST 0x200 5407#define GPU_ARCH_VALHALL 0x300 5408 5409 5410#define CONCAT(a, b) a##b 5411 5412 5413#define EXPAND(x) x 5414 5415 5416#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 5417 5418 5419#define REV1(x) ((x)) 5420#define REV2(x) ((x).s10) 5421#define REV3(x) ((x).s210) 5422#define REV4(x) ((x).s3210) 5423#define REV8(x) ((x).s76543210) 5424#define REV16(x) ((x).sFEDCBA9876543210) 5425 5426 5427 5428#define REVERSE_STR(x, s) REV##s((x)) 5429#define REVERSE(x, s) REVERSE_STR(x, s) 5430 5431 5432 5433#define ROT1_0(x) ((x)) 5434#define ROT1_1(x) ((x)) 5435 5436#define ROT2_0(x) ((x)) 5437#define ROT2_1(x) ((x).s10) 5438#define ROT2_2(x) ((x)) 5439 5440#define ROT3_0(x) ((x)) 5441#define ROT3_1(x) ((x).s201) 5442#define ROT3_2(x) ((x).s120) 5443#define ROT3_3(x) ((x)) 5444 5445#define ROT4_0(x) ((x)) 5446#define ROT4_1(x) ((x).s3012) 5447#define ROT4_2(x) ((x).s2301) 5448#define ROT4_3(x) ((x).s1230) 5449#define ROT4_4(x) ((x)) 5450 5451#define ROT8_0(x) ((x)) 5452#define ROT8_1(x) ((x).s70123456) 5453#define ROT8_2(x) ((x).s67012345) 5454#define ROT8_3(x) ((x).s56701234) 5455#define ROT8_4(x) ((x).s45670123) 5456#define ROT8_5(x) ((x).s34567012) 5457#define ROT8_6(x) ((x).s23456701) 5458#define ROT8_7(x) ((x).s12345670) 5459#define ROT8_8(x) ((x)) 5460 5461#define ROT16_0(x) ((x)) 5462#define ROT16_1(x) ((x).sF0123456789ABCDE) 5463#define ROT16_2(x) ((x).sEF0123456789ABCD) 5464#define ROT16_3(x) ((x).sDEF0123456789ABC) 5465#define ROT16_4(x) ((x).sCDEF0123456789AB) 5466#define ROT16_5(x) ((x).sBCDEF0123456789A) 5467#define ROT16_6(x) ((x).sABCDEF0123456789) 5468#define ROT16_7(x) ((x).s9ABCDEF012345678) 5469#define ROT16_8(x) ((x).s89ABCDEF01234567) 5470#define ROT16_9(x) ((x).s789ABCDEF0123456) 5471#define ROT16_10(x) ((x).s6789ABCDEF012345) 5472#define ROT16_11(x) ((x).s56789ABCDEF01234) 5473#define ROT16_12(x) ((x).s456789ABCDEF0123) 5474#define ROT16_13(x) ((x).s3456789ABCDEF012) 5475#define ROT16_14(x) ((x).s23456789ABCDEF01) 5476#define ROT16_15(x) ((x).s123456789ABCDEF0) 5477#define ROT16_16(x) ((x)) 5478 5479 5480 5481#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 5482#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 5483 5484 5485 5486#define V_OFFS1(dt) (dt##1)(0) 5487#define V_OFFS2(dt) (dt##2)(0, 1) 5488#define V_OFFS3(dt) (dt##3)(0, 1, 2) 5489#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 5490#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 5491#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 5492 5493 5494 5495#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 5496#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 5497 5498 5499#define VLOAD_STR(size) vload##size 5500#define VLOAD(size) VLOAD_STR(size) 5501 5502 5503#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 5504#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 5505 5506#define NO_LOAD(data, offs, ptr) \ 5507 { \ 5508 } 5509 5510 5511#define vload_partial_1_0 NO_LOAD 5512#define vload_partial_1_1 vload1 5513#define vload_partial_1_2 NO_LOAD 5514#define vload_partial_1_3 NO_LOAD 5515#define vload_partial_1_4 NO_LOAD 5516#define vload_partial_1_5 NO_LOAD 5517#define vload_partial_1_6 NO_LOAD 5518#define vload_partial_1_7 NO_LOAD 5519#define vload_partial_1_8 NO_LOAD 5520#define vload_partial_1_9 NO_LOAD 5521#define vload_partial_1_10 NO_LOAD 5522#define vload_partial_1_11 NO_LOAD 5523#define vload_partial_1_12 NO_LOAD 5524#define vload_partial_1_13 NO_LOAD 5525#define vload_partial_1_14 NO_LOAD 5526#define vload_partial_1_15 NO_LOAD 5527#define vload_partial_1_16 NO_LOAD 5528 5529#define vload_partial_2_0 NO_LOAD 5530#define vload_partial_2_1 vload_partial_1 5531#define vload_partial_2_2 vload_partial_2 5532#define vload_partial_2_3 NO_LOAD 5533#define vload_partial_2_4 NO_LOAD 5534#define vload_partial_2_5 NO_LOAD 5535#define vload_partial_2_6 NO_LOAD 5536#define vload_partial_2_7 NO_LOAD 5537#define vload_partial_2_8 NO_LOAD 5538#define vload_partial_2_9 NO_LOAD 5539#define vload_partial_2_10 NO_LOAD 5540#define vload_partial_2_11 NO_LOAD 5541#define vload_partial_2_12 NO_LOAD 5542#define vload_partial_2_13 NO_LOAD 5543#define vload_partial_2_14 NO_LOAD 5544#define vload_partial_2_15 NO_LOAD 5545#define vload_partial_2_16 NO_LOAD 5546 5547#define vload_partial_3_0 NO_LOAD 5548#define vload_partial_3_1 vload_partial_1 5549#define vload_partial_3_2 vload_partial_2 5550#define vload_partial_3_3 vload_partial_3 5551#define vload_partial_3_4 NO_LOAD 5552#define vload_partial_3_5 NO_LOAD 5553#define vload_partial_3_6 NO_LOAD 5554#define vload_partial_3_7 NO_LOAD 5555#define vload_partial_3_8 NO_LOAD 5556#define vload_partial_3_9 NO_LOAD 5557#define vload_partial_3_10 NO_LOAD 5558#define vload_partial_3_11 NO_LOAD 5559#define vload_partial_3_12 NO_LOAD 5560#define vload_partial_3_13 NO_LOAD 5561#define vload_partial_3_14 NO_LOAD 5562#define vload_partial_3_15 NO_LOAD 5563#define vload_partial_3_16 NO_LOAD 5564 5565#define vload_partial_4_0 NO_LOAD 5566#define vload_partial_4_1 vload_partial_1 5567#define vload_partial_4_2 vload_partial_2 5568#define vload_partial_4_3 vload_partial_3 5569#define vload_partial_4_4 vload_partial_4 5570#define vload_partial_4_5 NO_LOAD 5571#define vload_partial_4_6 NO_LOAD 5572#define vload_partial_4_7 NO_LOAD 5573#define vload_partial_4_8 NO_LOAD 5574#define vload_partial_4_9 NO_LOAD 5575#define vload_partial_4_10 NO_LOAD 5576#define vload_partial_4_11 NO_LOAD 5577#define vload_partial_4_12 NO_LOAD 5578#define vload_partial_4_13 NO_LOAD 5579#define vload_partial_4_14 NO_LOAD 5580#define vload_partial_4_15 NO_LOAD 5581#define vload_partial_4_16 NO_LOAD 5582 5583#define vload_partial_8_0 NO_LOAD 5584#define vload_partial_8_1 vload_partial_1 5585#define vload_partial_8_2 vload_partial_2 5586#define vload_partial_8_3 vload_partial_3 5587#define vload_partial_8_4 vload_partial_4 5588#define vload_partial_8_5 vload_partial_5 5589#define vload_partial_8_6 vload_partial_6 5590#define vload_partial_8_7 vload_partial_7 5591#define vload_partial_8_8 vload_partial_8 5592#define vload_partial_8_9 NO_LOAD 5593#define vload_partial_8_10 NO_LOAD 5594#define vload_partial_8_11 NO_LOAD 5595#define vload_partial_8_12 NO_LOAD 5596#define vload_partial_8_13 NO_LOAD 5597#define vload_partial_8_14 NO_LOAD 5598#define vload_partial_8_15 NO_LOAD 5599#define vload_partial_8_16 NO_LOAD 5600 5601#define vload_partial_16_0 NO_LOAD 5602#define vload_partial_16_1 vload_partial_1 5603#define vload_partial_16_2 vload_partial_2 5604#define vload_partial_16_3 vload_partial_3 5605#define vload_partial_16_4 vload_partial_4 5606#define vload_partial_16_5 vload_partial_5 5607#define vload_partial_16_6 vload_partial_6 5608#define vload_partial_16_7 vload_partial_7 5609#define vload_partial_16_8 vload_partial_8 5610#define vload_partial_16_9 vload_partial_9 5611#define vload_partial_16_10 vload_partial_10 5612#define vload_partial_16_11 vload_partial_11 5613#define vload_partial_16_12 vload_partial_12 5614#define vload_partial_16_13 vload_partial_13 5615#define vload_partial_16_14 vload_partial_14 5616#define vload_partial_16_15 vload_partial_15 5617#define vload_partial_16_16 vload_partial_16 5618 5619 5620#define vload_partial_1(DATA, OFFSET, PTR) \ 5621 DATA.s0 = vload1(OFFSET, PTR); 5622 5623#define vload_partial_2(DATA, OFFSET, PTR) \ 5624 DATA.s01 = vload2(OFFSET, PTR); 5625 5626#define vload_partial_3(DATA, OFFSET, PTR) \ 5627 DATA.s012 = vload3(OFFSET, PTR); 5628 5629#define vload_partial_4(DATA, OFFSET, PTR) \ 5630 DATA.s0123 = vload4(OFFSET, PTR); 5631 5632#define vload_partial_5(DATA, OFFSET, PTR) \ 5633 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 5634 DATA.s4 = vload1(OFFSET, PTR + 4); 5635 5636#define vload_partial_6(DATA, OFFSET, PTR) \ 5637 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 5638 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 5639 5640#define vload_partial_7(DATA, OFFSET, PTR) \ 5641 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 5642 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 5643 5644#define vload_partial_8(DATA, OFFSET, PTR) \ 5645 DATA.s01234567 = vload8(OFFSET, PTR); 5646 5647#define vload_partial_9(DATA, OFFSET, PTR) \ 5648 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5649 DATA.s8 = vload1(OFFSET, PTR + 8); 5650 5651#define vload_partial_10(DATA, OFFSET, PTR) \ 5652 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5653 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 5654 5655#define vload_partial_11(DATA, OFFSET, PTR) \ 5656 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5657 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 5658 5659#define vload_partial_12(DATA, OFFSET, PTR) \ 5660 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5661 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 5662 5663#define vload_partial_13(DATA, OFFSET, PTR) \ 5664 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5665 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 5666 5667#define vload_partial_14(DATA, OFFSET, PTR) \ 5668 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5669 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 5670 5671#define vload_partial_15(DATA, OFFSET, PTR) \ 5672 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 5673 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 5674 5675#define vload_partial_16(DATA, OFFSET, PTR) \ 5676 DATA = vload16(OFFSET, PTR); 5677 5678 5679 5680#define PIXEL_UNIT4 1 5681#define PIXEL_UNIT8 2 5682#define PIXEL_UNIT16 4 5683 5684 5685#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 5686#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 5687 5688 5689#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 5690#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 5691#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 5692 5693#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 5694#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 5695#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 5696#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 5697#endif 5698 5699#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 5700#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 5701#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 5702 5703#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 5704#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 5705#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 5706#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 5707#endif 5708 5709 5710#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 5711#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 5712 5713 5714#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 5715#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 5716 5717#define VSTORE_STR(size) vstore##size 5718#define VSTORE(size) VSTORE_STR(size) 5719 5720#define float1 float 5721#define half1 half 5722#define char1 char 5723#define uchar1 uchar 5724#define short1 short 5725#define ushort1 ushort 5726#define int1 int 5727#define uint1 uint 5728#define long1 long 5729#define ulong1 ulong 5730#define double1 double 5731 5732#define vload1(OFFSET, PTR) *(OFFSET + PTR) 5733#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 5734 5735 5736#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 5737#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 5738 5739#define NO_STORE(data, offs, ptr) \ 5740 { \ 5741 } 5742 5743 5744#define vstore_partial_1_0 NO_STORE 5745#define vstore_partial_1_1 vstore1 5746#define vstore_partial_1_2 NO_STORE 5747#define vstore_partial_1_3 NO_STORE 5748#define vstore_partial_1_4 NO_STORE 5749#define vstore_partial_1_5 NO_STORE 5750#define vstore_partial_1_6 NO_STORE 5751#define vstore_partial_1_7 NO_STORE 5752#define vstore_partial_1_8 NO_STORE 5753#define vstore_partial_1_9 NO_STORE 5754#define vstore_partial_1_10 NO_STORE 5755#define vstore_partial_1_11 NO_STORE 5756#define vstore_partial_1_12 NO_STORE 5757#define vstore_partial_1_13 NO_STORE 5758#define vstore_partial_1_14 NO_STORE 5759#define vstore_partial_1_15 NO_STORE 5760#define vstore_partial_1_16 NO_STORE 5761 5762#define vstore_partial_2_0 NO_STORE 5763#define vstore_partial_2_1 vstore_partial_1 5764#define vstore_partial_2_2 vstore_partial_2 5765#define vstore_partial_2_3 NO_STORE 5766#define vstore_partial_2_4 NO_STORE 5767#define vstore_partial_2_5 NO_STORE 5768#define vstore_partial_2_6 NO_STORE 5769#define vstore_partial_2_7 NO_STORE 5770#define vstore_partial_2_8 NO_STORE 5771#define vstore_partial_2_9 NO_STORE 5772#define vstore_partial_2_10 NO_STORE 5773#define vstore_partial_2_11 NO_STORE 5774#define vstore_partial_2_12 NO_STORE 5775#define vstore_partial_2_13 NO_STORE 5776#define vstore_partial_2_14 NO_STORE 5777#define vstore_partial_2_15 NO_STORE 5778#define vstore_partial_2_16 NO_STORE 5779 5780#define vstore_partial_3_0 NO_STORE 5781#define vstore_partial_3_1 vstore_partial_1 5782#define vstore_partial_3_2 vstore_partial_2 5783#define vstore_partial_3_3 vstore_partial_3 5784#define vstore_partial_3_4 NO_STORE 5785#define vstore_partial_3_5 NO_STORE 5786#define vstore_partial_3_6 NO_STORE 5787#define vstore_partial_3_7 NO_STORE 5788#define vstore_partial_3_8 NO_STORE 5789#define vstore_partial_3_9 NO_STORE 5790#define vstore_partial_3_10 NO_STORE 5791#define vstore_partial_3_11 NO_STORE 5792#define vstore_partial_3_12 NO_STORE 5793#define vstore_partial_3_13 NO_STORE 5794#define vstore_partial_3_14 NO_STORE 5795#define vstore_partial_3_15 NO_STORE 5796#define vstore_partial_3_16 NO_STORE 5797 5798#define vstore_partial_4_0 NO_STORE 5799#define vstore_partial_4_1 vstore_partial_1 5800#define vstore_partial_4_2 vstore_partial_2 5801#define vstore_partial_4_3 vstore_partial_3 5802#define vstore_partial_4_4 vstore_partial_4 5803#define vstore_partial_4_5 NO_STORE 5804#define vstore_partial_4_6 NO_STORE 5805#define vstore_partial_4_7 NO_STORE 5806#define vstore_partial_4_8 NO_STORE 5807#define vstore_partial_4_9 NO_STORE 5808#define vstore_partial_4_10 NO_STORE 5809#define vstore_partial_4_11 NO_STORE 5810#define vstore_partial_4_12 NO_STORE 5811#define vstore_partial_4_13 NO_STORE 5812#define vstore_partial_4_14 NO_STORE 5813#define vstore_partial_4_15 NO_STORE 5814#define vstore_partial_4_16 NO_STORE 5815 5816#define vstore_partial_8_0 NO_STORE 5817#define vstore_partial_8_1 vstore_partial_1 5818#define vstore_partial_8_2 vstore_partial_2 5819#define vstore_partial_8_3 vstore_partial_3 5820#define vstore_partial_8_4 vstore_partial_4 5821#define vstore_partial_8_5 vstore_partial_5 5822#define vstore_partial_8_6 vstore_partial_6 5823#define vstore_partial_8_7 vstore_partial_7 5824#define vstore_partial_8_8 vstore_partial_8 5825#define vstore_partial_8_9 NO_STORE 5826#define vstore_partial_8_10 NO_STORE 5827#define vstore_partial_8_11 NO_STORE 5828#define vstore_partial_8_12 NO_STORE 5829#define vstore_partial_8_13 NO_STORE 5830#define vstore_partial_8_14 NO_STORE 5831#define vstore_partial_8_15 NO_STORE 5832#define vstore_partial_8_16 NO_STORE 5833 5834#define vstore_partial_16_0 NO_STORE 5835#define vstore_partial_16_1 vstore_partial_1 5836#define vstore_partial_16_2 vstore_partial_2 5837#define vstore_partial_16_3 vstore_partial_3 5838#define vstore_partial_16_4 vstore_partial_4 5839#define vstore_partial_16_5 vstore_partial_5 5840#define vstore_partial_16_6 vstore_partial_6 5841#define vstore_partial_16_7 vstore_partial_7 5842#define vstore_partial_16_8 vstore_partial_8 5843#define vstore_partial_16_9 vstore_partial_9 5844#define vstore_partial_16_10 vstore_partial_10 5845#define vstore_partial_16_11 vstore_partial_11 5846#define vstore_partial_16_12 vstore_partial_12 5847#define vstore_partial_16_13 vstore_partial_13 5848#define vstore_partial_16_14 vstore_partial_14 5849#define vstore_partial_16_15 vstore_partial_15 5850#define vstore_partial_16_16 vstore_partial_16 5851 5852 5853#define vstore_partial_1(DATA, OFFSET, PTR) \ 5854 vstore1(DATA.s0, OFFSET, PTR); 5855 5856#define vstore_partial_2(DATA, OFFSET, PTR) \ 5857 vstore2(DATA.s01, OFFSET, PTR); 5858 5859#define vstore_partial_3(DATA, OFFSET, PTR) \ 5860 vstore3(DATA.s012, OFFSET, PTR); 5861 5862#define vstore_partial_4(DATA, OFFSET, PTR) \ 5863 vstore4(DATA.s0123, OFFSET, PTR); 5864 5865#define vstore_partial_5(DATA, OFFSET, PTR) \ 5866 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 5867 vstore1(DATA.s4, OFFSET, PTR + 4); 5868 5869#define vstore_partial_6(DATA, OFFSET, PTR) \ 5870 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 5871 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 5872 5873#define vstore_partial_7(DATA, OFFSET, PTR) \ 5874 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 5875 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 5876 5877#define vstore_partial_8(DATA, OFFSET, PTR) \ 5878 vstore8(DATA.s01234567, OFFSET, PTR); 5879 5880#define vstore_partial_9(DATA, OFFSET, PTR) \ 5881 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5882 vstore1(DATA.s8, OFFSET, PTR + 8); 5883 5884#define vstore_partial_10(DATA, OFFSET, PTR) \ 5885 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5886 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 5887 5888#define vstore_partial_11(DATA, OFFSET, PTR) \ 5889 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5890 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 5891 5892#define vstore_partial_12(DATA, OFFSET, PTR) \ 5893 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5894 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 5895 5896#define vstore_partial_13(DATA, OFFSET, PTR) \ 5897 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5898 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 5899 5900#define vstore_partial_14(DATA, OFFSET, PTR) \ 5901 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5902 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 5903 5904#define vstore_partial_15(DATA, OFFSET, PTR) \ 5905 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 5906 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 5907 5908#define vstore_partial_16(DATA, OFFSET, PTR) \ 5909 vstore16(DATA, OFFSET, PTR); 5910 5911 5912 5913 5914 5915#define convert_float_sat convert_float 5916#define convert_float1_sat convert_float 5917#define convert_float2_sat convert_float2 5918#define convert_float3_sat convert_float3 5919#define convert_float4_sat convert_float4 5920#define convert_float8_sat convert_float8 5921#define convert_float16_sat convert_float16 5922#define convert_half_sat convert_float 5923#define convert_half1_sat convert_half 5924#define convert_half2_sat convert_half2 5925#define convert_half3_sat convert_half3 5926#define convert_half4_sat convert_half4 5927#define convert_half8_sat convert_half8 5928#define convert_half16_sat convert_half16 5929 5930#define convert_float1 convert_float 5931#define convert_half1 convert_half 5932#define convert_char1 convert_char 5933#define convert_uchar1 convert_uchar 5934#define convert_short1 convert_short 5935#define convert_ushort1 convert_ushort 5936#define convert_int1 convert_int 5937#define convert_uint1 convert_uint 5938#define convert_long1 convert_long 5939#define convert_ulong1 convert_ulong 5940#define convert_double1 convert_double 5941 5942#define convert_char1_sat convert_char_sat 5943#define convert_uchar1_sat convert_uchar_sat 5944#define convert_uchar2_sat convert_uchar2_sat 5945#define convert_uchar3_sat convert_uchar3_sat 5946#define convert_uchar4_sat convert_uchar4_sat 5947#define convert_uchar8_sat convert_uchar8_sat 5948#define convert_uchar16_sat convert_uchar16_sat 5949#define convert_short1_sat convert_short_sat 5950#define convert_ushort1_sat convert_ushort_sat 5951#define convert_int1_sat convert_int_sat 5952#define convert_uint1_sat convert_uint_sat 5953#define convert_long1_sat convert_long_sat 5954#define convert_ulong1_sat convert_ulong_sat 5955#define convert_double1_sat convert_double_sat 5956 5957#define VEC_DATA_TYPE_STR(type, size) type##size 5958#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 5959 5960#define CONVERT_STR(x, type) (convert_##type((x))) 5961#define CONVERT(x, type) CONVERT_STR(x, type) 5962 5963#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 5964#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 5965 5966#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 5967#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 5968 5969#define select_vec_dt_uchar(size) uchar##size 5970#define select_vec_dt_char(size) char##size 5971#define select_vec_dt_ushort(size) ushort##size 5972#define select_vec_dt_short(size) short##size 5973#define select_vec_dt_half(size) short##size 5974#define select_vec_dt_uint(size) uint##size 5975#define select_vec_dt_int(size) int##size 5976#define select_vec_dt_float(size) int##size 5977#define select_vec_dt_ulong(size) ulong##size 5978#define select_vec_dt_long(size) long##size 5979 5980#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 5981#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 5982#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 5983 5984#define signed_int_vec_dt_uchar(size) char##size 5985#define signed_int_vec_dt_char(size) char##size 5986#define signed_int_vec_dt_ushort(size) short##size 5987#define signed_int_vec_dt_short(size) short##size 5988#define signed_int_vec_dt_half(size) short##size 5989#define signed_int_vec_dt_uint(size) int##size 5990#define signed_int_vec_dt_int(size) int##size 5991#define signed_int_vec_dt_float(size) int##size 5992#define signed_int_vec_dt_ulong(size) long##size 5993#define signed_int_vec_dt_long(size) long##size 5994 5995#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 5996#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 5997#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 5998 5999#define sum_reduce_1(x) (x) 6000#define sum_reduce_2(x) ((x).s0) + ((x).s1) 6001#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 6002#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 6003#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 6004#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 6005 6006#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 6007#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 6008 6009#define prod_reduce_1(x) (x) 6010#define prod_reduce_2(x) ((x).s0) * ((x).s1) 6011#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 6012#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 6013#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 6014#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 6015 6016#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 6017#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 6018 6019#define max_reduce_1(x) (x) 6020#define max_reduce_2(x) max(((x).s0), ((x).s1)) 6021#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 6022#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 6023#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 6024#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 6025 6026#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 6027#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 6028 6029#define VECTOR_DECLARATION(name) \ 6030 __global uchar *name##_ptr, \ 6031 uint name##_stride_x, \ 6032 uint name##_step_x, \ 6033 uint name##_offset_first_element_in_bytes 6034 6035#define IMAGE_DECLARATION(name) \ 6036 __global uchar *name##_ptr, \ 6037 uint name##_stride_x, \ 6038 uint name##_step_x, \ 6039 uint name##_stride_y, \ 6040 uint name##_step_y, \ 6041 uint name##_offset_first_element_in_bytes 6042 6043#define TENSOR3D_DECLARATION(name) \ 6044 __global uchar *name##_ptr, \ 6045 uint name##_stride_x, \ 6046 uint name##_step_x, \ 6047 uint name##_stride_y, \ 6048 uint name##_step_y, \ 6049 uint name##_stride_z, \ 6050 uint name##_step_z, \ 6051 uint name##_offset_first_element_in_bytes 6052 6053#define TENSOR4D_DECLARATION(name) \ 6054 __global uchar *name##_ptr, \ 6055 uint name##_stride_x, \ 6056 uint name##_step_x, \ 6057 uint name##_stride_y, \ 6058 uint name##_step_y, \ 6059 uint name##_stride_z, \ 6060 uint name##_step_z, \ 6061 uint name##_stride_w, \ 6062 uint name##_step_w, \ 6063 uint name##_offset_first_element_in_bytes 6064 6065#define TENSOR5D_DECLARATION(name) \ 6066 __global uchar *name##_ptr, \ 6067 uint name##_stride_x, \ 6068 uint name##_step_x, \ 6069 uint name##_stride_y, \ 6070 uint name##_step_y, \ 6071 uint name##_stride_z, \ 6072 uint name##_step_z, \ 6073 uint name##_stride_w, \ 6074 uint name##_step_w, \ 6075 uint name##_stride_v, \ 6076 uint name##_step_v, \ 6077 uint name##_offset_first_element_in_bytes 6078 6079#define CONVERT_TO_VECTOR_STRUCT(name) \ 6080 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 6081 6082#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 6083 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 6084 6085#define CONVERT_TO_IMAGE_STRUCT(name) \ 6086 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 6087 6088#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 6089 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 6090 6091#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6092 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6093 6094#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 6095 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 6096 6097#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6098 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6099 6100#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 6101 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6102 name##_stride_z, name##_step_z) 6103 6104#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 6105 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 6106 6107#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 6108 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6109 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 6110 6111#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 6112 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 6113 6114#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 6115 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6116 name##_stride_z, name##_step_z) 6117 6118 6119typedef struct Vector 6120{ 6121 __global uchar *ptr; 6122 int offset_first_element_in_bytes; 6123 int stride_x; 6124} Vector; 6125 6126 6127typedef struct Image 6128{ 6129 __global uchar *ptr; 6130 int offset_first_element_in_bytes; 6131 int stride_x; 6132 int stride_y; 6133} Image; 6134 6135 6136typedef struct Tensor3D 6137{ 6138 __global uchar *ptr; 6139 int offset_first_element_in_bytes; 6140 int stride_x; 6141 int stride_y; 6142 int stride_z; 6143} Tensor3D; 6144 6145 6146typedef struct Tensor4D 6147{ 6148 __global uchar *ptr; 6149 int offset_first_element_in_bytes; 6150 int stride_x; 6151 int stride_y; 6152 int stride_z; 6153 int stride_w; 6154} Tensor4D; 6155 6156 6157inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 6158{ 6159 Vector vector = 6160 { 6161 .ptr = ptr, 6162 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6163 .stride_x = stride_x, 6164 }; 6165 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 6166 return vector; 6167} 6168 6169 6170inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 6171{ 6172 Image img = 6173 { 6174 .ptr = ptr, 6175 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6176 .stride_x = stride_x, 6177 .stride_y = stride_y 6178 }; 6179 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 6180 return img; 6181} 6182 6183 6184inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6185{ 6186 Image img = 6187 { 6188 .ptr = ptr, 6189 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6190 .stride_x = stride_x, 6191 .stride_y = stride_y 6192 }; 6193 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6194 return img; 6195} 6196 6197 6198inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6199{ 6200 Tensor3D tensor = 6201 { 6202 .ptr = ptr, 6203 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6204 .stride_x = stride_x, 6205 .stride_y = stride_y, 6206 .stride_z = stride_z 6207 }; 6208 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6209 return tensor; 6210} 6211 6212 6213inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6214{ 6215 Tensor3D tensor = 6216 { 6217 .ptr = ptr, 6218 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6219 .stride_x = stride_x, 6220 .stride_y = stride_y, 6221 .stride_z = stride_z 6222 }; 6223 return tensor; 6224} 6225 6226inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 6227 uint step_w, 6228 uint mod_size) 6229{ 6230 Tensor4D tensor = 6231 { 6232 .ptr = ptr, 6233 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6234 .stride_x = stride_x, 6235 .stride_y = stride_y, 6236 .stride_z = stride_z, 6237 .stride_w = stride_w 6238 }; 6239 6240 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 6241 return tensor; 6242} 6243 6244 6245inline __global const uchar *vector_offset(const Vector *vec, int x) 6246{ 6247 return vec->ptr + x * vec->stride_x; 6248} 6249 6250 6251inline __global uchar *offset(const Image *img, int x, int y) 6252{ 6253 return img->ptr + x * img->stride_x + y * img->stride_y; 6254} 6255 6256 6257inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 6258{ 6259 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 6260} 6261 6262 6263inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 6264{ 6265 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 6266} 6267 6268 6269inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 6270{ 6271 uint num_elements = width * height; 6272 6273 const uint z = index / num_elements; 6274 6275 index %= num_elements; 6276 6277 const uint y = index / width; 6278 6279 index %= width; 6280 6281 const uint x = index; 6282 6283 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 6284} 6285 6286#endif 6287 6288 6289 6290#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 6291#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 6292 P_X##_DEF(1, P_A, P_B, P_C); \ 6293 REPEAT_3_1(P_X, P_A, P_B, P_C) 6294#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 6295 P_X##_DEF(2, P_A, P_B, P_C); \ 6296 REPEAT_3_2(P_X, P_A, P_B, P_C) 6297#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 6298 P_X##_DEF(3, P_A, P_B, P_C); \ 6299 REPEAT_3_3(P_X, P_A, P_B, P_C) 6300#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 6301 P_X##_DEF(4, P_A, P_B, P_C); \ 6302 REPEAT_3_4(P_X, P_A, P_B, P_C) 6303#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 6304 P_X##_DEF(5, P_A, P_B, P_C); \ 6305 REPEAT_3_5(P_X, P_A, P_B, P_C) 6306#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 6307 P_X##_DEF(6, P_A, P_B, P_C); \ 6308 REPEAT_3_6(P_X, P_A, P_B, P_C) 6309#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 6310 P_X##_DEF(7, P_A, P_B, P_C); \ 6311 REPEAT_3_7(P_X, P_A, P_B, P_C) 6312#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 6313 P_X##_DEF(8, P_A, P_B, P_C); \ 6314 REPEAT_3_8(P_X, P_A, P_B, P_C) 6315#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 6316 P_X##_DEF(9, P_A, P_B, P_C); \ 6317 REPEAT_3_9(P_X, P_A, P_B, P_C) 6318#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 6319 P_X##_DEF(A, P_A, P_B, P_C); \ 6320 REPEAT_3_10(P_X, P_A, P_B, P_C) 6321#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 6322 P_X##_DEF(B, P_A, P_B, P_C); \ 6323 REPEAT_3_11(P_X, P_A, P_B, P_C) 6324#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 6325 P_X##_DEF(C, P_A, P_B, P_C); \ 6326 REPEAT_3_12(P_X, P_A, P_B, P_C) 6327#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 6328 P_X##_DEF(D, P_A, P_B, P_C); \ 6329 REPEAT_3_13(P_X, P_A, P_B, P_C) 6330#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 6331 P_X##_DEF(E, P_A, P_B, P_C); \ 6332 REPEAT_3_14(P_X, P_A, P_B, P_C) 6333#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 6334 P_X##_DEF(F, P_A, P_B, P_C); \ 6335 REPEAT_3_15(P_X, P_A, P_B, P_C) 6336 6337#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 6338#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 6339 6340 6341#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 6342#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 6343 P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 6344 REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 6345#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 6346 P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 6347 REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 6348#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 6349 P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 6350 REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 6351#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 6352 P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 6353 REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 6354#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 6355 P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 6356 REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 6357#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 6358 P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 6359 REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 6360#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 6361 P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 6362 REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 6363#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 6364 P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 6365 REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 6366#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 6367 P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 6368 REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 6369#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 6370 P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 6371 REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 6372#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 6373 P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 6374 REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 6375#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 6376 P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 6377 REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 6378#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 6379 P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 6380 REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 6381#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 6382 P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 6383 REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 6384#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 6385 P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 6386 REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 6387 6388#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 6389#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 6390 6391 6392#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 6393#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 6394 6395 6396#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 6397#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 6398 6399 6400#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 6401#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 6402 6403 6404#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 6405#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 6406 6407 6408#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 6409#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 6410 6411 6412#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 6413#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 6414 6415 6416#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 6417#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 6418 6419 6420#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 6421#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 6422 6423 6424#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 6425#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 6426 6427 6428#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6429#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6430 6431 6432#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6433#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6434 6435 6436#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 6437 ({ \ 6438 VEC_DATA_TYPE(int, N0) \ 6439 VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6440 VEC_DATA_TYPE(int, N0) \ 6441 VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6442 VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 6443 }) 6444#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 6445 6446#endif 6447 6448#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6449#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6450 6451 6452 6453 6454#define TILE_VECTOR_SIZE1 1 6455#define TILE_VECTOR_SIZE2 2 6456#define TILE_VECTOR_SIZE3 3 6457#define TILE_VECTOR_SIZE4 4 6458#define TILE_VECTOR_SIZE5 8 6459#define TILE_VECTOR_SIZE6 8 6460#define TILE_VECTOR_SIZE7 8 6461#define TILE_VECTOR_SIZE8 8 6462#define TILE_VECTOR_SIZE9 16 6463#define TILE_VECTOR_SIZE10 16 6464#define TILE_VECTOR_SIZE11 16 6465#define TILE_VECTOR_SIZE12 16 6466#define TILE_VECTOR_SIZE13 16 6467#define TILE_VECTOR_SIZE14 16 6468#define TILE_VECTOR_SIZE15 16 6469#define TILE_VECTOR_SIZE16 16 6470 6471#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1 6472#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2 6473#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3 6474#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4 6475#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8 6476#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8 6477#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8 6478#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8 6479#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16 6480#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16 6481#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16 6482#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16 6483#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16 6484#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16 6485#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16 6486#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16 6487 6488 6489#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME) 6490#define TILE_STR(DATA_TYPE, H, W, BASENAME) \ 6491 union { \ 6492 DATA_TYPE s[TILE_VECTOR_SIZE##W]; \ 6493 TILE_VECTOR_TYPE##W(DATA_TYPE) v; \ 6494 } BASENAME[H] 6495 6496#define TENSOR4D_IMAGE(name) \ 6497 __read_only image2d_t name##_img, \ 6498 __global uchar *name##_ptr, \ 6499 uint name##_stride_x, \ 6500 uint name##_step_x, \ 6501 uint name##_stride_y, \ 6502 uint name##_step_y, \ 6503 uint name##_stride_z, \ 6504 uint name##_step_z, \ 6505 uint name##_stride_w, \ 6506 uint name##_step_w, \ 6507 uint name##_offset_first_element_in_bytes 6508 6509#define TENSOR4D_BUFFER(name) \ 6510 __global uchar *name##_ptr, \ 6511 uint name##_stride_x, \ 6512 uint name##_step_x, \ 6513 uint name##_stride_y, \ 6514 uint name##_step_y, \ 6515 uint name##_stride_z, \ 6516 uint name##_step_z, \ 6517 uint name##_stride_w, \ 6518 uint name##_step_w, \ 6519 uint name##_offset_first_element_in_bytes 6520 6521#define TENSOR4D_STR(name, type) TENSOR4D_##type(name) 6522#define TENSOR4D(name, type) TENSOR4D_STR(name, type) 6523 6524#define TENSOR4D_T_IMAGE(name) \ 6525 __read_only image2d_t name##_img, \ 6526 __global uchar *name##_ptr, \ 6527 uint name##_stride_y, \ 6528 uint name##_stride_z, \ 6529 uint name##_stride_w, \ 6530 uint name##_c, \ 6531 uint name##_w, \ 6532 uint name##_h, \ 6533 uint name##_n, \ 6534 uint name##_offset_first_element_in_bytes 6535 6536#define TENSOR4D_T_BUFFER(name) \ 6537 __global uchar *name##_ptr, \ 6538 uint name##_stride_y, \ 6539 uint name##_stride_z, \ 6540 uint name##_stride_w, \ 6541 uint name##_c, \ 6542 uint name##_w, \ 6543 uint name##_h, \ 6544 uint name##_n, \ 6545 uint name##_offset_first_element_in_bytes 6546 6547#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) 6548 6549 6550#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) 6551 6552#define TENSOR4D_RO_T_IMAGE(name) \ 6553 __read_only image2d_t name##_img, \ 6554 TENSOR4D_T_BUFFER(name) 6555 6556#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 6557 6558#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) 6559 6560 6561#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) 6562 6563#define TENSOR4D_WO_T_IMAGE(name) \ 6564 __write_only image2d_t name##_img, \ 6565 TENSOR4D_T_BUFFER(name) 6566 6567#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 6568 6569#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) 6570 6571 6572#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) 6573 6574#define TENSOR3D_T_IMAGE(name) \ 6575 __read_only image2d_t name##_img, \ 6576 __global uchar *name##_ptr, \ 6577 uint name##_stride_y, \ 6578 uint name##_stride_z, \ 6579 uint name##_w, \ 6580 uint name##_h, \ 6581 uint name##_n, \ 6582 uint name##_offset_first_element_in_bytes 6583 6584#define TENSOR3D_T_BUFFER(name) \ 6585 __global uchar *name##_ptr, \ 6586 uint name##_stride_y, \ 6587 uint name##_stride_z, \ 6588 uint name##_w, \ 6589 uint name##_h, \ 6590 uint name##_n, \ 6591 uint name##_offset_first_element_in_bytes 6592 6593#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name) 6594#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type) 6595 6596#if !defined(UNROLL_WITH_PRAGMA) 6597#define UNROLL_INCR(idx, step, macro) idx += (step); (macro) 6598 6599#define LOOP_UNROLLING_1(idx, step, macro) (macro) 6600#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro) 6601#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro) 6602#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro) 6603#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro) 6604#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro) 6605#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro) 6606#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro) 6607#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro) 6608#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro) 6609#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro) 6610#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro) 6611#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro) 6612#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro) 6613#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro) 6614#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro) 6615#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro) 6616#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro) 6617#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro) 6618#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro) 6619#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro) 6620#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro) 6621#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro) 6622#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro) 6623#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro) 6624#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro) 6625#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro) 6626#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro) 6627#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro) 6628#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro) 6629#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro) 6630#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro) 6631#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro) 6632#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro) 6633#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro) 6634#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro) 6635#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro) 6636#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro) 6637#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro) 6638#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro) 6639#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro) 6640#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro) 6641#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro) 6642#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro) 6643#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro) 6644#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro) 6645#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro) 6646#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro) 6647#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro) 6648#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro) 6649#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro) 6650#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro) 6651#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro) 6652#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro) 6653#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro) 6654#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro) 6655#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro) 6656#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro) 6657#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro) 6658#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro) 6659#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro) 6660#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro) 6661#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro) 6662#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro) 6663#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro) 6664#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro) 6665#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro) 6666#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro) 6667#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro) 6668#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro) 6669#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro) 6670#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro) 6671#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro) 6672#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro) 6673#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro) 6674#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro) 6675#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro) 6676#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro) 6677#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro) 6678#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro) 6679#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro) 6680#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro) 6681#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro) 6682#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro) 6683#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro) 6684#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro) 6685#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro) 6686#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro) 6687#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro) 6688#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro) 6689#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro) 6690#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro) 6691#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro) 6692#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro) 6693#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro) 6694#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro) 6695#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro) 6696#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro) 6697#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro) 6698#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro) 6699#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro) 6700#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro) 6701#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro) 6702#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro) 6703#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro) 6704#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro) 6705#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro) 6706#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro) 6707#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro) 6708#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro) 6709#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro) 6710#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro) 6711#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro) 6712#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro) 6713#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro) 6714#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro) 6715#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro) 6716#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro) 6717#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro) 6718#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro) 6719#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro) 6720#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro) 6721#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro) 6722#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro) 6723#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro) 6724#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro) 6725#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) 6726#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) 6727 6728#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 6729 { \ 6730 type idx = start; \ 6731 LOOP_UNROLLING_##num(idx, step, macro); \ 6732 } 6733#else 6734#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 6735 { \ 6736 _Pragma("unroll") \ 6737 for(type idx = start; idx < (num * step); idx += step) \ 6738 { \ 6739 (macro); \ 6740 } \ 6741 } 6742#endif 6743#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) 6744 6745 6746#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0)) 6747 6748 6749#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) 6750#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) 6751#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6752 ({ \ 6753 c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \ 6754 }) 6755#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product) 6756#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 6757#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 6758#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b)); 6759#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 6760#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c)); 6761#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c)); 6762#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c)); 6763#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 6764#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 6765#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 6766#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b)); 6767#else 6768#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6769 ({ \ 6770 c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \ 6771 c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \ 6772 }) 6773#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6774 ({ \ 6775 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \ 6776 c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \ 6777 }) 6778#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \ 6779 ({ \ 6780 val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \ 6781 val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \ 6782 val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \ 6783 val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \ 6784 }) 6785#endif 6786#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6787 ({ \ 6788 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 6789 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \ 6790 }) 6791#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6792 ({ \ 6793 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 6794 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \ 6795 }) 6796#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6797 ({ \ 6798 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 6799 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \ 6800 }) 6801#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6802 ({ \ 6803 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 6804 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 6805 }) 6806#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6807 ({ \ 6808 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6809 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \ 6810 }) 6811#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6812 ({ \ 6813 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6814 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \ 6815 }) 6816#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6817 ({ \ 6818 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6819 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \ 6820 }) 6821#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6822 ({ \ 6823 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6824 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \ 6825 }) 6826#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6827 ({ \ 6828 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6829 DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \ 6830 }) 6831#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6832 ({ \ 6833 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6834 DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \ 6835 }) 6836#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6837 ({ \ 6838 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 6839 DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \ 6840 }) 6841#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 6842 ({ \ 6843 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 6844 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 6845 }) 6846 6847 6848#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) 6849#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c) 6850 6851 6852#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) 6853#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) 6854#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \ 6855 VLOAD(WIDTH) \ 6856 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 6857#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) 6858 6859 6860#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) 6861#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) 6862#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ 6863 VSTORE(WIDTH) \ 6864 (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 6865#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) 6866 6867 6868#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ 6869 ({ \ 6870 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 6871 { \ 6872 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ 6873 }) \ 6874 }) 6875 6876 6877#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \ 6878 ({ \ 6879 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 6880 { \ 6881 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \ 6882 }) \ 6883 }) 6884 6885 6886#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ 6887 ({ \ 6888 if(WIDTH1_CONDITION) \ 6889 { \ 6890 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 6891 { \ 6892 VLOAD_PARTIAL(WIDTH0, WIDTH1) \ 6893 (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 6894 }) \ 6895 } \ 6896 else \ 6897 { \ 6898 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 6899 { \ 6900 dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ 6901 }) \ 6902 } \ 6903 }) 6904 6905#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ 6906 ({ \ 6907 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 6908 { \ 6909 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 6910 { \ 6911 int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \ 6912 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 6913 int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \ 6914 if(_src_valid_y != 0) \ 6915 { \ 6916 dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 6917 } \ 6918 }) \ 6919 }) \ 6920 }) 6921 6922 6923#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \ 6924 ({ \ 6925 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 6926 { \ 6927 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 6928 { \ 6929 int _src_y = (X) + _xk * (DILATION_X); \ 6930 int _src_z = ((Y) + _yk * (DILATION_Y)); \ 6931 int _src_w = (B); \ 6932 bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \ 6933 if(!(BOUNDARY_CHECK)) \ 6934 { \ 6935 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 6936 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 6937 } \ 6938 else \ 6939 { \ 6940 if(_src_valid_y) \ 6941 { \ 6942 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 6943 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 6944 } \ 6945 } \ 6946 }) \ 6947 }) \ 6948 }) 6949 6950 6951#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ 6952 ({ \ 6953 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 6954 { \ 6955 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ 6956 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 6957 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ 6958 if(_src_valid_y != 0) \ 6959 { \ 6960 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 6961 } \ 6962 }) \ 6963 }) 6964 6965 6966#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 6967#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 6968#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 6969 ({ \ 6970 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 6971 { \ 6972 if(yi[0].s[_i] >= 0) \ 6973 { \ 6974 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 6975 } \ 6976 }) \ 6977 }) 6978 6979#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 6980 ({ \ 6981 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 6982 { \ 6983 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 6984 }) \ 6985 }) 6986 6987 6988#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \ 6989 ({ \ 6990 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 6991 { \ 6992 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \ 6993 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \ 6994 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \ 6995 && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \ 6996 if(_src_valid_y != 0) \ 6997 { \ 6998 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 6999 } \ 7000 }) \ 7001 }) 7002 7003 7004#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \ 7005 ({ \ 7006 if(WIDTH1_CONDITION) \ 7007 { \ 7008 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7009 { \ 7010 VSTORE_PARTIAL(WIDTH0, WIDTH1) \ 7011 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7012 }) \ 7013 } \ 7014 else \ 7015 { \ 7016 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7017 { \ 7018 VSTORE(WIDTH0) \ 7019 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7020 }) \ 7021 } \ 7022 }) 7023 7024 7025#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \ 7026 ({ \ 7027 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7028 { \ 7029 ACC_DATA_TYPE _tm = 0; \ 7030 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7031 { \ 7032 _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \ 7033 }) \ 7034 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7035 { \ 7036 dst[_m0].s[_n0] += _tm; \ 7037 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7038 { \ 7039 dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \ 7040 }) \ 7041 }) \ 7042 }) \ 7043 }) 7044 7045 7046#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7047#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7048 7049 7050#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7051 ({ \ 7052 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7053 { \ 7054 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7055 { \ 7056 SRC_DATA_TYPE _tmp = 0; \ 7057 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7058 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7059 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7060 long a_64 = (long)(_src); \ 7061 long b_64 = (long)(DST_MULTIPLIER); \ 7062 long ab_64 = a_64 * b_64; \ 7063 long mask1 = 1 << 30; \ 7064 long mask2 = 1 - (1 << 30); \ 7065 long is_positive_or_zero = ab_64 >= 0; \ 7066 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7067 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7068 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7069 if(DST_SHIFT >= 0) \ 7070 { \ 7071 long mask = ((((int)1) << DST_SHIFT) - (long)1); \ 7072 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7073 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7074 } \ 7075 _tmp += DST_OFFSET; \ 7076 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7077 }) \ 7078 }) \ 7079 }) 7080 7081 7082#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7083 ({ \ 7084 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7085 { \ 7086 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7087 { \ 7088 SRC_DATA_TYPE _tmp = 0; \ 7089 SRC_DATA_TYPE _tmp2 = 0; \ 7090 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7091 SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \ 7092 SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \ 7093 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \ 7094 SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \ 7095 long a_64 = (long)(_src); \ 7096 long b_64 = (long)(_dst_multiplier); \ 7097 long ab_64 = a_64 * b_64; \ 7098 long mask1 = 1 << 30; \ 7099 long mask2 = 1 - (1 << 30); \ 7100 long is_positive_or_zero = ab_64 >= 0; \ 7101 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7102 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7103 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7104 long mask = ((((int)1) << _dst_shift) - (int)1); \ 7105 long threshold = (mask >> 1) + any(_tmp); \ 7106 _tmp2 = _tmp >> _dst_shift; \ 7107 _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ 7108 _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ 7109 _tmp += DST_OFFSET; \ 7110 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7111 }) \ 7112 }) \ 7113 }) 7114 7115 7116#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \ 7117 ({ \ 7118 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7119 { \ 7120 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7121 { \ 7122 SRC_DATA_TYPE _tmp = 0; \ 7123 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7124 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7125 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7126 long a_64 = (long)(_src); \ 7127 long b_64 = (long)(DST_MULTIPLIER); \ 7128 long ab_64 = a_64 * b_64; \ 7129 long mask1 = 1 << 30; \ 7130 long mask2 = 1 - (1 << 30); \ 7131 long is_positive_or_zero = ab_64 >= 0; \ 7132 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7133 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7134 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7135 if(DST_SHIFT >= 0) \ 7136 { \ 7137 long mask = ((((int)1) << DST_SHIFT) - (int)1); \ 7138 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7139 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7140 } \ 7141 _tmp += DST_OFFSET; \ 7142 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7143 }) \ 7144 }) \ 7145 }) 7146 7147 7148#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \ 7149 ({ \ 7150 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7151 { \ 7152 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7153 { \ 7154 a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \ 7155 }) \ 7156 }) \ 7157 }) 7158 7159 7160#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \ 7161 ({ \ 7162 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7163 { \ 7164 dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \ 7165 }) \ 7166 }) 7167 7168 7169#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) 7170 7171#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) 7172 7173#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 7174 7175#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) 7176 7177#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) 7178 7179#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7180#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7181 7182#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) 7183#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) 7184#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) 7185#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL)) 7186 7187 7188#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ 7189 ({ \ 7190 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7191 { \ 7192 dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ 7193 }) \ 7194 }) 7195 7196 7197#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7198 ({ \ 7199 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7200 { \ 7201 dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ 7202 }) \ 7203 }) 7204 7205 7206#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7207 ({ \ 7208 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7209 { \ 7210 dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ 7211 }) \ 7212 }) 7213 7214#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7215#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7216#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7217 7218#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7219#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7220 7221#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7222 7223#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7224#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7225 7226 7227#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7228 ({ \ 7229 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7230 { \ 7231 dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ 7232 }) \ 7233 }) 7234 7235 7236#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7237 ({ \ 7238 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7239 { \ 7240 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7241 }) \ 7242 }) 7243 7244 7245#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7246 ({ \ 7247 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7248 { \ 7249 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7250 }) \ 7251 }) 7252 7253#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7254#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7255#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7256#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7257 7258 7259#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7260 ({ \ 7261 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7262 { \ 7263 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7264 }) \ 7265 }) 7266 7267 7268#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ 7269 ({ \ 7270 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7271 { \ 7272 dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7273 }) \ 7274 }) 7275 7276 7277#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7278#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7279#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7280#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7281#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7282#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7283#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7284#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7285#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7286 { \ 7287 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7288 { \ 7289 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7290 { \ 7291 LOOP_UNROLLING(int, _k, 0, 1, K0, \ 7292 { \ 7293 dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \ 7294 }) \ 7295 }) \ 7296 }) \ 7297 } 7298 7299#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7300 ({ \ 7301 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7302 { \ 7303 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7304 { \ 7305 DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \ 7306 }) \ 7307 }) \ 7308 }) 7309 7310#endif 7311 7312#if defined(RESHAPE_LHS_NT) 7313 7314__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER), 7315 TENSOR3D_T(dst, BUFFER), 7316 const int M, 7317 const int V0) 7318{ 7319 7320#define BLOCK_SIZE ((M0) * (K0)) 7321 7322 7323#if defined(INTERLEAVE) 7324#define OUTPUT_OFFSET_X (K0) 7325#else 7326#define OUTPUT_OFFSET_X (BLOCK_SIZE) 7327#endif 7328 7329 7330#if defined(INTERLEAVE) 7331#define OUTPUT_STEP_X (K0) * (V0) 7332#else 7333#define OUTPUT_STEP_X (K0) 7334#endif 7335 7336 const int x = GET_SPATIAL_IDX(0, 1, 0); 7337 const int y = GET_SPATIAL_IDX(1, 1, 0); 7338 const int z = GET_SPATIAL_IDX(2, 1, 0); 7339 7340 const int xi = x * K0; 7341 const int yi = y * M0; 7342 7343 const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X; 7344 const int yo = (y / V0); 7345 7346 7347 src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y; 7348 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; 7349 7350 TILE(DATA_TYPE, M0, K0, in); 7351 7352 7353 LOOP_UNROLLING(int, _i, 0, 1, M0, 7354 { 7355 in[_i].v = 0; 7356 }); 7357 7358 bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0); 7359 bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0); 7360 7361 TILE(uint, M0, 1, in_indirect_y); 7362 LOOP_UNROLLING(int, _i, 0, 1, M0, 7363 { 7364 in_indirect_y[_i].v = _i; 7365 7366 }); 7367#if PARTIAL_M0 != 0 7368 if(y_cond) 7369 { 7370 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); 7371 } 7372 else 7373#endif 7374 { 7375 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); 7376 } 7377 7378 7379 TILE(uint, M0, 1, dst_indirect_y); 7380 LOOP_UNROLLING(int, _i, 0, 1, M0, 7381 { 7382 dst_indirect_y[_i].v = _i; 7383 }); 7384 7385 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y); 7386#undef BLOCK_SIZE 7387#undef OUTPUT_OFFSET_X 7388#undef OUTPUT_STEP_X 7389} 7390#endif 7391 7392#if defined(RESHAPE_LHS_T) 7393 7394__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER), 7395 TENSOR3D_T(dst, BUFFER), 7396 const int M, 7397 const int V0) 7398{ 7399 7400#define BLOCK_SIZE ((M0) * (K0)) 7401 7402 7403#if defined(INTERLEAVE) 7404#define OUTPUT_OFFSET_X (M0) 7405#else 7406#define OUTPUT_OFFSET_X (BLOCK_SIZE) 7407#endif 7408 7409 7410#if defined(INTERLEAVE) 7411#define OUTPUT_STEP_X (M0) * (V0) 7412#else 7413#define OUTPUT_STEP_X (M0) 7414#endif 7415 7416 const int x = GET_SPATIAL_IDX(0, 1, 0); 7417 const int y = GET_SPATIAL_IDX(1, 1, 0); 7418 const int z = GET_SPATIAL_IDX(2, 1, 0); 7419 7420 const int xi = x * K0; 7421 const int yi = y * M0; 7422 7423 const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X); 7424 const int yo = (y / V0); 7425 7426 7427 src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y; 7428 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; 7429 7430 TILE(DATA_TYPE, M0, K0, in); 7431 TILE(DATA_TYPE, K0, M0, in_tr); 7432 7433 7434 LOOP_UNROLLING(int, _i, 0, 1, M0, 7435 { 7436 in[_i].v = 0; 7437 }); 7438 7439 7440 bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0); 7441 bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0); 7442 7443 TILE(uint, M0, 1, in_indirect_y); 7444 LOOP_UNROLLING(int, _i, 0, 1, M0, 7445 { 7446 in_indirect_y[_i].v = _i; 7447 7448 }); 7449#if PARTIAL_M0 != 0 7450 if(y_cond) 7451 { 7452 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); 7453 } 7454 else 7455#endif 7456 { 7457 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); 7458 } 7459 7460 LOOP_UNROLLING(int, m0, 0, 1, M0, 7461 { 7462 LOOP_UNROLLING(int, k0, 0, 1, K0, 7463 { 7464 in_tr[k0].s[m0] = in[m0].s[k0]; 7465 }) 7466 }); 7467 7468 TILE(uint, K0, 1, dst_indirect_y); 7469 LOOP_UNROLLING(int, _i, 0, 1, K0, 7470 { 7471 dst_indirect_y[_i].v = _i; 7472 }); 7473 7474 7475 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y); 7476 7477#undef BLOCK_SIZE 7478#undef OUTPUT_OFFSET_X 7479#undef OUTPUT_STEP_X 7480} 7481#endif 7482 7483#if defined(RESHAPE_RHS_NT) 7484 7485__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER), 7486 TENSOR3D_T(dst, BUFFER), 7487 const int H0) 7488{ 7489 7490#define BLOCK_SIZE ((K0) * (N0)) 7491 7492 7493#if defined(INTERLEAVE) 7494#define OUTPUT_OFFSET_X (N0) 7495#else 7496#define OUTPUT_OFFSET_X (BLOCK_SIZE) 7497#endif 7498 7499 7500#if defined(INTERLEAVE) 7501#define OUTPUT_STEP_X (N0) * (H0) 7502#else 7503#define OUTPUT_STEP_X (N0) 7504#endif 7505 7506 const int x = GET_SPATIAL_IDX(0, 1, 0); 7507 const int y = GET_SPATIAL_IDX(1, 1, 0); 7508 const int z = GET_SPATIAL_IDX(2, 1, 0); 7509 7510 const int xi = x * N0; 7511 const int yi = y * K0; 7512 7513 const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X; 7514 const int yo = (x / H0); 7515 7516 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z; 7517 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; 7518 7519 TILE(DATA_TYPE, K0, N0, in); 7520 7521 7522 for(int i = 0; i < K0; ++i) 7523 { 7524 in[i].v = 0; 7525 } 7526 7527 7528 for(int i = 0; i < K0; ++i) 7529 { 7530 if(yi + i < src_h) 7531 { 7532 in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y); 7533 } 7534 } 7535 7536 TILE(uint, K0, 1, dst_indirect_y); 7537 for(int i = 0; i < K0; ++i) 7538 { 7539 dst_indirect_y[i].v = i; 7540 } 7541 7542 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y); 7543 7544#undef BLOCK_SIZE 7545#undef OUTPUT_OFFSET_X 7546#undef OUTPUT_STEP_X 7547} 7548#endif 7549 7550#if defined(RESHAPE_RHS_T) 7551 7552__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER), 7553 TENSOR3D_T(dst, BUFFER), 7554 const int H0) 7555{ 7556 7557#define BLOCK_SIZE ((K0) * (N0)) 7558 7559 7560#if defined(INTERLEAVE) 7561#define OUTPUT_OFFSET_X (K0) 7562#else 7563#define OUTPUT_OFFSET_X (BLOCK_SIZE) 7564#endif 7565 7566 7567#if defined(INTERLEAVE) 7568#define OUTPUT_STEP_X (K0) * (H0) 7569#else 7570#define OUTPUT_STEP_X (K0) 7571#endif 7572 7573 const int x = GET_SPATIAL_IDX(0, 1, 0); 7574 const int y = GET_SPATIAL_IDX(1, 1, 0); 7575 const int z = GET_SPATIAL_IDX(2, 1, 0); 7576 7577 const int xi = x * N0; 7578 const int yi = y * K0; 7579 7580 const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X; 7581 const int yo = (x / H0); 7582 7583 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z; 7584 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; 7585 7586 TILE(DATA_TYPE, K0, N0, in); 7587 TILE(DATA_TYPE, N0, K0, in_tr); 7588 7589 7590 for(int i = 0; i < K0; ++i) 7591 { 7592 in[i].v = 0; 7593 } 7594 7595 7596 for(int i = 0; i < K0; ++i) 7597 { 7598 if(yi + i < src_h) 7599 { 7600 in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y); 7601 } 7602 } 7603 7604 7605 for(int k0 = 0; k0 < K0; ++k0) 7606 { 7607 for(int n0 = 0; n0 < N0; ++n0) 7608 { 7609 in_tr[n0].s[k0] = in[k0].s[n0]; 7610 } 7611 } 7612 7613 TILE(uint, N0, 1, dst_indirect_y); 7614 for(int i = 0; i < N0; ++i) 7615 { 7616 dst_indirect_y[i].v = i; 7617 } 7618 7619 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y); 7620 7621#undef BLOCK_SIZE 7622#undef OUTPUT_OFFSET_X 7623#undef OUTPUT_STEP_X 7624} 7625 7626#endif )"