1R"( 2 3 4#ifndef ARM_COMPUTE_HELPER_H 5#define ARM_COMPUTE_HELPER_H 6 7 8 9 10#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 11 VSTORE(N0) \ 12 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 13 14#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 15 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 16 VSTORE(N0) \ 17 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 18 19#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 20 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 21 VSTORE(N0) \ 22 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 23 24#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 25 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 26 VSTORE(N0) \ 27 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 28 29#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 30 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 31 VSTORE(N0) \ 32 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 33 34#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 35 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 36 VSTORE(N0) \ 37 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 38 39#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 40 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 41 VSTORE(N0) \ 42 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 43 44#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 45 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 46 VSTORE(N0) \ 47 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 48 49#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 50 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 51 VSTORE(N0) \ 52 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 53 54#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 55 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 56 VSTORE(N0) \ 57 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 58 59#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 60 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 61 VSTORE(N0) \ 62 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 63 64#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 65 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 66 VSTORE(N0) \ 67 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 68 69#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 70 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 71 VSTORE(N0) \ 72 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 73 74#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 75 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 76 VSTORE(N0) \ 77 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 78 79#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 80 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 81 VSTORE(N0) \ 82 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 83 84#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 85 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 86 VSTORE(N0) \ 87 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 88 89 90 91#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 92 VSTORE(N0) \ 93 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 94 95#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 96 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 97 VSTORE(N0) \ 98 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 99 100#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 101 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 102 VSTORE(N0) \ 103 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 104 105#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 106 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 107 VSTORE(N0) \ 108 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 109 110#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 111 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 112 VSTORE(N0) \ 113 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 114 115#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 116 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 117 VSTORE(N0) \ 118 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 119 120#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 121 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 122 VSTORE(N0) \ 123 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 124 125#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 126 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 127 VSTORE(N0) \ 128 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 129 130#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 131 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 132 VSTORE(N0) \ 133 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 134 135#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 136 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 137 VSTORE(N0) \ 138 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 139 140#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 141 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 142 VSTORE(N0) \ 143 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 144 145#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 146 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 147 VSTORE(N0) \ 148 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 149 150#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 151 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 152 VSTORE(N0) \ 153 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 154 155#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 156 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 157 VSTORE(N0) \ 158 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 159 160#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 161 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 162 VSTORE(N0) \ 163 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 164 165#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 166 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 167 VSTORE(N0) \ 168 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 169 170 171 172 173#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 174#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 175 176 177 178#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 179#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 180 181 182 183#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 184 VSTORE_PARTIAL(N0, STORE_N0) \ 185 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 186 187#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 188 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 189 VSTORE_PARTIAL(N0, STORE_N0) \ 190 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 191 192#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 193 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 194 VSTORE_PARTIAL(N0, STORE_N0) \ 195 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 196 197#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 198 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 199 VSTORE_PARTIAL(N0, STORE_N0) \ 200 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 201 202#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 203 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 204 VSTORE_PARTIAL(N0, STORE_N0) \ 205 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 206 207#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 208 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 209 VSTORE_PARTIAL(N0, STORE_N0) \ 210 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 211 212#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 213 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 214 VSTORE_PARTIAL(N0, STORE_N0) \ 215 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 216 217#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 218 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 219 VSTORE_PARTIAL(N0, STORE_N0) \ 220 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 221 222#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 223 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 224 VSTORE_PARTIAL(N0, STORE_N0) \ 225 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 226 227#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 228 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 229 VSTORE_PARTIAL(N0, STORE_N0) \ 230 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 231 232#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 233 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 234 VSTORE_PARTIAL(N0, STORE_N0) \ 235 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 236 237#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 238 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 239 VSTORE_PARTIAL(N0, STORE_N0) \ 240 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 241 242#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 243 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 244 VSTORE_PARTIAL(N0, STORE_N0) \ 245 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 246 247#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 248 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 249 VSTORE_PARTIAL(N0, STORE_N0) \ 250 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 251 252#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 253 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 254 VSTORE_PARTIAL(N0, STORE_N0) \ 255 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 256 257#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 258 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 259 VSTORE_PARTIAL(N0, STORE_N0) \ 260 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 261 262 263 264#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 265#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 266 267#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 268 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 269 { \ 270 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 271 } \ 272 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 273 { \ 274 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 275 } \ 276 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 277 { \ 278 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 279 } \ 280 else \ 281 { \ 282 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 283 } 284 285#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 286 if(!(PARTIAL_COND_X)) \ 287 { \ 288 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 289 } \ 290 else \ 291 { \ 292 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 293 } 294 295#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 296 if(!(PARTIAL_COND_Y)) \ 297 { \ 298 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 299 } \ 300 else \ 301 { \ 302 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 303 } 304 305 306#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 307 308 309#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 310 311#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 312 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 313 314#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 315 316#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 317 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 318 319#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 320 321#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 322 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 323 324#else 325 326#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 327 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 328 329#endif 330 331#endif 332 333 334#if defined(PARTIAL_STORE_M0) 335 336#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 337 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 338#else 339#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 340 ((uint)(y * M0)) 341#endif 342 343 344 345#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 346 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 347 348 349#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 350#pragma OPENCL EXTENSION cl_khr_fp16 : enable 351#endif 352 353#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 354#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 355#endif 356 357#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 358#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 359#endif 360 361#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 362#pragma OPENCL EXTENSION cl_arm_printf : enable 363#endif 364 365#define GPU_ARCH_MIDGARD 0x100 366#define GPU_ARCH_BIFROST 0x200 367#define GPU_ARCH_VALHALL 0x300 368 369 370#define CONCAT(a, b) a##b 371 372 373#define EXPAND(x) x 374 375 376#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 377 378 379#define REV1(x) ((x)) 380#define REV2(x) ((x).s10) 381#define REV3(x) ((x).s210) 382#define REV4(x) ((x).s3210) 383#define REV8(x) ((x).s76543210) 384#define REV16(x) ((x).sFEDCBA9876543210) 385 386 387 388#define REVERSE_STR(x, s) REV##s((x)) 389#define REVERSE(x, s) REVERSE_STR(x, s) 390 391 392 393#define ROT1_0(x) ((x)) 394#define ROT1_1(x) ((x)) 395 396#define ROT2_0(x) ((x)) 397#define ROT2_1(x) ((x).s10) 398#define ROT2_2(x) ((x)) 399 400#define ROT3_0(x) ((x)) 401#define ROT3_1(x) ((x).s201) 402#define ROT3_2(x) ((x).s120) 403#define ROT3_3(x) ((x)) 404 405#define ROT4_0(x) ((x)) 406#define ROT4_1(x) ((x).s3012) 407#define ROT4_2(x) ((x).s2301) 408#define ROT4_3(x) ((x).s1230) 409#define ROT4_4(x) ((x)) 410 411#define ROT8_0(x) ((x)) 412#define ROT8_1(x) ((x).s70123456) 413#define ROT8_2(x) ((x).s67012345) 414#define ROT8_3(x) ((x).s56701234) 415#define ROT8_4(x) ((x).s45670123) 416#define ROT8_5(x) ((x).s34567012) 417#define ROT8_6(x) ((x).s23456701) 418#define ROT8_7(x) ((x).s12345670) 419#define ROT8_8(x) ((x)) 420 421#define ROT16_0(x) ((x)) 422#define ROT16_1(x) ((x).sF0123456789ABCDE) 423#define ROT16_2(x) ((x).sEF0123456789ABCD) 424#define ROT16_3(x) ((x).sDEF0123456789ABC) 425#define ROT16_4(x) ((x).sCDEF0123456789AB) 426#define ROT16_5(x) ((x).sBCDEF0123456789A) 427#define ROT16_6(x) ((x).sABCDEF0123456789) 428#define ROT16_7(x) ((x).s9ABCDEF012345678) 429#define ROT16_8(x) ((x).s89ABCDEF01234567) 430#define ROT16_9(x) ((x).s789ABCDEF0123456) 431#define ROT16_10(x) ((x).s6789ABCDEF012345) 432#define ROT16_11(x) ((x).s56789ABCDEF01234) 433#define ROT16_12(x) ((x).s456789ABCDEF0123) 434#define ROT16_13(x) ((x).s3456789ABCDEF012) 435#define ROT16_14(x) ((x).s23456789ABCDEF01) 436#define ROT16_15(x) ((x).s123456789ABCDEF0) 437#define ROT16_16(x) ((x)) 438 439 440 441#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 442#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 443 444 445 446#define V_OFFS1(dt) (dt##1)(0) 447#define V_OFFS2(dt) (dt##2)(0, 1) 448#define V_OFFS3(dt) (dt##3)(0, 1, 2) 449#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 450#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 451#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 452 453 454 455#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 456#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 457 458 459#define VLOAD_STR(size) vload##size 460#define VLOAD(size) VLOAD_STR(size) 461 462 463#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 464#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 465 466#define NO_LOAD(data, offs, ptr) \ 467 { \ 468 } 469 470 471#define vload_partial_1_0 NO_LOAD 472#define vload_partial_1_1 vload1 473#define vload_partial_1_2 NO_LOAD 474#define vload_partial_1_3 NO_LOAD 475#define vload_partial_1_4 NO_LOAD 476#define vload_partial_1_5 NO_LOAD 477#define vload_partial_1_6 NO_LOAD 478#define vload_partial_1_7 NO_LOAD 479#define vload_partial_1_8 NO_LOAD 480#define vload_partial_1_9 NO_LOAD 481#define vload_partial_1_10 NO_LOAD 482#define vload_partial_1_11 NO_LOAD 483#define vload_partial_1_12 NO_LOAD 484#define vload_partial_1_13 NO_LOAD 485#define vload_partial_1_14 NO_LOAD 486#define vload_partial_1_15 NO_LOAD 487#define vload_partial_1_16 NO_LOAD 488 489#define vload_partial_2_0 NO_LOAD 490#define vload_partial_2_1 vload_partial_1 491#define vload_partial_2_2 vload_partial_2 492#define vload_partial_2_3 NO_LOAD 493#define vload_partial_2_4 NO_LOAD 494#define vload_partial_2_5 NO_LOAD 495#define vload_partial_2_6 NO_LOAD 496#define vload_partial_2_7 NO_LOAD 497#define vload_partial_2_8 NO_LOAD 498#define vload_partial_2_9 NO_LOAD 499#define vload_partial_2_10 NO_LOAD 500#define vload_partial_2_11 NO_LOAD 501#define vload_partial_2_12 NO_LOAD 502#define vload_partial_2_13 NO_LOAD 503#define vload_partial_2_14 NO_LOAD 504#define vload_partial_2_15 NO_LOAD 505#define vload_partial_2_16 NO_LOAD 506 507#define vload_partial_3_0 NO_LOAD 508#define vload_partial_3_1 vload_partial_1 509#define vload_partial_3_2 vload_partial_2 510#define vload_partial_3_3 vload_partial_3 511#define vload_partial_3_4 NO_LOAD 512#define vload_partial_3_5 NO_LOAD 513#define vload_partial_3_6 NO_LOAD 514#define vload_partial_3_7 NO_LOAD 515#define vload_partial_3_8 NO_LOAD 516#define vload_partial_3_9 NO_LOAD 517#define vload_partial_3_10 NO_LOAD 518#define vload_partial_3_11 NO_LOAD 519#define vload_partial_3_12 NO_LOAD 520#define vload_partial_3_13 NO_LOAD 521#define vload_partial_3_14 NO_LOAD 522#define vload_partial_3_15 NO_LOAD 523#define vload_partial_3_16 NO_LOAD 524 525#define vload_partial_4_0 NO_LOAD 526#define vload_partial_4_1 vload_partial_1 527#define vload_partial_4_2 vload_partial_2 528#define vload_partial_4_3 vload_partial_3 529#define vload_partial_4_4 vload_partial_4 530#define vload_partial_4_5 NO_LOAD 531#define vload_partial_4_6 NO_LOAD 532#define vload_partial_4_7 NO_LOAD 533#define vload_partial_4_8 NO_LOAD 534#define vload_partial_4_9 NO_LOAD 535#define vload_partial_4_10 NO_LOAD 536#define vload_partial_4_11 NO_LOAD 537#define vload_partial_4_12 NO_LOAD 538#define vload_partial_4_13 NO_LOAD 539#define vload_partial_4_14 NO_LOAD 540#define vload_partial_4_15 NO_LOAD 541#define vload_partial_4_16 NO_LOAD 542 543#define vload_partial_8_0 NO_LOAD 544#define vload_partial_8_1 vload_partial_1 545#define vload_partial_8_2 vload_partial_2 546#define vload_partial_8_3 vload_partial_3 547#define vload_partial_8_4 vload_partial_4 548#define vload_partial_8_5 vload_partial_5 549#define vload_partial_8_6 vload_partial_6 550#define vload_partial_8_7 vload_partial_7 551#define vload_partial_8_8 vload_partial_8 552#define vload_partial_8_9 NO_LOAD 553#define vload_partial_8_10 NO_LOAD 554#define vload_partial_8_11 NO_LOAD 555#define vload_partial_8_12 NO_LOAD 556#define vload_partial_8_13 NO_LOAD 557#define vload_partial_8_14 NO_LOAD 558#define vload_partial_8_15 NO_LOAD 559#define vload_partial_8_16 NO_LOAD 560 561#define vload_partial_16_0 NO_LOAD 562#define vload_partial_16_1 vload_partial_1 563#define vload_partial_16_2 vload_partial_2 564#define vload_partial_16_3 vload_partial_3 565#define vload_partial_16_4 vload_partial_4 566#define vload_partial_16_5 vload_partial_5 567#define vload_partial_16_6 vload_partial_6 568#define vload_partial_16_7 vload_partial_7 569#define vload_partial_16_8 vload_partial_8 570#define vload_partial_16_9 vload_partial_9 571#define vload_partial_16_10 vload_partial_10 572#define vload_partial_16_11 vload_partial_11 573#define vload_partial_16_12 vload_partial_12 574#define vload_partial_16_13 vload_partial_13 575#define vload_partial_16_14 vload_partial_14 576#define vload_partial_16_15 vload_partial_15 577#define vload_partial_16_16 vload_partial_16 578 579 580#define vload_partial_1(DATA, OFFSET, PTR) \ 581 DATA.s0 = vload1(OFFSET, PTR); 582 583#define vload_partial_2(DATA, OFFSET, PTR) \ 584 DATA.s01 = vload2(OFFSET, PTR); 585 586#define vload_partial_3(DATA, OFFSET, PTR) \ 587 DATA.s012 = vload3(OFFSET, PTR); 588 589#define vload_partial_4(DATA, OFFSET, PTR) \ 590 DATA.s0123 = vload4(OFFSET, PTR); 591 592#define vload_partial_5(DATA, OFFSET, PTR) \ 593 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 594 DATA.s4 = vload1(OFFSET, PTR + 4); 595 596#define vload_partial_6(DATA, OFFSET, PTR) \ 597 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 598 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 599 600#define vload_partial_7(DATA, OFFSET, PTR) \ 601 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 602 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 603 604#define vload_partial_8(DATA, OFFSET, PTR) \ 605 DATA.s01234567 = vload8(OFFSET, PTR); 606 607#define vload_partial_9(DATA, OFFSET, PTR) \ 608 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 609 DATA.s8 = vload1(OFFSET, PTR + 8); 610 611#define vload_partial_10(DATA, OFFSET, PTR) \ 612 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 613 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 614 615#define vload_partial_11(DATA, OFFSET, PTR) \ 616 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 617 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 618 619#define vload_partial_12(DATA, OFFSET, PTR) \ 620 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 621 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 622 623#define vload_partial_13(DATA, OFFSET, PTR) \ 624 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 625 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 626 627#define vload_partial_14(DATA, OFFSET, PTR) \ 628 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 629 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 630 631#define vload_partial_15(DATA, OFFSET, PTR) \ 632 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 633 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 634 635#define vload_partial_16(DATA, OFFSET, PTR) \ 636 DATA = vload16(OFFSET, PTR); 637 638 639 640#define PIXEL_UNIT4 1 641#define PIXEL_UNIT8 2 642#define PIXEL_UNIT16 4 643 644 645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 646#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 647 648 649#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 650#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 651#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 652 653#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 654#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 655#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 656#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 657#endif 658 659#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 660#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 661#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 662 663#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 664#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 665#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 666#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 667#endif 668 669 670#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 671#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 672 673 674#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 675#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 676 677#define VSTORE_STR(size) vstore##size 678#define VSTORE(size) VSTORE_STR(size) 679 680#define float1 float 681#define half1 half 682#define char1 char 683#define uchar1 uchar 684#define short1 short 685#define ushort1 ushort 686#define int1 int 687#define uint1 uint 688#define long1 long 689#define ulong1 ulong 690#define double1 double 691 692#define vload1(OFFSET, PTR) *(OFFSET + PTR) 693#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 694 695 696#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 697#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 698 699#define NO_STORE(data, offs, ptr) \ 700 { \ 701 } 702 703 704#define vstore_partial_1_0 NO_STORE 705#define vstore_partial_1_1 vstore1 706#define vstore_partial_1_2 NO_STORE 707#define vstore_partial_1_3 NO_STORE 708#define vstore_partial_1_4 NO_STORE 709#define vstore_partial_1_5 NO_STORE 710#define vstore_partial_1_6 NO_STORE 711#define vstore_partial_1_7 NO_STORE 712#define vstore_partial_1_8 NO_STORE 713#define vstore_partial_1_9 NO_STORE 714#define vstore_partial_1_10 NO_STORE 715#define vstore_partial_1_11 NO_STORE 716#define vstore_partial_1_12 NO_STORE 717#define vstore_partial_1_13 NO_STORE 718#define vstore_partial_1_14 NO_STORE 719#define vstore_partial_1_15 NO_STORE 720#define vstore_partial_1_16 NO_STORE 721 722#define vstore_partial_2_0 NO_STORE 723#define vstore_partial_2_1 vstore_partial_1 724#define vstore_partial_2_2 vstore_partial_2 725#define vstore_partial_2_3 NO_STORE 726#define vstore_partial_2_4 NO_STORE 727#define vstore_partial_2_5 NO_STORE 728#define vstore_partial_2_6 NO_STORE 729#define vstore_partial_2_7 NO_STORE 730#define vstore_partial_2_8 NO_STORE 731#define vstore_partial_2_9 NO_STORE 732#define vstore_partial_2_10 NO_STORE 733#define vstore_partial_2_11 NO_STORE 734#define vstore_partial_2_12 NO_STORE 735#define vstore_partial_2_13 NO_STORE 736#define vstore_partial_2_14 NO_STORE 737#define vstore_partial_2_15 NO_STORE 738#define vstore_partial_2_16 NO_STORE 739 740#define vstore_partial_3_0 NO_STORE 741#define vstore_partial_3_1 vstore_partial_1 742#define vstore_partial_3_2 vstore_partial_2 743#define vstore_partial_3_3 vstore_partial_3 744#define vstore_partial_3_4 NO_STORE 745#define vstore_partial_3_5 NO_STORE 746#define vstore_partial_3_6 NO_STORE 747#define vstore_partial_3_7 NO_STORE 748#define vstore_partial_3_8 NO_STORE 749#define vstore_partial_3_9 NO_STORE 750#define vstore_partial_3_10 NO_STORE 751#define vstore_partial_3_11 NO_STORE 752#define vstore_partial_3_12 NO_STORE 753#define vstore_partial_3_13 NO_STORE 754#define vstore_partial_3_14 NO_STORE 755#define vstore_partial_3_15 NO_STORE 756#define vstore_partial_3_16 NO_STORE 757 758#define vstore_partial_4_0 NO_STORE 759#define vstore_partial_4_1 vstore_partial_1 760#define vstore_partial_4_2 vstore_partial_2 761#define vstore_partial_4_3 vstore_partial_3 762#define vstore_partial_4_4 vstore_partial_4 763#define vstore_partial_4_5 NO_STORE 764#define vstore_partial_4_6 NO_STORE 765#define vstore_partial_4_7 NO_STORE 766#define vstore_partial_4_8 NO_STORE 767#define vstore_partial_4_9 NO_STORE 768#define vstore_partial_4_10 NO_STORE 769#define vstore_partial_4_11 NO_STORE 770#define vstore_partial_4_12 NO_STORE 771#define vstore_partial_4_13 NO_STORE 772#define vstore_partial_4_14 NO_STORE 773#define vstore_partial_4_15 NO_STORE 774#define vstore_partial_4_16 NO_STORE 775 776#define vstore_partial_8_0 NO_STORE 777#define vstore_partial_8_1 vstore_partial_1 778#define vstore_partial_8_2 vstore_partial_2 779#define vstore_partial_8_3 vstore_partial_3 780#define vstore_partial_8_4 vstore_partial_4 781#define vstore_partial_8_5 vstore_partial_5 782#define vstore_partial_8_6 vstore_partial_6 783#define vstore_partial_8_7 vstore_partial_7 784#define vstore_partial_8_8 vstore_partial_8 785#define vstore_partial_8_9 NO_STORE 786#define vstore_partial_8_10 NO_STORE 787#define vstore_partial_8_11 NO_STORE 788#define vstore_partial_8_12 NO_STORE 789#define vstore_partial_8_13 NO_STORE 790#define vstore_partial_8_14 NO_STORE 791#define vstore_partial_8_15 NO_STORE 792#define vstore_partial_8_16 NO_STORE 793 794#define vstore_partial_16_0 NO_STORE 795#define vstore_partial_16_1 vstore_partial_1 796#define vstore_partial_16_2 vstore_partial_2 797#define vstore_partial_16_3 vstore_partial_3 798#define vstore_partial_16_4 vstore_partial_4 799#define vstore_partial_16_5 vstore_partial_5 800#define vstore_partial_16_6 vstore_partial_6 801#define vstore_partial_16_7 vstore_partial_7 802#define vstore_partial_16_8 vstore_partial_8 803#define vstore_partial_16_9 vstore_partial_9 804#define vstore_partial_16_10 vstore_partial_10 805#define vstore_partial_16_11 vstore_partial_11 806#define vstore_partial_16_12 vstore_partial_12 807#define vstore_partial_16_13 vstore_partial_13 808#define vstore_partial_16_14 vstore_partial_14 809#define vstore_partial_16_15 vstore_partial_15 810#define vstore_partial_16_16 vstore_partial_16 811 812 813#define vstore_partial_1(DATA, OFFSET, PTR) \ 814 vstore1(DATA.s0, OFFSET, PTR); 815 816#define vstore_partial_2(DATA, OFFSET, PTR) \ 817 vstore2(DATA.s01, OFFSET, PTR); 818 819#define vstore_partial_3(DATA, OFFSET, PTR) \ 820 vstore3(DATA.s012, OFFSET, PTR); 821 822#define vstore_partial_4(DATA, OFFSET, PTR) \ 823 vstore4(DATA.s0123, OFFSET, PTR); 824 825#define vstore_partial_5(DATA, OFFSET, PTR) \ 826 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 827 vstore1(DATA.s4, OFFSET, PTR + 4); 828 829#define vstore_partial_6(DATA, OFFSET, PTR) \ 830 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 831 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 832 833#define vstore_partial_7(DATA, OFFSET, PTR) \ 834 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 835 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 836 837#define vstore_partial_8(DATA, OFFSET, PTR) \ 838 vstore8(DATA.s01234567, OFFSET, PTR); 839 840#define vstore_partial_9(DATA, OFFSET, PTR) \ 841 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 842 vstore1(DATA.s8, OFFSET, PTR + 8); 843 844#define vstore_partial_10(DATA, OFFSET, PTR) \ 845 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 846 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 847 848#define vstore_partial_11(DATA, OFFSET, PTR) \ 849 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 850 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 851 852#define vstore_partial_12(DATA, OFFSET, PTR) \ 853 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 854 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 855 856#define vstore_partial_13(DATA, OFFSET, PTR) \ 857 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 858 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 859 860#define vstore_partial_14(DATA, OFFSET, PTR) \ 861 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 862 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 863 864#define vstore_partial_15(DATA, OFFSET, PTR) \ 865 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 866 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 867 868#define vstore_partial_16(DATA, OFFSET, PTR) \ 869 vstore16(DATA, OFFSET, PTR); 870 871 872 873 874 875#define convert_float_sat convert_float 876#define convert_float1_sat convert_float 877#define convert_float2_sat convert_float2 878#define convert_float3_sat convert_float3 879#define convert_float4_sat convert_float4 880#define convert_float8_sat convert_float8 881#define convert_float16_sat convert_float16 882#define convert_half_sat convert_float 883#define convert_half1_sat convert_half 884#define convert_half2_sat convert_half2 885#define convert_half3_sat convert_half3 886#define convert_half4_sat convert_half4 887#define convert_half8_sat convert_half8 888#define convert_half16_sat convert_half16 889 890#define convert_float1 convert_float 891#define convert_half1 convert_half 892#define convert_char1 convert_char 893#define convert_uchar1 convert_uchar 894#define convert_short1 convert_short 895#define convert_ushort1 convert_ushort 896#define convert_int1 convert_int 897#define convert_uint1 convert_uint 898#define convert_long1 convert_long 899#define convert_ulong1 convert_ulong 900#define convert_double1 convert_double 901 902#define convert_char1_sat convert_char_sat 903#define convert_uchar1_sat convert_uchar_sat 904#define convert_uchar2_sat convert_uchar2_sat 905#define convert_uchar3_sat convert_uchar3_sat 906#define convert_uchar4_sat convert_uchar4_sat 907#define convert_uchar8_sat convert_uchar8_sat 908#define convert_uchar16_sat convert_uchar16_sat 909#define convert_short1_sat convert_short_sat 910#define convert_ushort1_sat convert_ushort_sat 911#define convert_int1_sat convert_int_sat 912#define convert_uint1_sat convert_uint_sat 913#define convert_long1_sat convert_long_sat 914#define convert_ulong1_sat convert_ulong_sat 915#define convert_double1_sat convert_double_sat 916 917#define VEC_DATA_TYPE_STR(type, size) type##size 918#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 919 920#define CONVERT_STR(x, type) (convert_##type((x))) 921#define CONVERT(x, type) CONVERT_STR(x, type) 922 923#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 924#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 925 926#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 927#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 928 929#define select_vec_dt_uchar(size) uchar##size 930#define select_vec_dt_char(size) char##size 931#define select_vec_dt_ushort(size) ushort##size 932#define select_vec_dt_short(size) short##size 933#define select_vec_dt_half(size) short##size 934#define select_vec_dt_uint(size) uint##size 935#define select_vec_dt_int(size) int##size 936#define select_vec_dt_float(size) int##size 937#define select_vec_dt_ulong(size) ulong##size 938#define select_vec_dt_long(size) long##size 939 940#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 941#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 942#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 943 944#define signed_int_vec_dt_uchar(size) char##size 945#define signed_int_vec_dt_char(size) char##size 946#define signed_int_vec_dt_ushort(size) short##size 947#define signed_int_vec_dt_short(size) short##size 948#define signed_int_vec_dt_half(size) short##size 949#define signed_int_vec_dt_uint(size) int##size 950#define signed_int_vec_dt_int(size) int##size 951#define signed_int_vec_dt_float(size) int##size 952#define signed_int_vec_dt_ulong(size) long##size 953#define signed_int_vec_dt_long(size) long##size 954 955#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 956#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 957#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 958 959#define sum_reduce_1(x) (x) 960#define sum_reduce_2(x) ((x).s0) + ((x).s1) 961#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 962#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 963#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 964#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 965 966#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 967#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 968 969#define prod_reduce_1(x) (x) 970#define prod_reduce_2(x) ((x).s0) * ((x).s1) 971#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 972#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 973#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 974#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 975 976#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 977#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 978 979#define max_reduce_1(x) (x) 980#define max_reduce_2(x) max(((x).s0), ((x).s1)) 981#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 982#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 983#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 984#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 985 986#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 987#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 988 989#define VECTOR_DECLARATION(name) \ 990 __global uchar *name##_ptr, \ 991 uint name##_stride_x, \ 992 uint name##_step_x, \ 993 uint name##_offset_first_element_in_bytes 994 995#define IMAGE_DECLARATION(name) \ 996 __global uchar *name##_ptr, \ 997 uint name##_stride_x, \ 998 uint name##_step_x, \ 999 uint name##_stride_y, \ 1000 uint name##_step_y, \ 1001 uint name##_offset_first_element_in_bytes 1002 1003#define TENSOR3D_DECLARATION(name) \ 1004 __global uchar *name##_ptr, \ 1005 uint name##_stride_x, \ 1006 uint name##_step_x, \ 1007 uint name##_stride_y, \ 1008 uint name##_step_y, \ 1009 uint name##_stride_z, \ 1010 uint name##_step_z, \ 1011 uint name##_offset_first_element_in_bytes 1012 1013#define TENSOR4D_DECLARATION(name) \ 1014 __global uchar *name##_ptr, \ 1015 uint name##_stride_x, \ 1016 uint name##_step_x, \ 1017 uint name##_stride_y, \ 1018 uint name##_step_y, \ 1019 uint name##_stride_z, \ 1020 uint name##_step_z, \ 1021 uint name##_stride_w, \ 1022 uint name##_step_w, \ 1023 uint name##_offset_first_element_in_bytes 1024 1025#define TENSOR5D_DECLARATION(name) \ 1026 __global uchar *name##_ptr, \ 1027 uint name##_stride_x, \ 1028 uint name##_step_x, \ 1029 uint name##_stride_y, \ 1030 uint name##_step_y, \ 1031 uint name##_stride_z, \ 1032 uint name##_step_z, \ 1033 uint name##_stride_w, \ 1034 uint name##_step_w, \ 1035 uint name##_stride_v, \ 1036 uint name##_step_v, \ 1037 uint name##_offset_first_element_in_bytes 1038 1039#define CONVERT_TO_VECTOR_STRUCT(name) \ 1040 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1041 1042#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1043 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1044 1045#define CONVERT_TO_IMAGE_STRUCT(name) \ 1046 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1047 1048#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1049 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1050 1051#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1052 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1053 1054#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1055 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1056 1057#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1058 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1059 1060#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1061 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1062 name##_stride_z, name##_step_z) 1063 1064#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1065 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1066 1067#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1068 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1069 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1070 1071#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1072 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1073 1074#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1075 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1076 name##_stride_z, name##_step_z) 1077 1078 1079typedef struct Vector 1080{ 1081 __global uchar *ptr; 1082 int offset_first_element_in_bytes; 1083 int stride_x; 1084} Vector; 1085 1086 1087typedef struct Image 1088{ 1089 __global uchar *ptr; 1090 int offset_first_element_in_bytes; 1091 int stride_x; 1092 int stride_y; 1093} Image; 1094 1095 1096typedef struct Tensor3D 1097{ 1098 __global uchar *ptr; 1099 int offset_first_element_in_bytes; 1100 int stride_x; 1101 int stride_y; 1102 int stride_z; 1103} Tensor3D; 1104 1105 1106typedef struct Tensor4D 1107{ 1108 __global uchar *ptr; 1109 int offset_first_element_in_bytes; 1110 int stride_x; 1111 int stride_y; 1112 int stride_z; 1113 int stride_w; 1114} Tensor4D; 1115 1116 1117inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1118{ 1119 Vector vector = 1120 { 1121 .ptr = ptr, 1122 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1123 .stride_x = stride_x, 1124 }; 1125 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1126 return vector; 1127} 1128 1129 1130inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1131{ 1132 Image img = 1133 { 1134 .ptr = ptr, 1135 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1136 .stride_x = stride_x, 1137 .stride_y = stride_y 1138 }; 1139 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1140 return img; 1141} 1142 1143 1144inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1145{ 1146 Image img = 1147 { 1148 .ptr = ptr, 1149 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1150 .stride_x = stride_x, 1151 .stride_y = stride_y 1152 }; 1153 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1154 return img; 1155} 1156 1157 1158inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1159{ 1160 Tensor3D tensor = 1161 { 1162 .ptr = ptr, 1163 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1164 .stride_x = stride_x, 1165 .stride_y = stride_y, 1166 .stride_z = stride_z 1167 }; 1168 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1169 return tensor; 1170} 1171 1172 1173inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1174{ 1175 Tensor3D tensor = 1176 { 1177 .ptr = ptr, 1178 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1179 .stride_x = stride_x, 1180 .stride_y = stride_y, 1181 .stride_z = stride_z 1182 }; 1183 return tensor; 1184} 1185 1186inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1187 uint step_w, 1188 uint mod_size) 1189{ 1190 Tensor4D tensor = 1191 { 1192 .ptr = ptr, 1193 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1194 .stride_x = stride_x, 1195 .stride_y = stride_y, 1196 .stride_z = stride_z, 1197 .stride_w = stride_w 1198 }; 1199 1200 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1201 return tensor; 1202} 1203 1204 1205inline __global const uchar *vector_offset(const Vector *vec, int x) 1206{ 1207 return vec->ptr + x * vec->stride_x; 1208} 1209 1210 1211inline __global uchar *offset(const Image *img, int x, int y) 1212{ 1213 return img->ptr + x * img->stride_x + y * img->stride_y; 1214} 1215 1216 1217inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1218{ 1219 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1220} 1221 1222 1223inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1224{ 1225 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1226} 1227 1228 1229inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1230{ 1231 uint num_elements = width * height; 1232 1233 const uint z = index / num_elements; 1234 1235 index %= num_elements; 1236 1237 const uint y = index / width; 1238 1239 index %= width; 1240 1241 const uint x = index; 1242 1243 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1244} 1245 1246#endif 1247 1248#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 1249#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 1250 1251 1252 1253 1254#define TILE_VECTOR_SIZE1 1 1255#define TILE_VECTOR_SIZE2 2 1256#define TILE_VECTOR_SIZE3 3 1257#define TILE_VECTOR_SIZE4 4 1258#define TILE_VECTOR_SIZE5 8 1259#define TILE_VECTOR_SIZE6 8 1260#define TILE_VECTOR_SIZE7 8 1261#define TILE_VECTOR_SIZE8 8 1262#define TILE_VECTOR_SIZE9 16 1263#define TILE_VECTOR_SIZE10 16 1264#define TILE_VECTOR_SIZE11 16 1265#define TILE_VECTOR_SIZE12 16 1266#define TILE_VECTOR_SIZE13 16 1267#define TILE_VECTOR_SIZE14 16 1268#define TILE_VECTOR_SIZE15 16 1269#define TILE_VECTOR_SIZE16 16 1270 1271#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1 1272#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2 1273#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3 1274#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4 1275#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8 1276#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8 1277#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8 1278#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8 1279#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16 1280#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16 1281#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16 1282#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16 1283#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16 1284#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16 1285#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16 1286#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16 1287 1288 1289#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME) 1290#define TILE_STR(DATA_TYPE, H, W, BASENAME) \ 1291 union { \ 1292 DATA_TYPE s[TILE_VECTOR_SIZE##W]; \ 1293 TILE_VECTOR_TYPE##W(DATA_TYPE) v; \ 1294 } BASENAME[H] 1295 1296#define TENSOR4D_IMAGE(name) \ 1297 __read_only image2d_t name##_img, \ 1298 __global uchar *name##_ptr, \ 1299 uint name##_stride_x, \ 1300 uint name##_step_x, \ 1301 uint name##_stride_y, \ 1302 uint name##_step_y, \ 1303 uint name##_stride_z, \ 1304 uint name##_step_z, \ 1305 uint name##_stride_w, \ 1306 uint name##_step_w, \ 1307 uint name##_offset_first_element_in_bytes 1308 1309#define TENSOR4D_BUFFER(name) \ 1310 __global uchar *name##_ptr, \ 1311 uint name##_stride_x, \ 1312 uint name##_step_x, \ 1313 uint name##_stride_y, \ 1314 uint name##_step_y, \ 1315 uint name##_stride_z, \ 1316 uint name##_step_z, \ 1317 uint name##_stride_w, \ 1318 uint name##_step_w, \ 1319 uint name##_offset_first_element_in_bytes 1320 1321#define TENSOR4D_STR(name, type) TENSOR4D_##type(name) 1322#define TENSOR4D(name, type) TENSOR4D_STR(name, type) 1323 1324#define TENSOR4D_T_IMAGE(name) \ 1325 __read_only image2d_t name##_img, \ 1326 __global uchar *name##_ptr, \ 1327 uint name##_stride_y, \ 1328 uint name##_stride_z, \ 1329 uint name##_stride_w, \ 1330 uint name##_c, \ 1331 uint name##_w, \ 1332 uint name##_h, \ 1333 uint name##_n, \ 1334 uint name##_offset_first_element_in_bytes 1335 1336#define TENSOR4D_T_BUFFER(name) \ 1337 __global uchar *name##_ptr, \ 1338 uint name##_stride_y, \ 1339 uint name##_stride_z, \ 1340 uint name##_stride_w, \ 1341 uint name##_c, \ 1342 uint name##_w, \ 1343 uint name##_h, \ 1344 uint name##_n, \ 1345 uint name##_offset_first_element_in_bytes 1346 1347#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) 1348 1349 1350#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) 1351 1352#define TENSOR4D_RO_T_IMAGE(name) \ 1353 __read_only image2d_t name##_img, \ 1354 TENSOR4D_T_BUFFER(name) 1355 1356#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 1357 1358#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) 1359 1360 1361#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) 1362 1363#define TENSOR4D_WO_T_IMAGE(name) \ 1364 __write_only image2d_t name##_img, \ 1365 TENSOR4D_T_BUFFER(name) 1366 1367#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 1368 1369#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) 1370 1371 1372#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) 1373 1374#define TENSOR3D_T_IMAGE(name) \ 1375 __read_only image2d_t name##_img, \ 1376 __global uchar *name##_ptr, \ 1377 uint name##_stride_y, \ 1378 uint name##_stride_z, \ 1379 uint name##_w, \ 1380 uint name##_h, \ 1381 uint name##_n, \ 1382 uint name##_offset_first_element_in_bytes 1383 1384#define TENSOR3D_T_BUFFER(name) \ 1385 __global uchar *name##_ptr, \ 1386 uint name##_stride_y, \ 1387 uint name##_stride_z, \ 1388 uint name##_w, \ 1389 uint name##_h, \ 1390 uint name##_n, \ 1391 uint name##_offset_first_element_in_bytes 1392 1393#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name) 1394#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type) 1395 1396#if !defined(UNROLL_WITH_PRAGMA) 1397#define UNROLL_INCR(idx, step, macro) idx += (step); (macro) 1398 1399#define LOOP_UNROLLING_1(idx, step, macro) (macro) 1400#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro) 1401#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro) 1402#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro) 1403#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro) 1404#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro) 1405#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro) 1406#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro) 1407#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro) 1408#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro) 1409#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro) 1410#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro) 1411#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro) 1412#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro) 1413#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro) 1414#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro) 1415#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro) 1416#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro) 1417#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro) 1418#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro) 1419#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro) 1420#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro) 1421#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro) 1422#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro) 1423#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro) 1424#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro) 1425#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro) 1426#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro) 1427#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro) 1428#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro) 1429#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro) 1430#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro) 1431#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro) 1432#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro) 1433#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro) 1434#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro) 1435#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro) 1436#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro) 1437#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro) 1438#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro) 1439#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro) 1440#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro) 1441#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro) 1442#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro) 1443#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro) 1444#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro) 1445#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro) 1446#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro) 1447#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro) 1448#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro) 1449#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro) 1450#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro) 1451#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro) 1452#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro) 1453#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro) 1454#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro) 1455#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro) 1456#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro) 1457#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro) 1458#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro) 1459#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro) 1460#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro) 1461#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro) 1462#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro) 1463#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro) 1464#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro) 1465#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro) 1466#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro) 1467#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro) 1468#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro) 1469#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro) 1470#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro) 1471#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro) 1472#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro) 1473#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro) 1474#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro) 1475#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro) 1476#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro) 1477#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro) 1478#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro) 1479#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro) 1480#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro) 1481#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro) 1482#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro) 1483#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro) 1484#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro) 1485#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro) 1486#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro) 1487#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro) 1488#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro) 1489#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro) 1490#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro) 1491#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro) 1492#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro) 1493#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro) 1494#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro) 1495#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro) 1496#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro) 1497#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro) 1498#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro) 1499#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro) 1500#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro) 1501#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro) 1502#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro) 1503#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro) 1504#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro) 1505#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro) 1506#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro) 1507#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro) 1508#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro) 1509#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro) 1510#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro) 1511#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro) 1512#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro) 1513#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro) 1514#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro) 1515#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro) 1516#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro) 1517#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro) 1518#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro) 1519#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro) 1520#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro) 1521#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro) 1522#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro) 1523#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro) 1524#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro) 1525#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) 1526#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) 1527 1528#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 1529 { \ 1530 type idx = start; \ 1531 LOOP_UNROLLING_##num(idx, step, macro); \ 1532 } 1533#else 1534#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 1535 { \ 1536 _Pragma("unroll") \ 1537 for(type idx = start; idx < (num * step); idx += step) \ 1538 { \ 1539 (macro); \ 1540 } \ 1541 } 1542#endif 1543#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) 1544 1545 1546#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0)) 1547 1548 1549#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) 1550#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) 1551#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1552 ({ \ 1553 c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \ 1554 }) 1555#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product) 1556#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 1557#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 1558#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b)); 1559#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1560#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c)); 1561#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c)); 1562#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c)); 1563#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1564#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 1565#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 1566#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b)); 1567#else 1568#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1569 ({ \ 1570 c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \ 1571 c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \ 1572 }) 1573#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1574 ({ \ 1575 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \ 1576 c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \ 1577 }) 1578#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \ 1579 ({ \ 1580 val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \ 1581 val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \ 1582 val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \ 1583 val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \ 1584 }) 1585#endif 1586#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1587 ({ \ 1588 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 1589 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \ 1590 }) 1591#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1592 ({ \ 1593 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 1594 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \ 1595 }) 1596#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1597 ({ \ 1598 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 1599 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \ 1600 }) 1601#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1602 ({ \ 1603 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 1604 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 1605 }) 1606#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1607 ({ \ 1608 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1609 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \ 1610 }) 1611#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1612 ({ \ 1613 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1614 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \ 1615 }) 1616#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1617 ({ \ 1618 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1619 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \ 1620 }) 1621#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1622 ({ \ 1623 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1624 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \ 1625 }) 1626#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1627 ({ \ 1628 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1629 DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \ 1630 }) 1631#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1632 ({ \ 1633 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1634 DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \ 1635 }) 1636#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1637 ({ \ 1638 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 1639 DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \ 1640 }) 1641#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 1642 ({ \ 1643 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 1644 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 1645 }) 1646 1647 1648#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) 1649#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c) 1650 1651 1652#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) 1653#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) 1654#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \ 1655 VLOAD(WIDTH) \ 1656 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 1657#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) 1658 1659 1660#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) 1661#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) 1662#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ 1663 VSTORE(WIDTH) \ 1664 (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 1665#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) 1666 1667 1668#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ 1669 ({ \ 1670 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1671 { \ 1672 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ 1673 }) \ 1674 }) 1675 1676 1677#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \ 1678 ({ \ 1679 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1680 { \ 1681 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \ 1682 }) \ 1683 }) 1684 1685 1686#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ 1687 ({ \ 1688 if(WIDTH1_CONDITION) \ 1689 { \ 1690 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1691 { \ 1692 VLOAD_PARTIAL(WIDTH0, WIDTH1) \ 1693 (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 1694 }) \ 1695 } \ 1696 else \ 1697 { \ 1698 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1699 { \ 1700 dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ 1701 }) \ 1702 } \ 1703 }) 1704 1705#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ 1706 ({ \ 1707 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 1708 { \ 1709 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 1710 { \ 1711 int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \ 1712 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 1713 int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \ 1714 if(_src_valid_y != 0) \ 1715 { \ 1716 dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 1717 } \ 1718 }) \ 1719 }) \ 1720 }) 1721 1722 1723#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \ 1724 ({ \ 1725 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 1726 { \ 1727 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 1728 { \ 1729 int _src_y = (X) + _xk * (DILATION_X); \ 1730 int _src_z = ((Y) + _yk * (DILATION_Y)); \ 1731 int _src_w = (B); \ 1732 bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \ 1733 if(!(BOUNDARY_CHECK)) \ 1734 { \ 1735 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 1736 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 1737 } \ 1738 else \ 1739 { \ 1740 if(_src_valid_y) \ 1741 { \ 1742 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 1743 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 1744 } \ 1745 } \ 1746 }) \ 1747 }) \ 1748 }) 1749 1750 1751#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ 1752 ({ \ 1753 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 1754 { \ 1755 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ 1756 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 1757 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ 1758 if(_src_valid_y != 0) \ 1759 { \ 1760 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 1761 } \ 1762 }) \ 1763 }) 1764 1765 1766#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 1767#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 1768#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 1769 ({ \ 1770 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 1771 { \ 1772 if(yi[0].s[_i] >= 0) \ 1773 { \ 1774 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 1775 } \ 1776 }) \ 1777 }) 1778 1779#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 1780 ({ \ 1781 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 1782 { \ 1783 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 1784 }) \ 1785 }) 1786 1787 1788#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \ 1789 ({ \ 1790 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 1791 { \ 1792 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \ 1793 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \ 1794 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \ 1795 && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \ 1796 if(_src_valid_y != 0) \ 1797 { \ 1798 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 1799 } \ 1800 }) \ 1801 }) 1802 1803 1804#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \ 1805 ({ \ 1806 if(WIDTH1_CONDITION) \ 1807 { \ 1808 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1809 { \ 1810 VSTORE_PARTIAL(WIDTH0, WIDTH1) \ 1811 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 1812 }) \ 1813 } \ 1814 else \ 1815 { \ 1816 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 1817 { \ 1818 VSTORE(WIDTH0) \ 1819 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 1820 }) \ 1821 } \ 1822 }) 1823 1824 1825#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \ 1826 ({ \ 1827 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1828 { \ 1829 ACC_DATA_TYPE _tm = 0; \ 1830 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 1831 { \ 1832 _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \ 1833 }) \ 1834 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 1835 { \ 1836 dst[_m0].s[_n0] += _tm; \ 1837 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 1838 { \ 1839 dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \ 1840 }) \ 1841 }) \ 1842 }) \ 1843 }) 1844 1845 1846#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 1847#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 1848 1849 1850#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 1851 ({ \ 1852 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1853 { \ 1854 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 1855 { \ 1856 SRC_DATA_TYPE _tmp = 0; \ 1857 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 1858 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 1859 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 1860 long a_64 = (long)(_src); \ 1861 long b_64 = (long)(DST_MULTIPLIER); \ 1862 long ab_64 = a_64 * b_64; \ 1863 long mask1 = 1 << 30; \ 1864 long mask2 = 1 - (1 << 30); \ 1865 long is_positive_or_zero = ab_64 >= 0; \ 1866 long nudge = select(mask2, mask1, is_positive_or_zero); \ 1867 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 1868 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 1869 if(DST_SHIFT >= 0) \ 1870 { \ 1871 long mask = ((((int)1) << DST_SHIFT) - (long)1); \ 1872 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 1873 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 1874 } \ 1875 _tmp += DST_OFFSET; \ 1876 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 1877 }) \ 1878 }) \ 1879 }) 1880 1881 1882#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 1883 ({ \ 1884 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1885 { \ 1886 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 1887 { \ 1888 SRC_DATA_TYPE _tmp = 0; \ 1889 SRC_DATA_TYPE _tmp2 = 0; \ 1890 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 1891 SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \ 1892 SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \ 1893 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \ 1894 SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \ 1895 long a_64 = (long)(_src); \ 1896 long b_64 = (long)(_dst_multiplier); \ 1897 long ab_64 = a_64 * b_64; \ 1898 long mask1 = 1 << 30; \ 1899 long mask2 = 1 - (1 << 30); \ 1900 long is_positive_or_zero = ab_64 >= 0; \ 1901 long nudge = select(mask2, mask1, is_positive_or_zero); \ 1902 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 1903 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 1904 long mask = ((((int)1) << _dst_shift) - (int)1); \ 1905 long threshold = (mask >> 1) + any(_tmp); \ 1906 _tmp2 = _tmp >> _dst_shift; \ 1907 _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ 1908 _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ 1909 _tmp += DST_OFFSET; \ 1910 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 1911 }) \ 1912 }) \ 1913 }) 1914 1915 1916#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \ 1917 ({ \ 1918 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1919 { \ 1920 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 1921 { \ 1922 SRC_DATA_TYPE _tmp = 0; \ 1923 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 1924 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 1925 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 1926 long a_64 = (long)(_src); \ 1927 long b_64 = (long)(DST_MULTIPLIER); \ 1928 long ab_64 = a_64 * b_64; \ 1929 long mask1 = 1 << 30; \ 1930 long mask2 = 1 - (1 << 30); \ 1931 long is_positive_or_zero = ab_64 >= 0; \ 1932 long nudge = select(mask2, mask1, is_positive_or_zero); \ 1933 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 1934 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 1935 if(DST_SHIFT >= 0) \ 1936 { \ 1937 long mask = ((((int)1) << DST_SHIFT) - (int)1); \ 1938 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 1939 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 1940 } \ 1941 _tmp += DST_OFFSET; \ 1942 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 1943 }) \ 1944 }) \ 1945 }) 1946 1947 1948#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \ 1949 ({ \ 1950 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1951 { \ 1952 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 1953 { \ 1954 a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \ 1955 }) \ 1956 }) \ 1957 }) 1958 1959 1960#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \ 1961 ({ \ 1962 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1963 { \ 1964 dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \ 1965 }) \ 1966 }) 1967 1968 1969#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) 1970 1971#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) 1972 1973#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1974 1975#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) 1976 1977#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) 1978 1979#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 1980#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 1981 1982#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) 1983#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) 1984#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) 1985#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL)) 1986 1987 1988#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ 1989 ({ \ 1990 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 1991 { \ 1992 dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ 1993 }) \ 1994 }) 1995 1996 1997#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ 1998 ({ \ 1999 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2000 { \ 2001 dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ 2002 }) \ 2003 }) 2004 2005 2006#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 2007 ({ \ 2008 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2009 { \ 2010 dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ 2011 }) \ 2012 }) 2013 2014#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2015#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2016#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2017 2018#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2019#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2020 2021#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2022 2023#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2024#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2025 2026 2027#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 2028 ({ \ 2029 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2030 { \ 2031 dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ 2032 }) \ 2033 }) 2034 2035 2036#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 2037 ({ \ 2038 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2039 { \ 2040 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 2041 }) \ 2042 }) 2043 2044 2045#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 2046 ({ \ 2047 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2048 { \ 2049 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 2050 }) \ 2051 }) 2052 2053#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2054#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2055#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2056#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 2057 2058 2059#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 2060 ({ \ 2061 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2062 { \ 2063 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 2064 }) \ 2065 }) 2066 2067 2068#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ 2069 ({ \ 2070 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 2071 { \ 2072 dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 2073 }) \ 2074 }) 2075 2076 2077#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2078#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2079#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2080#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2081#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2082#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2083#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2084#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 2085#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 2086 { \ 2087 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 2088 { \ 2089 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 2090 { \ 2091 LOOP_UNROLLING(int, _k, 0, 1, K0, \ 2092 { \ 2093 dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \ 2094 }) \ 2095 }) \ 2096 }) \ 2097 } 2098 2099#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 2100 ({ \ 2101 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 2102 { \ 2103 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 2104 { \ 2105 DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \ 2106 }) \ 2107 }) \ 2108 }) 2109 2110#endif 2111 2112 2113 2114 2115__kernel void transposed_convolution_nhwc( 2116 TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), 2117 TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), 2118 TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) 2119#if defined(HAS_BIAS) 2120 , 2121 VECTOR_DECLARATION(bia) 2122#endif 2123) 2124{ 2125 2126 2127#define _IWEI_WIDTH WEI_WIDTH 2128#define _IWEI_HEIGHT WEI_HEIGHT 2129#define _ISRC_WIDTH SRC_WIDTH 2130#define _ISRC_HEIGHT SRC_HEIGHT 2131#define _ISRC_CHANNELS SRC_CHANNELS 2132#define _IDST_WIDTH DST_WIDTH 2133#define _IDST_HEIGHT DST_HEIGHT 2134#define _IDST_CHANNELS DST_CHANNELS 2135#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) 2136 2137#if defined(IS_QUANTIZED) 2138#define _IOUTPUT_TILE cq 2139#else 2140#define _IOUTPUT_TILE c 2141#endif 2142 2143 const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); 2144 const int mout = GET_SPATIAL_IDX(1, M0, 0); 2145 const int bout = GET_SPATIAL_IDX(2, 1, 0); 2146 2147 2148 2149 TILE(int, 1, M0, xi); 2150 TILE(int, 1, M0, yi); 2151 TILE(int, 1, M0, xu); 2152 TILE(int, 1, M0, yu); 2153 2154 2155 LOOP_UNROLLING(int, i, 0, 1, M0, 2156 { 2157 xu[0].s[i] = ((mout + i) % _IDST_WIDTH) - PAD_LEFT; 2158 yu[0].s[i] = ((mout + i) / _IDST_WIDTH) - PAD_TOP; 2159 xi[0].s[i] = ceil(xu[0].s[i] / (float)STRIDE_X); 2160 yi[0].s[i] = ceil(yu[0].s[i] / (float)STRIDE_Y); 2161 }) 2162 2163 2164 TILE(ACC_DATA_TYPE, M0, N0, c); 2165 2166 LOOP_UNROLLING(int, i, 0, 1, M0, 2167 { 2168 c[i].v = 0; 2169 }) 2170 2171 2172 const int x_start = _IWEI_WIDTH - (xi[0].s[0] * STRIDE_X - xu[0].s[0]) - 1; 2173 const int y_start = _IWEI_HEIGHT - (yi[0].s[0] * STRIDE_Y - yu[0].s[0]) - 1; 2174 2175 for(int yk = y_start, yi_step = 0; yk >= 0; yk -= STRIDE_Y, ++yi_step) 2176 { 2177 for(int xk = x_start, xi_step = 0; xk >= 0; xk -= STRIDE_X, ++xi_step) 2178 { 2179 const int weights_y = cout * _IY_MULTIPLIER + yk * _IWEI_WIDTH + xk; 2180 2181 TILE(int, 1, M0, my); 2182 2183 LOOP_UNROLLING(int, i, 0, 1, M0, 2184 { 2185 int x_s = xi[0].s[i] + xi_step; 2186 int y_s = yi[0].s[i] + yi_step; 2187 my[0].s[i] = x_s + y_s *_ISRC_WIDTH; 2188 my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT); 2189 my[0].s[i] = select(-1, my[0].s[i], x_s >= 0); 2190 my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH); 2191 my[0].s[i] = select(-1, my[0].s[i], y_s >= 0); 2192 my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT); 2193 }) 2194 2195 int ck = 0; 2196 for(; ck <= (_ISRC_CHANNELS - K0); ck += K0) 2197 { 2198 TILE(SRC_DATA_TYPE, M0, K0, a); 2199 TILE(WEI_DATA_TYPE, N0, K0, b); 2200 2201 2202 LOOP_UNROLLING(int, i, 0, 1, M0, 2203 { 2204 a[i].v = ZERO_VALUE; 2205 }) 2206 2207 LOOP_UNROLLING(int, i, 0, 1, N0, 2208 { 2209 b[i].v = ZERO_VALUE; 2210 }) 2211 2212 2213 T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a); 2214 2215 2216 T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b); 2217 2218 2219 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); 2220 2221#if defined(IS_QUANTIZED) 2222 2223 2224 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); 2225#endif 2226 } 2227 2228 2229#if defined(LEFTOVER_LOOP) 2230 2231 for(; ck < _ISRC_CHANNELS; ++ck) 2232 { 2233 TILE(SRC_DATA_TYPE, M0, 1, a); 2234 TILE(WEI_DATA_TYPE, N0, 1, b); 2235 2236 2237 LOOP_UNROLLING(int, i, 0, 1, M0, 2238 { 2239 a[i].v = ZERO_VALUE; 2240 }) 2241 2242 2243 2244 T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, BUFFER, src, ck, src_stride_y, my, a); 2245 2246 2247 2248 T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b); 2249 2250 2251 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); 2252 2253#if defined(IS_QUANTIZED) 2254 2255 2256 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); 2257#endif 2258 } 2259#endif 2260 } 2261 } 2262 2263#if defined(IS_QUANTIZED) 2264 const int total_pixels = floor((1 + y_start / (float)STRIDE_Y)) * floor(1 + x_start / (float)STRIDE_X); 2265 2266 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (total_pixels * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c); 2267#endif 2268 2269#if defined(HAS_BIAS) 2270 TILE(BIA_DATA_TYPE, 1, N0, bias0); 2271 2272 T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0); 2273 2274 2275 T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); 2276 2277#endif 2278 2279#if defined(IS_QUANTIZED) 2280 2281 TILE(DST_DATA_TYPE, M0, N0, cq); 2282 2283 2284 T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq); 2285#endif 2286 2287 TILE(uint, M0, 1, dst_indirect_y); 2288 2289 2290 LOOP_UNROLLING(int, i, 0, 1, M0, 2291 { 2292 dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); 2293 dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); 2294 }) 2295 2296 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; 2297 2298 2299 T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y); 2300 2301#undef _IWEI_WIDTH 2302#undef _IWEI_HEIGHT 2303#undef _ISRC_WIDTH 2304#undef _ISRC_HEIGHT 2305#undef _ISRC_CHANNELS 2306#undef _IDST_WIDTH 2307#undef _IDST_HEIGHT 2308#undef _IDST_CHANNELS 2309#undef _IY_MULTIPLIER 2310})"