1R"( 2 3 4 5 6#ifndef ARM_COMPUTE_HELPER_H 7#define ARM_COMPUTE_HELPER_H 8 9 10 11 12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 VSTORE(N0) \ 14 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15 16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 VSTORE(N0) \ 19 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20 21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 VSTORE(N0) \ 24 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25 26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 VSTORE(N0) \ 29 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30 31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 VSTORE(N0) \ 34 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35 36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 VSTORE(N0) \ 39 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40 41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 VSTORE(N0) \ 44 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45 46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 VSTORE(N0) \ 49 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50 51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 VSTORE(N0) \ 54 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55 56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 VSTORE(N0) \ 59 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60 61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 VSTORE(N0) \ 64 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65 66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 VSTORE(N0) \ 69 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70 71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 VSTORE(N0) \ 74 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75 76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 VSTORE(N0) \ 79 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80 81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 VSTORE(N0) \ 84 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85 86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90 91 92 93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 VSTORE(N0) \ 95 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96 97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 VSTORE(N0) \ 100 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101 102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 VSTORE(N0) \ 105 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106 107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 VSTORE(N0) \ 110 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111 112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116 117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121 122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126 127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131 132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136 137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141 142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146 147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151 152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161 162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166 167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171 172 173 174 175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177 178 179 180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182 183 184 185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE_PARTIAL(N0, STORE_N0) \ 187 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188 189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE_PARTIAL(N0, STORE_N0) \ 192 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193 194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE_PARTIAL(N0, STORE_N0) \ 197 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198 199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE_PARTIAL(N0, STORE_N0) \ 202 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203 204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE_PARTIAL(N0, STORE_N0) \ 207 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208 209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE_PARTIAL(N0, STORE_N0) \ 212 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213 214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE_PARTIAL(N0, STORE_N0) \ 217 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218 219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE_PARTIAL(N0, STORE_N0) \ 222 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223 224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE_PARTIAL(N0, STORE_N0) \ 227 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228 229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE_PARTIAL(N0, STORE_N0) \ 232 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233 234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 VSTORE_PARTIAL(N0, STORE_N0) \ 237 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238 239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 VSTORE_PARTIAL(N0, STORE_N0) \ 242 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243 244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 VSTORE_PARTIAL(N0, STORE_N0) \ 247 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248 249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 VSTORE_PARTIAL(N0, STORE_N0) \ 252 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253 254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 VSTORE_PARTIAL(N0, STORE_N0) \ 257 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258 259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261 VSTORE_PARTIAL(N0, STORE_N0) \ 262 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263 264 265 266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268 269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279 { \ 280 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } \ 282 else \ 283 { \ 284 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285 } 286 287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288 if(!(PARTIAL_COND_X)) \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } \ 292 else \ 293 { \ 294 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295 } 296 297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298 if(!(PARTIAL_COND_Y)) \ 299 { \ 300 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } \ 302 else \ 303 { \ 304 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305 } 306 307 308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309 310 311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312 313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315 316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317 318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320 321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322 323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325 326#else 327 328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330 331#endif 332 333#endif 334 335 336#if defined(PARTIAL_STORE_M0) 337 338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340#else 341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342 ((uint)(y * M0)) 343#endif 344 345 346 347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349 350 351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361#endif 362 363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364#pragma OPENCL EXTENSION cl_arm_printf : enable 365#endif 366 367#define GPU_ARCH_MIDGARD 0x100 368#define GPU_ARCH_BIFROST 0x200 369#define GPU_ARCH_VALHALL 0x300 370 371 372#define CONCAT(a, b) a##b 373 374 375#define EXPAND(x) x 376 377 378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379 380 381#define REV1(x) ((x)) 382#define REV2(x) ((x).s10) 383#define REV3(x) ((x).s210) 384#define REV4(x) ((x).s3210) 385#define REV8(x) ((x).s76543210) 386#define REV16(x) ((x).sFEDCBA9876543210) 387 388 389 390#define REVERSE_STR(x, s) REV##s((x)) 391#define REVERSE(x, s) REVERSE_STR(x, s) 392 393 394 395#define ROT1_0(x) ((x)) 396#define ROT1_1(x) ((x)) 397 398#define ROT2_0(x) ((x)) 399#define ROT2_1(x) ((x).s10) 400#define ROT2_2(x) ((x)) 401 402#define ROT3_0(x) ((x)) 403#define ROT3_1(x) ((x).s201) 404#define ROT3_2(x) ((x).s120) 405#define ROT3_3(x) ((x)) 406 407#define ROT4_0(x) ((x)) 408#define ROT4_1(x) ((x).s3012) 409#define ROT4_2(x) ((x).s2301) 410#define ROT4_3(x) ((x).s1230) 411#define ROT4_4(x) ((x)) 412 413#define ROT8_0(x) ((x)) 414#define ROT8_1(x) ((x).s70123456) 415#define ROT8_2(x) ((x).s67012345) 416#define ROT8_3(x) ((x).s56701234) 417#define ROT8_4(x) ((x).s45670123) 418#define ROT8_5(x) ((x).s34567012) 419#define ROT8_6(x) ((x).s23456701) 420#define ROT8_7(x) ((x).s12345670) 421#define ROT8_8(x) ((x)) 422 423#define ROT16_0(x) ((x)) 424#define ROT16_1(x) ((x).sF0123456789ABCDE) 425#define ROT16_2(x) ((x).sEF0123456789ABCD) 426#define ROT16_3(x) ((x).sDEF0123456789ABC) 427#define ROT16_4(x) ((x).sCDEF0123456789AB) 428#define ROT16_5(x) ((x).sBCDEF0123456789A) 429#define ROT16_6(x) ((x).sABCDEF0123456789) 430#define ROT16_7(x) ((x).s9ABCDEF012345678) 431#define ROT16_8(x) ((x).s89ABCDEF01234567) 432#define ROT16_9(x) ((x).s789ABCDEF0123456) 433#define ROT16_10(x) ((x).s6789ABCDEF012345) 434#define ROT16_11(x) ((x).s56789ABCDEF01234) 435#define ROT16_12(x) ((x).s456789ABCDEF0123) 436#define ROT16_13(x) ((x).s3456789ABCDEF012) 437#define ROT16_14(x) ((x).s23456789ABCDEF01) 438#define ROT16_15(x) ((x).s123456789ABCDEF0) 439#define ROT16_16(x) ((x)) 440 441 442 443#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445 446 447 448#define V_OFFS1(dt) (dt##1)(0) 449#define V_OFFS2(dt) (dt##2)(0, 1) 450#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454 455 456 457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459 460 461#define VLOAD_STR(size) vload##size 462#define VLOAD(size) VLOAD_STR(size) 463 464 465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467 468#define NO_LOAD(data, offs, ptr) \ 469 { \ 470 } 471 472 473#define vload_partial_1_0 NO_LOAD 474#define vload_partial_1_1 vload1 475#define vload_partial_1_2 NO_LOAD 476#define vload_partial_1_3 NO_LOAD 477#define vload_partial_1_4 NO_LOAD 478#define vload_partial_1_5 NO_LOAD 479#define vload_partial_1_6 NO_LOAD 480#define vload_partial_1_7 NO_LOAD 481#define vload_partial_1_8 NO_LOAD 482#define vload_partial_1_9 NO_LOAD 483#define vload_partial_1_10 NO_LOAD 484#define vload_partial_1_11 NO_LOAD 485#define vload_partial_1_12 NO_LOAD 486#define vload_partial_1_13 NO_LOAD 487#define vload_partial_1_14 NO_LOAD 488#define vload_partial_1_15 NO_LOAD 489#define vload_partial_1_16 NO_LOAD 490 491#define vload_partial_2_0 NO_LOAD 492#define vload_partial_2_1 vload_partial_1 493#define vload_partial_2_2 vload_partial_2 494#define vload_partial_2_3 NO_LOAD 495#define vload_partial_2_4 NO_LOAD 496#define vload_partial_2_5 NO_LOAD 497#define vload_partial_2_6 NO_LOAD 498#define vload_partial_2_7 NO_LOAD 499#define vload_partial_2_8 NO_LOAD 500#define vload_partial_2_9 NO_LOAD 501#define vload_partial_2_10 NO_LOAD 502#define vload_partial_2_11 NO_LOAD 503#define vload_partial_2_12 NO_LOAD 504#define vload_partial_2_13 NO_LOAD 505#define vload_partial_2_14 NO_LOAD 506#define vload_partial_2_15 NO_LOAD 507#define vload_partial_2_16 NO_LOAD 508 509#define vload_partial_3_0 NO_LOAD 510#define vload_partial_3_1 vload_partial_1 511#define vload_partial_3_2 vload_partial_2 512#define vload_partial_3_3 vload_partial_3 513#define vload_partial_3_4 NO_LOAD 514#define vload_partial_3_5 NO_LOAD 515#define vload_partial_3_6 NO_LOAD 516#define vload_partial_3_7 NO_LOAD 517#define vload_partial_3_8 NO_LOAD 518#define vload_partial_3_9 NO_LOAD 519#define vload_partial_3_10 NO_LOAD 520#define vload_partial_3_11 NO_LOAD 521#define vload_partial_3_12 NO_LOAD 522#define vload_partial_3_13 NO_LOAD 523#define vload_partial_3_14 NO_LOAD 524#define vload_partial_3_15 NO_LOAD 525#define vload_partial_3_16 NO_LOAD 526 527#define vload_partial_4_0 NO_LOAD 528#define vload_partial_4_1 vload_partial_1 529#define vload_partial_4_2 vload_partial_2 530#define vload_partial_4_3 vload_partial_3 531#define vload_partial_4_4 vload_partial_4 532#define vload_partial_4_5 NO_LOAD 533#define vload_partial_4_6 NO_LOAD 534#define vload_partial_4_7 NO_LOAD 535#define vload_partial_4_8 NO_LOAD 536#define vload_partial_4_9 NO_LOAD 537#define vload_partial_4_10 NO_LOAD 538#define vload_partial_4_11 NO_LOAD 539#define vload_partial_4_12 NO_LOAD 540#define vload_partial_4_13 NO_LOAD 541#define vload_partial_4_14 NO_LOAD 542#define vload_partial_4_15 NO_LOAD 543#define vload_partial_4_16 NO_LOAD 544 545#define vload_partial_8_0 NO_LOAD 546#define vload_partial_8_1 vload_partial_1 547#define vload_partial_8_2 vload_partial_2 548#define vload_partial_8_3 vload_partial_3 549#define vload_partial_8_4 vload_partial_4 550#define vload_partial_8_5 vload_partial_5 551#define vload_partial_8_6 vload_partial_6 552#define vload_partial_8_7 vload_partial_7 553#define vload_partial_8_8 vload_partial_8 554#define vload_partial_8_9 NO_LOAD 555#define vload_partial_8_10 NO_LOAD 556#define vload_partial_8_11 NO_LOAD 557#define vload_partial_8_12 NO_LOAD 558#define vload_partial_8_13 NO_LOAD 559#define vload_partial_8_14 NO_LOAD 560#define vload_partial_8_15 NO_LOAD 561#define vload_partial_8_16 NO_LOAD 562 563#define vload_partial_16_0 NO_LOAD 564#define vload_partial_16_1 vload_partial_1 565#define vload_partial_16_2 vload_partial_2 566#define vload_partial_16_3 vload_partial_3 567#define vload_partial_16_4 vload_partial_4 568#define vload_partial_16_5 vload_partial_5 569#define vload_partial_16_6 vload_partial_6 570#define vload_partial_16_7 vload_partial_7 571#define vload_partial_16_8 vload_partial_8 572#define vload_partial_16_9 vload_partial_9 573#define vload_partial_16_10 vload_partial_10 574#define vload_partial_16_11 vload_partial_11 575#define vload_partial_16_12 vload_partial_12 576#define vload_partial_16_13 vload_partial_13 577#define vload_partial_16_14 vload_partial_14 578#define vload_partial_16_15 vload_partial_15 579#define vload_partial_16_16 vload_partial_16 580 581 582#define vload_partial_1(DATA, OFFSET, PTR) \ 583 DATA.s0 = vload1(OFFSET, PTR); 584 585#define vload_partial_2(DATA, OFFSET, PTR) \ 586 DATA.s01 = vload2(OFFSET, PTR); 587 588#define vload_partial_3(DATA, OFFSET, PTR) \ 589 DATA.s012 = vload3(OFFSET, PTR); 590 591#define vload_partial_4(DATA, OFFSET, PTR) \ 592 DATA.s0123 = vload4(OFFSET, PTR); 593 594#define vload_partial_5(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 DATA.s4 = vload1(OFFSET, PTR + 4); 597 598#define vload_partial_6(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601 602#define vload_partial_7(DATA, OFFSET, PTR) \ 603 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605 606#define vload_partial_8(DATA, OFFSET, PTR) \ 607 DATA.s01234567 = vload8(OFFSET, PTR); 608 609#define vload_partial_9(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 DATA.s8 = vload1(OFFSET, PTR + 8); 612 613#define vload_partial_10(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616 617#define vload_partial_11(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620 621#define vload_partial_12(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624 625#define vload_partial_13(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_14(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_15(DATA, OFFSET, PTR) \ 634 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636 637#define vload_partial_16(DATA, OFFSET, PTR) \ 638 DATA = vload16(OFFSET, PTR); 639 640 641 642#define PIXEL_UNIT4 1 643#define PIXEL_UNIT8 2 644#define PIXEL_UNIT16 4 645 646 647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649 650 651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654 655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659#endif 660 661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669#endif 670 671 672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674 675 676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678 679#define VSTORE_STR(size) vstore##size 680#define VSTORE(size) VSTORE_STR(size) 681 682#define float1 float 683#define half1 half 684#define char1 char 685#define uchar1 uchar 686#define short1 short 687#define ushort1 ushort 688#define int1 int 689#define uint1 uint 690#define long1 long 691#define ulong1 ulong 692#define double1 double 693 694#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696 697 698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700 701#define NO_STORE(data, offs, ptr) \ 702 { \ 703 } 704 705 706#define vstore_partial_1_0 NO_STORE 707#define vstore_partial_1_1 vstore1 708#define vstore_partial_1_2 NO_STORE 709#define vstore_partial_1_3 NO_STORE 710#define vstore_partial_1_4 NO_STORE 711#define vstore_partial_1_5 NO_STORE 712#define vstore_partial_1_6 NO_STORE 713#define vstore_partial_1_7 NO_STORE 714#define vstore_partial_1_8 NO_STORE 715#define vstore_partial_1_9 NO_STORE 716#define vstore_partial_1_10 NO_STORE 717#define vstore_partial_1_11 NO_STORE 718#define vstore_partial_1_12 NO_STORE 719#define vstore_partial_1_13 NO_STORE 720#define vstore_partial_1_14 NO_STORE 721#define vstore_partial_1_15 NO_STORE 722#define vstore_partial_1_16 NO_STORE 723 724#define vstore_partial_2_0 NO_STORE 725#define vstore_partial_2_1 vstore_partial_1 726#define vstore_partial_2_2 vstore_partial_2 727#define vstore_partial_2_3 NO_STORE 728#define vstore_partial_2_4 NO_STORE 729#define vstore_partial_2_5 NO_STORE 730#define vstore_partial_2_6 NO_STORE 731#define vstore_partial_2_7 NO_STORE 732#define vstore_partial_2_8 NO_STORE 733#define vstore_partial_2_9 NO_STORE 734#define vstore_partial_2_10 NO_STORE 735#define vstore_partial_2_11 NO_STORE 736#define vstore_partial_2_12 NO_STORE 737#define vstore_partial_2_13 NO_STORE 738#define vstore_partial_2_14 NO_STORE 739#define vstore_partial_2_15 NO_STORE 740#define vstore_partial_2_16 NO_STORE 741 742#define vstore_partial_3_0 NO_STORE 743#define vstore_partial_3_1 vstore_partial_1 744#define vstore_partial_3_2 vstore_partial_2 745#define vstore_partial_3_3 vstore_partial_3 746#define vstore_partial_3_4 NO_STORE 747#define vstore_partial_3_5 NO_STORE 748#define vstore_partial_3_6 NO_STORE 749#define vstore_partial_3_7 NO_STORE 750#define vstore_partial_3_8 NO_STORE 751#define vstore_partial_3_9 NO_STORE 752#define vstore_partial_3_10 NO_STORE 753#define vstore_partial_3_11 NO_STORE 754#define vstore_partial_3_12 NO_STORE 755#define vstore_partial_3_13 NO_STORE 756#define vstore_partial_3_14 NO_STORE 757#define vstore_partial_3_15 NO_STORE 758#define vstore_partial_3_16 NO_STORE 759 760#define vstore_partial_4_0 NO_STORE 761#define vstore_partial_4_1 vstore_partial_1 762#define vstore_partial_4_2 vstore_partial_2 763#define vstore_partial_4_3 vstore_partial_3 764#define vstore_partial_4_4 vstore_partial_4 765#define vstore_partial_4_5 NO_STORE 766#define vstore_partial_4_6 NO_STORE 767#define vstore_partial_4_7 NO_STORE 768#define vstore_partial_4_8 NO_STORE 769#define vstore_partial_4_9 NO_STORE 770#define vstore_partial_4_10 NO_STORE 771#define vstore_partial_4_11 NO_STORE 772#define vstore_partial_4_12 NO_STORE 773#define vstore_partial_4_13 NO_STORE 774#define vstore_partial_4_14 NO_STORE 775#define vstore_partial_4_15 NO_STORE 776#define vstore_partial_4_16 NO_STORE 777 778#define vstore_partial_8_0 NO_STORE 779#define vstore_partial_8_1 vstore_partial_1 780#define vstore_partial_8_2 vstore_partial_2 781#define vstore_partial_8_3 vstore_partial_3 782#define vstore_partial_8_4 vstore_partial_4 783#define vstore_partial_8_5 vstore_partial_5 784#define vstore_partial_8_6 vstore_partial_6 785#define vstore_partial_8_7 vstore_partial_7 786#define vstore_partial_8_8 vstore_partial_8 787#define vstore_partial_8_9 NO_STORE 788#define vstore_partial_8_10 NO_STORE 789#define vstore_partial_8_11 NO_STORE 790#define vstore_partial_8_12 NO_STORE 791#define vstore_partial_8_13 NO_STORE 792#define vstore_partial_8_14 NO_STORE 793#define vstore_partial_8_15 NO_STORE 794#define vstore_partial_8_16 NO_STORE 795 796#define vstore_partial_16_0 NO_STORE 797#define vstore_partial_16_1 vstore_partial_1 798#define vstore_partial_16_2 vstore_partial_2 799#define vstore_partial_16_3 vstore_partial_3 800#define vstore_partial_16_4 vstore_partial_4 801#define vstore_partial_16_5 vstore_partial_5 802#define vstore_partial_16_6 vstore_partial_6 803#define vstore_partial_16_7 vstore_partial_7 804#define vstore_partial_16_8 vstore_partial_8 805#define vstore_partial_16_9 vstore_partial_9 806#define vstore_partial_16_10 vstore_partial_10 807#define vstore_partial_16_11 vstore_partial_11 808#define vstore_partial_16_12 vstore_partial_12 809#define vstore_partial_16_13 vstore_partial_13 810#define vstore_partial_16_14 vstore_partial_14 811#define vstore_partial_16_15 vstore_partial_15 812#define vstore_partial_16_16 vstore_partial_16 813 814 815#define vstore_partial_1(DATA, OFFSET, PTR) \ 816 vstore1(DATA.s0, OFFSET, PTR); 817 818#define vstore_partial_2(DATA, OFFSET, PTR) \ 819 vstore2(DATA.s01, OFFSET, PTR); 820 821#define vstore_partial_3(DATA, OFFSET, PTR) \ 822 vstore3(DATA.s012, OFFSET, PTR); 823 824#define vstore_partial_4(DATA, OFFSET, PTR) \ 825 vstore4(DATA.s0123, OFFSET, PTR); 826 827#define vstore_partial_5(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore1(DATA.s4, OFFSET, PTR + 4); 830 831#define vstore_partial_6(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834 835#define vstore_partial_7(DATA, OFFSET, PTR) \ 836 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838 839#define vstore_partial_8(DATA, OFFSET, PTR) \ 840 vstore8(DATA.s01234567, OFFSET, PTR); 841 842#define vstore_partial_9(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore1(DATA.s8, OFFSET, PTR + 8); 845 846#define vstore_partial_10(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849 850#define vstore_partial_11(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853 854#define vstore_partial_12(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857 858#define vstore_partial_13(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_14(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_15(DATA, OFFSET, PTR) \ 867 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869 870#define vstore_partial_16(DATA, OFFSET, PTR) \ 871 vstore16(DATA, OFFSET, PTR); 872 873 874 875 876 877#define convert_float_sat convert_float 878#define convert_float1_sat convert_float 879#define convert_float2_sat convert_float2 880#define convert_float3_sat convert_float3 881#define convert_float4_sat convert_float4 882#define convert_float8_sat convert_float8 883#define convert_float16_sat convert_float16 884#define convert_half_sat convert_float 885#define convert_half1_sat convert_half 886#define convert_half2_sat convert_half2 887#define convert_half3_sat convert_half3 888#define convert_half4_sat convert_half4 889#define convert_half8_sat convert_half8 890#define convert_half16_sat convert_half16 891 892#define convert_float1 convert_float 893#define convert_half1 convert_half 894#define convert_char1 convert_char 895#define convert_uchar1 convert_uchar 896#define convert_short1 convert_short 897#define convert_ushort1 convert_ushort 898#define convert_int1 convert_int 899#define convert_uint1 convert_uint 900#define convert_long1 convert_long 901#define convert_ulong1 convert_ulong 902#define convert_double1 convert_double 903 904#define convert_char1_sat convert_char_sat 905#define convert_uchar1_sat convert_uchar_sat 906#define convert_uchar2_sat convert_uchar2_sat 907#define convert_uchar3_sat convert_uchar3_sat 908#define convert_uchar4_sat convert_uchar4_sat 909#define convert_uchar8_sat convert_uchar8_sat 910#define convert_uchar16_sat convert_uchar16_sat 911#define convert_short1_sat convert_short_sat 912#define convert_ushort1_sat convert_ushort_sat 913#define convert_int1_sat convert_int_sat 914#define convert_uint1_sat convert_uint_sat 915#define convert_long1_sat convert_long_sat 916#define convert_ulong1_sat convert_ulong_sat 917#define convert_double1_sat convert_double_sat 918 919#define VEC_DATA_TYPE_STR(type, size) type##size 920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921 922#define CONVERT_STR(x, type) (convert_##type((x))) 923#define CONVERT(x, type) CONVERT_STR(x, type) 924 925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927 928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930 931#define select_vec_dt_uchar(size) uchar##size 932#define select_vec_dt_char(size) char##size 933#define select_vec_dt_ushort(size) ushort##size 934#define select_vec_dt_short(size) short##size 935#define select_vec_dt_half(size) short##size 936#define select_vec_dt_uint(size) uint##size 937#define select_vec_dt_int(size) int##size 938#define select_vec_dt_float(size) int##size 939#define select_vec_dt_ulong(size) ulong##size 940#define select_vec_dt_long(size) long##size 941 942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945 946#define signed_int_vec_dt_uchar(size) char##size 947#define signed_int_vec_dt_char(size) char##size 948#define signed_int_vec_dt_ushort(size) short##size 949#define signed_int_vec_dt_short(size) short##size 950#define signed_int_vec_dt_half(size) short##size 951#define signed_int_vec_dt_uint(size) int##size 952#define signed_int_vec_dt_int(size) int##size 953#define signed_int_vec_dt_float(size) int##size 954#define signed_int_vec_dt_ulong(size) long##size 955#define signed_int_vec_dt_long(size) long##size 956 957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960 961#define sum_reduce_1(x) (x) 962#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967 968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970 971#define prod_reduce_1(x) (x) 972#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977 978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980 981#define max_reduce_1(x) (x) 982#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987 988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990 991#define VECTOR_DECLARATION(name) \ 992 __global uchar *name##_ptr, \ 993 uint name##_stride_x, \ 994 uint name##_step_x, \ 995 uint name##_offset_first_element_in_bytes 996 997#define IMAGE_DECLARATION(name) \ 998 __global uchar *name##_ptr, \ 999 uint name##_stride_x, \ 1000 uint name##_step_x, \ 1001 uint name##_stride_y, \ 1002 uint name##_step_y, \ 1003 uint name##_offset_first_element_in_bytes 1004 1005#define TENSOR3D_DECLARATION(name) \ 1006 __global uchar *name##_ptr, \ 1007 uint name##_stride_x, \ 1008 uint name##_step_x, \ 1009 uint name##_stride_y, \ 1010 uint name##_step_y, \ 1011 uint name##_stride_z, \ 1012 uint name##_step_z, \ 1013 uint name##_offset_first_element_in_bytes 1014 1015#define TENSOR4D_DECLARATION(name) \ 1016 __global uchar *name##_ptr, \ 1017 uint name##_stride_x, \ 1018 uint name##_step_x, \ 1019 uint name##_stride_y, \ 1020 uint name##_step_y, \ 1021 uint name##_stride_z, \ 1022 uint name##_step_z, \ 1023 uint name##_stride_w, \ 1024 uint name##_step_w, \ 1025 uint name##_offset_first_element_in_bytes 1026 1027#define TENSOR5D_DECLARATION(name) \ 1028 __global uchar *name##_ptr, \ 1029 uint name##_stride_x, \ 1030 uint name##_step_x, \ 1031 uint name##_stride_y, \ 1032 uint name##_step_y, \ 1033 uint name##_stride_z, \ 1034 uint name##_step_z, \ 1035 uint name##_stride_w, \ 1036 uint name##_step_w, \ 1037 uint name##_stride_v, \ 1038 uint name##_step_v, \ 1039 uint name##_offset_first_element_in_bytes 1040 1041#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043 1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046 1047#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049 1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052 1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055 1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058 1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064 name##_stride_z, name##_step_z) 1065 1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072 1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075 1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078 name##_stride_z, name##_step_z) 1079 1080 1081typedef struct Vector 1082{ 1083 __global uchar *ptr; 1084 int offset_first_element_in_bytes; 1085 int stride_x; 1086} Vector; 1087 1088 1089typedef struct Image 1090{ 1091 __global uchar *ptr; 1092 int offset_first_element_in_bytes; 1093 int stride_x; 1094 int stride_y; 1095} Image; 1096 1097 1098typedef struct Tensor3D 1099{ 1100 __global uchar *ptr; 1101 int offset_first_element_in_bytes; 1102 int stride_x; 1103 int stride_y; 1104 int stride_z; 1105} Tensor3D; 1106 1107 1108typedef struct Tensor4D 1109{ 1110 __global uchar *ptr; 1111 int offset_first_element_in_bytes; 1112 int stride_x; 1113 int stride_y; 1114 int stride_z; 1115 int stride_w; 1116} Tensor4D; 1117 1118 1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120{ 1121 Vector vector = 1122 { 1123 .ptr = ptr, 1124 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125 .stride_x = stride_x, 1126 }; 1127 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128 return vector; 1129} 1130 1131 1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133{ 1134 Image img = 1135 { 1136 .ptr = ptr, 1137 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138 .stride_x = stride_x, 1139 .stride_y = stride_y 1140 }; 1141 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142 return img; 1143} 1144 1145 1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147{ 1148 Image img = 1149 { 1150 .ptr = ptr, 1151 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152 .stride_x = stride_x, 1153 .stride_y = stride_y 1154 }; 1155 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156 return img; 1157} 1158 1159 1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161{ 1162 Tensor3D tensor = 1163 { 1164 .ptr = ptr, 1165 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166 .stride_x = stride_x, 1167 .stride_y = stride_y, 1168 .stride_z = stride_z 1169 }; 1170 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171 return tensor; 1172} 1173 1174 1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176{ 1177 Tensor3D tensor = 1178 { 1179 .ptr = ptr, 1180 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181 .stride_x = stride_x, 1182 .stride_y = stride_y, 1183 .stride_z = stride_z 1184 }; 1185 return tensor; 1186} 1187 1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189 uint step_w, 1190 uint mod_size) 1191{ 1192 Tensor4D tensor = 1193 { 1194 .ptr = ptr, 1195 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196 .stride_x = stride_x, 1197 .stride_y = stride_y, 1198 .stride_z = stride_z, 1199 .stride_w = stride_w 1200 }; 1201 1202 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203 return tensor; 1204} 1205 1206 1207inline __global const uchar *vector_offset(const Vector *vec, int x) 1208{ 1209 return vec->ptr + x * vec->stride_x; 1210} 1211 1212 1213inline __global uchar *offset(const Image *img, int x, int y) 1214{ 1215 return img->ptr + x * img->stride_x + y * img->stride_y; 1216} 1217 1218 1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220{ 1221 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222} 1223 1224 1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226{ 1227 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228} 1229 1230 1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232{ 1233 uint num_elements = width * height; 1234 1235 const uint z = index / num_elements; 1236 1237 index %= num_elements; 1238 1239 const uint y = index / width; 1240 1241 index %= width; 1242 1243 const uint x = index; 1244 1245 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246} 1247 1248#endif 1249 1250#if GPU_ARCH == GPU_ARCH_BIFROST 1251#define MLA(a, b, c) (fma(c, b, a)) 1252#else 1253#define MLA(a, b, c) ((b) * (c) + (a)) 1254#endif 1255 1256 1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258 1259 1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261 1262 1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264 1265 1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267 1268 1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270 1271 1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273 1274 1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276 1277 1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279 1280 1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282 1283 1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285 1286 1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288 1289 1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291 1292 1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294 1295 1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297 1298 1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300 1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302 1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304 1305#ifndef ARM_COMPUTE_HELPER_H 1306#define ARM_COMPUTE_HELPER_H 1307 1308 1309 1310 1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312 VSTORE(N0) \ 1313 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314 1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317 VSTORE(N0) \ 1318 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319 1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322 VSTORE(N0) \ 1323 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324 1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327 VSTORE(N0) \ 1328 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329 1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332 VSTORE(N0) \ 1333 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334 1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337 VSTORE(N0) \ 1338 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339 1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342 VSTORE(N0) \ 1343 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344 1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347 VSTORE(N0) \ 1348 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349 1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352 VSTORE(N0) \ 1353 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354 1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357 VSTORE(N0) \ 1358 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359 1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362 VSTORE(N0) \ 1363 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364 1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367 VSTORE(N0) \ 1368 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369 1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372 VSTORE(N0) \ 1373 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374 1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377 VSTORE(N0) \ 1378 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379 1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382 VSTORE(N0) \ 1383 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384 1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387 VSTORE(N0) \ 1388 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389 1390 1391 1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393 VSTORE(N0) \ 1394 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395 1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398 VSTORE(N0) \ 1399 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400 1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403 VSTORE(N0) \ 1404 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405 1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408 VSTORE(N0) \ 1409 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410 1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413 VSTORE(N0) \ 1414 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415 1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418 VSTORE(N0) \ 1419 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420 1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423 VSTORE(N0) \ 1424 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425 1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428 VSTORE(N0) \ 1429 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430 1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433 VSTORE(N0) \ 1434 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435 1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438 VSTORE(N0) \ 1439 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440 1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443 VSTORE(N0) \ 1444 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445 1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448 VSTORE(N0) \ 1449 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450 1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453 VSTORE(N0) \ 1454 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455 1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458 VSTORE(N0) \ 1459 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460 1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463 VSTORE(N0) \ 1464 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465 1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468 VSTORE(N0) \ 1469 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470 1471 1472 1473 1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476 1477 1478 1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481 1482 1483 1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485 VSTORE_PARTIAL(N0, STORE_N0) \ 1486 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487 1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490 VSTORE_PARTIAL(N0, STORE_N0) \ 1491 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492 1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495 VSTORE_PARTIAL(N0, STORE_N0) \ 1496 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497 1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500 VSTORE_PARTIAL(N0, STORE_N0) \ 1501 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502 1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505 VSTORE_PARTIAL(N0, STORE_N0) \ 1506 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507 1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510 VSTORE_PARTIAL(N0, STORE_N0) \ 1511 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512 1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515 VSTORE_PARTIAL(N0, STORE_N0) \ 1516 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517 1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520 VSTORE_PARTIAL(N0, STORE_N0) \ 1521 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522 1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525 VSTORE_PARTIAL(N0, STORE_N0) \ 1526 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527 1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530 VSTORE_PARTIAL(N0, STORE_N0) \ 1531 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532 1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535 VSTORE_PARTIAL(N0, STORE_N0) \ 1536 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537 1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540 VSTORE_PARTIAL(N0, STORE_N0) \ 1541 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542 1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545 VSTORE_PARTIAL(N0, STORE_N0) \ 1546 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547 1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550 VSTORE_PARTIAL(N0, STORE_N0) \ 1551 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552 1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555 VSTORE_PARTIAL(N0, STORE_N0) \ 1556 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557 1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560 VSTORE_PARTIAL(N0, STORE_N0) \ 1561 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562 1563 1564 1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567 1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570 { \ 1571 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572 } \ 1573 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574 { \ 1575 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576 } \ 1577 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578 { \ 1579 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580 } \ 1581 else \ 1582 { \ 1583 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584 } 1585 1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587 if(!(PARTIAL_COND_X)) \ 1588 { \ 1589 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590 } \ 1591 else \ 1592 { \ 1593 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594 } 1595 1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597 if(!(PARTIAL_COND_Y)) \ 1598 { \ 1599 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600 } \ 1601 else \ 1602 { \ 1603 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604 } 1605 1606 1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608 1609 1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611 1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614 1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616 1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619 1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621 1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624 1625#else 1626 1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629 1630#endif 1631 1632#endif 1633 1634 1635#if defined(PARTIAL_STORE_M0) 1636 1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639#else 1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641 ((uint)(y * M0)) 1642#endif 1643 1644 1645 1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648 1649 1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652#endif 1653 1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656#endif 1657 1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660#endif 1661 1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663#pragma OPENCL EXTENSION cl_arm_printf : enable 1664#endif 1665 1666#define GPU_ARCH_MIDGARD 0x100 1667#define GPU_ARCH_BIFROST 0x200 1668#define GPU_ARCH_VALHALL 0x300 1669 1670 1671#define CONCAT(a, b) a##b 1672 1673 1674#define EXPAND(x) x 1675 1676 1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678 1679 1680#define REV1(x) ((x)) 1681#define REV2(x) ((x).s10) 1682#define REV3(x) ((x).s210) 1683#define REV4(x) ((x).s3210) 1684#define REV8(x) ((x).s76543210) 1685#define REV16(x) ((x).sFEDCBA9876543210) 1686 1687 1688 1689#define REVERSE_STR(x, s) REV##s((x)) 1690#define REVERSE(x, s) REVERSE_STR(x, s) 1691 1692 1693 1694#define ROT1_0(x) ((x)) 1695#define ROT1_1(x) ((x)) 1696 1697#define ROT2_0(x) ((x)) 1698#define ROT2_1(x) ((x).s10) 1699#define ROT2_2(x) ((x)) 1700 1701#define ROT3_0(x) ((x)) 1702#define ROT3_1(x) ((x).s201) 1703#define ROT3_2(x) ((x).s120) 1704#define ROT3_3(x) ((x)) 1705 1706#define ROT4_0(x) ((x)) 1707#define ROT4_1(x) ((x).s3012) 1708#define ROT4_2(x) ((x).s2301) 1709#define ROT4_3(x) ((x).s1230) 1710#define ROT4_4(x) ((x)) 1711 1712#define ROT8_0(x) ((x)) 1713#define ROT8_1(x) ((x).s70123456) 1714#define ROT8_2(x) ((x).s67012345) 1715#define ROT8_3(x) ((x).s56701234) 1716#define ROT8_4(x) ((x).s45670123) 1717#define ROT8_5(x) ((x).s34567012) 1718#define ROT8_6(x) ((x).s23456701) 1719#define ROT8_7(x) ((x).s12345670) 1720#define ROT8_8(x) ((x)) 1721 1722#define ROT16_0(x) ((x)) 1723#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728#define ROT16_6(x) ((x).sABCDEF0123456789) 1729#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738#define ROT16_16(x) ((x)) 1739 1740 1741 1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744 1745 1746 1747#define V_OFFS1(dt) (dt##1)(0) 1748#define V_OFFS2(dt) (dt##2)(0, 1) 1749#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753 1754 1755 1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758 1759 1760#define VLOAD_STR(size) vload##size 1761#define VLOAD(size) VLOAD_STR(size) 1762 1763 1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766 1767#define NO_LOAD(data, offs, ptr) \ 1768 { \ 1769 } 1770 1771 1772#define vload_partial_1_0 NO_LOAD 1773#define vload_partial_1_1 vload1 1774#define vload_partial_1_2 NO_LOAD 1775#define vload_partial_1_3 NO_LOAD 1776#define vload_partial_1_4 NO_LOAD 1777#define vload_partial_1_5 NO_LOAD 1778#define vload_partial_1_6 NO_LOAD 1779#define vload_partial_1_7 NO_LOAD 1780#define vload_partial_1_8 NO_LOAD 1781#define vload_partial_1_9 NO_LOAD 1782#define vload_partial_1_10 NO_LOAD 1783#define vload_partial_1_11 NO_LOAD 1784#define vload_partial_1_12 NO_LOAD 1785#define vload_partial_1_13 NO_LOAD 1786#define vload_partial_1_14 NO_LOAD 1787#define vload_partial_1_15 NO_LOAD 1788#define vload_partial_1_16 NO_LOAD 1789 1790#define vload_partial_2_0 NO_LOAD 1791#define vload_partial_2_1 vload_partial_1 1792#define vload_partial_2_2 vload_partial_2 1793#define vload_partial_2_3 NO_LOAD 1794#define vload_partial_2_4 NO_LOAD 1795#define vload_partial_2_5 NO_LOAD 1796#define vload_partial_2_6 NO_LOAD 1797#define vload_partial_2_7 NO_LOAD 1798#define vload_partial_2_8 NO_LOAD 1799#define vload_partial_2_9 NO_LOAD 1800#define vload_partial_2_10 NO_LOAD 1801#define vload_partial_2_11 NO_LOAD 1802#define vload_partial_2_12 NO_LOAD 1803#define vload_partial_2_13 NO_LOAD 1804#define vload_partial_2_14 NO_LOAD 1805#define vload_partial_2_15 NO_LOAD 1806#define vload_partial_2_16 NO_LOAD 1807 1808#define vload_partial_3_0 NO_LOAD 1809#define vload_partial_3_1 vload_partial_1 1810#define vload_partial_3_2 vload_partial_2 1811#define vload_partial_3_3 vload_partial_3 1812#define vload_partial_3_4 NO_LOAD 1813#define vload_partial_3_5 NO_LOAD 1814#define vload_partial_3_6 NO_LOAD 1815#define vload_partial_3_7 NO_LOAD 1816#define vload_partial_3_8 NO_LOAD 1817#define vload_partial_3_9 NO_LOAD 1818#define vload_partial_3_10 NO_LOAD 1819#define vload_partial_3_11 NO_LOAD 1820#define vload_partial_3_12 NO_LOAD 1821#define vload_partial_3_13 NO_LOAD 1822#define vload_partial_3_14 NO_LOAD 1823#define vload_partial_3_15 NO_LOAD 1824#define vload_partial_3_16 NO_LOAD 1825 1826#define vload_partial_4_0 NO_LOAD 1827#define vload_partial_4_1 vload_partial_1 1828#define vload_partial_4_2 vload_partial_2 1829#define vload_partial_4_3 vload_partial_3 1830#define vload_partial_4_4 vload_partial_4 1831#define vload_partial_4_5 NO_LOAD 1832#define vload_partial_4_6 NO_LOAD 1833#define vload_partial_4_7 NO_LOAD 1834#define vload_partial_4_8 NO_LOAD 1835#define vload_partial_4_9 NO_LOAD 1836#define vload_partial_4_10 NO_LOAD 1837#define vload_partial_4_11 NO_LOAD 1838#define vload_partial_4_12 NO_LOAD 1839#define vload_partial_4_13 NO_LOAD 1840#define vload_partial_4_14 NO_LOAD 1841#define vload_partial_4_15 NO_LOAD 1842#define vload_partial_4_16 NO_LOAD 1843 1844#define vload_partial_8_0 NO_LOAD 1845#define vload_partial_8_1 vload_partial_1 1846#define vload_partial_8_2 vload_partial_2 1847#define vload_partial_8_3 vload_partial_3 1848#define vload_partial_8_4 vload_partial_4 1849#define vload_partial_8_5 vload_partial_5 1850#define vload_partial_8_6 vload_partial_6 1851#define vload_partial_8_7 vload_partial_7 1852#define vload_partial_8_8 vload_partial_8 1853#define vload_partial_8_9 NO_LOAD 1854#define vload_partial_8_10 NO_LOAD 1855#define vload_partial_8_11 NO_LOAD 1856#define vload_partial_8_12 NO_LOAD 1857#define vload_partial_8_13 NO_LOAD 1858#define vload_partial_8_14 NO_LOAD 1859#define vload_partial_8_15 NO_LOAD 1860#define vload_partial_8_16 NO_LOAD 1861 1862#define vload_partial_16_0 NO_LOAD 1863#define vload_partial_16_1 vload_partial_1 1864#define vload_partial_16_2 vload_partial_2 1865#define vload_partial_16_3 vload_partial_3 1866#define vload_partial_16_4 vload_partial_4 1867#define vload_partial_16_5 vload_partial_5 1868#define vload_partial_16_6 vload_partial_6 1869#define vload_partial_16_7 vload_partial_7 1870#define vload_partial_16_8 vload_partial_8 1871#define vload_partial_16_9 vload_partial_9 1872#define vload_partial_16_10 vload_partial_10 1873#define vload_partial_16_11 vload_partial_11 1874#define vload_partial_16_12 vload_partial_12 1875#define vload_partial_16_13 vload_partial_13 1876#define vload_partial_16_14 vload_partial_14 1877#define vload_partial_16_15 vload_partial_15 1878#define vload_partial_16_16 vload_partial_16 1879 1880 1881#define vload_partial_1(DATA, OFFSET, PTR) \ 1882 DATA.s0 = vload1(OFFSET, PTR); 1883 1884#define vload_partial_2(DATA, OFFSET, PTR) \ 1885 DATA.s01 = vload2(OFFSET, PTR); 1886 1887#define vload_partial_3(DATA, OFFSET, PTR) \ 1888 DATA.s012 = vload3(OFFSET, PTR); 1889 1890#define vload_partial_4(DATA, OFFSET, PTR) \ 1891 DATA.s0123 = vload4(OFFSET, PTR); 1892 1893#define vload_partial_5(DATA, OFFSET, PTR) \ 1894 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895 DATA.s4 = vload1(OFFSET, PTR + 4); 1896 1897#define vload_partial_6(DATA, OFFSET, PTR) \ 1898 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900 1901#define vload_partial_7(DATA, OFFSET, PTR) \ 1902 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904 1905#define vload_partial_8(DATA, OFFSET, PTR) \ 1906 DATA.s01234567 = vload8(OFFSET, PTR); 1907 1908#define vload_partial_9(DATA, OFFSET, PTR) \ 1909 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910 DATA.s8 = vload1(OFFSET, PTR + 8); 1911 1912#define vload_partial_10(DATA, OFFSET, PTR) \ 1913 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915 1916#define vload_partial_11(DATA, OFFSET, PTR) \ 1917 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919 1920#define vload_partial_12(DATA, OFFSET, PTR) \ 1921 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923 1924#define vload_partial_13(DATA, OFFSET, PTR) \ 1925 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927 1928#define vload_partial_14(DATA, OFFSET, PTR) \ 1929 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931 1932#define vload_partial_15(DATA, OFFSET, PTR) \ 1933 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935 1936#define vload_partial_16(DATA, OFFSET, PTR) \ 1937 DATA = vload16(OFFSET, PTR); 1938 1939 1940 1941#define PIXEL_UNIT4 1 1942#define PIXEL_UNIT8 2 1943#define PIXEL_UNIT16 4 1944 1945 1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948 1949 1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953 1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958#endif 1959 1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963 1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968#endif 1969 1970 1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973 1974 1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977 1978#define VSTORE_STR(size) vstore##size 1979#define VSTORE(size) VSTORE_STR(size) 1980 1981#define float1 float 1982#define half1 half 1983#define char1 char 1984#define uchar1 uchar 1985#define short1 short 1986#define ushort1 ushort 1987#define int1 int 1988#define uint1 uint 1989#define long1 long 1990#define ulong1 ulong 1991#define double1 double 1992 1993#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995 1996 1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999 2000#define NO_STORE(data, offs, ptr) \ 2001 { \ 2002 } 2003 2004 2005#define vstore_partial_1_0 NO_STORE 2006#define vstore_partial_1_1 vstore1 2007#define vstore_partial_1_2 NO_STORE 2008#define vstore_partial_1_3 NO_STORE 2009#define vstore_partial_1_4 NO_STORE 2010#define vstore_partial_1_5 NO_STORE 2011#define vstore_partial_1_6 NO_STORE 2012#define vstore_partial_1_7 NO_STORE 2013#define vstore_partial_1_8 NO_STORE 2014#define vstore_partial_1_9 NO_STORE 2015#define vstore_partial_1_10 NO_STORE 2016#define vstore_partial_1_11 NO_STORE 2017#define vstore_partial_1_12 NO_STORE 2018#define vstore_partial_1_13 NO_STORE 2019#define vstore_partial_1_14 NO_STORE 2020#define vstore_partial_1_15 NO_STORE 2021#define vstore_partial_1_16 NO_STORE 2022 2023#define vstore_partial_2_0 NO_STORE 2024#define vstore_partial_2_1 vstore_partial_1 2025#define vstore_partial_2_2 vstore_partial_2 2026#define vstore_partial_2_3 NO_STORE 2027#define vstore_partial_2_4 NO_STORE 2028#define vstore_partial_2_5 NO_STORE 2029#define vstore_partial_2_6 NO_STORE 2030#define vstore_partial_2_7 NO_STORE 2031#define vstore_partial_2_8 NO_STORE 2032#define vstore_partial_2_9 NO_STORE 2033#define vstore_partial_2_10 NO_STORE 2034#define vstore_partial_2_11 NO_STORE 2035#define vstore_partial_2_12 NO_STORE 2036#define vstore_partial_2_13 NO_STORE 2037#define vstore_partial_2_14 NO_STORE 2038#define vstore_partial_2_15 NO_STORE 2039#define vstore_partial_2_16 NO_STORE 2040 2041#define vstore_partial_3_0 NO_STORE 2042#define vstore_partial_3_1 vstore_partial_1 2043#define vstore_partial_3_2 vstore_partial_2 2044#define vstore_partial_3_3 vstore_partial_3 2045#define vstore_partial_3_4 NO_STORE 2046#define vstore_partial_3_5 NO_STORE 2047#define vstore_partial_3_6 NO_STORE 2048#define vstore_partial_3_7 NO_STORE 2049#define vstore_partial_3_8 NO_STORE 2050#define vstore_partial_3_9 NO_STORE 2051#define vstore_partial_3_10 NO_STORE 2052#define vstore_partial_3_11 NO_STORE 2053#define vstore_partial_3_12 NO_STORE 2054#define vstore_partial_3_13 NO_STORE 2055#define vstore_partial_3_14 NO_STORE 2056#define vstore_partial_3_15 NO_STORE 2057#define vstore_partial_3_16 NO_STORE 2058 2059#define vstore_partial_4_0 NO_STORE 2060#define vstore_partial_4_1 vstore_partial_1 2061#define vstore_partial_4_2 vstore_partial_2 2062#define vstore_partial_4_3 vstore_partial_3 2063#define vstore_partial_4_4 vstore_partial_4 2064#define vstore_partial_4_5 NO_STORE 2065#define vstore_partial_4_6 NO_STORE 2066#define vstore_partial_4_7 NO_STORE 2067#define vstore_partial_4_8 NO_STORE 2068#define vstore_partial_4_9 NO_STORE 2069#define vstore_partial_4_10 NO_STORE 2070#define vstore_partial_4_11 NO_STORE 2071#define vstore_partial_4_12 NO_STORE 2072#define vstore_partial_4_13 NO_STORE 2073#define vstore_partial_4_14 NO_STORE 2074#define vstore_partial_4_15 NO_STORE 2075#define vstore_partial_4_16 NO_STORE 2076 2077#define vstore_partial_8_0 NO_STORE 2078#define vstore_partial_8_1 vstore_partial_1 2079#define vstore_partial_8_2 vstore_partial_2 2080#define vstore_partial_8_3 vstore_partial_3 2081#define vstore_partial_8_4 vstore_partial_4 2082#define vstore_partial_8_5 vstore_partial_5 2083#define vstore_partial_8_6 vstore_partial_6 2084#define vstore_partial_8_7 vstore_partial_7 2085#define vstore_partial_8_8 vstore_partial_8 2086#define vstore_partial_8_9 NO_STORE 2087#define vstore_partial_8_10 NO_STORE 2088#define vstore_partial_8_11 NO_STORE 2089#define vstore_partial_8_12 NO_STORE 2090#define vstore_partial_8_13 NO_STORE 2091#define vstore_partial_8_14 NO_STORE 2092#define vstore_partial_8_15 NO_STORE 2093#define vstore_partial_8_16 NO_STORE 2094 2095#define vstore_partial_16_0 NO_STORE 2096#define vstore_partial_16_1 vstore_partial_1 2097#define vstore_partial_16_2 vstore_partial_2 2098#define vstore_partial_16_3 vstore_partial_3 2099#define vstore_partial_16_4 vstore_partial_4 2100#define vstore_partial_16_5 vstore_partial_5 2101#define vstore_partial_16_6 vstore_partial_6 2102#define vstore_partial_16_7 vstore_partial_7 2103#define vstore_partial_16_8 vstore_partial_8 2104#define vstore_partial_16_9 vstore_partial_9 2105#define vstore_partial_16_10 vstore_partial_10 2106#define vstore_partial_16_11 vstore_partial_11 2107#define vstore_partial_16_12 vstore_partial_12 2108#define vstore_partial_16_13 vstore_partial_13 2109#define vstore_partial_16_14 vstore_partial_14 2110#define vstore_partial_16_15 vstore_partial_15 2111#define vstore_partial_16_16 vstore_partial_16 2112 2113 2114#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115 vstore1(DATA.s0, OFFSET, PTR); 2116 2117#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118 vstore2(DATA.s01, OFFSET, PTR); 2119 2120#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121 vstore3(DATA.s012, OFFSET, PTR); 2122 2123#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124 vstore4(DATA.s0123, OFFSET, PTR); 2125 2126#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128 vstore1(DATA.s4, OFFSET, PTR + 4); 2129 2130#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133 2134#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137 2138#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139 vstore8(DATA.s01234567, OFFSET, PTR); 2140 2141#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143 vstore1(DATA.s8, OFFSET, PTR + 8); 2144 2145#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148 2149#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152 2153#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156 2157#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160 2161#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164 2165#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168 2169#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170 vstore16(DATA, OFFSET, PTR); 2171 2172 2173 2174 2175 2176#define convert_float_sat convert_float 2177#define convert_float1_sat convert_float 2178#define convert_float2_sat convert_float2 2179#define convert_float3_sat convert_float3 2180#define convert_float4_sat convert_float4 2181#define convert_float8_sat convert_float8 2182#define convert_float16_sat convert_float16 2183#define convert_half_sat convert_float 2184#define convert_half1_sat convert_half 2185#define convert_half2_sat convert_half2 2186#define convert_half3_sat convert_half3 2187#define convert_half4_sat convert_half4 2188#define convert_half8_sat convert_half8 2189#define convert_half16_sat convert_half16 2190 2191#define convert_float1 convert_float 2192#define convert_half1 convert_half 2193#define convert_char1 convert_char 2194#define convert_uchar1 convert_uchar 2195#define convert_short1 convert_short 2196#define convert_ushort1 convert_ushort 2197#define convert_int1 convert_int 2198#define convert_uint1 convert_uint 2199#define convert_long1 convert_long 2200#define convert_ulong1 convert_ulong 2201#define convert_double1 convert_double 2202 2203#define convert_char1_sat convert_char_sat 2204#define convert_uchar1_sat convert_uchar_sat 2205#define convert_uchar2_sat convert_uchar2_sat 2206#define convert_uchar3_sat convert_uchar3_sat 2207#define convert_uchar4_sat convert_uchar4_sat 2208#define convert_uchar8_sat convert_uchar8_sat 2209#define convert_uchar16_sat convert_uchar16_sat 2210#define convert_short1_sat convert_short_sat 2211#define convert_ushort1_sat convert_ushort_sat 2212#define convert_int1_sat convert_int_sat 2213#define convert_uint1_sat convert_uint_sat 2214#define convert_long1_sat convert_long_sat 2215#define convert_ulong1_sat convert_ulong_sat 2216#define convert_double1_sat convert_double_sat 2217 2218#define VEC_DATA_TYPE_STR(type, size) type##size 2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220 2221#define CONVERT_STR(x, type) (convert_##type((x))) 2222#define CONVERT(x, type) CONVERT_STR(x, type) 2223 2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226 2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229 2230#define select_vec_dt_uchar(size) uchar##size 2231#define select_vec_dt_char(size) char##size 2232#define select_vec_dt_ushort(size) ushort##size 2233#define select_vec_dt_short(size) short##size 2234#define select_vec_dt_half(size) short##size 2235#define select_vec_dt_uint(size) uint##size 2236#define select_vec_dt_int(size) int##size 2237#define select_vec_dt_float(size) int##size 2238#define select_vec_dt_ulong(size) ulong##size 2239#define select_vec_dt_long(size) long##size 2240 2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244 2245#define signed_int_vec_dt_uchar(size) char##size 2246#define signed_int_vec_dt_char(size) char##size 2247#define signed_int_vec_dt_ushort(size) short##size 2248#define signed_int_vec_dt_short(size) short##size 2249#define signed_int_vec_dt_half(size) short##size 2250#define signed_int_vec_dt_uint(size) int##size 2251#define signed_int_vec_dt_int(size) int##size 2252#define signed_int_vec_dt_float(size) int##size 2253#define signed_int_vec_dt_ulong(size) long##size 2254#define signed_int_vec_dt_long(size) long##size 2255 2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259 2260#define sum_reduce_1(x) (x) 2261#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266 2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269 2270#define prod_reduce_1(x) (x) 2271#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276 2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279 2280#define max_reduce_1(x) (x) 2281#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286 2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289 2290#define VECTOR_DECLARATION(name) \ 2291 __global uchar *name##_ptr, \ 2292 uint name##_stride_x, \ 2293 uint name##_step_x, \ 2294 uint name##_offset_first_element_in_bytes 2295 2296#define IMAGE_DECLARATION(name) \ 2297 __global uchar *name##_ptr, \ 2298 uint name##_stride_x, \ 2299 uint name##_step_x, \ 2300 uint name##_stride_y, \ 2301 uint name##_step_y, \ 2302 uint name##_offset_first_element_in_bytes 2303 2304#define TENSOR3D_DECLARATION(name) \ 2305 __global uchar *name##_ptr, \ 2306 uint name##_stride_x, \ 2307 uint name##_step_x, \ 2308 uint name##_stride_y, \ 2309 uint name##_step_y, \ 2310 uint name##_stride_z, \ 2311 uint name##_step_z, \ 2312 uint name##_offset_first_element_in_bytes 2313 2314#define TENSOR4D_DECLARATION(name) \ 2315 __global uchar *name##_ptr, \ 2316 uint name##_stride_x, \ 2317 uint name##_step_x, \ 2318 uint name##_stride_y, \ 2319 uint name##_step_y, \ 2320 uint name##_stride_z, \ 2321 uint name##_step_z, \ 2322 uint name##_stride_w, \ 2323 uint name##_step_w, \ 2324 uint name##_offset_first_element_in_bytes 2325 2326#define TENSOR5D_DECLARATION(name) \ 2327 __global uchar *name##_ptr, \ 2328 uint name##_stride_x, \ 2329 uint name##_step_x, \ 2330 uint name##_stride_y, \ 2331 uint name##_step_y, \ 2332 uint name##_stride_z, \ 2333 uint name##_step_z, \ 2334 uint name##_stride_w, \ 2335 uint name##_step_w, \ 2336 uint name##_stride_v, \ 2337 uint name##_step_v, \ 2338 uint name##_offset_first_element_in_bytes 2339 2340#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342 2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345 2346#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348 2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351 2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354 2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357 2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360 2361#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363 name##_stride_z, name##_step_z) 2364 2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367 2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371 2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374 2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377 name##_stride_z, name##_step_z) 2378 2379 2380typedef struct Vector 2381{ 2382 __global uchar *ptr; 2383 int offset_first_element_in_bytes; 2384 int stride_x; 2385} Vector; 2386 2387 2388typedef struct Image 2389{ 2390 __global uchar *ptr; 2391 int offset_first_element_in_bytes; 2392 int stride_x; 2393 int stride_y; 2394} Image; 2395 2396 2397typedef struct Tensor3D 2398{ 2399 __global uchar *ptr; 2400 int offset_first_element_in_bytes; 2401 int stride_x; 2402 int stride_y; 2403 int stride_z; 2404} Tensor3D; 2405 2406 2407typedef struct Tensor4D 2408{ 2409 __global uchar *ptr; 2410 int offset_first_element_in_bytes; 2411 int stride_x; 2412 int stride_y; 2413 int stride_z; 2414 int stride_w; 2415} Tensor4D; 2416 2417 2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419{ 2420 Vector vector = 2421 { 2422 .ptr = ptr, 2423 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424 .stride_x = stride_x, 2425 }; 2426 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427 return vector; 2428} 2429 2430 2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432{ 2433 Image img = 2434 { 2435 .ptr = ptr, 2436 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437 .stride_x = stride_x, 2438 .stride_y = stride_y 2439 }; 2440 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441 return img; 2442} 2443 2444 2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446{ 2447 Image img = 2448 { 2449 .ptr = ptr, 2450 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451 .stride_x = stride_x, 2452 .stride_y = stride_y 2453 }; 2454 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455 return img; 2456} 2457 2458 2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460{ 2461 Tensor3D tensor = 2462 { 2463 .ptr = ptr, 2464 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465 .stride_x = stride_x, 2466 .stride_y = stride_y, 2467 .stride_z = stride_z 2468 }; 2469 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470 return tensor; 2471} 2472 2473 2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475{ 2476 Tensor3D tensor = 2477 { 2478 .ptr = ptr, 2479 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480 .stride_x = stride_x, 2481 .stride_y = stride_y, 2482 .stride_z = stride_z 2483 }; 2484 return tensor; 2485} 2486 2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488 uint step_w, 2489 uint mod_size) 2490{ 2491 Tensor4D tensor = 2492 { 2493 .ptr = ptr, 2494 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495 .stride_x = stride_x, 2496 .stride_y = stride_y, 2497 .stride_z = stride_z, 2498 .stride_w = stride_w 2499 }; 2500 2501 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502 return tensor; 2503} 2504 2505 2506inline __global const uchar *vector_offset(const Vector *vec, int x) 2507{ 2508 return vec->ptr + x * vec->stride_x; 2509} 2510 2511 2512inline __global uchar *offset(const Image *img, int x, int y) 2513{ 2514 return img->ptr + x * img->stride_x + y * img->stride_y; 2515} 2516 2517 2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519{ 2520 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521} 2522 2523 2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525{ 2526 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527} 2528 2529 2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531{ 2532 uint num_elements = width * height; 2533 2534 const uint z = index / num_elements; 2535 2536 index %= num_elements; 2537 2538 const uint y = index / width; 2539 2540 index %= width; 2541 2542 const uint x = index; 2543 2544 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545} 2546 2547#endif 2548 2549#ifndef ARM_COMPUTE_HELPERS_ASYMM_H 2550#define ARM_COMPUTE_HELPERS_ASYMM_H 2551 2552 2553#ifndef ARM_COMPUTE_HELPER_H 2554#define ARM_COMPUTE_HELPER_H 2555 2556 2557 2558 2559#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2560 VSTORE(N0) \ 2561 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2562 2563#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2564 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2565 VSTORE(N0) \ 2566 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2567 2568#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2569 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2570 VSTORE(N0) \ 2571 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2572 2573#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2574 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2575 VSTORE(N0) \ 2576 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2577 2578#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2579 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2580 VSTORE(N0) \ 2581 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2582 2583#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2584 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2585 VSTORE(N0) \ 2586 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2587 2588#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2589 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2590 VSTORE(N0) \ 2591 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2592 2593#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2594 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2595 VSTORE(N0) \ 2596 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2597 2598#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2599 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2600 VSTORE(N0) \ 2601 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2602 2603#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2604 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2605 VSTORE(N0) \ 2606 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2607 2608#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2609 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2610 VSTORE(N0) \ 2611 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2612 2613#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2614 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2615 VSTORE(N0) \ 2616 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2617 2618#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2619 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2620 VSTORE(N0) \ 2621 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2622 2623#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2624 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2625 VSTORE(N0) \ 2626 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2627 2628#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2629 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2630 VSTORE(N0) \ 2631 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2632 2633#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2634 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2635 VSTORE(N0) \ 2636 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2637 2638 2639 2640#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2641 VSTORE(N0) \ 2642 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2643 2644#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2645 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2646 VSTORE(N0) \ 2647 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2648 2649#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2650 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2651 VSTORE(N0) \ 2652 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2653 2654#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2655 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2656 VSTORE(N0) \ 2657 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2658 2659#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2660 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2661 VSTORE(N0) \ 2662 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2663 2664#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2665 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2666 VSTORE(N0) \ 2667 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2668 2669#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2670 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2671 VSTORE(N0) \ 2672 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2673 2674#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2675 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2676 VSTORE(N0) \ 2677 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2678 2679#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2680 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2681 VSTORE(N0) \ 2682 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2683 2684#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 2685 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2686 VSTORE(N0) \ 2687 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2688 2689#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2690 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2691 VSTORE(N0) \ 2692 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2693 2694#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2695 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2696 VSTORE(N0) \ 2697 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2698 2699#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2700 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2701 VSTORE(N0) \ 2702 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2703 2704#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2705 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2706 VSTORE(N0) \ 2707 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2708 2709#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2710 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2711 VSTORE(N0) \ 2712 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2713 2714#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2715 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2716 VSTORE(N0) \ 2717 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2718 2719 2720 2721 2722#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2723#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2724 2725 2726 2727#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2728#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2729 2730 2731 2732#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2733 VSTORE_PARTIAL(N0, STORE_N0) \ 2734 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2735 2736#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2737 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2738 VSTORE_PARTIAL(N0, STORE_N0) \ 2739 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2740 2741#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2742 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2743 VSTORE_PARTIAL(N0, STORE_N0) \ 2744 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2745 2746#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2747 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2748 VSTORE_PARTIAL(N0, STORE_N0) \ 2749 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2750 2751#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2752 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2753 VSTORE_PARTIAL(N0, STORE_N0) \ 2754 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2755 2756#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2757 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2758 VSTORE_PARTIAL(N0, STORE_N0) \ 2759 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2760 2761#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2762 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2763 VSTORE_PARTIAL(N0, STORE_N0) \ 2764 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2765 2766#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2767 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2768 VSTORE_PARTIAL(N0, STORE_N0) \ 2769 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2770 2771#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2772 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2773 VSTORE_PARTIAL(N0, STORE_N0) \ 2774 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2775 2776#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2777 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2778 VSTORE_PARTIAL(N0, STORE_N0) \ 2779 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2780 2781#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2782 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2783 VSTORE_PARTIAL(N0, STORE_N0) \ 2784 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2785 2786#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2787 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2788 VSTORE_PARTIAL(N0, STORE_N0) \ 2789 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2790 2791#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2792 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2793 VSTORE_PARTIAL(N0, STORE_N0) \ 2794 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2795 2796#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2797 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2798 VSTORE_PARTIAL(N0, STORE_N0) \ 2799 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2800 2801#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2802 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2803 VSTORE_PARTIAL(N0, STORE_N0) \ 2804 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2805 2806#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2807 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2808 VSTORE_PARTIAL(N0, STORE_N0) \ 2809 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2810 2811 2812 2813#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2814#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2815 2816#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2817 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2818 { \ 2819 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2820 } \ 2821 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2822 { \ 2823 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2824 } \ 2825 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2826 { \ 2827 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2828 } \ 2829 else \ 2830 { \ 2831 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2832 } 2833 2834#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2835 if(!(PARTIAL_COND_X)) \ 2836 { \ 2837 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2838 } \ 2839 else \ 2840 { \ 2841 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2842 } 2843 2844#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2845 if(!(PARTIAL_COND_Y)) \ 2846 { \ 2847 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2848 } \ 2849 else \ 2850 { \ 2851 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2852 } 2853 2854 2855#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 2856 2857 2858#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2859 2860#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2861 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2862 2863#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2864 2865#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2866 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2867 2868#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2869 2870#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2871 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2872 2873#else 2874 2875#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2876 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2877 2878#endif 2879 2880#endif 2881 2882 2883#if defined(PARTIAL_STORE_M0) 2884 2885#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2886 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 2887#else 2888#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2889 ((uint)(y * M0)) 2890#endif 2891 2892 2893 2894#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 2895 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 2896 2897 2898#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2899#pragma OPENCL EXTENSION cl_khr_fp16 : enable 2900#endif 2901 2902#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 2903#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 2904#endif 2905 2906#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 2907#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 2908#endif 2909 2910#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 2911#pragma OPENCL EXTENSION cl_arm_printf : enable 2912#endif 2913 2914#define GPU_ARCH_MIDGARD 0x100 2915#define GPU_ARCH_BIFROST 0x200 2916#define GPU_ARCH_VALHALL 0x300 2917 2918 2919#define CONCAT(a, b) a##b 2920 2921 2922#define EXPAND(x) x 2923 2924 2925#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 2926 2927 2928#define REV1(x) ((x)) 2929#define REV2(x) ((x).s10) 2930#define REV3(x) ((x).s210) 2931#define REV4(x) ((x).s3210) 2932#define REV8(x) ((x).s76543210) 2933#define REV16(x) ((x).sFEDCBA9876543210) 2934 2935 2936 2937#define REVERSE_STR(x, s) REV##s((x)) 2938#define REVERSE(x, s) REVERSE_STR(x, s) 2939 2940 2941 2942#define ROT1_0(x) ((x)) 2943#define ROT1_1(x) ((x)) 2944 2945#define ROT2_0(x) ((x)) 2946#define ROT2_1(x) ((x).s10) 2947#define ROT2_2(x) ((x)) 2948 2949#define ROT3_0(x) ((x)) 2950#define ROT3_1(x) ((x).s201) 2951#define ROT3_2(x) ((x).s120) 2952#define ROT3_3(x) ((x)) 2953 2954#define ROT4_0(x) ((x)) 2955#define ROT4_1(x) ((x).s3012) 2956#define ROT4_2(x) ((x).s2301) 2957#define ROT4_3(x) ((x).s1230) 2958#define ROT4_4(x) ((x)) 2959 2960#define ROT8_0(x) ((x)) 2961#define ROT8_1(x) ((x).s70123456) 2962#define ROT8_2(x) ((x).s67012345) 2963#define ROT8_3(x) ((x).s56701234) 2964#define ROT8_4(x) ((x).s45670123) 2965#define ROT8_5(x) ((x).s34567012) 2966#define ROT8_6(x) ((x).s23456701) 2967#define ROT8_7(x) ((x).s12345670) 2968#define ROT8_8(x) ((x)) 2969 2970#define ROT16_0(x) ((x)) 2971#define ROT16_1(x) ((x).sF0123456789ABCDE) 2972#define ROT16_2(x) ((x).sEF0123456789ABCD) 2973#define ROT16_3(x) ((x).sDEF0123456789ABC) 2974#define ROT16_4(x) ((x).sCDEF0123456789AB) 2975#define ROT16_5(x) ((x).sBCDEF0123456789A) 2976#define ROT16_6(x) ((x).sABCDEF0123456789) 2977#define ROT16_7(x) ((x).s9ABCDEF012345678) 2978#define ROT16_8(x) ((x).s89ABCDEF01234567) 2979#define ROT16_9(x) ((x).s789ABCDEF0123456) 2980#define ROT16_10(x) ((x).s6789ABCDEF012345) 2981#define ROT16_11(x) ((x).s56789ABCDEF01234) 2982#define ROT16_12(x) ((x).s456789ABCDEF0123) 2983#define ROT16_13(x) ((x).s3456789ABCDEF012) 2984#define ROT16_14(x) ((x).s23456789ABCDEF01) 2985#define ROT16_15(x) ((x).s123456789ABCDEF0) 2986#define ROT16_16(x) ((x)) 2987 2988 2989 2990#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 2991#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 2992 2993 2994 2995#define V_OFFS1(dt) (dt##1)(0) 2996#define V_OFFS2(dt) (dt##2)(0, 1) 2997#define V_OFFS3(dt) (dt##3)(0, 1, 2) 2998#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 2999#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 3000#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 3001 3002 3003 3004#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 3005#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 3006 3007 3008#define VLOAD_STR(size) vload##size 3009#define VLOAD(size) VLOAD_STR(size) 3010 3011 3012#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 3013#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 3014 3015#define NO_LOAD(data, offs, ptr) \ 3016 { \ 3017 } 3018 3019 3020#define vload_partial_1_0 NO_LOAD 3021#define vload_partial_1_1 vload1 3022#define vload_partial_1_2 NO_LOAD 3023#define vload_partial_1_3 NO_LOAD 3024#define vload_partial_1_4 NO_LOAD 3025#define vload_partial_1_5 NO_LOAD 3026#define vload_partial_1_6 NO_LOAD 3027#define vload_partial_1_7 NO_LOAD 3028#define vload_partial_1_8 NO_LOAD 3029#define vload_partial_1_9 NO_LOAD 3030#define vload_partial_1_10 NO_LOAD 3031#define vload_partial_1_11 NO_LOAD 3032#define vload_partial_1_12 NO_LOAD 3033#define vload_partial_1_13 NO_LOAD 3034#define vload_partial_1_14 NO_LOAD 3035#define vload_partial_1_15 NO_LOAD 3036#define vload_partial_1_16 NO_LOAD 3037 3038#define vload_partial_2_0 NO_LOAD 3039#define vload_partial_2_1 vload_partial_1 3040#define vload_partial_2_2 vload_partial_2 3041#define vload_partial_2_3 NO_LOAD 3042#define vload_partial_2_4 NO_LOAD 3043#define vload_partial_2_5 NO_LOAD 3044#define vload_partial_2_6 NO_LOAD 3045#define vload_partial_2_7 NO_LOAD 3046#define vload_partial_2_8 NO_LOAD 3047#define vload_partial_2_9 NO_LOAD 3048#define vload_partial_2_10 NO_LOAD 3049#define vload_partial_2_11 NO_LOAD 3050#define vload_partial_2_12 NO_LOAD 3051#define vload_partial_2_13 NO_LOAD 3052#define vload_partial_2_14 NO_LOAD 3053#define vload_partial_2_15 NO_LOAD 3054#define vload_partial_2_16 NO_LOAD 3055 3056#define vload_partial_3_0 NO_LOAD 3057#define vload_partial_3_1 vload_partial_1 3058#define vload_partial_3_2 vload_partial_2 3059#define vload_partial_3_3 vload_partial_3 3060#define vload_partial_3_4 NO_LOAD 3061#define vload_partial_3_5 NO_LOAD 3062#define vload_partial_3_6 NO_LOAD 3063#define vload_partial_3_7 NO_LOAD 3064#define vload_partial_3_8 NO_LOAD 3065#define vload_partial_3_9 NO_LOAD 3066#define vload_partial_3_10 NO_LOAD 3067#define vload_partial_3_11 NO_LOAD 3068#define vload_partial_3_12 NO_LOAD 3069#define vload_partial_3_13 NO_LOAD 3070#define vload_partial_3_14 NO_LOAD 3071#define vload_partial_3_15 NO_LOAD 3072#define vload_partial_3_16 NO_LOAD 3073 3074#define vload_partial_4_0 NO_LOAD 3075#define vload_partial_4_1 vload_partial_1 3076#define vload_partial_4_2 vload_partial_2 3077#define vload_partial_4_3 vload_partial_3 3078#define vload_partial_4_4 vload_partial_4 3079#define vload_partial_4_5 NO_LOAD 3080#define vload_partial_4_6 NO_LOAD 3081#define vload_partial_4_7 NO_LOAD 3082#define vload_partial_4_8 NO_LOAD 3083#define vload_partial_4_9 NO_LOAD 3084#define vload_partial_4_10 NO_LOAD 3085#define vload_partial_4_11 NO_LOAD 3086#define vload_partial_4_12 NO_LOAD 3087#define vload_partial_4_13 NO_LOAD 3088#define vload_partial_4_14 NO_LOAD 3089#define vload_partial_4_15 NO_LOAD 3090#define vload_partial_4_16 NO_LOAD 3091 3092#define vload_partial_8_0 NO_LOAD 3093#define vload_partial_8_1 vload_partial_1 3094#define vload_partial_8_2 vload_partial_2 3095#define vload_partial_8_3 vload_partial_3 3096#define vload_partial_8_4 vload_partial_4 3097#define vload_partial_8_5 vload_partial_5 3098#define vload_partial_8_6 vload_partial_6 3099#define vload_partial_8_7 vload_partial_7 3100#define vload_partial_8_8 vload_partial_8 3101#define vload_partial_8_9 NO_LOAD 3102#define vload_partial_8_10 NO_LOAD 3103#define vload_partial_8_11 NO_LOAD 3104#define vload_partial_8_12 NO_LOAD 3105#define vload_partial_8_13 NO_LOAD 3106#define vload_partial_8_14 NO_LOAD 3107#define vload_partial_8_15 NO_LOAD 3108#define vload_partial_8_16 NO_LOAD 3109 3110#define vload_partial_16_0 NO_LOAD 3111#define vload_partial_16_1 vload_partial_1 3112#define vload_partial_16_2 vload_partial_2 3113#define vload_partial_16_3 vload_partial_3 3114#define vload_partial_16_4 vload_partial_4 3115#define vload_partial_16_5 vload_partial_5 3116#define vload_partial_16_6 vload_partial_6 3117#define vload_partial_16_7 vload_partial_7 3118#define vload_partial_16_8 vload_partial_8 3119#define vload_partial_16_9 vload_partial_9 3120#define vload_partial_16_10 vload_partial_10 3121#define vload_partial_16_11 vload_partial_11 3122#define vload_partial_16_12 vload_partial_12 3123#define vload_partial_16_13 vload_partial_13 3124#define vload_partial_16_14 vload_partial_14 3125#define vload_partial_16_15 vload_partial_15 3126#define vload_partial_16_16 vload_partial_16 3127 3128 3129#define vload_partial_1(DATA, OFFSET, PTR) \ 3130 DATA.s0 = vload1(OFFSET, PTR); 3131 3132#define vload_partial_2(DATA, OFFSET, PTR) \ 3133 DATA.s01 = vload2(OFFSET, PTR); 3134 3135#define vload_partial_3(DATA, OFFSET, PTR) \ 3136 DATA.s012 = vload3(OFFSET, PTR); 3137 3138#define vload_partial_4(DATA, OFFSET, PTR) \ 3139 DATA.s0123 = vload4(OFFSET, PTR); 3140 3141#define vload_partial_5(DATA, OFFSET, PTR) \ 3142 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3143 DATA.s4 = vload1(OFFSET, PTR + 4); 3144 3145#define vload_partial_6(DATA, OFFSET, PTR) \ 3146 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3147 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 3148 3149#define vload_partial_7(DATA, OFFSET, PTR) \ 3150 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3151 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 3152 3153#define vload_partial_8(DATA, OFFSET, PTR) \ 3154 DATA.s01234567 = vload8(OFFSET, PTR); 3155 3156#define vload_partial_9(DATA, OFFSET, PTR) \ 3157 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3158 DATA.s8 = vload1(OFFSET, PTR + 8); 3159 3160#define vload_partial_10(DATA, OFFSET, PTR) \ 3161 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3162 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 3163 3164#define vload_partial_11(DATA, OFFSET, PTR) \ 3165 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3166 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 3167 3168#define vload_partial_12(DATA, OFFSET, PTR) \ 3169 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3170 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 3171 3172#define vload_partial_13(DATA, OFFSET, PTR) \ 3173 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3174 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 3175 3176#define vload_partial_14(DATA, OFFSET, PTR) \ 3177 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3178 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 3179 3180#define vload_partial_15(DATA, OFFSET, PTR) \ 3181 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3182 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 3183 3184#define vload_partial_16(DATA, OFFSET, PTR) \ 3185 DATA = vload16(OFFSET, PTR); 3186 3187 3188 3189#define PIXEL_UNIT4 1 3190#define PIXEL_UNIT8 2 3191#define PIXEL_UNIT16 4 3192 3193 3194#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 3195#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 3196 3197 3198#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 3199#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 3200#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 3201 3202#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 3203#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 3204#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 3205#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 3206#endif 3207 3208#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 3209#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 3210#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 3211 3212#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 3213#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 3214#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 3215#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 3216#endif 3217 3218 3219#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 3220#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 3221 3222 3223#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 3224#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 3225 3226#define VSTORE_STR(size) vstore##size 3227#define VSTORE(size) VSTORE_STR(size) 3228 3229#define float1 float 3230#define half1 half 3231#define char1 char 3232#define uchar1 uchar 3233#define short1 short 3234#define ushort1 ushort 3235#define int1 int 3236#define uint1 uint 3237#define long1 long 3238#define ulong1 ulong 3239#define double1 double 3240 3241#define vload1(OFFSET, PTR) *(OFFSET + PTR) 3242#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 3243 3244 3245#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 3246#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 3247 3248#define NO_STORE(data, offs, ptr) \ 3249 { \ 3250 } 3251 3252 3253#define vstore_partial_1_0 NO_STORE 3254#define vstore_partial_1_1 vstore1 3255#define vstore_partial_1_2 NO_STORE 3256#define vstore_partial_1_3 NO_STORE 3257#define vstore_partial_1_4 NO_STORE 3258#define vstore_partial_1_5 NO_STORE 3259#define vstore_partial_1_6 NO_STORE 3260#define vstore_partial_1_7 NO_STORE 3261#define vstore_partial_1_8 NO_STORE 3262#define vstore_partial_1_9 NO_STORE 3263#define vstore_partial_1_10 NO_STORE 3264#define vstore_partial_1_11 NO_STORE 3265#define vstore_partial_1_12 NO_STORE 3266#define vstore_partial_1_13 NO_STORE 3267#define vstore_partial_1_14 NO_STORE 3268#define vstore_partial_1_15 NO_STORE 3269#define vstore_partial_1_16 NO_STORE 3270 3271#define vstore_partial_2_0 NO_STORE 3272#define vstore_partial_2_1 vstore_partial_1 3273#define vstore_partial_2_2 vstore_partial_2 3274#define vstore_partial_2_3 NO_STORE 3275#define vstore_partial_2_4 NO_STORE 3276#define vstore_partial_2_5 NO_STORE 3277#define vstore_partial_2_6 NO_STORE 3278#define vstore_partial_2_7 NO_STORE 3279#define vstore_partial_2_8 NO_STORE 3280#define vstore_partial_2_9 NO_STORE 3281#define vstore_partial_2_10 NO_STORE 3282#define vstore_partial_2_11 NO_STORE 3283#define vstore_partial_2_12 NO_STORE 3284#define vstore_partial_2_13 NO_STORE 3285#define vstore_partial_2_14 NO_STORE 3286#define vstore_partial_2_15 NO_STORE 3287#define vstore_partial_2_16 NO_STORE 3288 3289#define vstore_partial_3_0 NO_STORE 3290#define vstore_partial_3_1 vstore_partial_1 3291#define vstore_partial_3_2 vstore_partial_2 3292#define vstore_partial_3_3 vstore_partial_3 3293#define vstore_partial_3_4 NO_STORE 3294#define vstore_partial_3_5 NO_STORE 3295#define vstore_partial_3_6 NO_STORE 3296#define vstore_partial_3_7 NO_STORE 3297#define vstore_partial_3_8 NO_STORE 3298#define vstore_partial_3_9 NO_STORE 3299#define vstore_partial_3_10 NO_STORE 3300#define vstore_partial_3_11 NO_STORE 3301#define vstore_partial_3_12 NO_STORE 3302#define vstore_partial_3_13 NO_STORE 3303#define vstore_partial_3_14 NO_STORE 3304#define vstore_partial_3_15 NO_STORE 3305#define vstore_partial_3_16 NO_STORE 3306 3307#define vstore_partial_4_0 NO_STORE 3308#define vstore_partial_4_1 vstore_partial_1 3309#define vstore_partial_4_2 vstore_partial_2 3310#define vstore_partial_4_3 vstore_partial_3 3311#define vstore_partial_4_4 vstore_partial_4 3312#define vstore_partial_4_5 NO_STORE 3313#define vstore_partial_4_6 NO_STORE 3314#define vstore_partial_4_7 NO_STORE 3315#define vstore_partial_4_8 NO_STORE 3316#define vstore_partial_4_9 NO_STORE 3317#define vstore_partial_4_10 NO_STORE 3318#define vstore_partial_4_11 NO_STORE 3319#define vstore_partial_4_12 NO_STORE 3320#define vstore_partial_4_13 NO_STORE 3321#define vstore_partial_4_14 NO_STORE 3322#define vstore_partial_4_15 NO_STORE 3323#define vstore_partial_4_16 NO_STORE 3324 3325#define vstore_partial_8_0 NO_STORE 3326#define vstore_partial_8_1 vstore_partial_1 3327#define vstore_partial_8_2 vstore_partial_2 3328#define vstore_partial_8_3 vstore_partial_3 3329#define vstore_partial_8_4 vstore_partial_4 3330#define vstore_partial_8_5 vstore_partial_5 3331#define vstore_partial_8_6 vstore_partial_6 3332#define vstore_partial_8_7 vstore_partial_7 3333#define vstore_partial_8_8 vstore_partial_8 3334#define vstore_partial_8_9 NO_STORE 3335#define vstore_partial_8_10 NO_STORE 3336#define vstore_partial_8_11 NO_STORE 3337#define vstore_partial_8_12 NO_STORE 3338#define vstore_partial_8_13 NO_STORE 3339#define vstore_partial_8_14 NO_STORE 3340#define vstore_partial_8_15 NO_STORE 3341#define vstore_partial_8_16 NO_STORE 3342 3343#define vstore_partial_16_0 NO_STORE 3344#define vstore_partial_16_1 vstore_partial_1 3345#define vstore_partial_16_2 vstore_partial_2 3346#define vstore_partial_16_3 vstore_partial_3 3347#define vstore_partial_16_4 vstore_partial_4 3348#define vstore_partial_16_5 vstore_partial_5 3349#define vstore_partial_16_6 vstore_partial_6 3350#define vstore_partial_16_7 vstore_partial_7 3351#define vstore_partial_16_8 vstore_partial_8 3352#define vstore_partial_16_9 vstore_partial_9 3353#define vstore_partial_16_10 vstore_partial_10 3354#define vstore_partial_16_11 vstore_partial_11 3355#define vstore_partial_16_12 vstore_partial_12 3356#define vstore_partial_16_13 vstore_partial_13 3357#define vstore_partial_16_14 vstore_partial_14 3358#define vstore_partial_16_15 vstore_partial_15 3359#define vstore_partial_16_16 vstore_partial_16 3360 3361 3362#define vstore_partial_1(DATA, OFFSET, PTR) \ 3363 vstore1(DATA.s0, OFFSET, PTR); 3364 3365#define vstore_partial_2(DATA, OFFSET, PTR) \ 3366 vstore2(DATA.s01, OFFSET, PTR); 3367 3368#define vstore_partial_3(DATA, OFFSET, PTR) \ 3369 vstore3(DATA.s012, OFFSET, PTR); 3370 3371#define vstore_partial_4(DATA, OFFSET, PTR) \ 3372 vstore4(DATA.s0123, OFFSET, PTR); 3373 3374#define vstore_partial_5(DATA, OFFSET, PTR) \ 3375 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3376 vstore1(DATA.s4, OFFSET, PTR + 4); 3377 3378#define vstore_partial_6(DATA, OFFSET, PTR) \ 3379 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3380 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 3381 3382#define vstore_partial_7(DATA, OFFSET, PTR) \ 3383 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3384 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 3385 3386#define vstore_partial_8(DATA, OFFSET, PTR) \ 3387 vstore8(DATA.s01234567, OFFSET, PTR); 3388 3389#define vstore_partial_9(DATA, OFFSET, PTR) \ 3390 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3391 vstore1(DATA.s8, OFFSET, PTR + 8); 3392 3393#define vstore_partial_10(DATA, OFFSET, PTR) \ 3394 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3395 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 3396 3397#define vstore_partial_11(DATA, OFFSET, PTR) \ 3398 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3399 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 3400 3401#define vstore_partial_12(DATA, OFFSET, PTR) \ 3402 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3403 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 3404 3405#define vstore_partial_13(DATA, OFFSET, PTR) \ 3406 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3407 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 3408 3409#define vstore_partial_14(DATA, OFFSET, PTR) \ 3410 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3411 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 3412 3413#define vstore_partial_15(DATA, OFFSET, PTR) \ 3414 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3415 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 3416 3417#define vstore_partial_16(DATA, OFFSET, PTR) \ 3418 vstore16(DATA, OFFSET, PTR); 3419 3420 3421 3422 3423 3424#define convert_float_sat convert_float 3425#define convert_float1_sat convert_float 3426#define convert_float2_sat convert_float2 3427#define convert_float3_sat convert_float3 3428#define convert_float4_sat convert_float4 3429#define convert_float8_sat convert_float8 3430#define convert_float16_sat convert_float16 3431#define convert_half_sat convert_float 3432#define convert_half1_sat convert_half 3433#define convert_half2_sat convert_half2 3434#define convert_half3_sat convert_half3 3435#define convert_half4_sat convert_half4 3436#define convert_half8_sat convert_half8 3437#define convert_half16_sat convert_half16 3438 3439#define convert_float1 convert_float 3440#define convert_half1 convert_half 3441#define convert_char1 convert_char 3442#define convert_uchar1 convert_uchar 3443#define convert_short1 convert_short 3444#define convert_ushort1 convert_ushort 3445#define convert_int1 convert_int 3446#define convert_uint1 convert_uint 3447#define convert_long1 convert_long 3448#define convert_ulong1 convert_ulong 3449#define convert_double1 convert_double 3450 3451#define convert_char1_sat convert_char_sat 3452#define convert_uchar1_sat convert_uchar_sat 3453#define convert_uchar2_sat convert_uchar2_sat 3454#define convert_uchar3_sat convert_uchar3_sat 3455#define convert_uchar4_sat convert_uchar4_sat 3456#define convert_uchar8_sat convert_uchar8_sat 3457#define convert_uchar16_sat convert_uchar16_sat 3458#define convert_short1_sat convert_short_sat 3459#define convert_ushort1_sat convert_ushort_sat 3460#define convert_int1_sat convert_int_sat 3461#define convert_uint1_sat convert_uint_sat 3462#define convert_long1_sat convert_long_sat 3463#define convert_ulong1_sat convert_ulong_sat 3464#define convert_double1_sat convert_double_sat 3465 3466#define VEC_DATA_TYPE_STR(type, size) type##size 3467#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 3468 3469#define CONVERT_STR(x, type) (convert_##type((x))) 3470#define CONVERT(x, type) CONVERT_STR(x, type) 3471 3472#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 3473#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 3474 3475#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 3476#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 3477 3478#define select_vec_dt_uchar(size) uchar##size 3479#define select_vec_dt_char(size) char##size 3480#define select_vec_dt_ushort(size) ushort##size 3481#define select_vec_dt_short(size) short##size 3482#define select_vec_dt_half(size) short##size 3483#define select_vec_dt_uint(size) uint##size 3484#define select_vec_dt_int(size) int##size 3485#define select_vec_dt_float(size) int##size 3486#define select_vec_dt_ulong(size) ulong##size 3487#define select_vec_dt_long(size) long##size 3488 3489#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 3490#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 3491#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 3492 3493#define signed_int_vec_dt_uchar(size) char##size 3494#define signed_int_vec_dt_char(size) char##size 3495#define signed_int_vec_dt_ushort(size) short##size 3496#define signed_int_vec_dt_short(size) short##size 3497#define signed_int_vec_dt_half(size) short##size 3498#define signed_int_vec_dt_uint(size) int##size 3499#define signed_int_vec_dt_int(size) int##size 3500#define signed_int_vec_dt_float(size) int##size 3501#define signed_int_vec_dt_ulong(size) long##size 3502#define signed_int_vec_dt_long(size) long##size 3503 3504#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 3505#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 3506#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 3507 3508#define sum_reduce_1(x) (x) 3509#define sum_reduce_2(x) ((x).s0) + ((x).s1) 3510#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 3511#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 3512#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 3513#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 3514 3515#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 3516#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 3517 3518#define prod_reduce_1(x) (x) 3519#define prod_reduce_2(x) ((x).s0) * ((x).s1) 3520#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 3521#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 3522#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 3523#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 3524 3525#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 3526#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 3527 3528#define max_reduce_1(x) (x) 3529#define max_reduce_2(x) max(((x).s0), ((x).s1)) 3530#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 3531#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 3532#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 3533#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 3534 3535#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 3536#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 3537 3538#define VECTOR_DECLARATION(name) \ 3539 __global uchar *name##_ptr, \ 3540 uint name##_stride_x, \ 3541 uint name##_step_x, \ 3542 uint name##_offset_first_element_in_bytes 3543 3544#define IMAGE_DECLARATION(name) \ 3545 __global uchar *name##_ptr, \ 3546 uint name##_stride_x, \ 3547 uint name##_step_x, \ 3548 uint name##_stride_y, \ 3549 uint name##_step_y, \ 3550 uint name##_offset_first_element_in_bytes 3551 3552#define TENSOR3D_DECLARATION(name) \ 3553 __global uchar *name##_ptr, \ 3554 uint name##_stride_x, \ 3555 uint name##_step_x, \ 3556 uint name##_stride_y, \ 3557 uint name##_step_y, \ 3558 uint name##_stride_z, \ 3559 uint name##_step_z, \ 3560 uint name##_offset_first_element_in_bytes 3561 3562#define TENSOR4D_DECLARATION(name) \ 3563 __global uchar *name##_ptr, \ 3564 uint name##_stride_x, \ 3565 uint name##_step_x, \ 3566 uint name##_stride_y, \ 3567 uint name##_step_y, \ 3568 uint name##_stride_z, \ 3569 uint name##_step_z, \ 3570 uint name##_stride_w, \ 3571 uint name##_step_w, \ 3572 uint name##_offset_first_element_in_bytes 3573 3574#define TENSOR5D_DECLARATION(name) \ 3575 __global uchar *name##_ptr, \ 3576 uint name##_stride_x, \ 3577 uint name##_step_x, \ 3578 uint name##_stride_y, \ 3579 uint name##_step_y, \ 3580 uint name##_stride_z, \ 3581 uint name##_step_z, \ 3582 uint name##_stride_w, \ 3583 uint name##_step_w, \ 3584 uint name##_stride_v, \ 3585 uint name##_step_v, \ 3586 uint name##_offset_first_element_in_bytes 3587 3588#define CONVERT_TO_VECTOR_STRUCT(name) \ 3589 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 3590 3591#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 3592 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 3593 3594#define CONVERT_TO_IMAGE_STRUCT(name) \ 3595 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 3596 3597#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 3598 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 3599 3600#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3601 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3602 3603#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 3604 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 3605 3606#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3607 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3608 3609#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 3610 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3611 name##_stride_z, name##_step_z) 3612 3613#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 3614 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 3615 3616#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 3617 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3618 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 3619 3620#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 3621 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 3622 3623#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 3624 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3625 name##_stride_z, name##_step_z) 3626 3627 3628typedef struct Vector 3629{ 3630 __global uchar *ptr; 3631 int offset_first_element_in_bytes; 3632 int stride_x; 3633} Vector; 3634 3635 3636typedef struct Image 3637{ 3638 __global uchar *ptr; 3639 int offset_first_element_in_bytes; 3640 int stride_x; 3641 int stride_y; 3642} Image; 3643 3644 3645typedef struct Tensor3D 3646{ 3647 __global uchar *ptr; 3648 int offset_first_element_in_bytes; 3649 int stride_x; 3650 int stride_y; 3651 int stride_z; 3652} Tensor3D; 3653 3654 3655typedef struct Tensor4D 3656{ 3657 __global uchar *ptr; 3658 int offset_first_element_in_bytes; 3659 int stride_x; 3660 int stride_y; 3661 int stride_z; 3662 int stride_w; 3663} Tensor4D; 3664 3665 3666inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 3667{ 3668 Vector vector = 3669 { 3670 .ptr = ptr, 3671 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3672 .stride_x = stride_x, 3673 }; 3674 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 3675 return vector; 3676} 3677 3678 3679inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 3680{ 3681 Image img = 3682 { 3683 .ptr = ptr, 3684 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3685 .stride_x = stride_x, 3686 .stride_y = stride_y 3687 }; 3688 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 3689 return img; 3690} 3691 3692 3693inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3694{ 3695 Image img = 3696 { 3697 .ptr = ptr, 3698 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3699 .stride_x = stride_x, 3700 .stride_y = stride_y 3701 }; 3702 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3703 return img; 3704} 3705 3706 3707inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3708{ 3709 Tensor3D tensor = 3710 { 3711 .ptr = ptr, 3712 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3713 .stride_x = stride_x, 3714 .stride_y = stride_y, 3715 .stride_z = stride_z 3716 }; 3717 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3718 return tensor; 3719} 3720 3721 3722inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3723{ 3724 Tensor3D tensor = 3725 { 3726 .ptr = ptr, 3727 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3728 .stride_x = stride_x, 3729 .stride_y = stride_y, 3730 .stride_z = stride_z 3731 }; 3732 return tensor; 3733} 3734 3735inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 3736 uint step_w, 3737 uint mod_size) 3738{ 3739 Tensor4D tensor = 3740 { 3741 .ptr = ptr, 3742 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3743 .stride_x = stride_x, 3744 .stride_y = stride_y, 3745 .stride_z = stride_z, 3746 .stride_w = stride_w 3747 }; 3748 3749 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 3750 return tensor; 3751} 3752 3753 3754inline __global const uchar *vector_offset(const Vector *vec, int x) 3755{ 3756 return vec->ptr + x * vec->stride_x; 3757} 3758 3759 3760inline __global uchar *offset(const Image *img, int x, int y) 3761{ 3762 return img->ptr + x * img->stride_x + y * img->stride_y; 3763} 3764 3765 3766inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 3767{ 3768 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 3769} 3770 3771 3772inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 3773{ 3774 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 3775} 3776 3777 3778inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 3779{ 3780 uint num_elements = width * height; 3781 3782 const uint z = index / num_elements; 3783 3784 index %= num_elements; 3785 3786 const uint y = index / width; 3787 3788 index %= width; 3789 3790 const uint x = index; 3791 3792 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 3793} 3794 3795#endif 3796 3797 3798#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) 3799#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) 3800 3801 3802inline uchar quantize_qasymm8(float input, float offset, float scale) 3803{ 3804 float out_f32 = input / scale + offset; 3805 uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); 3806 return res_u8; 3807} 3808 3809 3810inline float dequantize_qasymm8(uchar input, float offset, float scale) 3811{ 3812 return ((float)input - offset) * scale; 3813} 3814 3815 3816inline float dequantize_qasymm8_signed(char input, float offset, float scale) 3817{ 3818 return ((float)input - offset) * scale; 3819} 3820 3821 3822#define QUANTIZE_IMPL(type, size) \ 3823 inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ 3824 { \ 3825 VEC_DATA_TYPE(float, size) \ 3826 out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ 3827 VEC_DATA_TYPE(type, size) \ 3828 res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ 3829 return res; \ 3830 } 3831 3832 3833#define DEQUANTIZE_IMPL(type, size) \ 3834 inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ 3835 { \ 3836 return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ 3837 } 3838 3839 3840#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ 3841 inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ 3842 { \ 3843 const VEC_DATA_TYPE(int, size) \ 3844 zero = (VEC_DATA_TYPE(int, size))0; \ 3845 const VEC_DATA_TYPE(int, size) \ 3846 one = (VEC_DATA_TYPE(int, size))1; \ 3847 VEC_DATA_TYPE(int, size) \ 3848 mask = (one << exponent) - one; \ 3849 VEC_DATA_TYPE(int, size) \ 3850 threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ 3851 return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ 3852 } 3853 3854 3855#define ASYMM_MULT_IMPL(size) \ 3856 inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 3857 { \ 3858 VEC_DATA_TYPE(int, size) \ 3859 overflow = a == b && a == INT_MIN; \ 3860 VEC_DATA_TYPE(long, size) \ 3861 a_64 = convert_long##size(a); \ 3862 VEC_DATA_TYPE(long, size) \ 3863 b_64 = convert_long##size(b); \ 3864 VEC_DATA_TYPE(long, size) \ 3865 ab_64 = a_64 * b_64; \ 3866 \ 3867 VEC_DATA_TYPE(long, size) \ 3868 mask1 = 1 << 30; \ 3869 VEC_DATA_TYPE(long, size) \ 3870 mask2 = 1 - (1 << 30); \ 3871 VEC_DATA_TYPE(long, size) \ 3872 is_positive_or_zero = ab_64 >= 0; \ 3873 VEC_DATA_TYPE(long, size) \ 3874 nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ 3875 VEC_DATA_TYPE(long, size) \ 3876 mask = 1ll << 31; \ 3877 VEC_DATA_TYPE(int, size) \ 3878 ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ 3879 return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ 3880 } 3881 3882 3883#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ 3884 inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ 3885 { \ 3886 const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ 3887 const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ 3888 const int k_fractional_bits = 31; \ 3889 VEC_DATA_TYPE(int, size) \ 3890 x = a + (1 << (k_fractional_bits - 3)); \ 3891 VEC_DATA_TYPE(int, size) \ 3892 x2 = ASYMM_MULT(x, x, size); \ 3893 VEC_DATA_TYPE(int, size) \ 3894 x3 = ASYMM_MULT(x2, x, size); \ 3895 VEC_DATA_TYPE(int, size) \ 3896 x4 = ASYMM_MULT(x2, x2, size); \ 3897 VEC_DATA_TYPE(int, size) \ 3898 x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ 3899 VEC_DATA_TYPE(int, size) \ 3900 x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ 3901 VEC_DATA_TYPE(int, size) \ 3902 x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ 3903 return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ 3904 } 3905 3906 3907#define ASYMM_SELECT_USING_MASK_IMPL(size) \ 3908 inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ 3909 { \ 3910 return (if_mask & then_val) ^ (~if_mask & else_val); \ 3911 } 3912 3913 3914#define ASYMM_MASK_IF_ZERO_IMPL(size) \ 3915 inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ 3916 { \ 3917 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 3918 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 3919 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \ 3920 } 3921 3922 3923#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ 3924 inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ 3925 { \ 3926 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 3927 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 3928 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ 3929 } 3930 3931#define EXP_BARREL_SHIFTER_IMPL(size) \ 3932 inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ 3933 { \ 3934 if(k_integer_bits > exponent) \ 3935 { \ 3936 const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ 3937 return ASYMM_SELECT_USING_MASK( \ 3938 ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ 3939 ASYMM_MULT(result, fp_multiplier, size), result, size); \ 3940 } \ 3941 \ 3942 return result; \ 3943 } 3944 3945 3946#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ 3947 inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ 3948 { \ 3949 const int k_fractional_bits = 31 - k_integer_bits; \ 3950 VEC_DATA_TYPE(int, size) \ 3951 k_one_quarter = 1 << (k_fractional_bits - 2); \ 3952 VEC_DATA_TYPE(int, size) \ 3953 mask = k_one_quarter - 1; \ 3954 VEC_DATA_TYPE(int, size) \ 3955 a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ 3956 VEC_DATA_TYPE(int, size) \ 3957 a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ 3958 VEC_DATA_TYPE(int, size) \ 3959 result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ 3960 VEC_DATA_TYPE(int, size) \ 3961 remainder = a_mod_quarter_minus_one_quarter - a; \ 3962 \ 3963 result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ 3964 result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ 3965 result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ 3966 result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ 3967 result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ 3968 result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ 3969 result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ 3970 \ 3971 if(k_integer_bits > 5) \ 3972 { \ 3973 const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ 3974 result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ 3975 } \ 3976 \ 3977 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 3978 return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ 3979 } 3980 3981 3982#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ 3983 inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ 3984 { \ 3985 if(exponent < 0) \ 3986 { \ 3987 return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ 3988 } \ 3989 \ 3990 const VEC_DATA_TYPE(int, size) min = INT_MIN; \ 3991 const VEC_DATA_TYPE(int, size) max = INT_MAX; \ 3992 int threshold = ((1 << (31 - exponent)) - 1); \ 3993 VEC_DATA_TYPE(int, size) \ 3994 positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ 3995 VEC_DATA_TYPE(int, size) \ 3996 negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ 3997 VEC_DATA_TYPE(int, size) \ 3998 result = x << exponent; \ 3999 result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ 4000 result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ 4001 return result; \ 4002 } 4003 4004 4005#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ 4006 inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 4007 { \ 4008 VEC_DATA_TYPE(long, size) \ 4009 a64 = convert_long##size(a); \ 4010 VEC_DATA_TYPE(long, size) \ 4011 b64 = convert_long##size(b); \ 4012 VEC_DATA_TYPE(long, size) \ 4013 sum = a64 + b64; \ 4014 const VEC_DATA_TYPE(long, size) one = 1; \ 4015 const VEC_DATA_TYPE(long, size) minus_one = -1; \ 4016 VEC_DATA_TYPE(long, size) \ 4017 sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ 4018 return convert_int##size((sum + sign) / 2); \ 4019 } 4020 4021 4022#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ 4023 inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ 4024 { \ 4025 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 4026 const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ 4027 VEC_DATA_TYPE(int, size) \ 4028 half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ 4029 const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ 4030 const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ 4031 VEC_DATA_TYPE(int, size) \ 4032 x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ 4033 for(int i = 0; i < 3; i++) \ 4034 { \ 4035 VEC_DATA_TYPE(int, size) \ 4036 half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ 4037 VEC_DATA_TYPE(int, size) \ 4038 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ 4039 VEC_DATA_TYPE(int, size) \ 4040 tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ 4041 x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ 4042 } \ 4043 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ 4044 } 4045 4046 4047#define ASYMM_RESCALE_IMPL(size) \ 4048 inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ 4049 { \ 4050 int exponent = src_integer_bits - dst_integer_bits; \ 4051 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ 4052 } 4053 4054#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) 4055#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) 4056#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) 4057#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) 4058 4059#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) 4060#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) 4061#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) 4062#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) 4063#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ 4064 ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) 4065#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ 4066 ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) 4067#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) 4068#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) 4069#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) 4070#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) 4071#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) 4072#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) 4073#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) 4074#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) 4075#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) 4076#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) 4077#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) 4078#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) 4079#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) 4080 4081#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ 4082 inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ 4083 { \ 4084 const int left_shift = shift > 0 ? shift : 0; \ 4085 const int right_shift = shift > 0 ? 0 : -shift; \ 4086 return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ 4087 } 4088#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) 4089 4090QUANTIZE_IMPL(uchar, 1) 4091QUANTIZE_IMPL(char, 1) 4092QUANTIZE_IMPL(uint, 1) 4093QUANTIZE_IMPL(int, 1) 4094QUANTIZE_IMPL(uchar, 2) 4095QUANTIZE_IMPL(char, 2) 4096QUANTIZE_IMPL(uint, 2) 4097QUANTIZE_IMPL(int, 2) 4098QUANTIZE_IMPL(uchar, 3) 4099QUANTIZE_IMPL(char, 3) 4100QUANTIZE_IMPL(uint, 3) 4101QUANTIZE_IMPL(int, 3) 4102QUANTIZE_IMPL(uchar, 4) 4103QUANTIZE_IMPL(ushort, 4) 4104QUANTIZE_IMPL(short, 4) 4105QUANTIZE_IMPL(int, 4) 4106QUANTIZE_IMPL(uchar, 8) 4107QUANTIZE_IMPL(char, 8) 4108QUANTIZE_IMPL(uint, 8) 4109QUANTIZE_IMPL(int, 8) 4110QUANTIZE_IMPL(uchar, 16) 4111QUANTIZE_IMPL(char, 16) 4112QUANTIZE_IMPL(ushort, 16) 4113QUANTIZE_IMPL(short, 16) 4114QUANTIZE_IMPL(uint, 16) 4115QUANTIZE_IMPL(int, 16) 4116 4117DEQUANTIZE_IMPL(uchar, 1) 4118DEQUANTIZE_IMPL(char, 1) 4119DEQUANTIZE_IMPL(uint, 1) 4120DEQUANTIZE_IMPL(int, 1) 4121DEQUANTIZE_IMPL(uchar, 2) 4122DEQUANTIZE_IMPL(char, 2) 4123DEQUANTIZE_IMPL(uint, 2) 4124DEQUANTIZE_IMPL(int, 2) 4125DEQUANTIZE_IMPL(uchar, 3) 4126DEQUANTIZE_IMPL(char, 3) 4127DEQUANTIZE_IMPL(uint, 3) 4128DEQUANTIZE_IMPL(int, 3) 4129DEQUANTIZE_IMPL(uchar, 4) 4130DEQUANTIZE_IMPL(ushort, 4) 4131DEQUANTIZE_IMPL(short, 4) 4132DEQUANTIZE_IMPL(int, 4) 4133DEQUANTIZE_IMPL(uchar, 8) 4134DEQUANTIZE_IMPL(char, 8) 4135DEQUANTIZE_IMPL(uint, 8) 4136DEQUANTIZE_IMPL(int, 8) 4137DEQUANTIZE_IMPL(uchar, 16) 4138DEQUANTIZE_IMPL(char, 16) 4139DEQUANTIZE_IMPL(ushort, 16) 4140DEQUANTIZE_IMPL(short, 16) 4141DEQUANTIZE_IMPL(uint, 16) 4142DEQUANTIZE_IMPL(int, 16) 4143 4144ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) 4145ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) 4146ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) 4147ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) 4148ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) 4149ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) 4150 4151ASYMM_MULT_IMPL(1) 4152ASYMM_MULT_IMPL(2) 4153ASYMM_MULT_IMPL(3) 4154ASYMM_MULT_IMPL(4) 4155ASYMM_MULT_IMPL(8) 4156ASYMM_MULT_IMPL(16) 4157 4158ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) 4159ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) 4160ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) 4161ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) 4162ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) 4163ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) 4164 4165ASYMM_SELECT_USING_MASK_IMPL(1) 4166ASYMM_SELECT_USING_MASK_IMPL(2) 4167ASYMM_SELECT_USING_MASK_IMPL(3) 4168ASYMM_SELECT_USING_MASK_IMPL(4) 4169ASYMM_SELECT_USING_MASK_IMPL(8) 4170ASYMM_SELECT_USING_MASK_IMPL(16) 4171 4172ASYMM_MASK_IF_ZERO_IMPL(1) 4173ASYMM_MASK_IF_ZERO_IMPL(2) 4174ASYMM_MASK_IF_ZERO_IMPL(3) 4175ASYMM_MASK_IF_ZERO_IMPL(4) 4176ASYMM_MASK_IF_ZERO_IMPL(8) 4177ASYMM_MASK_IF_ZERO_IMPL(16) 4178 4179ASYMM_MASK_IF_NON_ZERO_IMPL(1) 4180ASYMM_MASK_IF_NON_ZERO_IMPL(2) 4181ASYMM_MASK_IF_NON_ZERO_IMPL(3) 4182ASYMM_MASK_IF_NON_ZERO_IMPL(4) 4183ASYMM_MASK_IF_NON_ZERO_IMPL(8) 4184ASYMM_MASK_IF_NON_ZERO_IMPL(16) 4185 4186EXP_BARREL_SHIFTER_IMPL(1) 4187EXP_BARREL_SHIFTER_IMPL(2) 4188EXP_BARREL_SHIFTER_IMPL(3) 4189EXP_BARREL_SHIFTER_IMPL(4) 4190EXP_BARREL_SHIFTER_IMPL(8) 4191EXP_BARREL_SHIFTER_IMPL(16) 4192 4193ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) 4194ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) 4195ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) 4196ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) 4197ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) 4198ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) 4199 4200ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) 4201ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) 4202ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) 4203ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) 4204ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) 4205ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) 4206 4207ASYMM_ROUNDING_HALF_SUM_IMPL(1) 4208ASYMM_ROUNDING_HALF_SUM_IMPL(2) 4209ASYMM_ROUNDING_HALF_SUM_IMPL(3) 4210ASYMM_ROUNDING_HALF_SUM_IMPL(4) 4211ASYMM_ROUNDING_HALF_SUM_IMPL(8) 4212ASYMM_ROUNDING_HALF_SUM_IMPL(16) 4213 4214ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) 4215ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) 4216ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) 4217ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) 4218ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) 4219ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) 4220 4221ASYMM_RESCALE_IMPL(1) 4222ASYMM_RESCALE_IMPL(2) 4223ASYMM_RESCALE_IMPL(3) 4224ASYMM_RESCALE_IMPL(4) 4225ASYMM_RESCALE_IMPL(8) 4226ASYMM_RESCALE_IMPL(16) 4227 4228MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) 4229MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) 4230MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) 4231MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) 4232MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) 4233MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) 4234 4235#endif 4236 4237#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 4238#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 4239 4240 4241 4242 4243#define TILE_VECTOR_SIZE1 1 4244#define TILE_VECTOR_SIZE2 2 4245#define TILE_VECTOR_SIZE3 3 4246#define TILE_VECTOR_SIZE4 4 4247#define TILE_VECTOR_SIZE5 8 4248#define TILE_VECTOR_SIZE6 8 4249#define TILE_VECTOR_SIZE7 8 4250#define TILE_VECTOR_SIZE8 8 4251#define TILE_VECTOR_SIZE9 16 4252#define TILE_VECTOR_SIZE10 16 4253#define TILE_VECTOR_SIZE11 16 4254#define TILE_VECTOR_SIZE12 16 4255#define TILE_VECTOR_SIZE13 16 4256#define TILE_VECTOR_SIZE14 16 4257#define TILE_VECTOR_SIZE15 16 4258#define TILE_VECTOR_SIZE16 16 4259 4260#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1 4261#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2 4262#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3 4263#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4 4264#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8 4265#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8 4266#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8 4267#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8 4268#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16 4269#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16 4270#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16 4271#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16 4272#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16 4273#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16 4274#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16 4275#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16 4276 4277 4278#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME) 4279#define TILE_STR(DATA_TYPE, H, W, BASENAME) \ 4280 union { \ 4281 DATA_TYPE s[TILE_VECTOR_SIZE##W]; \ 4282 TILE_VECTOR_TYPE##W(DATA_TYPE) v; \ 4283 } BASENAME[H] 4284 4285#define TENSOR4D_IMAGE(name) \ 4286 __read_only image2d_t name##_img, \ 4287 __global uchar *name##_ptr, \ 4288 uint name##_stride_x, \ 4289 uint name##_step_x, \ 4290 uint name##_stride_y, \ 4291 uint name##_step_y, \ 4292 uint name##_stride_z, \ 4293 uint name##_step_z, \ 4294 uint name##_stride_w, \ 4295 uint name##_step_w, \ 4296 uint name##_offset_first_element_in_bytes 4297 4298#define TENSOR4D_BUFFER(name) \ 4299 __global uchar *name##_ptr, \ 4300 uint name##_stride_x, \ 4301 uint name##_step_x, \ 4302 uint name##_stride_y, \ 4303 uint name##_step_y, \ 4304 uint name##_stride_z, \ 4305 uint name##_step_z, \ 4306 uint name##_stride_w, \ 4307 uint name##_step_w, \ 4308 uint name##_offset_first_element_in_bytes 4309 4310#define TENSOR4D_STR(name, type) TENSOR4D_##type(name) 4311#define TENSOR4D(name, type) TENSOR4D_STR(name, type) 4312 4313#define TENSOR4D_T_IMAGE(name) \ 4314 __read_only image2d_t name##_img, \ 4315 __global uchar *name##_ptr, \ 4316 uint name##_stride_y, \ 4317 uint name##_stride_z, \ 4318 uint name##_stride_w, \ 4319 uint name##_c, \ 4320 uint name##_w, \ 4321 uint name##_h, \ 4322 uint name##_n, \ 4323 uint name##_offset_first_element_in_bytes 4324 4325#define TENSOR4D_T_BUFFER(name) \ 4326 __global uchar *name##_ptr, \ 4327 uint name##_stride_y, \ 4328 uint name##_stride_z, \ 4329 uint name##_stride_w, \ 4330 uint name##_c, \ 4331 uint name##_w, \ 4332 uint name##_h, \ 4333 uint name##_n, \ 4334 uint name##_offset_first_element_in_bytes 4335 4336#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) 4337 4338 4339#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) 4340 4341#define TENSOR4D_RO_T_IMAGE(name) \ 4342 __read_only image2d_t name##_img, \ 4343 TENSOR4D_T_BUFFER(name) 4344 4345#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 4346 4347#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) 4348 4349 4350#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) 4351 4352#define TENSOR4D_WO_T_IMAGE(name) \ 4353 __write_only image2d_t name##_img, \ 4354 TENSOR4D_T_BUFFER(name) 4355 4356#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 4357 4358#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) 4359 4360 4361#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) 4362 4363#define TENSOR3D_T_IMAGE(name) \ 4364 __read_only image2d_t name##_img, \ 4365 __global uchar *name##_ptr, \ 4366 uint name##_stride_y, \ 4367 uint name##_stride_z, \ 4368 uint name##_w, \ 4369 uint name##_h, \ 4370 uint name##_n, \ 4371 uint name##_offset_first_element_in_bytes 4372 4373#define TENSOR3D_T_BUFFER(name) \ 4374 __global uchar *name##_ptr, \ 4375 uint name##_stride_y, \ 4376 uint name##_stride_z, \ 4377 uint name##_w, \ 4378 uint name##_h, \ 4379 uint name##_n, \ 4380 uint name##_offset_first_element_in_bytes 4381 4382#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name) 4383#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type) 4384 4385#if !defined(UNROLL_WITH_PRAGMA) 4386#define UNROLL_INCR(idx, step, macro) idx += (step); (macro) 4387 4388#define LOOP_UNROLLING_1(idx, step, macro) (macro) 4389#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro) 4390#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro) 4391#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro) 4392#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro) 4393#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro) 4394#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro) 4395#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro) 4396#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro) 4397#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro) 4398#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro) 4399#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro) 4400#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro) 4401#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro) 4402#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro) 4403#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro) 4404#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro) 4405#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro) 4406#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro) 4407#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro) 4408#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro) 4409#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro) 4410#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro) 4411#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro) 4412#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro) 4413#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro) 4414#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro) 4415#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro) 4416#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro) 4417#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro) 4418#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro) 4419#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro) 4420#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro) 4421#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro) 4422#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro) 4423#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro) 4424#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro) 4425#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro) 4426#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro) 4427#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro) 4428#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro) 4429#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro) 4430#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro) 4431#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro) 4432#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro) 4433#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro) 4434#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro) 4435#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro) 4436#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro) 4437#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro) 4438#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro) 4439#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro) 4440#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro) 4441#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro) 4442#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro) 4443#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro) 4444#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro) 4445#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro) 4446#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro) 4447#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro) 4448#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro) 4449#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro) 4450#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro) 4451#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro) 4452#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro) 4453#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro) 4454#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro) 4455#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro) 4456#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro) 4457#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro) 4458#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro) 4459#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro) 4460#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro) 4461#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro) 4462#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro) 4463#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro) 4464#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro) 4465#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro) 4466#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro) 4467#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro) 4468#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro) 4469#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro) 4470#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro) 4471#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro) 4472#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro) 4473#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro) 4474#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro) 4475#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro) 4476#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro) 4477#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro) 4478#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro) 4479#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro) 4480#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro) 4481#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro) 4482#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro) 4483#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro) 4484#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro) 4485#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro) 4486#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro) 4487#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro) 4488#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro) 4489#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro) 4490#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro) 4491#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro) 4492#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro) 4493#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro) 4494#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro) 4495#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro) 4496#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro) 4497#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro) 4498#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro) 4499#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro) 4500#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro) 4501#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro) 4502#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro) 4503#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro) 4504#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro) 4505#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro) 4506#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro) 4507#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro) 4508#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro) 4509#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro) 4510#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro) 4511#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro) 4512#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro) 4513#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro) 4514#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) 4515#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) 4516 4517#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 4518 { \ 4519 type idx = start; \ 4520 LOOP_UNROLLING_##num(idx, step, macro); \ 4521 } 4522#else 4523#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 4524 { \ 4525 _Pragma("unroll") \ 4526 for(type idx = start; idx < (num * step); idx += step) \ 4527 { \ 4528 (macro); \ 4529 } \ 4530 } 4531#endif 4532#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) 4533 4534 4535#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0)) 4536 4537 4538#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) 4539#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) 4540#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4541 ({ \ 4542 c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \ 4543 }) 4544#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product) 4545#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 4546#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 4547#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b)); 4548#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4549#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c)); 4550#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c)); 4551#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c)); 4552#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4553#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 4554#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 4555#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b)); 4556#else 4557#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4558 ({ \ 4559 c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \ 4560 c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \ 4561 }) 4562#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4563 ({ \ 4564 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \ 4565 c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \ 4566 }) 4567#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \ 4568 ({ \ 4569 val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \ 4570 val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \ 4571 val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \ 4572 val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \ 4573 }) 4574#endif 4575#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4576 ({ \ 4577 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 4578 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \ 4579 }) 4580#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4581 ({ \ 4582 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 4583 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \ 4584 }) 4585#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4586 ({ \ 4587 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 4588 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \ 4589 }) 4590#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4591 ({ \ 4592 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 4593 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 4594 }) 4595#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4596 ({ \ 4597 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4598 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \ 4599 }) 4600#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4601 ({ \ 4602 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4603 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \ 4604 }) 4605#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4606 ({ \ 4607 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4608 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \ 4609 }) 4610#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4611 ({ \ 4612 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4613 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \ 4614 }) 4615#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4616 ({ \ 4617 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4618 DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \ 4619 }) 4620#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4621 ({ \ 4622 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4623 DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \ 4624 }) 4625#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4626 ({ \ 4627 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 4628 DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \ 4629 }) 4630#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 4631 ({ \ 4632 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 4633 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 4634 }) 4635 4636 4637#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) 4638#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c) 4639 4640 4641#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) 4642#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) 4643#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \ 4644 VLOAD(WIDTH) \ 4645 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 4646#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) 4647 4648 4649#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) 4650#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) 4651#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ 4652 VSTORE(WIDTH) \ 4653 (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 4654#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) 4655 4656 4657#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ 4658 ({ \ 4659 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4660 { \ 4661 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ 4662 }) \ 4663 }) 4664 4665 4666#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \ 4667 ({ \ 4668 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4669 { \ 4670 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \ 4671 }) \ 4672 }) 4673 4674 4675#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ 4676 ({ \ 4677 if(WIDTH1_CONDITION) \ 4678 { \ 4679 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4680 { \ 4681 VLOAD_PARTIAL(WIDTH0, WIDTH1) \ 4682 (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 4683 }) \ 4684 } \ 4685 else \ 4686 { \ 4687 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4688 { \ 4689 dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ 4690 }) \ 4691 } \ 4692 }) 4693 4694#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ 4695 ({ \ 4696 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 4697 { \ 4698 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 4699 { \ 4700 int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \ 4701 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 4702 int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \ 4703 if(_src_valid_y != 0) \ 4704 { \ 4705 dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 4706 } \ 4707 }) \ 4708 }) \ 4709 }) 4710 4711 4712#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \ 4713 ({ \ 4714 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 4715 { \ 4716 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 4717 { \ 4718 int _src_y = (X) + _xk * (DILATION_X); \ 4719 int _src_z = ((Y) + _yk * (DILATION_Y)); \ 4720 int _src_w = (B); \ 4721 bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \ 4722 if(!(BOUNDARY_CHECK)) \ 4723 { \ 4724 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 4725 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 4726 } \ 4727 else \ 4728 { \ 4729 if(_src_valid_y) \ 4730 { \ 4731 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 4732 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 4733 } \ 4734 } \ 4735 }) \ 4736 }) \ 4737 }) 4738 4739 4740#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ 4741 ({ \ 4742 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 4743 { \ 4744 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ 4745 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 4746 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ 4747 if(_src_valid_y != 0) \ 4748 { \ 4749 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 4750 } \ 4751 }) \ 4752 }) 4753 4754 4755#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 4756#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 4757#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 4758 ({ \ 4759 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 4760 { \ 4761 if(yi[0].s[_i] >= 0) \ 4762 { \ 4763 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 4764 } \ 4765 }) \ 4766 }) 4767 4768#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 4769 ({ \ 4770 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 4771 { \ 4772 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 4773 }) \ 4774 }) 4775 4776 4777#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \ 4778 ({ \ 4779 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 4780 { \ 4781 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \ 4782 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \ 4783 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \ 4784 && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \ 4785 if(_src_valid_y != 0) \ 4786 { \ 4787 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 4788 } \ 4789 }) \ 4790 }) 4791 4792 4793#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \ 4794 ({ \ 4795 if(WIDTH1_CONDITION) \ 4796 { \ 4797 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4798 { \ 4799 VSTORE_PARTIAL(WIDTH0, WIDTH1) \ 4800 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 4801 }) \ 4802 } \ 4803 else \ 4804 { \ 4805 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 4806 { \ 4807 VSTORE(WIDTH0) \ 4808 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 4809 }) \ 4810 } \ 4811 }) 4812 4813 4814#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \ 4815 ({ \ 4816 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4817 { \ 4818 ACC_DATA_TYPE _tm = 0; \ 4819 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 4820 { \ 4821 _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \ 4822 }) \ 4823 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 4824 { \ 4825 dst[_m0].s[_n0] += _tm; \ 4826 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 4827 { \ 4828 dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \ 4829 }) \ 4830 }) \ 4831 }) \ 4832 }) 4833 4834 4835#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 4836#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 4837 4838 4839#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 4840 ({ \ 4841 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4842 { \ 4843 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 4844 { \ 4845 SRC_DATA_TYPE _tmp = 0; \ 4846 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 4847 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 4848 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 4849 long a_64 = (long)(_src); \ 4850 long b_64 = (long)(DST_MULTIPLIER); \ 4851 long ab_64 = a_64 * b_64; \ 4852 long mask1 = 1 << 30; \ 4853 long mask2 = 1 - (1 << 30); \ 4854 long is_positive_or_zero = ab_64 >= 0; \ 4855 long nudge = select(mask2, mask1, is_positive_or_zero); \ 4856 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 4857 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 4858 if(DST_SHIFT >= 0) \ 4859 { \ 4860 long mask = ((((int)1) << DST_SHIFT) - (long)1); \ 4861 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 4862 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 4863 } \ 4864 _tmp += DST_OFFSET; \ 4865 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 4866 }) \ 4867 }) \ 4868 }) 4869 4870 4871#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 4872 ({ \ 4873 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4874 { \ 4875 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 4876 { \ 4877 SRC_DATA_TYPE _tmp = 0; \ 4878 SRC_DATA_TYPE _tmp2 = 0; \ 4879 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 4880 SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \ 4881 SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \ 4882 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \ 4883 SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \ 4884 long a_64 = (long)(_src); \ 4885 long b_64 = (long)(_dst_multiplier); \ 4886 long ab_64 = a_64 * b_64; \ 4887 long mask1 = 1 << 30; \ 4888 long mask2 = 1 - (1 << 30); \ 4889 long is_positive_or_zero = ab_64 >= 0; \ 4890 long nudge = select(mask2, mask1, is_positive_or_zero); \ 4891 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 4892 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 4893 long mask = ((((int)1) << _dst_shift) - (int)1); \ 4894 long threshold = (mask >> 1) + any(_tmp); \ 4895 _tmp2 = _tmp >> _dst_shift; \ 4896 _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ 4897 _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ 4898 _tmp += DST_OFFSET; \ 4899 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 4900 }) \ 4901 }) \ 4902 }) 4903 4904 4905#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \ 4906 ({ \ 4907 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4908 { \ 4909 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 4910 { \ 4911 SRC_DATA_TYPE _tmp = 0; \ 4912 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 4913 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 4914 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 4915 long a_64 = (long)(_src); \ 4916 long b_64 = (long)(DST_MULTIPLIER); \ 4917 long ab_64 = a_64 * b_64; \ 4918 long mask1 = 1 << 30; \ 4919 long mask2 = 1 - (1 << 30); \ 4920 long is_positive_or_zero = ab_64 >= 0; \ 4921 long nudge = select(mask2, mask1, is_positive_or_zero); \ 4922 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 4923 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 4924 if(DST_SHIFT >= 0) \ 4925 { \ 4926 long mask = ((((int)1) << DST_SHIFT) - (int)1); \ 4927 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 4928 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 4929 } \ 4930 _tmp += DST_OFFSET; \ 4931 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 4932 }) \ 4933 }) \ 4934 }) 4935 4936 4937#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \ 4938 ({ \ 4939 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4940 { \ 4941 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 4942 { \ 4943 a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \ 4944 }) \ 4945 }) \ 4946 }) 4947 4948 4949#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \ 4950 ({ \ 4951 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4952 { \ 4953 dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \ 4954 }) \ 4955 }) 4956 4957 4958#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) 4959 4960#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) 4961 4962#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 4963 4964#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) 4965 4966#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) 4967 4968#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 4969#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 4970 4971#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) 4972#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) 4973#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) 4974#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL)) 4975 4976 4977#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ 4978 ({ \ 4979 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4980 { \ 4981 dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ 4982 }) \ 4983 }) 4984 4985 4986#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ 4987 ({ \ 4988 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4989 { \ 4990 dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ 4991 }) \ 4992 }) 4993 4994 4995#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 4996 ({ \ 4997 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 4998 { \ 4999 dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ 5000 }) \ 5001 }) 5002 5003#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5004#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5005#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5006 5007#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5008#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5009 5010#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5011 5012#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5013#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5014 5015 5016#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 5017 ({ \ 5018 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 5019 { \ 5020 dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ 5021 }) \ 5022 }) 5023 5024 5025#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 5026 ({ \ 5027 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 5028 { \ 5029 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 5030 }) \ 5031 }) 5032 5033 5034#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 5035 ({ \ 5036 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 5037 { \ 5038 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 5039 }) \ 5040 }) 5041 5042#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5043#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5044#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5045#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 5046 5047 5048#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 5049 ({ \ 5050 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 5051 { \ 5052 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 5053 }) \ 5054 }) 5055 5056 5057#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ 5058 ({ \ 5059 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 5060 { \ 5061 dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 5062 }) \ 5063 }) 5064 5065 5066#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5067#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5068#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5069#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5070#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5071#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5072#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5073#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 5074#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 5075 { \ 5076 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 5077 { \ 5078 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 5079 { \ 5080 LOOP_UNROLLING(int, _k, 0, 1, K0, \ 5081 { \ 5082 dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \ 5083 }) \ 5084 }) \ 5085 }) \ 5086 } 5087 5088#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 5089 ({ \ 5090 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 5091 { \ 5092 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 5093 { \ 5094 DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \ 5095 }) \ 5096 }) \ 5097 }) 5098 5099#endif 5100 5101 5102 5103 5104__kernel void direct_convolution_nhwc( 5105 TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), 5106 TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), 5107 TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) 5108#if defined(HAS_BIAS) 5109 , 5110 VECTOR_DECLARATION(bia) 5111#endif 5112) 5113{ 5114 5115 5116#define _IWEI_WIDTH WEI_WIDTH 5117#define _IWEI_HEIGHT WEI_HEIGHT 5118#define _ISRC_WIDTH SRC_WIDTH 5119#define _ISRC_HEIGHT SRC_HEIGHT 5120#define _ISRC_CHANNELS SRC_CHANNELS 5121#define _IDST_WIDTH DST_WIDTH 5122#define _IDST_HEIGHT DST_HEIGHT 5123#define _IDST_CHANNELS DST_CHANNELS 5124#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) 5125 5126 5127#if defined(IS_QUANTIZED) 5128#define _IOUTPUT_TILE cq 5129#else 5130#define _IOUTPUT_TILE c 5131#endif 5132 5133 const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); 5134 const int mout = GET_SPATIAL_IDX(1, M0, 0); 5135 const int bout = GET_SPATIAL_IDX(2, 1, 0); 5136 5137 5138 5139 TILE(int, 1, M0, xi); 5140 TILE(int, 1, M0, yi); 5141 5142 5143 LOOP_UNROLLING(int, i, 0, 1, M0, 5144 { 5145 xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X; 5146 yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y; 5147 xi[0].s[i] -= PAD_LEFT; 5148 yi[0].s[i] -= PAD_TOP; 5149 }) 5150 5151 5152 TILE(ACC_DATA_TYPE, M0, N0, c); 5153 5154 LOOP_UNROLLING(int, i, 0, 1, M0, 5155 { 5156 c[i].v = 0; 5157 }) 5158 5159 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) 5160 { 5161 int xk = i % _IWEI_WIDTH; 5162 int yk = i / _IWEI_WIDTH; 5163 5164 TILE(int, 1, M0, my); 5165 5166 LOOP_UNROLLING(int, i, 0, 1, M0, 5167 { 5168 int x_s = xi[0].s[i] + xk; 5169 int y_s = yi[0].s[i] + yk; 5170 my[0].s[i] = x_s + y_s *_ISRC_WIDTH; 5171 my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT); 5172 my[0].s[i] = select(-1, my[0].s[i], x_s >= 0); 5173 my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH); 5174 my[0].s[i] = select(-1, my[0].s[i], y_s >= 0); 5175 my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT); 5176 }) 5177 5178 int ck = 0; 5179 for(; ck <= (_ISRC_CHANNELS - K0); ck += K0) 5180 { 5181 TILE(SRC_DATA_TYPE, M0, K0, a); 5182 TILE(WEI_DATA_TYPE, N0, K0, b); 5183 5184 5185 LOOP_UNROLLING(int, i, 0, 1, M0, 5186 { 5187 a[i].v = ZERO_VALUE; 5188 }) 5189 5190 LOOP_UNROLLING(int, i, 0, 1, N0, 5191 { 5192 b[i].v = ZERO_VALUE; 5193 }) 5194 5195 5196 T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a); 5197 5198 5199 T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); 5200 5201 5202 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); 5203 5204 5205 5206 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); 5207 } 5208 5209 5210#if defined(LEFTOVER_LOOP) 5211 5212 for(; ck < _ISRC_CHANNELS; ++ck) 5213 { 5214 TILE(SRC_DATA_TYPE, M0, 1, a); 5215 TILE(WEI_DATA_TYPE, N0, 1, b); 5216 5217 5218 LOOP_UNROLLING(int, i, 0, 1, M0, 5219 { 5220 a[i].v = ZERO_VALUE; 5221 }) 5222 5223 LOOP_UNROLLING(int, i, 0, 1, N0, 5224 { 5225 b[i].v = ZERO_VALUE; 5226 }) 5227 5228 5229 T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a); 5230 5231 5232 5233 T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); 5234 5235 5236 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); 5237 5238 5239 5240 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); 5241 } 5242#endif 5243 } 5244 5245 5246 5247 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c); 5248 5249#if defined(HAS_BIAS) 5250 TILE(BIA_DATA_TYPE, 1, N0, bias0); 5251 5252 T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0); 5253 5254 5255 T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); 5256 5257#endif 5258 5259#if defined(IS_QUANTIZED) 5260 5261 TILE(DST_DATA_TYPE, M0, N0, cq); 5262 5263 5264 T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq); 5265#endif 5266 5267 5268 T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE); 5269 5270 TILE(uint, M0, 1, dst_indirect_y); 5271 5272 5273 LOOP_UNROLLING(int, i, 0, 1, M0, 5274 { 5275 dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); 5276 dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); 5277 }) 5278 5279 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; 5280 5281 5282 5283 T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y); 5284 5285#undef _IWEI_WIDTH 5286#undef _IWEI_HEIGHT 5287#undef _ISRC_WIDTH 5288#undef _ISRC_HEIGHT 5289#undef _ISRC_CHANNELS 5290#undef _IDST_WIDTH 5291#undef _IDST_HEIGHT 5292#undef _IDST_CHANNELS 5293#undef _IY_MULTIPLIER 5294})"