xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/nhwc/dwc_native_fp_nhwc.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3
4
5
6#ifndef ARM_COMPUTE_HELPER_H
7#define ARM_COMPUTE_HELPER_H
8
9
10
11
12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    VSTORE(N0)                                                 \
14    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15
16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18    VSTORE(N0)                                                 \
19    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20
21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23    VSTORE(N0)                                                 \
24    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25
26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28    VSTORE(N0)                                                 \
29    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30
31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33    VSTORE(N0)                                                 \
34    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35
36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38    VSTORE(N0)                                                 \
39    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40
41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43    VSTORE(N0)                                                 \
44    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45
46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48    VSTORE(N0)                                                 \
49    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50
51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53    VSTORE(N0)                                                 \
54    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55
56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58    VSTORE(N0)                                                  \
59    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60
61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63    VSTORE(N0)                                                  \
64    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65
66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68    VSTORE(N0)                                                  \
69    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70
71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73    VSTORE(N0)                                                  \
74    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75
76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78    VSTORE(N0)                                                  \
79    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80
81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83    VSTORE(N0)                                                  \
84    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85
86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88    VSTORE(N0)                                                  \
89    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90
91
92
93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    VSTORE(N0)                                                         \
95    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96
97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99    VSTORE(N0)                                                         \
100    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101
102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104    VSTORE(N0)                                                         \
105    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106
107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109    VSTORE(N0)                                                         \
110    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111
112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114    VSTORE(N0)                                                         \
115    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116
117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                         \
120    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121
122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                         \
125    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126
127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                         \
130    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131
132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                         \
135    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136
137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    VSTORE(N0)                                                     \
140    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141
142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                          \
145    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146
147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                          \
150    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151
152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                          \
155    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159    VSTORE(N0)                                                          \
160    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161
162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                          \
165    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166
167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                          \
170    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171
172
173
174
175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177
178
179
180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182
183
184
185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188
189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193
194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198
199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203
204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208
209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213
214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218
219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223
224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228
229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233
234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238
239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243
244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248
249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253
254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258
259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263
264
265
266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268
269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273    }                                                                                                                                                     \
274    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281    }                                                                                                                                                     \
282    else                                                                                                                                                  \
283    {                                                                                                                                                     \
284        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285    }
286
287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288    if(!(PARTIAL_COND_X))                                                                                         \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291    }                                                                                                             \
292    else                                                                                                          \
293    {                                                                                                             \
294        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295    }
296
297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298    if(!(PARTIAL_COND_Y))                                                                                         \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301    }                                                                                                             \
302    else                                                                                                          \
303    {                                                                                                             \
304        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305    }
306
307
308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309
310
311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312
313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315
316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317
318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320
321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322
323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325
326#else
327
328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330
331#endif
332
333#endif
334
335
336#if defined(PARTIAL_STORE_M0)
337
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340#else
341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342    ((uint)(y * M0))
343#endif
344
345
346
347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349
350
351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361#endif
362
363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364#pragma OPENCL EXTENSION cl_arm_printf : enable
365#endif
366
367#define GPU_ARCH_MIDGARD 0x100
368#define GPU_ARCH_BIFROST 0x200
369#define GPU_ARCH_VALHALL 0x300
370
371
372#define CONCAT(a, b) a##b
373
374
375#define EXPAND(x) x
376
377
378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379
380
381#define REV1(x) ((x))
382#define REV2(x) ((x).s10)
383#define REV3(x) ((x).s210)
384#define REV4(x) ((x).s3210)
385#define REV8(x) ((x).s76543210)
386#define REV16(x) ((x).sFEDCBA9876543210)
387
388
389
390#define REVERSE_STR(x, s) REV##s((x))
391#define REVERSE(x, s) REVERSE_STR(x, s)
392
393
394
395#define ROT1_0(x) ((x))
396#define ROT1_1(x) ((x))
397
398#define ROT2_0(x) ((x))
399#define ROT2_1(x) ((x).s10)
400#define ROT2_2(x) ((x))
401
402#define ROT3_0(x) ((x))
403#define ROT3_1(x) ((x).s201)
404#define ROT3_2(x) ((x).s120)
405#define ROT3_3(x) ((x))
406
407#define ROT4_0(x) ((x))
408#define ROT4_1(x) ((x).s3012)
409#define ROT4_2(x) ((x).s2301)
410#define ROT4_3(x) ((x).s1230)
411#define ROT4_4(x) ((x))
412
413#define ROT8_0(x) ((x))
414#define ROT8_1(x) ((x).s70123456)
415#define ROT8_2(x) ((x).s67012345)
416#define ROT8_3(x) ((x).s56701234)
417#define ROT8_4(x) ((x).s45670123)
418#define ROT8_5(x) ((x).s34567012)
419#define ROT8_6(x) ((x).s23456701)
420#define ROT8_7(x) ((x).s12345670)
421#define ROT8_8(x) ((x))
422
423#define ROT16_0(x) ((x))
424#define ROT16_1(x) ((x).sF0123456789ABCDE)
425#define ROT16_2(x) ((x).sEF0123456789ABCD)
426#define ROT16_3(x) ((x).sDEF0123456789ABC)
427#define ROT16_4(x) ((x).sCDEF0123456789AB)
428#define ROT16_5(x) ((x).sBCDEF0123456789A)
429#define ROT16_6(x) ((x).sABCDEF0123456789)
430#define ROT16_7(x) ((x).s9ABCDEF012345678)
431#define ROT16_8(x) ((x).s89ABCDEF01234567)
432#define ROT16_9(x) ((x).s789ABCDEF0123456)
433#define ROT16_10(x) ((x).s6789ABCDEF012345)
434#define ROT16_11(x) ((x).s56789ABCDEF01234)
435#define ROT16_12(x) ((x).s456789ABCDEF0123)
436#define ROT16_13(x) ((x).s3456789ABCDEF012)
437#define ROT16_14(x) ((x).s23456789ABCDEF01)
438#define ROT16_15(x) ((x).s123456789ABCDEF0)
439#define ROT16_16(x) ((x))
440
441
442
443#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445
446
447
448#define V_OFFS1(dt) (dt##1)(0)
449#define V_OFFS2(dt) (dt##2)(0, 1)
450#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454
455
456
457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459
460
461#define VLOAD_STR(size) vload##size
462#define VLOAD(size) VLOAD_STR(size)
463
464
465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467
468#define NO_LOAD(data, offs, ptr) \
469    {                            \
470    }
471
472
473#define vload_partial_1_0 NO_LOAD
474#define vload_partial_1_1 vload1
475#define vload_partial_1_2 NO_LOAD
476#define vload_partial_1_3 NO_LOAD
477#define vload_partial_1_4 NO_LOAD
478#define vload_partial_1_5 NO_LOAD
479#define vload_partial_1_6 NO_LOAD
480#define vload_partial_1_7 NO_LOAD
481#define vload_partial_1_8 NO_LOAD
482#define vload_partial_1_9 NO_LOAD
483#define vload_partial_1_10 NO_LOAD
484#define vload_partial_1_11 NO_LOAD
485#define vload_partial_1_12 NO_LOAD
486#define vload_partial_1_13 NO_LOAD
487#define vload_partial_1_14 NO_LOAD
488#define vload_partial_1_15 NO_LOAD
489#define vload_partial_1_16 NO_LOAD
490
491#define vload_partial_2_0 NO_LOAD
492#define vload_partial_2_1 vload_partial_1
493#define vload_partial_2_2 vload_partial_2
494#define vload_partial_2_3 NO_LOAD
495#define vload_partial_2_4 NO_LOAD
496#define vload_partial_2_5 NO_LOAD
497#define vload_partial_2_6 NO_LOAD
498#define vload_partial_2_7 NO_LOAD
499#define vload_partial_2_8 NO_LOAD
500#define vload_partial_2_9 NO_LOAD
501#define vload_partial_2_10 NO_LOAD
502#define vload_partial_2_11 NO_LOAD
503#define vload_partial_2_12 NO_LOAD
504#define vload_partial_2_13 NO_LOAD
505#define vload_partial_2_14 NO_LOAD
506#define vload_partial_2_15 NO_LOAD
507#define vload_partial_2_16 NO_LOAD
508
509#define vload_partial_3_0 NO_LOAD
510#define vload_partial_3_1 vload_partial_1
511#define vload_partial_3_2 vload_partial_2
512#define vload_partial_3_3 vload_partial_3
513#define vload_partial_3_4 NO_LOAD
514#define vload_partial_3_5 NO_LOAD
515#define vload_partial_3_6 NO_LOAD
516#define vload_partial_3_7 NO_LOAD
517#define vload_partial_3_8 NO_LOAD
518#define vload_partial_3_9 NO_LOAD
519#define vload_partial_3_10 NO_LOAD
520#define vload_partial_3_11 NO_LOAD
521#define vload_partial_3_12 NO_LOAD
522#define vload_partial_3_13 NO_LOAD
523#define vload_partial_3_14 NO_LOAD
524#define vload_partial_3_15 NO_LOAD
525#define vload_partial_3_16 NO_LOAD
526
527#define vload_partial_4_0 NO_LOAD
528#define vload_partial_4_1 vload_partial_1
529#define vload_partial_4_2 vload_partial_2
530#define vload_partial_4_3 vload_partial_3
531#define vload_partial_4_4 vload_partial_4
532#define vload_partial_4_5 NO_LOAD
533#define vload_partial_4_6 NO_LOAD
534#define vload_partial_4_7 NO_LOAD
535#define vload_partial_4_8 NO_LOAD
536#define vload_partial_4_9 NO_LOAD
537#define vload_partial_4_10 NO_LOAD
538#define vload_partial_4_11 NO_LOAD
539#define vload_partial_4_12 NO_LOAD
540#define vload_partial_4_13 NO_LOAD
541#define vload_partial_4_14 NO_LOAD
542#define vload_partial_4_15 NO_LOAD
543#define vload_partial_4_16 NO_LOAD
544
545#define vload_partial_8_0 NO_LOAD
546#define vload_partial_8_1 vload_partial_1
547#define vload_partial_8_2 vload_partial_2
548#define vload_partial_8_3 vload_partial_3
549#define vload_partial_8_4 vload_partial_4
550#define vload_partial_8_5 vload_partial_5
551#define vload_partial_8_6 vload_partial_6
552#define vload_partial_8_7 vload_partial_7
553#define vload_partial_8_8 vload_partial_8
554#define vload_partial_8_9 NO_LOAD
555#define vload_partial_8_10 NO_LOAD
556#define vload_partial_8_11 NO_LOAD
557#define vload_partial_8_12 NO_LOAD
558#define vload_partial_8_13 NO_LOAD
559#define vload_partial_8_14 NO_LOAD
560#define vload_partial_8_15 NO_LOAD
561#define vload_partial_8_16 NO_LOAD
562
563#define vload_partial_16_0 NO_LOAD
564#define vload_partial_16_1 vload_partial_1
565#define vload_partial_16_2 vload_partial_2
566#define vload_partial_16_3 vload_partial_3
567#define vload_partial_16_4 vload_partial_4
568#define vload_partial_16_5 vload_partial_5
569#define vload_partial_16_6 vload_partial_6
570#define vload_partial_16_7 vload_partial_7
571#define vload_partial_16_8 vload_partial_8
572#define vload_partial_16_9 vload_partial_9
573#define vload_partial_16_10 vload_partial_10
574#define vload_partial_16_11 vload_partial_11
575#define vload_partial_16_12 vload_partial_12
576#define vload_partial_16_13 vload_partial_13
577#define vload_partial_16_14 vload_partial_14
578#define vload_partial_16_15 vload_partial_15
579#define vload_partial_16_16 vload_partial_16
580
581
582#define vload_partial_1(DATA, OFFSET, PTR) \
583    DATA.s0 = vload1(OFFSET, PTR);
584
585#define vload_partial_2(DATA, OFFSET, PTR) \
586    DATA.s01 = vload2(OFFSET, PTR);
587
588#define vload_partial_3(DATA, OFFSET, PTR) \
589    DATA.s012 = vload3(OFFSET, PTR);
590
591#define vload_partial_4(DATA, OFFSET, PTR) \
592    DATA.s0123 = vload4(OFFSET, PTR);
593
594#define vload_partial_5(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    DATA.s4 = vload1(OFFSET, PTR + 4);
597
598#define vload_partial_6(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601
602#define vload_partial_7(DATA, OFFSET, PTR)    \
603    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605
606#define vload_partial_8(DATA, OFFSET, PTR) \
607    DATA.s01234567 = vload8(OFFSET, PTR);
608
609#define vload_partial_9(DATA, OFFSET, PTR)        \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    DATA.s8 = vload1(OFFSET, PTR + 8);
612
613#define vload_partial_10(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616
617#define vload_partial_11(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620
621#define vload_partial_12(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624
625#define vload_partial_13(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_14(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_15(DATA, OFFSET, PTR)       \
634    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636
637#define vload_partial_16(DATA, OFFSET, PTR) \
638    DATA = vload16(OFFSET, PTR);
639
640
641
642#define PIXEL_UNIT4 1
643#define PIXEL_UNIT8 2
644#define PIXEL_UNIT16 4
645
646
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649
650
651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654
655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659#endif
660
661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669#endif
670
671
672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674
675
676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678
679#define VSTORE_STR(size) vstore##size
680#define VSTORE(size) VSTORE_STR(size)
681
682#define float1 float
683#define half1 half
684#define char1 char
685#define uchar1 uchar
686#define short1 short
687#define ushort1 ushort
688#define int1 int
689#define uint1 uint
690#define long1 long
691#define ulong1 ulong
692#define double1 double
693
694#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696
697
698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700
701#define NO_STORE(data, offs, ptr) \
702    {                             \
703    }
704
705
706#define vstore_partial_1_0 NO_STORE
707#define vstore_partial_1_1 vstore1
708#define vstore_partial_1_2 NO_STORE
709#define vstore_partial_1_3 NO_STORE
710#define vstore_partial_1_4 NO_STORE
711#define vstore_partial_1_5 NO_STORE
712#define vstore_partial_1_6 NO_STORE
713#define vstore_partial_1_7 NO_STORE
714#define vstore_partial_1_8 NO_STORE
715#define vstore_partial_1_9 NO_STORE
716#define vstore_partial_1_10 NO_STORE
717#define vstore_partial_1_11 NO_STORE
718#define vstore_partial_1_12 NO_STORE
719#define vstore_partial_1_13 NO_STORE
720#define vstore_partial_1_14 NO_STORE
721#define vstore_partial_1_15 NO_STORE
722#define vstore_partial_1_16 NO_STORE
723
724#define vstore_partial_2_0 NO_STORE
725#define vstore_partial_2_1 vstore_partial_1
726#define vstore_partial_2_2 vstore_partial_2
727#define vstore_partial_2_3 NO_STORE
728#define vstore_partial_2_4 NO_STORE
729#define vstore_partial_2_5 NO_STORE
730#define vstore_partial_2_6 NO_STORE
731#define vstore_partial_2_7 NO_STORE
732#define vstore_partial_2_8 NO_STORE
733#define vstore_partial_2_9 NO_STORE
734#define vstore_partial_2_10 NO_STORE
735#define vstore_partial_2_11 NO_STORE
736#define vstore_partial_2_12 NO_STORE
737#define vstore_partial_2_13 NO_STORE
738#define vstore_partial_2_14 NO_STORE
739#define vstore_partial_2_15 NO_STORE
740#define vstore_partial_2_16 NO_STORE
741
742#define vstore_partial_3_0 NO_STORE
743#define vstore_partial_3_1 vstore_partial_1
744#define vstore_partial_3_2 vstore_partial_2
745#define vstore_partial_3_3 vstore_partial_3
746#define vstore_partial_3_4 NO_STORE
747#define vstore_partial_3_5 NO_STORE
748#define vstore_partial_3_6 NO_STORE
749#define vstore_partial_3_7 NO_STORE
750#define vstore_partial_3_8 NO_STORE
751#define vstore_partial_3_9 NO_STORE
752#define vstore_partial_3_10 NO_STORE
753#define vstore_partial_3_11 NO_STORE
754#define vstore_partial_3_12 NO_STORE
755#define vstore_partial_3_13 NO_STORE
756#define vstore_partial_3_14 NO_STORE
757#define vstore_partial_3_15 NO_STORE
758#define vstore_partial_3_16 NO_STORE
759
760#define vstore_partial_4_0 NO_STORE
761#define vstore_partial_4_1 vstore_partial_1
762#define vstore_partial_4_2 vstore_partial_2
763#define vstore_partial_4_3 vstore_partial_3
764#define vstore_partial_4_4 vstore_partial_4
765#define vstore_partial_4_5 NO_STORE
766#define vstore_partial_4_6 NO_STORE
767#define vstore_partial_4_7 NO_STORE
768#define vstore_partial_4_8 NO_STORE
769#define vstore_partial_4_9 NO_STORE
770#define vstore_partial_4_10 NO_STORE
771#define vstore_partial_4_11 NO_STORE
772#define vstore_partial_4_12 NO_STORE
773#define vstore_partial_4_13 NO_STORE
774#define vstore_partial_4_14 NO_STORE
775#define vstore_partial_4_15 NO_STORE
776#define vstore_partial_4_16 NO_STORE
777
778#define vstore_partial_8_0 NO_STORE
779#define vstore_partial_8_1 vstore_partial_1
780#define vstore_partial_8_2 vstore_partial_2
781#define vstore_partial_8_3 vstore_partial_3
782#define vstore_partial_8_4 vstore_partial_4
783#define vstore_partial_8_5 vstore_partial_5
784#define vstore_partial_8_6 vstore_partial_6
785#define vstore_partial_8_7 vstore_partial_7
786#define vstore_partial_8_8 vstore_partial_8
787#define vstore_partial_8_9 NO_STORE
788#define vstore_partial_8_10 NO_STORE
789#define vstore_partial_8_11 NO_STORE
790#define vstore_partial_8_12 NO_STORE
791#define vstore_partial_8_13 NO_STORE
792#define vstore_partial_8_14 NO_STORE
793#define vstore_partial_8_15 NO_STORE
794#define vstore_partial_8_16 NO_STORE
795
796#define vstore_partial_16_0 NO_STORE
797#define vstore_partial_16_1 vstore_partial_1
798#define vstore_partial_16_2 vstore_partial_2
799#define vstore_partial_16_3 vstore_partial_3
800#define vstore_partial_16_4 vstore_partial_4
801#define vstore_partial_16_5 vstore_partial_5
802#define vstore_partial_16_6 vstore_partial_6
803#define vstore_partial_16_7 vstore_partial_7
804#define vstore_partial_16_8 vstore_partial_8
805#define vstore_partial_16_9 vstore_partial_9
806#define vstore_partial_16_10 vstore_partial_10
807#define vstore_partial_16_11 vstore_partial_11
808#define vstore_partial_16_12 vstore_partial_12
809#define vstore_partial_16_13 vstore_partial_13
810#define vstore_partial_16_14 vstore_partial_14
811#define vstore_partial_16_15 vstore_partial_15
812#define vstore_partial_16_16 vstore_partial_16
813
814
815#define vstore_partial_1(DATA, OFFSET, PTR) \
816    vstore1(DATA.s0, OFFSET, PTR);
817
818#define vstore_partial_2(DATA, OFFSET, PTR) \
819    vstore2(DATA.s01, OFFSET, PTR);
820
821#define vstore_partial_3(DATA, OFFSET, PTR) \
822    vstore3(DATA.s012, OFFSET, PTR);
823
824#define vstore_partial_4(DATA, OFFSET, PTR) \
825    vstore4(DATA.s0123, OFFSET, PTR);
826
827#define vstore_partial_5(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore1(DATA.s4, OFFSET, PTR + 4);
830
831#define vstore_partial_6(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834
835#define vstore_partial_7(DATA, OFFSET, PTR)    \
836    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838
839#define vstore_partial_8(DATA, OFFSET, PTR) \
840    vstore8(DATA.s01234567, OFFSET, PTR);
841
842#define vstore_partial_9(DATA, OFFSET, PTR)        \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore1(DATA.s8, OFFSET, PTR + 8);
845
846#define vstore_partial_10(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849
850#define vstore_partial_11(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853
854#define vstore_partial_12(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857
858#define vstore_partial_13(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_14(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_15(DATA, OFFSET, PTR)       \
867    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869
870#define vstore_partial_16(DATA, OFFSET, PTR) \
871    vstore16(DATA, OFFSET, PTR);
872
873
874
875
876
877#define convert_float_sat convert_float
878#define convert_float1_sat convert_float
879#define convert_float2_sat convert_float2
880#define convert_float3_sat convert_float3
881#define convert_float4_sat convert_float4
882#define convert_float8_sat convert_float8
883#define convert_float16_sat convert_float16
884#define convert_half_sat convert_float
885#define convert_half1_sat convert_half
886#define convert_half2_sat convert_half2
887#define convert_half3_sat convert_half3
888#define convert_half4_sat convert_half4
889#define convert_half8_sat convert_half8
890#define convert_half16_sat convert_half16
891
892#define convert_float1 convert_float
893#define convert_half1 convert_half
894#define convert_char1 convert_char
895#define convert_uchar1 convert_uchar
896#define convert_short1 convert_short
897#define convert_ushort1 convert_ushort
898#define convert_int1 convert_int
899#define convert_uint1 convert_uint
900#define convert_long1 convert_long
901#define convert_ulong1 convert_ulong
902#define convert_double1 convert_double
903
904#define convert_char1_sat convert_char_sat
905#define convert_uchar1_sat convert_uchar_sat
906#define convert_uchar2_sat convert_uchar2_sat
907#define convert_uchar3_sat convert_uchar3_sat
908#define convert_uchar4_sat convert_uchar4_sat
909#define convert_uchar8_sat convert_uchar8_sat
910#define convert_uchar16_sat convert_uchar16_sat
911#define convert_short1_sat convert_short_sat
912#define convert_ushort1_sat convert_ushort_sat
913#define convert_int1_sat convert_int_sat
914#define convert_uint1_sat convert_uint_sat
915#define convert_long1_sat convert_long_sat
916#define convert_ulong1_sat convert_ulong_sat
917#define convert_double1_sat convert_double_sat
918
919#define VEC_DATA_TYPE_STR(type, size) type##size
920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921
922#define CONVERT_STR(x, type) (convert_##type((x)))
923#define CONVERT(x, type) CONVERT_STR(x, type)
924
925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927
928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930
931#define select_vec_dt_uchar(size) uchar##size
932#define select_vec_dt_char(size) char##size
933#define select_vec_dt_ushort(size) ushort##size
934#define select_vec_dt_short(size) short##size
935#define select_vec_dt_half(size) short##size
936#define select_vec_dt_uint(size) uint##size
937#define select_vec_dt_int(size) int##size
938#define select_vec_dt_float(size) int##size
939#define select_vec_dt_ulong(size) ulong##size
940#define select_vec_dt_long(size) long##size
941
942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945
946#define signed_int_vec_dt_uchar(size) char##size
947#define signed_int_vec_dt_char(size) char##size
948#define signed_int_vec_dt_ushort(size) short##size
949#define signed_int_vec_dt_short(size) short##size
950#define signed_int_vec_dt_half(size) short##size
951#define signed_int_vec_dt_uint(size) int##size
952#define signed_int_vec_dt_int(size) int##size
953#define signed_int_vec_dt_float(size) int##size
954#define signed_int_vec_dt_ulong(size) long##size
955#define signed_int_vec_dt_long(size) long##size
956
957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960
961#define sum_reduce_1(x) (x)
962#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967
968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970
971#define prod_reduce_1(x) (x)
972#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977
978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980
981#define max_reduce_1(x) (x)
982#define max_reduce_2(x) max(((x).s0), ((x).s1))
983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987
988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990
991#define VECTOR_DECLARATION(name)     \
992    __global uchar *name##_ptr,      \
993    uint        name##_stride_x, \
994    uint        name##_step_x,   \
995    uint        name##_offset_first_element_in_bytes
996
997#define IMAGE_DECLARATION(name)      \
998    __global uchar *name##_ptr,      \
999    uint        name##_stride_x, \
1000    uint        name##_step_x,   \
1001    uint        name##_stride_y, \
1002    uint        name##_step_y,   \
1003    uint        name##_offset_first_element_in_bytes
1004
1005#define TENSOR3D_DECLARATION(name)   \
1006    __global uchar *name##_ptr,      \
1007    uint        name##_stride_x, \
1008    uint        name##_step_x,   \
1009    uint        name##_stride_y, \
1010    uint        name##_step_y,   \
1011    uint        name##_stride_z, \
1012    uint        name##_step_z,   \
1013    uint        name##_offset_first_element_in_bytes
1014
1015#define TENSOR4D_DECLARATION(name)   \
1016    __global uchar *name##_ptr,      \
1017    uint        name##_stride_x, \
1018    uint        name##_step_x,   \
1019    uint        name##_stride_y, \
1020    uint        name##_step_y,   \
1021    uint        name##_stride_z, \
1022    uint        name##_step_z,   \
1023    uint        name##_stride_w, \
1024    uint        name##_step_w,   \
1025    uint        name##_offset_first_element_in_bytes
1026
1027#define TENSOR5D_DECLARATION(name)   \
1028    __global uchar *name##_ptr,      \
1029    uint        name##_stride_x, \
1030    uint        name##_step_x,   \
1031    uint        name##_stride_y, \
1032    uint        name##_step_y,   \
1033    uint        name##_stride_z, \
1034    uint        name##_step_z,   \
1035    uint        name##_stride_w, \
1036    uint        name##_step_w,   \
1037    uint        name##_stride_v, \
1038    uint        name##_step_v,   \
1039    uint        name##_offset_first_element_in_bytes
1040
1041#define CONVERT_TO_VECTOR_STRUCT(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043
1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046
1047#define CONVERT_TO_IMAGE_STRUCT(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049
1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064                                 name##_stride_z, name##_step_z)
1065
1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072
1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075
1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078                           name##_stride_z, name##_step_z)
1079
1080
1081typedef struct Vector
1082{
1083    __global uchar *ptr;
1084    int             offset_first_element_in_bytes;
1085    int             stride_x;
1086} Vector;
1087
1088
1089typedef struct Image
1090{
1091    __global uchar *ptr;
1092    int             offset_first_element_in_bytes;
1093    int             stride_x;
1094    int             stride_y;
1095} Image;
1096
1097
1098typedef struct Tensor3D
1099{
1100    __global uchar *ptr;
1101    int             offset_first_element_in_bytes;
1102    int             stride_x;
1103    int             stride_y;
1104    int             stride_z;
1105} Tensor3D;
1106
1107
1108typedef struct Tensor4D
1109{
1110    __global uchar *ptr;
1111    int             offset_first_element_in_bytes;
1112    int             stride_x;
1113    int             stride_y;
1114    int             stride_z;
1115    int             stride_w;
1116} Tensor4D;
1117
1118
1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120{
1121    Vector vector =
1122    {
1123        .ptr                           = ptr,
1124        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125        .stride_x                      = stride_x,
1126    };
1127    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128    return vector;
1129}
1130
1131
1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133{
1134    Image img =
1135    {
1136        .ptr                           = ptr,
1137        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138        .stride_x                      = stride_x,
1139        .stride_y                      = stride_y
1140    };
1141    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142    return img;
1143}
1144
1145
1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147{
1148    Image img =
1149    {
1150        .ptr                           = ptr,
1151        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152        .stride_x                      = stride_x,
1153        .stride_y                      = stride_y
1154    };
1155    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156    return img;
1157}
1158
1159
1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161{
1162    Tensor3D tensor =
1163    {
1164        .ptr                           = ptr,
1165        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166        .stride_x                      = stride_x,
1167        .stride_y                      = stride_y,
1168        .stride_z                      = stride_z
1169    };
1170    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171    return tensor;
1172}
1173
1174
1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176{
1177    Tensor3D tensor =
1178    {
1179        .ptr                           = ptr,
1180        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181        .stride_x                      = stride_x,
1182        .stride_y                      = stride_y,
1183        .stride_z                      = stride_z
1184    };
1185    return tensor;
1186}
1187
1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189                                             uint step_w,
1190                                             uint mod_size)
1191{
1192    Tensor4D tensor =
1193    {
1194        .ptr                           = ptr,
1195        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196        .stride_x                      = stride_x,
1197        .stride_y                      = stride_y,
1198        .stride_z                      = stride_z,
1199        .stride_w                      = stride_w
1200    };
1201
1202    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203    return tensor;
1204}
1205
1206
1207inline __global const uchar *vector_offset(const Vector *vec, int x)
1208{
1209    return vec->ptr + x * vec->stride_x;
1210}
1211
1212
1213inline __global uchar *offset(const Image *img, int x, int y)
1214{
1215    return img->ptr + x * img->stride_x + y * img->stride_y;
1216}
1217
1218
1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220{
1221    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222}
1223
1224
1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226{
1227    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228}
1229
1230
1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232{
1233    uint num_elements = width * height;
1234
1235    const uint z = index / num_elements;
1236
1237    index %= num_elements;
1238
1239    const uint y = index / width;
1240
1241    index %= width;
1242
1243    const uint x = index;
1244
1245    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246}
1247
1248#endif
1249
1250#if GPU_ARCH == GPU_ARCH_BIFROST
1251#define MLA(a, b, c) (fma(c, b, a))
1252#else
1253#define MLA(a, b, c) ((b) * (c) + (a))
1254#endif
1255
1256
1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258
1259
1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261
1262
1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264
1265
1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267
1268
1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270
1271
1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273
1274
1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276
1277
1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279
1280
1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282
1283
1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285
1286
1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288
1289
1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291
1292
1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294
1295
1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297
1298
1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300
1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302
1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304
1305#ifndef ARM_COMPUTE_HELPER_H
1306#define ARM_COMPUTE_HELPER_H
1307
1308
1309
1310
1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    VSTORE(N0)                                                 \
1313    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314
1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317    VSTORE(N0)                                                 \
1318    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319
1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322    VSTORE(N0)                                                 \
1323    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324
1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327    VSTORE(N0)                                                 \
1328    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329
1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332    VSTORE(N0)                                                 \
1333    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334
1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337    VSTORE(N0)                                                 \
1338    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339
1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342    VSTORE(N0)                                                 \
1343    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344
1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347    VSTORE(N0)                                                 \
1348    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349
1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352    VSTORE(N0)                                                 \
1353    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354
1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357    VSTORE(N0)                                                  \
1358    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359
1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362    VSTORE(N0)                                                  \
1363    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364
1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367    VSTORE(N0)                                                  \
1368    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369
1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372    VSTORE(N0)                                                  \
1373    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374
1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377    VSTORE(N0)                                                  \
1378    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379
1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382    VSTORE(N0)                                                  \
1383    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384
1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387    VSTORE(N0)                                                  \
1388    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389
1390
1391
1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    VSTORE(N0)                                                         \
1394    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395
1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398    VSTORE(N0)                                                         \
1399    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400
1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403    VSTORE(N0)                                                         \
1404    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405
1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408    VSTORE(N0)                                                         \
1409    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410
1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413    VSTORE(N0)                                                         \
1414    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415
1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418    VSTORE(N0)                                                         \
1419    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420
1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423    VSTORE(N0)                                                         \
1424    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425
1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428    VSTORE(N0)                                                         \
1429    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430
1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433    VSTORE(N0)                                                         \
1434    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435
1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438    VSTORE(N0)                                                     \
1439    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440
1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443    VSTORE(N0)                                                          \
1444    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445
1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448    VSTORE(N0)                                                          \
1449    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450
1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453    VSTORE(N0)                                                          \
1454    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455
1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458    VSTORE(N0)                                                          \
1459    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460
1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463    VSTORE(N0)                                                          \
1464    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465
1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468    VSTORE(N0)                                                          \
1469    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470
1471
1472
1473
1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476
1477
1478
1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481
1482
1483
1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487
1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492
1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497
1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502
1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507
1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512
1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517
1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522
1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527
1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532
1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537
1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542
1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547
1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552
1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557
1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562
1563
1564
1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567
1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570    {                                                                                                                                                     \
1571        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572    }                                                                                                                                                     \
1573    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574    {                                                                                                                                                     \
1575        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576    }                                                                                                                                                     \
1577    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578    {                                                                                                                                                     \
1579        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580    }                                                                                                                                                     \
1581    else                                                                                                                                                  \
1582    {                                                                                                                                                     \
1583        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584    }
1585
1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587    if(!(PARTIAL_COND_X))                                                                                         \
1588    {                                                                                                             \
1589        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590    }                                                                                                             \
1591    else                                                                                                          \
1592    {                                                                                                             \
1593        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594    }
1595
1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597    if(!(PARTIAL_COND_Y))                                                                                         \
1598    {                                                                                                             \
1599        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600    }                                                                                                             \
1601    else                                                                                                          \
1602    {                                                                                                             \
1603        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604    }
1605
1606
1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608
1609
1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611
1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614
1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616
1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619
1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621
1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624
1625#else
1626
1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629
1630#endif
1631
1632#endif
1633
1634
1635#if defined(PARTIAL_STORE_M0)
1636
1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639#else
1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641    ((uint)(y * M0))
1642#endif
1643
1644
1645
1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648
1649
1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652#endif
1653
1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656#endif
1657
1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660#endif
1661
1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663#pragma OPENCL EXTENSION cl_arm_printf : enable
1664#endif
1665
1666#define GPU_ARCH_MIDGARD 0x100
1667#define GPU_ARCH_BIFROST 0x200
1668#define GPU_ARCH_VALHALL 0x300
1669
1670
1671#define CONCAT(a, b) a##b
1672
1673
1674#define EXPAND(x) x
1675
1676
1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678
1679
1680#define REV1(x) ((x))
1681#define REV2(x) ((x).s10)
1682#define REV3(x) ((x).s210)
1683#define REV4(x) ((x).s3210)
1684#define REV8(x) ((x).s76543210)
1685#define REV16(x) ((x).sFEDCBA9876543210)
1686
1687
1688
1689#define REVERSE_STR(x, s) REV##s((x))
1690#define REVERSE(x, s) REVERSE_STR(x, s)
1691
1692
1693
1694#define ROT1_0(x) ((x))
1695#define ROT1_1(x) ((x))
1696
1697#define ROT2_0(x) ((x))
1698#define ROT2_1(x) ((x).s10)
1699#define ROT2_2(x) ((x))
1700
1701#define ROT3_0(x) ((x))
1702#define ROT3_1(x) ((x).s201)
1703#define ROT3_2(x) ((x).s120)
1704#define ROT3_3(x) ((x))
1705
1706#define ROT4_0(x) ((x))
1707#define ROT4_1(x) ((x).s3012)
1708#define ROT4_2(x) ((x).s2301)
1709#define ROT4_3(x) ((x).s1230)
1710#define ROT4_4(x) ((x))
1711
1712#define ROT8_0(x) ((x))
1713#define ROT8_1(x) ((x).s70123456)
1714#define ROT8_2(x) ((x).s67012345)
1715#define ROT8_3(x) ((x).s56701234)
1716#define ROT8_4(x) ((x).s45670123)
1717#define ROT8_5(x) ((x).s34567012)
1718#define ROT8_6(x) ((x).s23456701)
1719#define ROT8_7(x) ((x).s12345670)
1720#define ROT8_8(x) ((x))
1721
1722#define ROT16_0(x) ((x))
1723#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728#define ROT16_6(x) ((x).sABCDEF0123456789)
1729#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738#define ROT16_16(x) ((x))
1739
1740
1741
1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744
1745
1746
1747#define V_OFFS1(dt) (dt##1)(0)
1748#define V_OFFS2(dt) (dt##2)(0, 1)
1749#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753
1754
1755
1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758
1759
1760#define VLOAD_STR(size) vload##size
1761#define VLOAD(size) VLOAD_STR(size)
1762
1763
1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766
1767#define NO_LOAD(data, offs, ptr) \
1768    {                            \
1769    }
1770
1771
1772#define vload_partial_1_0 NO_LOAD
1773#define vload_partial_1_1 vload1
1774#define vload_partial_1_2 NO_LOAD
1775#define vload_partial_1_3 NO_LOAD
1776#define vload_partial_1_4 NO_LOAD
1777#define vload_partial_1_5 NO_LOAD
1778#define vload_partial_1_6 NO_LOAD
1779#define vload_partial_1_7 NO_LOAD
1780#define vload_partial_1_8 NO_LOAD
1781#define vload_partial_1_9 NO_LOAD
1782#define vload_partial_1_10 NO_LOAD
1783#define vload_partial_1_11 NO_LOAD
1784#define vload_partial_1_12 NO_LOAD
1785#define vload_partial_1_13 NO_LOAD
1786#define vload_partial_1_14 NO_LOAD
1787#define vload_partial_1_15 NO_LOAD
1788#define vload_partial_1_16 NO_LOAD
1789
1790#define vload_partial_2_0 NO_LOAD
1791#define vload_partial_2_1 vload_partial_1
1792#define vload_partial_2_2 vload_partial_2
1793#define vload_partial_2_3 NO_LOAD
1794#define vload_partial_2_4 NO_LOAD
1795#define vload_partial_2_5 NO_LOAD
1796#define vload_partial_2_6 NO_LOAD
1797#define vload_partial_2_7 NO_LOAD
1798#define vload_partial_2_8 NO_LOAD
1799#define vload_partial_2_9 NO_LOAD
1800#define vload_partial_2_10 NO_LOAD
1801#define vload_partial_2_11 NO_LOAD
1802#define vload_partial_2_12 NO_LOAD
1803#define vload_partial_2_13 NO_LOAD
1804#define vload_partial_2_14 NO_LOAD
1805#define vload_partial_2_15 NO_LOAD
1806#define vload_partial_2_16 NO_LOAD
1807
1808#define vload_partial_3_0 NO_LOAD
1809#define vload_partial_3_1 vload_partial_1
1810#define vload_partial_3_2 vload_partial_2
1811#define vload_partial_3_3 vload_partial_3
1812#define vload_partial_3_4 NO_LOAD
1813#define vload_partial_3_5 NO_LOAD
1814#define vload_partial_3_6 NO_LOAD
1815#define vload_partial_3_7 NO_LOAD
1816#define vload_partial_3_8 NO_LOAD
1817#define vload_partial_3_9 NO_LOAD
1818#define vload_partial_3_10 NO_LOAD
1819#define vload_partial_3_11 NO_LOAD
1820#define vload_partial_3_12 NO_LOAD
1821#define vload_partial_3_13 NO_LOAD
1822#define vload_partial_3_14 NO_LOAD
1823#define vload_partial_3_15 NO_LOAD
1824#define vload_partial_3_16 NO_LOAD
1825
1826#define vload_partial_4_0 NO_LOAD
1827#define vload_partial_4_1 vload_partial_1
1828#define vload_partial_4_2 vload_partial_2
1829#define vload_partial_4_3 vload_partial_3
1830#define vload_partial_4_4 vload_partial_4
1831#define vload_partial_4_5 NO_LOAD
1832#define vload_partial_4_6 NO_LOAD
1833#define vload_partial_4_7 NO_LOAD
1834#define vload_partial_4_8 NO_LOAD
1835#define vload_partial_4_9 NO_LOAD
1836#define vload_partial_4_10 NO_LOAD
1837#define vload_partial_4_11 NO_LOAD
1838#define vload_partial_4_12 NO_LOAD
1839#define vload_partial_4_13 NO_LOAD
1840#define vload_partial_4_14 NO_LOAD
1841#define vload_partial_4_15 NO_LOAD
1842#define vload_partial_4_16 NO_LOAD
1843
1844#define vload_partial_8_0 NO_LOAD
1845#define vload_partial_8_1 vload_partial_1
1846#define vload_partial_8_2 vload_partial_2
1847#define vload_partial_8_3 vload_partial_3
1848#define vload_partial_8_4 vload_partial_4
1849#define vload_partial_8_5 vload_partial_5
1850#define vload_partial_8_6 vload_partial_6
1851#define vload_partial_8_7 vload_partial_7
1852#define vload_partial_8_8 vload_partial_8
1853#define vload_partial_8_9 NO_LOAD
1854#define vload_partial_8_10 NO_LOAD
1855#define vload_partial_8_11 NO_LOAD
1856#define vload_partial_8_12 NO_LOAD
1857#define vload_partial_8_13 NO_LOAD
1858#define vload_partial_8_14 NO_LOAD
1859#define vload_partial_8_15 NO_LOAD
1860#define vload_partial_8_16 NO_LOAD
1861
1862#define vload_partial_16_0 NO_LOAD
1863#define vload_partial_16_1 vload_partial_1
1864#define vload_partial_16_2 vload_partial_2
1865#define vload_partial_16_3 vload_partial_3
1866#define vload_partial_16_4 vload_partial_4
1867#define vload_partial_16_5 vload_partial_5
1868#define vload_partial_16_6 vload_partial_6
1869#define vload_partial_16_7 vload_partial_7
1870#define vload_partial_16_8 vload_partial_8
1871#define vload_partial_16_9 vload_partial_9
1872#define vload_partial_16_10 vload_partial_10
1873#define vload_partial_16_11 vload_partial_11
1874#define vload_partial_16_12 vload_partial_12
1875#define vload_partial_16_13 vload_partial_13
1876#define vload_partial_16_14 vload_partial_14
1877#define vload_partial_16_15 vload_partial_15
1878#define vload_partial_16_16 vload_partial_16
1879
1880
1881#define vload_partial_1(DATA, OFFSET, PTR) \
1882    DATA.s0 = vload1(OFFSET, PTR);
1883
1884#define vload_partial_2(DATA, OFFSET, PTR) \
1885    DATA.s01 = vload2(OFFSET, PTR);
1886
1887#define vload_partial_3(DATA, OFFSET, PTR) \
1888    DATA.s012 = vload3(OFFSET, PTR);
1889
1890#define vload_partial_4(DATA, OFFSET, PTR) \
1891    DATA.s0123 = vload4(OFFSET, PTR);
1892
1893#define vload_partial_5(DATA, OFFSET, PTR)    \
1894    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895    DATA.s4 = vload1(OFFSET, PTR + 4);
1896
1897#define vload_partial_6(DATA, OFFSET, PTR)    \
1898    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900
1901#define vload_partial_7(DATA, OFFSET, PTR)    \
1902    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904
1905#define vload_partial_8(DATA, OFFSET, PTR) \
1906    DATA.s01234567 = vload8(OFFSET, PTR);
1907
1908#define vload_partial_9(DATA, OFFSET, PTR)        \
1909    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910    DATA.s8 = vload1(OFFSET, PTR + 8);
1911
1912#define vload_partial_10(DATA, OFFSET, PTR)       \
1913    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915
1916#define vload_partial_11(DATA, OFFSET, PTR)       \
1917    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919
1920#define vload_partial_12(DATA, OFFSET, PTR)       \
1921    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923
1924#define vload_partial_13(DATA, OFFSET, PTR)       \
1925    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927
1928#define vload_partial_14(DATA, OFFSET, PTR)       \
1929    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931
1932#define vload_partial_15(DATA, OFFSET, PTR)       \
1933    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935
1936#define vload_partial_16(DATA, OFFSET, PTR) \
1937    DATA = vload16(OFFSET, PTR);
1938
1939
1940
1941#define PIXEL_UNIT4 1
1942#define PIXEL_UNIT8 2
1943#define PIXEL_UNIT16 4
1944
1945
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948
1949
1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953
1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958#endif
1959
1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963
1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968#endif
1969
1970
1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973
1974
1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977
1978#define VSTORE_STR(size) vstore##size
1979#define VSTORE(size) VSTORE_STR(size)
1980
1981#define float1 float
1982#define half1 half
1983#define char1 char
1984#define uchar1 uchar
1985#define short1 short
1986#define ushort1 ushort
1987#define int1 int
1988#define uint1 uint
1989#define long1 long
1990#define ulong1 ulong
1991#define double1 double
1992
1993#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995
1996
1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999
2000#define NO_STORE(data, offs, ptr) \
2001    {                             \
2002    }
2003
2004
2005#define vstore_partial_1_0 NO_STORE
2006#define vstore_partial_1_1 vstore1
2007#define vstore_partial_1_2 NO_STORE
2008#define vstore_partial_1_3 NO_STORE
2009#define vstore_partial_1_4 NO_STORE
2010#define vstore_partial_1_5 NO_STORE
2011#define vstore_partial_1_6 NO_STORE
2012#define vstore_partial_1_7 NO_STORE
2013#define vstore_partial_1_8 NO_STORE
2014#define vstore_partial_1_9 NO_STORE
2015#define vstore_partial_1_10 NO_STORE
2016#define vstore_partial_1_11 NO_STORE
2017#define vstore_partial_1_12 NO_STORE
2018#define vstore_partial_1_13 NO_STORE
2019#define vstore_partial_1_14 NO_STORE
2020#define vstore_partial_1_15 NO_STORE
2021#define vstore_partial_1_16 NO_STORE
2022
2023#define vstore_partial_2_0 NO_STORE
2024#define vstore_partial_2_1 vstore_partial_1
2025#define vstore_partial_2_2 vstore_partial_2
2026#define vstore_partial_2_3 NO_STORE
2027#define vstore_partial_2_4 NO_STORE
2028#define vstore_partial_2_5 NO_STORE
2029#define vstore_partial_2_6 NO_STORE
2030#define vstore_partial_2_7 NO_STORE
2031#define vstore_partial_2_8 NO_STORE
2032#define vstore_partial_2_9 NO_STORE
2033#define vstore_partial_2_10 NO_STORE
2034#define vstore_partial_2_11 NO_STORE
2035#define vstore_partial_2_12 NO_STORE
2036#define vstore_partial_2_13 NO_STORE
2037#define vstore_partial_2_14 NO_STORE
2038#define vstore_partial_2_15 NO_STORE
2039#define vstore_partial_2_16 NO_STORE
2040
2041#define vstore_partial_3_0 NO_STORE
2042#define vstore_partial_3_1 vstore_partial_1
2043#define vstore_partial_3_2 vstore_partial_2
2044#define vstore_partial_3_3 vstore_partial_3
2045#define vstore_partial_3_4 NO_STORE
2046#define vstore_partial_3_5 NO_STORE
2047#define vstore_partial_3_6 NO_STORE
2048#define vstore_partial_3_7 NO_STORE
2049#define vstore_partial_3_8 NO_STORE
2050#define vstore_partial_3_9 NO_STORE
2051#define vstore_partial_3_10 NO_STORE
2052#define vstore_partial_3_11 NO_STORE
2053#define vstore_partial_3_12 NO_STORE
2054#define vstore_partial_3_13 NO_STORE
2055#define vstore_partial_3_14 NO_STORE
2056#define vstore_partial_3_15 NO_STORE
2057#define vstore_partial_3_16 NO_STORE
2058
2059#define vstore_partial_4_0 NO_STORE
2060#define vstore_partial_4_1 vstore_partial_1
2061#define vstore_partial_4_2 vstore_partial_2
2062#define vstore_partial_4_3 vstore_partial_3
2063#define vstore_partial_4_4 vstore_partial_4
2064#define vstore_partial_4_5 NO_STORE
2065#define vstore_partial_4_6 NO_STORE
2066#define vstore_partial_4_7 NO_STORE
2067#define vstore_partial_4_8 NO_STORE
2068#define vstore_partial_4_9 NO_STORE
2069#define vstore_partial_4_10 NO_STORE
2070#define vstore_partial_4_11 NO_STORE
2071#define vstore_partial_4_12 NO_STORE
2072#define vstore_partial_4_13 NO_STORE
2073#define vstore_partial_4_14 NO_STORE
2074#define vstore_partial_4_15 NO_STORE
2075#define vstore_partial_4_16 NO_STORE
2076
2077#define vstore_partial_8_0 NO_STORE
2078#define vstore_partial_8_1 vstore_partial_1
2079#define vstore_partial_8_2 vstore_partial_2
2080#define vstore_partial_8_3 vstore_partial_3
2081#define vstore_partial_8_4 vstore_partial_4
2082#define vstore_partial_8_5 vstore_partial_5
2083#define vstore_partial_8_6 vstore_partial_6
2084#define vstore_partial_8_7 vstore_partial_7
2085#define vstore_partial_8_8 vstore_partial_8
2086#define vstore_partial_8_9 NO_STORE
2087#define vstore_partial_8_10 NO_STORE
2088#define vstore_partial_8_11 NO_STORE
2089#define vstore_partial_8_12 NO_STORE
2090#define vstore_partial_8_13 NO_STORE
2091#define vstore_partial_8_14 NO_STORE
2092#define vstore_partial_8_15 NO_STORE
2093#define vstore_partial_8_16 NO_STORE
2094
2095#define vstore_partial_16_0 NO_STORE
2096#define vstore_partial_16_1 vstore_partial_1
2097#define vstore_partial_16_2 vstore_partial_2
2098#define vstore_partial_16_3 vstore_partial_3
2099#define vstore_partial_16_4 vstore_partial_4
2100#define vstore_partial_16_5 vstore_partial_5
2101#define vstore_partial_16_6 vstore_partial_6
2102#define vstore_partial_16_7 vstore_partial_7
2103#define vstore_partial_16_8 vstore_partial_8
2104#define vstore_partial_16_9 vstore_partial_9
2105#define vstore_partial_16_10 vstore_partial_10
2106#define vstore_partial_16_11 vstore_partial_11
2107#define vstore_partial_16_12 vstore_partial_12
2108#define vstore_partial_16_13 vstore_partial_13
2109#define vstore_partial_16_14 vstore_partial_14
2110#define vstore_partial_16_15 vstore_partial_15
2111#define vstore_partial_16_16 vstore_partial_16
2112
2113
2114#define vstore_partial_1(DATA, OFFSET, PTR) \
2115    vstore1(DATA.s0, OFFSET, PTR);
2116
2117#define vstore_partial_2(DATA, OFFSET, PTR) \
2118    vstore2(DATA.s01, OFFSET, PTR);
2119
2120#define vstore_partial_3(DATA, OFFSET, PTR) \
2121    vstore3(DATA.s012, OFFSET, PTR);
2122
2123#define vstore_partial_4(DATA, OFFSET, PTR) \
2124    vstore4(DATA.s0123, OFFSET, PTR);
2125
2126#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128    vstore1(DATA.s4, OFFSET, PTR + 4);
2129
2130#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133
2134#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137
2138#define vstore_partial_8(DATA, OFFSET, PTR) \
2139    vstore8(DATA.s01234567, OFFSET, PTR);
2140
2141#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143    vstore1(DATA.s8, OFFSET, PTR + 8);
2144
2145#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148
2149#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152
2153#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156
2157#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160
2161#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164
2165#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168
2169#define vstore_partial_16(DATA, OFFSET, PTR) \
2170    vstore16(DATA, OFFSET, PTR);
2171
2172
2173
2174
2175
2176#define convert_float_sat convert_float
2177#define convert_float1_sat convert_float
2178#define convert_float2_sat convert_float2
2179#define convert_float3_sat convert_float3
2180#define convert_float4_sat convert_float4
2181#define convert_float8_sat convert_float8
2182#define convert_float16_sat convert_float16
2183#define convert_half_sat convert_float
2184#define convert_half1_sat convert_half
2185#define convert_half2_sat convert_half2
2186#define convert_half3_sat convert_half3
2187#define convert_half4_sat convert_half4
2188#define convert_half8_sat convert_half8
2189#define convert_half16_sat convert_half16
2190
2191#define convert_float1 convert_float
2192#define convert_half1 convert_half
2193#define convert_char1 convert_char
2194#define convert_uchar1 convert_uchar
2195#define convert_short1 convert_short
2196#define convert_ushort1 convert_ushort
2197#define convert_int1 convert_int
2198#define convert_uint1 convert_uint
2199#define convert_long1 convert_long
2200#define convert_ulong1 convert_ulong
2201#define convert_double1 convert_double
2202
2203#define convert_char1_sat convert_char_sat
2204#define convert_uchar1_sat convert_uchar_sat
2205#define convert_uchar2_sat convert_uchar2_sat
2206#define convert_uchar3_sat convert_uchar3_sat
2207#define convert_uchar4_sat convert_uchar4_sat
2208#define convert_uchar8_sat convert_uchar8_sat
2209#define convert_uchar16_sat convert_uchar16_sat
2210#define convert_short1_sat convert_short_sat
2211#define convert_ushort1_sat convert_ushort_sat
2212#define convert_int1_sat convert_int_sat
2213#define convert_uint1_sat convert_uint_sat
2214#define convert_long1_sat convert_long_sat
2215#define convert_ulong1_sat convert_ulong_sat
2216#define convert_double1_sat convert_double_sat
2217
2218#define VEC_DATA_TYPE_STR(type, size) type##size
2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220
2221#define CONVERT_STR(x, type) (convert_##type((x)))
2222#define CONVERT(x, type) CONVERT_STR(x, type)
2223
2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226
2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229
2230#define select_vec_dt_uchar(size) uchar##size
2231#define select_vec_dt_char(size) char##size
2232#define select_vec_dt_ushort(size) ushort##size
2233#define select_vec_dt_short(size) short##size
2234#define select_vec_dt_half(size) short##size
2235#define select_vec_dt_uint(size) uint##size
2236#define select_vec_dt_int(size) int##size
2237#define select_vec_dt_float(size) int##size
2238#define select_vec_dt_ulong(size) ulong##size
2239#define select_vec_dt_long(size) long##size
2240
2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244
2245#define signed_int_vec_dt_uchar(size) char##size
2246#define signed_int_vec_dt_char(size) char##size
2247#define signed_int_vec_dt_ushort(size) short##size
2248#define signed_int_vec_dt_short(size) short##size
2249#define signed_int_vec_dt_half(size) short##size
2250#define signed_int_vec_dt_uint(size) int##size
2251#define signed_int_vec_dt_int(size) int##size
2252#define signed_int_vec_dt_float(size) int##size
2253#define signed_int_vec_dt_ulong(size) long##size
2254#define signed_int_vec_dt_long(size) long##size
2255
2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259
2260#define sum_reduce_1(x) (x)
2261#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266
2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269
2270#define prod_reduce_1(x) (x)
2271#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276
2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279
2280#define max_reduce_1(x) (x)
2281#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286
2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289
2290#define VECTOR_DECLARATION(name)     \
2291    __global uchar *name##_ptr,      \
2292    uint        name##_stride_x, \
2293    uint        name##_step_x,   \
2294    uint        name##_offset_first_element_in_bytes
2295
2296#define IMAGE_DECLARATION(name)      \
2297    __global uchar *name##_ptr,      \
2298    uint        name##_stride_x, \
2299    uint        name##_step_x,   \
2300    uint        name##_stride_y, \
2301    uint        name##_step_y,   \
2302    uint        name##_offset_first_element_in_bytes
2303
2304#define TENSOR3D_DECLARATION(name)   \
2305    __global uchar *name##_ptr,      \
2306    uint        name##_stride_x, \
2307    uint        name##_step_x,   \
2308    uint        name##_stride_y, \
2309    uint        name##_step_y,   \
2310    uint        name##_stride_z, \
2311    uint        name##_step_z,   \
2312    uint        name##_offset_first_element_in_bytes
2313
2314#define TENSOR4D_DECLARATION(name)   \
2315    __global uchar *name##_ptr,      \
2316    uint        name##_stride_x, \
2317    uint        name##_step_x,   \
2318    uint        name##_stride_y, \
2319    uint        name##_step_y,   \
2320    uint        name##_stride_z, \
2321    uint        name##_step_z,   \
2322    uint        name##_stride_w, \
2323    uint        name##_step_w,   \
2324    uint        name##_offset_first_element_in_bytes
2325
2326#define TENSOR5D_DECLARATION(name)   \
2327    __global uchar *name##_ptr,      \
2328    uint        name##_stride_x, \
2329    uint        name##_step_x,   \
2330    uint        name##_stride_y, \
2331    uint        name##_step_y,   \
2332    uint        name##_stride_z, \
2333    uint        name##_step_z,   \
2334    uint        name##_stride_w, \
2335    uint        name##_step_w,   \
2336    uint        name##_stride_v, \
2337    uint        name##_step_v,   \
2338    uint        name##_offset_first_element_in_bytes
2339
2340#define CONVERT_TO_VECTOR_STRUCT(name) \
2341    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342
2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345
2346#define CONVERT_TO_IMAGE_STRUCT(name) \
2347    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348
2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351
2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354
2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357
2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360
2361#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363                                 name##_stride_z, name##_step_z)
2364
2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367
2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371
2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374
2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377                           name##_stride_z, name##_step_z)
2378
2379
2380typedef struct Vector
2381{
2382    __global uchar *ptr;
2383    int             offset_first_element_in_bytes;
2384    int             stride_x;
2385} Vector;
2386
2387
2388typedef struct Image
2389{
2390    __global uchar *ptr;
2391    int             offset_first_element_in_bytes;
2392    int             stride_x;
2393    int             stride_y;
2394} Image;
2395
2396
2397typedef struct Tensor3D
2398{
2399    __global uchar *ptr;
2400    int             offset_first_element_in_bytes;
2401    int             stride_x;
2402    int             stride_y;
2403    int             stride_z;
2404} Tensor3D;
2405
2406
2407typedef struct Tensor4D
2408{
2409    __global uchar *ptr;
2410    int             offset_first_element_in_bytes;
2411    int             stride_x;
2412    int             stride_y;
2413    int             stride_z;
2414    int             stride_w;
2415} Tensor4D;
2416
2417
2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419{
2420    Vector vector =
2421    {
2422        .ptr                           = ptr,
2423        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424        .stride_x                      = stride_x,
2425    };
2426    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427    return vector;
2428}
2429
2430
2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432{
2433    Image img =
2434    {
2435        .ptr                           = ptr,
2436        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437        .stride_x                      = stride_x,
2438        .stride_y                      = stride_y
2439    };
2440    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441    return img;
2442}
2443
2444
2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446{
2447    Image img =
2448    {
2449        .ptr                           = ptr,
2450        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451        .stride_x                      = stride_x,
2452        .stride_y                      = stride_y
2453    };
2454    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455    return img;
2456}
2457
2458
2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460{
2461    Tensor3D tensor =
2462    {
2463        .ptr                           = ptr,
2464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465        .stride_x                      = stride_x,
2466        .stride_y                      = stride_y,
2467        .stride_z                      = stride_z
2468    };
2469    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470    return tensor;
2471}
2472
2473
2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475{
2476    Tensor3D tensor =
2477    {
2478        .ptr                           = ptr,
2479        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480        .stride_x                      = stride_x,
2481        .stride_y                      = stride_y,
2482        .stride_z                      = stride_z
2483    };
2484    return tensor;
2485}
2486
2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488                                             uint step_w,
2489                                             uint mod_size)
2490{
2491    Tensor4D tensor =
2492    {
2493        .ptr                           = ptr,
2494        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495        .stride_x                      = stride_x,
2496        .stride_y                      = stride_y,
2497        .stride_z                      = stride_z,
2498        .stride_w                      = stride_w
2499    };
2500
2501    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502    return tensor;
2503}
2504
2505
2506inline __global const uchar *vector_offset(const Vector *vec, int x)
2507{
2508    return vec->ptr + x * vec->stride_x;
2509}
2510
2511
2512inline __global uchar *offset(const Image *img, int x, int y)
2513{
2514    return img->ptr + x * img->stride_x + y * img->stride_y;
2515}
2516
2517
2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519{
2520    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521}
2522
2523
2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525{
2526    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527}
2528
2529
2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531{
2532    uint num_elements = width * height;
2533
2534    const uint z = index / num_elements;
2535
2536    index %= num_elements;
2537
2538    const uint y = index / width;
2539
2540    index %= width;
2541
2542    const uint x = index;
2543
2544    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545}
2546
2547#endif
2548
2549#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
2550#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
2551
2552
2553
2554
2555#define TILE_VECTOR_SIZE1 1
2556#define TILE_VECTOR_SIZE2 2
2557#define TILE_VECTOR_SIZE3 3
2558#define TILE_VECTOR_SIZE4 4
2559#define TILE_VECTOR_SIZE5 8
2560#define TILE_VECTOR_SIZE6 8
2561#define TILE_VECTOR_SIZE7 8
2562#define TILE_VECTOR_SIZE8 8
2563#define TILE_VECTOR_SIZE9 16
2564#define TILE_VECTOR_SIZE10 16
2565#define TILE_VECTOR_SIZE11 16
2566#define TILE_VECTOR_SIZE12 16
2567#define TILE_VECTOR_SIZE13 16
2568#define TILE_VECTOR_SIZE14 16
2569#define TILE_VECTOR_SIZE15 16
2570#define TILE_VECTOR_SIZE16 16
2571
2572#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
2573#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
2574#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
2575#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
2576#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
2577#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
2578#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
2579#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
2580#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
2581#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
2582#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
2583#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
2584#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
2585#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
2586#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
2587#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
2588
2589
2590#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
2591#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
2592    union {                                 \
2593        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
2594        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
2595    } BASENAME[H]
2596
2597#define TENSOR4D_IMAGE(name)          \
2598    __read_only image2d_t name##_img, \
2599    __global uchar *name##_ptr,       \
2600    uint            name##_stride_x,  \
2601    uint            name##_step_x,    \
2602    uint            name##_stride_y,  \
2603    uint            name##_step_y,    \
2604    uint            name##_stride_z,  \
2605    uint            name##_step_z,    \
2606    uint            name##_stride_w,  \
2607    uint            name##_step_w,    \
2608    uint            name##_offset_first_element_in_bytes
2609
2610#define TENSOR4D_BUFFER(name)    \
2611    __global uchar *name##_ptr,  \
2612    uint        name##_stride_x, \
2613    uint        name##_step_x,   \
2614    uint        name##_stride_y, \
2615    uint        name##_step_y,   \
2616    uint        name##_stride_z, \
2617    uint        name##_step_z,   \
2618    uint        name##_stride_w, \
2619    uint        name##_step_w,   \
2620    uint        name##_offset_first_element_in_bytes
2621
2622#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
2623#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
2624
2625#define TENSOR4D_T_IMAGE(name)          \
2626    __read_only image2d_t name##_img, \
2627    __global uchar *name##_ptr,       \
2628    uint        name##_stride_y, \
2629    uint        name##_stride_z, \
2630    uint        name##_stride_w, \
2631    uint        name##_c,   \
2632    uint        name##_w,   \
2633    uint        name##_h,   \
2634    uint        name##_n,   \
2635    uint        name##_offset_first_element_in_bytes
2636
2637#define TENSOR4D_T_BUFFER(name)    \
2638    __global uchar *name##_ptr,  \
2639    uint        name##_stride_y, \
2640    uint        name##_stride_z, \
2641    uint        name##_stride_w, \
2642    uint        name##_c,   \
2643    uint        name##_w,   \
2644    uint        name##_h,   \
2645    uint        name##_n,   \
2646    uint        name##_offset_first_element_in_bytes
2647
2648#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
2649
2650
2651#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
2652
2653#define TENSOR4D_RO_T_IMAGE(name)          \
2654    __read_only image2d_t name##_img, \
2655    TENSOR4D_T_BUFFER(name)
2656
2657#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
2658
2659#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
2660
2661
2662#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
2663
2664#define TENSOR4D_WO_T_IMAGE(name)          \
2665    __write_only image2d_t name##_img, \
2666    TENSOR4D_T_BUFFER(name)
2667
2668#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
2669
2670#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
2671
2672
2673#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
2674
2675#define TENSOR3D_T_IMAGE(name)          \
2676    __read_only image2d_t name##_img, \
2677    __global uchar *name##_ptr,       \
2678    uint        name##_stride_y, \
2679    uint        name##_stride_z, \
2680    uint        name##_w,   \
2681    uint        name##_h,   \
2682    uint        name##_n,   \
2683    uint        name##_offset_first_element_in_bytes
2684
2685#define TENSOR3D_T_BUFFER(name)    \
2686    __global uchar *name##_ptr,  \
2687    uint        name##_stride_y, \
2688    uint        name##_stride_z, \
2689    uint        name##_w,   \
2690    uint        name##_h,   \
2691    uint        name##_n,   \
2692    uint        name##_offset_first_element_in_bytes
2693
2694#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
2695#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
2696
2697#if !defined(UNROLL_WITH_PRAGMA)
2698#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
2699
2700#define LOOP_UNROLLING_1(idx, step, macro) (macro)
2701#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
2702#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
2703#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
2704#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
2705#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
2706#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
2707#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
2708#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
2709#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
2710#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
2711#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
2712#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
2713#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
2714#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
2715#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
2716#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
2717#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
2718#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
2719#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
2720#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
2721#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
2722#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
2723#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
2724#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
2725#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
2726#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
2727#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
2728#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
2729#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
2730#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
2731#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
2732#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
2733#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
2734#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
2735#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
2736#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
2737#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
2738#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
2739#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
2740#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
2741#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
2742#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
2743#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
2744#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
2745#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
2746#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
2747#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
2748#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
2749#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
2750#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
2751#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
2752#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
2753#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
2754#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
2755#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
2756#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
2757#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
2758#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
2759#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
2760#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
2761#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
2762#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
2763#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
2764#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
2765#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
2766#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
2767#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
2768#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
2769#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
2770#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
2771#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
2772#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
2773#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
2774#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
2775#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
2776#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
2777#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
2778#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
2779#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
2780#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
2781#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
2782#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
2783#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
2784#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
2785#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
2786#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
2787#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
2788#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
2789#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
2790#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
2791#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
2792#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
2793#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
2794#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
2795#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
2796#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
2797#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
2798#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
2799#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
2800#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
2801#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
2802#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
2803#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
2804#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
2805#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
2806#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
2807#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
2808#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
2809#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
2810#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
2811#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
2812#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
2813#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
2814#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
2815#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
2816#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
2817#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
2818#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
2819#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
2820#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
2821#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
2822#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
2823#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
2824#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
2825#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
2826#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
2827#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
2828
2829#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
2830    {                                                          \
2831        type idx = start;                                      \
2832        LOOP_UNROLLING_##num(idx, step, macro);                \
2833    }
2834#else
2835#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
2836    {                                                          \
2837        _Pragma("unroll")                                      \
2838        for(type idx = start; idx < (num * step); idx += step) \
2839        {                                                      \
2840            (macro);                                           \
2841        }                                                      \
2842    }
2843#endif
2844#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
2845
2846
2847#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
2848
2849
2850#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
2851#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
2852#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2853    ({                                                \
2854        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
2855    })
2856#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
2857#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
2858#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
2859#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
2860#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2861#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
2862#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
2863#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
2864#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2865#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
2866#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
2867#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
2868#else
2869#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
2870    ({                                                  \
2871        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
2872        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
2873    })
2874#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
2875    ({                                                  \
2876        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
2877        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
2878    })
2879#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
2880    ({                                                    \
2881        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
2882        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
2883        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
2884        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
2885    })
2886#endif
2887#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2888    ({                                                \
2889        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
2890        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
2891    })
2892#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2893    ({                                                \
2894        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
2895        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
2896    })
2897#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2898    ({                                                \
2899        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
2900        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
2901    })
2902#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2903    ({                                                \
2904        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
2905        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
2906    })
2907#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2908    ({                                                \
2909        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2910        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
2911    })
2912#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2913    ({                                                \
2914        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2915        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
2916    })
2917#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2918    ({                                                \
2919        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2920        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
2921    })
2922#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2923    ({                                                \
2924        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2925        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
2926    })
2927#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2928    ({                                                \
2929        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2930        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
2931    })
2932#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2933    ({                                                \
2934        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2935        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
2936    })
2937#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2938    ({                                                \
2939        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
2940        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
2941    })
2942#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2943    ({                                                 \
2944        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
2945        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
2946    })
2947
2948
2949#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
2950#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
2951
2952
2953#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
2954#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
2955#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
2956    VLOAD(WIDTH)                                                \
2957    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
2958#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
2959
2960
2961#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
2962#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
2963#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
2964    VSTORE(WIDTH)                                                \
2965    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
2966#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
2967
2968
2969#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
2970    ({                                                                                                                 \
2971        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
2972        {                                                                                                              \
2973            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
2974        })                                                                                                             \
2975    })
2976
2977
2978#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
2979    ({                                                                                                  \
2980        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
2981        {                                                                                               \
2982            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
2983        })                                                                                              \
2984    })
2985
2986
2987#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
2988    ({                                                                                                                                                                                             \
2989        if(WIDTH1_CONDITION)                                                                                                                                                                       \
2990        {                                                                                                                                                                                          \
2991            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
2992            {                                                                                                                                                                                      \
2993                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
2994                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
2995            })                                                                                                                                                                                     \
2996        }                                                                                                                                                                                          \
2997        else                                                                                                                                                                                       \
2998        {                                                                                                                                                                                          \
2999            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3000            {                                                                                                                                                                                      \
3001                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
3002            })                                                                                                                                                                                     \
3003        }                                                                                                                                                                                          \
3004    })
3005
3006#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
3007    ({                                                                                                                                                \
3008        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
3009        {                                                                                                                                             \
3010            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
3011            {                                                                                                                                         \
3012                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
3013                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
3014                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
3015                if(_src_valid_y != 0)                                                                                                                 \
3016                {                                                                                                                                     \
3017                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
3018                }                                                                                                                                     \
3019            })                                                                                                                                        \
3020        })                                                                                                                                            \
3021    })
3022
3023
3024#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
3025    ({ \
3026        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
3027        { \
3028            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
3029            { \
3030                int _src_y = (X) + _xk * (DILATION_X); \
3031                int _src_z = ((Y) + _yk * (DILATION_Y)); \
3032                int _src_w    = (B); \
3033                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
3034                if(!(BOUNDARY_CHECK)) \
3035                { \
3036                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
3037                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
3038                } \
3039                else \
3040                { \
3041                    if(_src_valid_y) \
3042                    { \
3043                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
3044                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
3045                    }                                                                                                                                                                                                 \
3046                } \
3047            })                                                                                                                                                                                                             \
3048        })                                                                                                                                                                                                             \
3049    })
3050
3051
3052#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
3053    ({                                                                                                                                                                \
3054        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
3055        {                                                                                                                                                             \
3056            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
3057            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
3058            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
3059            if(_src_valid_y != 0)                                                                                                                                     \
3060            {                                                                                                                                                         \
3061                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
3062            }                                                                                                                                                         \
3063        })                                                                                                                                                            \
3064    })
3065
3066
3067#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
3068#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
3069#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
3070    ({ \
3071        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
3072        { \
3073            if(yi[0].s[_i] >= 0) \
3074            { \
3075                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
3076            } \
3077        }) \
3078    })
3079
3080#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
3081    ({ \
3082        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
3083        { \
3084            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
3085        }) \
3086    })
3087
3088
3089#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
3090    ({                                                                                                                                                                \
3091        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
3092        {                                                                                                                                                             \
3093            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
3094            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
3095            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
3096                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
3097            if(_src_valid_y != 0)                                                                                                                                     \
3098            {                                                                                                                                                         \
3099                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
3100            }                                                                                                                                                         \
3101        })                                                                                                                                                            \
3102    })
3103
3104
3105#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
3106    ({                                                                                                                                                                                             \
3107        if(WIDTH1_CONDITION)                                                                                                                                                                       \
3108        {                                                                                                                                                                                          \
3109            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3110            {                                                                                                                                                                                      \
3111                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
3112                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
3113            })                                                                                                                                                                                     \
3114        }                                                                                                                                                                                          \
3115        else                                                                                                                                                                                       \
3116        {                                                                                                                                                                                          \
3117            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3118            {                                                                                                                                                                                      \
3119                VSTORE(WIDTH0)                                                                                                                                                                     \
3120                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
3121            })                                                                                                                                                                                     \
3122        }                                                                                                                                                                                          \
3123    })
3124
3125
3126#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
3127    ({                                                                                               \
3128        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
3129        {                                                                                            \
3130            ACC_DATA_TYPE _tm = 0;                                                                   \
3131            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
3132            {                                                                                        \
3133                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
3134            })                                                                                       \
3135            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
3136            {                                                                                        \
3137                dst[_m0].s[_n0] += _tm;                                                              \
3138                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
3139                {                                                                                    \
3140                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
3141                })                                                                                   \
3142            })                                                                                       \
3143        })                                                                                          \
3144    })
3145
3146
3147#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
3148#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
3149
3150
3151#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
3152    ({ \
3153        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3154        { \
3155            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3156            { \
3157                SRC_DATA_TYPE _tmp = 0; \
3158                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3159                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
3160                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
3161                long a_64 = (long)(_src); \
3162                long b_64 = (long)(DST_MULTIPLIER); \
3163                long ab_64 = a_64 * b_64; \
3164                long mask1 = 1 << 30; \
3165                long mask2 = 1 - (1 << 30); \
3166                long is_positive_or_zero = ab_64 >= 0; \
3167                long nudge = select(mask2, mask1, is_positive_or_zero); \
3168                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3169                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3170                if(DST_SHIFT >= 0) \
3171                { \
3172                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
3173                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
3174                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
3175                } \
3176                _tmp += DST_OFFSET; \
3177                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3178            })                                                                                                                                          \
3179        })                                                                                                                                          \
3180    })
3181
3182
3183#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
3184    ({ \
3185        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3186        { \
3187            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3188            { \
3189                SRC_DATA_TYPE _tmp = 0; \
3190                SRC_DATA_TYPE _tmp2 = 0; \
3191                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3192                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
3193                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
3194                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
3195                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
3196                long a_64 = (long)(_src); \
3197                long b_64 = (long)(_dst_multiplier); \
3198                long ab_64 = a_64 * b_64; \
3199                long mask1 = 1 << 30; \
3200                long mask2 = 1 - (1 << 30); \
3201                long is_positive_or_zero = ab_64 >= 0; \
3202                long nudge = select(mask2, mask1, is_positive_or_zero); \
3203                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3204                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3205                long mask = ((((int)1) << _dst_shift) - (int)1); \
3206                long threshold = (mask >> 1) + any(_tmp); \
3207                _tmp2 = _tmp >> _dst_shift; \
3208                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
3209                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
3210                _tmp += DST_OFFSET; \
3211                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3212            })                                                                                                                                          \
3213        })                                                                                                                                         \
3214    })
3215
3216
3217#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
3218    ({ \
3219        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3220        { \
3221            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3222            { \
3223                SRC_DATA_TYPE _tmp = 0; \
3224                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3225                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
3226                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
3227                long a_64 = (long)(_src); \
3228                long b_64 = (long)(DST_MULTIPLIER); \
3229                long ab_64 = a_64 * b_64; \
3230                long mask1 = 1 << 30; \
3231                long mask2 = 1 - (1 << 30); \
3232                long is_positive_or_zero = ab_64 >= 0; \
3233                long nudge = select(mask2, mask1, is_positive_or_zero); \
3234                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3235                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3236                if(DST_SHIFT >= 0) \
3237                { \
3238                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
3239                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
3240                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
3241                } \
3242                _tmp += DST_OFFSET; \
3243                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3244            })                                                                                                                                          \
3245        })                                                                                                                                          \
3246    })
3247
3248
3249#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
3250    ({                                                                                                                                                     \
3251        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
3252        {                                                                                                                                                  \
3253            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
3254            {                                                                                                                                              \
3255                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
3256            })                                                                                                                                             \
3257        })                                                                                                                                                 \
3258    })
3259
3260
3261#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
3262    ({                                                                                         \
3263        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
3264        {                                                                                      \
3265            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
3266        })                                                                                     \
3267    })
3268
3269
3270#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
3271
3272#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
3273
3274#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
3275
3276#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
3277
3278#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
3279
3280#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
3281#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
3282
3283#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
3284#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
3285#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
3286#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
3287
3288
3289#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
3290    ({ \
3291        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3292        { \
3293            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
3294        })                                                                                          \
3295    })
3296
3297
3298#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
3299    ({                                                            \
3300        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3301        {                                                         \
3302            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
3303        })                                                        \
3304    })
3305
3306
3307#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
3308    ({                                                            \
3309        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3310        {                                                         \
3311            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
3312        })                                                        \
3313    })
3314
3315#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3316#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3317#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3318
3319#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3320#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3321
3322#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3323
3324#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3325#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3326
3327
3328#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
3329    ({                                                            \
3330        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3331        {                                                         \
3332            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
3333        })                                                        \
3334    })
3335
3336
3337#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3338    ({                                                      \
3339        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3340        {                                                   \
3341            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3342        })                                                  \
3343    })
3344
3345
3346#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3347    ({                                                      \
3348        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3349        {                                                   \
3350            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3351        })                                                  \
3352    })
3353
3354#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3355#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3356#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3357#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3358
3359
3360#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3361    ({                                                      \
3362        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3363        {                                                   \
3364            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3365        })                                                  \
3366    })
3367
3368
3369#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
3370    ({                                                      \
3371        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3372        {                                                   \
3373            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3374        })                                                  \
3375    })
3376
3377
3378#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3379#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3380#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3381#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3382#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3383#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3384#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3385#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3386#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
3387    {                                                                                     \
3388        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
3389        {                                                                                 \
3390            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
3391            {                                                                             \
3392                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
3393                {                                                                         \
3394                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
3395                })                                                                        \
3396            })                                                                            \
3397        })                                                                                \
3398    }
3399
3400#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
3401    ({ \
3402        LOOP_UNROLLING(int, _m, 0, 1, M0, \
3403        { \
3404            LOOP_UNROLLING(int, _n, 0, 1, N0, \
3405            { \
3406                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
3407            })                                                                                             \
3408        })                                                                                             \
3409    })
3410
3411#endif
3412
3413
3414#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
3415
3416
3417
3418__kernel void dwc_native_fp_nhwc(
3419    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
3420    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
3421    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
3422#if defined(HAS_BIAS)
3423    ,
3424    VECTOR_DECLARATION(bia)
3425#endif
3426)
3427{
3428
3429
3430#define _IWEI_WIDTH WEI_WIDTH
3431#define _IWEI_HEIGHT WEI_HEIGHT
3432#define _IM0_A M0_A
3433#define _IN0_A N0_A
3434#define _IM0_B _IWEI_WIDTH
3435#define _IN0_B N0
3436#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
3437
3438    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0);
3439    const int xo   = GET_SPATIAL_IDX(1, M0, 0);
3440#if defined(BATCHED_EXECUTION)
3441    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h;
3442    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h;
3443#else
3444    const int yo   = GET_SPATIAL_IDX(2, 1, 0);
3445    const int bout = 0;
3446#endif
3447
3448    int xi = xo * STRIDE_X;
3449    int yi = yo * STRIDE_Y;
3450    xi -= PAD_LEFT;
3451    yi -= PAD_TOP;
3452
3453    TILE(ACC_DATA_TYPE, M0, N0, c);
3454
3455
3456    LOOP_UNROLLING(int, i, 0, 1, M0,
3457    {
3458        c[i].v = 0;
3459    })
3460
3461#if _IWEI_HEIGHT < 5
3462    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
3463#else
3464    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
3465#endif
3466    {
3467        TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
3468
3469        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
3470        {
3471            a[i].v = 0;
3472        })
3473
3474
3475        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), SRC_WIDTH, SRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
3476
3477        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
3478
3479
3480        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
3481
3482
3483
3484        LOOP_UNROLLING(int, m0, 0, 1, M0,
3485        {
3486            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
3487            {
3488#if GPU_ARCH == GPU_ARCH_MIDGARD
3489                c[m0].v += a[xk + m0].v * b[xk].v;
3490#else
3491                c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
3492#endif
3493            })
3494        })
3495    }
3496#if _IWEI_HEIGHT < 5
3497                      )
3498#endif
3499
3500#if defined(HAS_BIAS)
3501    TILE(BIA_DATA_TYPE, 1, N0, bias0);
3502
3503    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
3504
3505
3506    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
3507#endif
3508
3509    T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
3510
3511    TILE(uint, M0, 1, dst_indirect_y);
3512
3513    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
3514
3515    if(x_cond)
3516    {
3517        LOOP_UNROLLING(int, m0, 0, 1, M0,
3518        {
3519            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
3520            VSTORE_PARTIAL(N0, PARTIAL_N0)
3521            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
3522        })
3523    }
3524    else
3525    {
3526        LOOP_UNROLLING(int, m0, 0, 1, M0,
3527        {
3528            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
3529            VSTORE(N0)
3530            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
3531        })
3532    }
3533}
3534#endif
3535
3536 )"