xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/nhwc/direct_convolution.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3
4
5
6#ifndef ARM_COMPUTE_HELPER_H
7#define ARM_COMPUTE_HELPER_H
8
9
10
11
12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    VSTORE(N0)                                                 \
14    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15
16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18    VSTORE(N0)                                                 \
19    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20
21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23    VSTORE(N0)                                                 \
24    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25
26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28    VSTORE(N0)                                                 \
29    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30
31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33    VSTORE(N0)                                                 \
34    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35
36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38    VSTORE(N0)                                                 \
39    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40
41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43    VSTORE(N0)                                                 \
44    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45
46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48    VSTORE(N0)                                                 \
49    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50
51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53    VSTORE(N0)                                                 \
54    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55
56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58    VSTORE(N0)                                                  \
59    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60
61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63    VSTORE(N0)                                                  \
64    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65
66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68    VSTORE(N0)                                                  \
69    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70
71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73    VSTORE(N0)                                                  \
74    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75
76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78    VSTORE(N0)                                                  \
79    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80
81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83    VSTORE(N0)                                                  \
84    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85
86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88    VSTORE(N0)                                                  \
89    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90
91
92
93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    VSTORE(N0)                                                         \
95    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96
97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99    VSTORE(N0)                                                         \
100    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101
102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104    VSTORE(N0)                                                         \
105    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106
107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109    VSTORE(N0)                                                         \
110    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111
112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114    VSTORE(N0)                                                         \
115    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116
117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                         \
120    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121
122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                         \
125    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126
127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                         \
130    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131
132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                         \
135    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136
137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    VSTORE(N0)                                                     \
140    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141
142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                          \
145    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146
147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                          \
150    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151
152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                          \
155    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159    VSTORE(N0)                                                          \
160    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161
162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                          \
165    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166
167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                          \
170    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171
172
173
174
175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177
178
179
180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182
183
184
185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188
189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193
194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198
199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203
204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208
209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213
214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218
219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223
224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228
229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233
234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238
239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243
244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248
249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253
254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258
259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263
264
265
266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268
269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273    }                                                                                                                                                     \
274    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281    }                                                                                                                                                     \
282    else                                                                                                                                                  \
283    {                                                                                                                                                     \
284        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285    }
286
287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288    if(!(PARTIAL_COND_X))                                                                                         \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291    }                                                                                                             \
292    else                                                                                                          \
293    {                                                                                                             \
294        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295    }
296
297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298    if(!(PARTIAL_COND_Y))                                                                                         \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301    }                                                                                                             \
302    else                                                                                                          \
303    {                                                                                                             \
304        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305    }
306
307
308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309
310
311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312
313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315
316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317
318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320
321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322
323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325
326#else
327
328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330
331#endif
332
333#endif
334
335
336#if defined(PARTIAL_STORE_M0)
337
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340#else
341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342    ((uint)(y * M0))
343#endif
344
345
346
347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349
350
351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361#endif
362
363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364#pragma OPENCL EXTENSION cl_arm_printf : enable
365#endif
366
367#define GPU_ARCH_MIDGARD 0x100
368#define GPU_ARCH_BIFROST 0x200
369#define GPU_ARCH_VALHALL 0x300
370
371
372#define CONCAT(a, b) a##b
373
374
375#define EXPAND(x) x
376
377
378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379
380
381#define REV1(x) ((x))
382#define REV2(x) ((x).s10)
383#define REV3(x) ((x).s210)
384#define REV4(x) ((x).s3210)
385#define REV8(x) ((x).s76543210)
386#define REV16(x) ((x).sFEDCBA9876543210)
387
388
389
390#define REVERSE_STR(x, s) REV##s((x))
391#define REVERSE(x, s) REVERSE_STR(x, s)
392
393
394
395#define ROT1_0(x) ((x))
396#define ROT1_1(x) ((x))
397
398#define ROT2_0(x) ((x))
399#define ROT2_1(x) ((x).s10)
400#define ROT2_2(x) ((x))
401
402#define ROT3_0(x) ((x))
403#define ROT3_1(x) ((x).s201)
404#define ROT3_2(x) ((x).s120)
405#define ROT3_3(x) ((x))
406
407#define ROT4_0(x) ((x))
408#define ROT4_1(x) ((x).s3012)
409#define ROT4_2(x) ((x).s2301)
410#define ROT4_3(x) ((x).s1230)
411#define ROT4_4(x) ((x))
412
413#define ROT8_0(x) ((x))
414#define ROT8_1(x) ((x).s70123456)
415#define ROT8_2(x) ((x).s67012345)
416#define ROT8_3(x) ((x).s56701234)
417#define ROT8_4(x) ((x).s45670123)
418#define ROT8_5(x) ((x).s34567012)
419#define ROT8_6(x) ((x).s23456701)
420#define ROT8_7(x) ((x).s12345670)
421#define ROT8_8(x) ((x))
422
423#define ROT16_0(x) ((x))
424#define ROT16_1(x) ((x).sF0123456789ABCDE)
425#define ROT16_2(x) ((x).sEF0123456789ABCD)
426#define ROT16_3(x) ((x).sDEF0123456789ABC)
427#define ROT16_4(x) ((x).sCDEF0123456789AB)
428#define ROT16_5(x) ((x).sBCDEF0123456789A)
429#define ROT16_6(x) ((x).sABCDEF0123456789)
430#define ROT16_7(x) ((x).s9ABCDEF012345678)
431#define ROT16_8(x) ((x).s89ABCDEF01234567)
432#define ROT16_9(x) ((x).s789ABCDEF0123456)
433#define ROT16_10(x) ((x).s6789ABCDEF012345)
434#define ROT16_11(x) ((x).s56789ABCDEF01234)
435#define ROT16_12(x) ((x).s456789ABCDEF0123)
436#define ROT16_13(x) ((x).s3456789ABCDEF012)
437#define ROT16_14(x) ((x).s23456789ABCDEF01)
438#define ROT16_15(x) ((x).s123456789ABCDEF0)
439#define ROT16_16(x) ((x))
440
441
442
443#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445
446
447
448#define V_OFFS1(dt) (dt##1)(0)
449#define V_OFFS2(dt) (dt##2)(0, 1)
450#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454
455
456
457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459
460
461#define VLOAD_STR(size) vload##size
462#define VLOAD(size) VLOAD_STR(size)
463
464
465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467
468#define NO_LOAD(data, offs, ptr) \
469    {                            \
470    }
471
472
473#define vload_partial_1_0 NO_LOAD
474#define vload_partial_1_1 vload1
475#define vload_partial_1_2 NO_LOAD
476#define vload_partial_1_3 NO_LOAD
477#define vload_partial_1_4 NO_LOAD
478#define vload_partial_1_5 NO_LOAD
479#define vload_partial_1_6 NO_LOAD
480#define vload_partial_1_7 NO_LOAD
481#define vload_partial_1_8 NO_LOAD
482#define vload_partial_1_9 NO_LOAD
483#define vload_partial_1_10 NO_LOAD
484#define vload_partial_1_11 NO_LOAD
485#define vload_partial_1_12 NO_LOAD
486#define vload_partial_1_13 NO_LOAD
487#define vload_partial_1_14 NO_LOAD
488#define vload_partial_1_15 NO_LOAD
489#define vload_partial_1_16 NO_LOAD
490
491#define vload_partial_2_0 NO_LOAD
492#define vload_partial_2_1 vload_partial_1
493#define vload_partial_2_2 vload_partial_2
494#define vload_partial_2_3 NO_LOAD
495#define vload_partial_2_4 NO_LOAD
496#define vload_partial_2_5 NO_LOAD
497#define vload_partial_2_6 NO_LOAD
498#define vload_partial_2_7 NO_LOAD
499#define vload_partial_2_8 NO_LOAD
500#define vload_partial_2_9 NO_LOAD
501#define vload_partial_2_10 NO_LOAD
502#define vload_partial_2_11 NO_LOAD
503#define vload_partial_2_12 NO_LOAD
504#define vload_partial_2_13 NO_LOAD
505#define vload_partial_2_14 NO_LOAD
506#define vload_partial_2_15 NO_LOAD
507#define vload_partial_2_16 NO_LOAD
508
509#define vload_partial_3_0 NO_LOAD
510#define vload_partial_3_1 vload_partial_1
511#define vload_partial_3_2 vload_partial_2
512#define vload_partial_3_3 vload_partial_3
513#define vload_partial_3_4 NO_LOAD
514#define vload_partial_3_5 NO_LOAD
515#define vload_partial_3_6 NO_LOAD
516#define vload_partial_3_7 NO_LOAD
517#define vload_partial_3_8 NO_LOAD
518#define vload_partial_3_9 NO_LOAD
519#define vload_partial_3_10 NO_LOAD
520#define vload_partial_3_11 NO_LOAD
521#define vload_partial_3_12 NO_LOAD
522#define vload_partial_3_13 NO_LOAD
523#define vload_partial_3_14 NO_LOAD
524#define vload_partial_3_15 NO_LOAD
525#define vload_partial_3_16 NO_LOAD
526
527#define vload_partial_4_0 NO_LOAD
528#define vload_partial_4_1 vload_partial_1
529#define vload_partial_4_2 vload_partial_2
530#define vload_partial_4_3 vload_partial_3
531#define vload_partial_4_4 vload_partial_4
532#define vload_partial_4_5 NO_LOAD
533#define vload_partial_4_6 NO_LOAD
534#define vload_partial_4_7 NO_LOAD
535#define vload_partial_4_8 NO_LOAD
536#define vload_partial_4_9 NO_LOAD
537#define vload_partial_4_10 NO_LOAD
538#define vload_partial_4_11 NO_LOAD
539#define vload_partial_4_12 NO_LOAD
540#define vload_partial_4_13 NO_LOAD
541#define vload_partial_4_14 NO_LOAD
542#define vload_partial_4_15 NO_LOAD
543#define vload_partial_4_16 NO_LOAD
544
545#define vload_partial_8_0 NO_LOAD
546#define vload_partial_8_1 vload_partial_1
547#define vload_partial_8_2 vload_partial_2
548#define vload_partial_8_3 vload_partial_3
549#define vload_partial_8_4 vload_partial_4
550#define vload_partial_8_5 vload_partial_5
551#define vload_partial_8_6 vload_partial_6
552#define vload_partial_8_7 vload_partial_7
553#define vload_partial_8_8 vload_partial_8
554#define vload_partial_8_9 NO_LOAD
555#define vload_partial_8_10 NO_LOAD
556#define vload_partial_8_11 NO_LOAD
557#define vload_partial_8_12 NO_LOAD
558#define vload_partial_8_13 NO_LOAD
559#define vload_partial_8_14 NO_LOAD
560#define vload_partial_8_15 NO_LOAD
561#define vload_partial_8_16 NO_LOAD
562
563#define vload_partial_16_0 NO_LOAD
564#define vload_partial_16_1 vload_partial_1
565#define vload_partial_16_2 vload_partial_2
566#define vload_partial_16_3 vload_partial_3
567#define vload_partial_16_4 vload_partial_4
568#define vload_partial_16_5 vload_partial_5
569#define vload_partial_16_6 vload_partial_6
570#define vload_partial_16_7 vload_partial_7
571#define vload_partial_16_8 vload_partial_8
572#define vload_partial_16_9 vload_partial_9
573#define vload_partial_16_10 vload_partial_10
574#define vload_partial_16_11 vload_partial_11
575#define vload_partial_16_12 vload_partial_12
576#define vload_partial_16_13 vload_partial_13
577#define vload_partial_16_14 vload_partial_14
578#define vload_partial_16_15 vload_partial_15
579#define vload_partial_16_16 vload_partial_16
580
581
582#define vload_partial_1(DATA, OFFSET, PTR) \
583    DATA.s0 = vload1(OFFSET, PTR);
584
585#define vload_partial_2(DATA, OFFSET, PTR) \
586    DATA.s01 = vload2(OFFSET, PTR);
587
588#define vload_partial_3(DATA, OFFSET, PTR) \
589    DATA.s012 = vload3(OFFSET, PTR);
590
591#define vload_partial_4(DATA, OFFSET, PTR) \
592    DATA.s0123 = vload4(OFFSET, PTR);
593
594#define vload_partial_5(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    DATA.s4 = vload1(OFFSET, PTR + 4);
597
598#define vload_partial_6(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601
602#define vload_partial_7(DATA, OFFSET, PTR)    \
603    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605
606#define vload_partial_8(DATA, OFFSET, PTR) \
607    DATA.s01234567 = vload8(OFFSET, PTR);
608
609#define vload_partial_9(DATA, OFFSET, PTR)        \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    DATA.s8 = vload1(OFFSET, PTR + 8);
612
613#define vload_partial_10(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616
617#define vload_partial_11(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620
621#define vload_partial_12(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624
625#define vload_partial_13(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_14(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_15(DATA, OFFSET, PTR)       \
634    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636
637#define vload_partial_16(DATA, OFFSET, PTR) \
638    DATA = vload16(OFFSET, PTR);
639
640
641
642#define PIXEL_UNIT4 1
643#define PIXEL_UNIT8 2
644#define PIXEL_UNIT16 4
645
646
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649
650
651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654
655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659#endif
660
661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669#endif
670
671
672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674
675
676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678
679#define VSTORE_STR(size) vstore##size
680#define VSTORE(size) VSTORE_STR(size)
681
682#define float1 float
683#define half1 half
684#define char1 char
685#define uchar1 uchar
686#define short1 short
687#define ushort1 ushort
688#define int1 int
689#define uint1 uint
690#define long1 long
691#define ulong1 ulong
692#define double1 double
693
694#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696
697
698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700
701#define NO_STORE(data, offs, ptr) \
702    {                             \
703    }
704
705
706#define vstore_partial_1_0 NO_STORE
707#define vstore_partial_1_1 vstore1
708#define vstore_partial_1_2 NO_STORE
709#define vstore_partial_1_3 NO_STORE
710#define vstore_partial_1_4 NO_STORE
711#define vstore_partial_1_5 NO_STORE
712#define vstore_partial_1_6 NO_STORE
713#define vstore_partial_1_7 NO_STORE
714#define vstore_partial_1_8 NO_STORE
715#define vstore_partial_1_9 NO_STORE
716#define vstore_partial_1_10 NO_STORE
717#define vstore_partial_1_11 NO_STORE
718#define vstore_partial_1_12 NO_STORE
719#define vstore_partial_1_13 NO_STORE
720#define vstore_partial_1_14 NO_STORE
721#define vstore_partial_1_15 NO_STORE
722#define vstore_partial_1_16 NO_STORE
723
724#define vstore_partial_2_0 NO_STORE
725#define vstore_partial_2_1 vstore_partial_1
726#define vstore_partial_2_2 vstore_partial_2
727#define vstore_partial_2_3 NO_STORE
728#define vstore_partial_2_4 NO_STORE
729#define vstore_partial_2_5 NO_STORE
730#define vstore_partial_2_6 NO_STORE
731#define vstore_partial_2_7 NO_STORE
732#define vstore_partial_2_8 NO_STORE
733#define vstore_partial_2_9 NO_STORE
734#define vstore_partial_2_10 NO_STORE
735#define vstore_partial_2_11 NO_STORE
736#define vstore_partial_2_12 NO_STORE
737#define vstore_partial_2_13 NO_STORE
738#define vstore_partial_2_14 NO_STORE
739#define vstore_partial_2_15 NO_STORE
740#define vstore_partial_2_16 NO_STORE
741
742#define vstore_partial_3_0 NO_STORE
743#define vstore_partial_3_1 vstore_partial_1
744#define vstore_partial_3_2 vstore_partial_2
745#define vstore_partial_3_3 vstore_partial_3
746#define vstore_partial_3_4 NO_STORE
747#define vstore_partial_3_5 NO_STORE
748#define vstore_partial_3_6 NO_STORE
749#define vstore_partial_3_7 NO_STORE
750#define vstore_partial_3_8 NO_STORE
751#define vstore_partial_3_9 NO_STORE
752#define vstore_partial_3_10 NO_STORE
753#define vstore_partial_3_11 NO_STORE
754#define vstore_partial_3_12 NO_STORE
755#define vstore_partial_3_13 NO_STORE
756#define vstore_partial_3_14 NO_STORE
757#define vstore_partial_3_15 NO_STORE
758#define vstore_partial_3_16 NO_STORE
759
760#define vstore_partial_4_0 NO_STORE
761#define vstore_partial_4_1 vstore_partial_1
762#define vstore_partial_4_2 vstore_partial_2
763#define vstore_partial_4_3 vstore_partial_3
764#define vstore_partial_4_4 vstore_partial_4
765#define vstore_partial_4_5 NO_STORE
766#define vstore_partial_4_6 NO_STORE
767#define vstore_partial_4_7 NO_STORE
768#define vstore_partial_4_8 NO_STORE
769#define vstore_partial_4_9 NO_STORE
770#define vstore_partial_4_10 NO_STORE
771#define vstore_partial_4_11 NO_STORE
772#define vstore_partial_4_12 NO_STORE
773#define vstore_partial_4_13 NO_STORE
774#define vstore_partial_4_14 NO_STORE
775#define vstore_partial_4_15 NO_STORE
776#define vstore_partial_4_16 NO_STORE
777
778#define vstore_partial_8_0 NO_STORE
779#define vstore_partial_8_1 vstore_partial_1
780#define vstore_partial_8_2 vstore_partial_2
781#define vstore_partial_8_3 vstore_partial_3
782#define vstore_partial_8_4 vstore_partial_4
783#define vstore_partial_8_5 vstore_partial_5
784#define vstore_partial_8_6 vstore_partial_6
785#define vstore_partial_8_7 vstore_partial_7
786#define vstore_partial_8_8 vstore_partial_8
787#define vstore_partial_8_9 NO_STORE
788#define vstore_partial_8_10 NO_STORE
789#define vstore_partial_8_11 NO_STORE
790#define vstore_partial_8_12 NO_STORE
791#define vstore_partial_8_13 NO_STORE
792#define vstore_partial_8_14 NO_STORE
793#define vstore_partial_8_15 NO_STORE
794#define vstore_partial_8_16 NO_STORE
795
796#define vstore_partial_16_0 NO_STORE
797#define vstore_partial_16_1 vstore_partial_1
798#define vstore_partial_16_2 vstore_partial_2
799#define vstore_partial_16_3 vstore_partial_3
800#define vstore_partial_16_4 vstore_partial_4
801#define vstore_partial_16_5 vstore_partial_5
802#define vstore_partial_16_6 vstore_partial_6
803#define vstore_partial_16_7 vstore_partial_7
804#define vstore_partial_16_8 vstore_partial_8
805#define vstore_partial_16_9 vstore_partial_9
806#define vstore_partial_16_10 vstore_partial_10
807#define vstore_partial_16_11 vstore_partial_11
808#define vstore_partial_16_12 vstore_partial_12
809#define vstore_partial_16_13 vstore_partial_13
810#define vstore_partial_16_14 vstore_partial_14
811#define vstore_partial_16_15 vstore_partial_15
812#define vstore_partial_16_16 vstore_partial_16
813
814
815#define vstore_partial_1(DATA, OFFSET, PTR) \
816    vstore1(DATA.s0, OFFSET, PTR);
817
818#define vstore_partial_2(DATA, OFFSET, PTR) \
819    vstore2(DATA.s01, OFFSET, PTR);
820
821#define vstore_partial_3(DATA, OFFSET, PTR) \
822    vstore3(DATA.s012, OFFSET, PTR);
823
824#define vstore_partial_4(DATA, OFFSET, PTR) \
825    vstore4(DATA.s0123, OFFSET, PTR);
826
827#define vstore_partial_5(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore1(DATA.s4, OFFSET, PTR + 4);
830
831#define vstore_partial_6(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834
835#define vstore_partial_7(DATA, OFFSET, PTR)    \
836    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838
839#define vstore_partial_8(DATA, OFFSET, PTR) \
840    vstore8(DATA.s01234567, OFFSET, PTR);
841
842#define vstore_partial_9(DATA, OFFSET, PTR)        \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore1(DATA.s8, OFFSET, PTR + 8);
845
846#define vstore_partial_10(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849
850#define vstore_partial_11(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853
854#define vstore_partial_12(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857
858#define vstore_partial_13(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_14(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_15(DATA, OFFSET, PTR)       \
867    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869
870#define vstore_partial_16(DATA, OFFSET, PTR) \
871    vstore16(DATA, OFFSET, PTR);
872
873
874
875
876
877#define convert_float_sat convert_float
878#define convert_float1_sat convert_float
879#define convert_float2_sat convert_float2
880#define convert_float3_sat convert_float3
881#define convert_float4_sat convert_float4
882#define convert_float8_sat convert_float8
883#define convert_float16_sat convert_float16
884#define convert_half_sat convert_float
885#define convert_half1_sat convert_half
886#define convert_half2_sat convert_half2
887#define convert_half3_sat convert_half3
888#define convert_half4_sat convert_half4
889#define convert_half8_sat convert_half8
890#define convert_half16_sat convert_half16
891
892#define convert_float1 convert_float
893#define convert_half1 convert_half
894#define convert_char1 convert_char
895#define convert_uchar1 convert_uchar
896#define convert_short1 convert_short
897#define convert_ushort1 convert_ushort
898#define convert_int1 convert_int
899#define convert_uint1 convert_uint
900#define convert_long1 convert_long
901#define convert_ulong1 convert_ulong
902#define convert_double1 convert_double
903
904#define convert_char1_sat convert_char_sat
905#define convert_uchar1_sat convert_uchar_sat
906#define convert_uchar2_sat convert_uchar2_sat
907#define convert_uchar3_sat convert_uchar3_sat
908#define convert_uchar4_sat convert_uchar4_sat
909#define convert_uchar8_sat convert_uchar8_sat
910#define convert_uchar16_sat convert_uchar16_sat
911#define convert_short1_sat convert_short_sat
912#define convert_ushort1_sat convert_ushort_sat
913#define convert_int1_sat convert_int_sat
914#define convert_uint1_sat convert_uint_sat
915#define convert_long1_sat convert_long_sat
916#define convert_ulong1_sat convert_ulong_sat
917#define convert_double1_sat convert_double_sat
918
919#define VEC_DATA_TYPE_STR(type, size) type##size
920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921
922#define CONVERT_STR(x, type) (convert_##type((x)))
923#define CONVERT(x, type) CONVERT_STR(x, type)
924
925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927
928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930
931#define select_vec_dt_uchar(size) uchar##size
932#define select_vec_dt_char(size) char##size
933#define select_vec_dt_ushort(size) ushort##size
934#define select_vec_dt_short(size) short##size
935#define select_vec_dt_half(size) short##size
936#define select_vec_dt_uint(size) uint##size
937#define select_vec_dt_int(size) int##size
938#define select_vec_dt_float(size) int##size
939#define select_vec_dt_ulong(size) ulong##size
940#define select_vec_dt_long(size) long##size
941
942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945
946#define signed_int_vec_dt_uchar(size) char##size
947#define signed_int_vec_dt_char(size) char##size
948#define signed_int_vec_dt_ushort(size) short##size
949#define signed_int_vec_dt_short(size) short##size
950#define signed_int_vec_dt_half(size) short##size
951#define signed_int_vec_dt_uint(size) int##size
952#define signed_int_vec_dt_int(size) int##size
953#define signed_int_vec_dt_float(size) int##size
954#define signed_int_vec_dt_ulong(size) long##size
955#define signed_int_vec_dt_long(size) long##size
956
957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960
961#define sum_reduce_1(x) (x)
962#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967
968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970
971#define prod_reduce_1(x) (x)
972#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977
978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980
981#define max_reduce_1(x) (x)
982#define max_reduce_2(x) max(((x).s0), ((x).s1))
983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987
988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990
991#define VECTOR_DECLARATION(name)     \
992    __global uchar *name##_ptr,      \
993    uint        name##_stride_x, \
994    uint        name##_step_x,   \
995    uint        name##_offset_first_element_in_bytes
996
997#define IMAGE_DECLARATION(name)      \
998    __global uchar *name##_ptr,      \
999    uint        name##_stride_x, \
1000    uint        name##_step_x,   \
1001    uint        name##_stride_y, \
1002    uint        name##_step_y,   \
1003    uint        name##_offset_first_element_in_bytes
1004
1005#define TENSOR3D_DECLARATION(name)   \
1006    __global uchar *name##_ptr,      \
1007    uint        name##_stride_x, \
1008    uint        name##_step_x,   \
1009    uint        name##_stride_y, \
1010    uint        name##_step_y,   \
1011    uint        name##_stride_z, \
1012    uint        name##_step_z,   \
1013    uint        name##_offset_first_element_in_bytes
1014
1015#define TENSOR4D_DECLARATION(name)   \
1016    __global uchar *name##_ptr,      \
1017    uint        name##_stride_x, \
1018    uint        name##_step_x,   \
1019    uint        name##_stride_y, \
1020    uint        name##_step_y,   \
1021    uint        name##_stride_z, \
1022    uint        name##_step_z,   \
1023    uint        name##_stride_w, \
1024    uint        name##_step_w,   \
1025    uint        name##_offset_first_element_in_bytes
1026
1027#define TENSOR5D_DECLARATION(name)   \
1028    __global uchar *name##_ptr,      \
1029    uint        name##_stride_x, \
1030    uint        name##_step_x,   \
1031    uint        name##_stride_y, \
1032    uint        name##_step_y,   \
1033    uint        name##_stride_z, \
1034    uint        name##_step_z,   \
1035    uint        name##_stride_w, \
1036    uint        name##_step_w,   \
1037    uint        name##_stride_v, \
1038    uint        name##_step_v,   \
1039    uint        name##_offset_first_element_in_bytes
1040
1041#define CONVERT_TO_VECTOR_STRUCT(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043
1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046
1047#define CONVERT_TO_IMAGE_STRUCT(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049
1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064                                 name##_stride_z, name##_step_z)
1065
1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072
1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075
1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078                           name##_stride_z, name##_step_z)
1079
1080
1081typedef struct Vector
1082{
1083    __global uchar *ptr;
1084    int             offset_first_element_in_bytes;
1085    int             stride_x;
1086} Vector;
1087
1088
1089typedef struct Image
1090{
1091    __global uchar *ptr;
1092    int             offset_first_element_in_bytes;
1093    int             stride_x;
1094    int             stride_y;
1095} Image;
1096
1097
1098typedef struct Tensor3D
1099{
1100    __global uchar *ptr;
1101    int             offset_first_element_in_bytes;
1102    int             stride_x;
1103    int             stride_y;
1104    int             stride_z;
1105} Tensor3D;
1106
1107
1108typedef struct Tensor4D
1109{
1110    __global uchar *ptr;
1111    int             offset_first_element_in_bytes;
1112    int             stride_x;
1113    int             stride_y;
1114    int             stride_z;
1115    int             stride_w;
1116} Tensor4D;
1117
1118
1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120{
1121    Vector vector =
1122    {
1123        .ptr                           = ptr,
1124        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125        .stride_x                      = stride_x,
1126    };
1127    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128    return vector;
1129}
1130
1131
1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133{
1134    Image img =
1135    {
1136        .ptr                           = ptr,
1137        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138        .stride_x                      = stride_x,
1139        .stride_y                      = stride_y
1140    };
1141    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142    return img;
1143}
1144
1145
1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147{
1148    Image img =
1149    {
1150        .ptr                           = ptr,
1151        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152        .stride_x                      = stride_x,
1153        .stride_y                      = stride_y
1154    };
1155    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156    return img;
1157}
1158
1159
1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161{
1162    Tensor3D tensor =
1163    {
1164        .ptr                           = ptr,
1165        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166        .stride_x                      = stride_x,
1167        .stride_y                      = stride_y,
1168        .stride_z                      = stride_z
1169    };
1170    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171    return tensor;
1172}
1173
1174
1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176{
1177    Tensor3D tensor =
1178    {
1179        .ptr                           = ptr,
1180        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181        .stride_x                      = stride_x,
1182        .stride_y                      = stride_y,
1183        .stride_z                      = stride_z
1184    };
1185    return tensor;
1186}
1187
1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189                                             uint step_w,
1190                                             uint mod_size)
1191{
1192    Tensor4D tensor =
1193    {
1194        .ptr                           = ptr,
1195        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196        .stride_x                      = stride_x,
1197        .stride_y                      = stride_y,
1198        .stride_z                      = stride_z,
1199        .stride_w                      = stride_w
1200    };
1201
1202    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203    return tensor;
1204}
1205
1206
1207inline __global const uchar *vector_offset(const Vector *vec, int x)
1208{
1209    return vec->ptr + x * vec->stride_x;
1210}
1211
1212
1213inline __global uchar *offset(const Image *img, int x, int y)
1214{
1215    return img->ptr + x * img->stride_x + y * img->stride_y;
1216}
1217
1218
1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220{
1221    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222}
1223
1224
1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226{
1227    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228}
1229
1230
1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232{
1233    uint num_elements = width * height;
1234
1235    const uint z = index / num_elements;
1236
1237    index %= num_elements;
1238
1239    const uint y = index / width;
1240
1241    index %= width;
1242
1243    const uint x = index;
1244
1245    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246}
1247
1248#endif
1249
1250#if GPU_ARCH == GPU_ARCH_BIFROST
1251#define MLA(a, b, c) (fma(c, b, a))
1252#else
1253#define MLA(a, b, c) ((b) * (c) + (a))
1254#endif
1255
1256
1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258
1259
1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261
1262
1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264
1265
1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267
1268
1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270
1271
1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273
1274
1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276
1277
1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279
1280
1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282
1283
1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285
1286
1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288
1289
1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291
1292
1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294
1295
1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297
1298
1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300
1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302
1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304
1305#ifndef ARM_COMPUTE_HELPER_H
1306#define ARM_COMPUTE_HELPER_H
1307
1308
1309
1310
1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    VSTORE(N0)                                                 \
1313    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314
1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317    VSTORE(N0)                                                 \
1318    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319
1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322    VSTORE(N0)                                                 \
1323    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324
1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327    VSTORE(N0)                                                 \
1328    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329
1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332    VSTORE(N0)                                                 \
1333    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334
1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337    VSTORE(N0)                                                 \
1338    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339
1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342    VSTORE(N0)                                                 \
1343    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344
1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347    VSTORE(N0)                                                 \
1348    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349
1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352    VSTORE(N0)                                                 \
1353    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354
1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357    VSTORE(N0)                                                  \
1358    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359
1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362    VSTORE(N0)                                                  \
1363    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364
1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367    VSTORE(N0)                                                  \
1368    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369
1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372    VSTORE(N0)                                                  \
1373    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374
1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377    VSTORE(N0)                                                  \
1378    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379
1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382    VSTORE(N0)                                                  \
1383    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384
1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387    VSTORE(N0)                                                  \
1388    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389
1390
1391
1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    VSTORE(N0)                                                         \
1394    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395
1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398    VSTORE(N0)                                                         \
1399    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400
1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403    VSTORE(N0)                                                         \
1404    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405
1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408    VSTORE(N0)                                                         \
1409    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410
1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413    VSTORE(N0)                                                         \
1414    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415
1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418    VSTORE(N0)                                                         \
1419    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420
1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423    VSTORE(N0)                                                         \
1424    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425
1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428    VSTORE(N0)                                                         \
1429    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430
1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433    VSTORE(N0)                                                         \
1434    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435
1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438    VSTORE(N0)                                                     \
1439    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440
1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443    VSTORE(N0)                                                          \
1444    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445
1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448    VSTORE(N0)                                                          \
1449    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450
1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453    VSTORE(N0)                                                          \
1454    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455
1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458    VSTORE(N0)                                                          \
1459    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460
1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463    VSTORE(N0)                                                          \
1464    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465
1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468    VSTORE(N0)                                                          \
1469    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470
1471
1472
1473
1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476
1477
1478
1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481
1482
1483
1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487
1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492
1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497
1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502
1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507
1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512
1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517
1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522
1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527
1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532
1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537
1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542
1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547
1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552
1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557
1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562
1563
1564
1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567
1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570    {                                                                                                                                                     \
1571        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572    }                                                                                                                                                     \
1573    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574    {                                                                                                                                                     \
1575        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576    }                                                                                                                                                     \
1577    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578    {                                                                                                                                                     \
1579        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580    }                                                                                                                                                     \
1581    else                                                                                                                                                  \
1582    {                                                                                                                                                     \
1583        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584    }
1585
1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587    if(!(PARTIAL_COND_X))                                                                                         \
1588    {                                                                                                             \
1589        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590    }                                                                                                             \
1591    else                                                                                                          \
1592    {                                                                                                             \
1593        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594    }
1595
1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597    if(!(PARTIAL_COND_Y))                                                                                         \
1598    {                                                                                                             \
1599        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600    }                                                                                                             \
1601    else                                                                                                          \
1602    {                                                                                                             \
1603        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604    }
1605
1606
1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608
1609
1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611
1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614
1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616
1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619
1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621
1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624
1625#else
1626
1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629
1630#endif
1631
1632#endif
1633
1634
1635#if defined(PARTIAL_STORE_M0)
1636
1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639#else
1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641    ((uint)(y * M0))
1642#endif
1643
1644
1645
1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648
1649
1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652#endif
1653
1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656#endif
1657
1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660#endif
1661
1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663#pragma OPENCL EXTENSION cl_arm_printf : enable
1664#endif
1665
1666#define GPU_ARCH_MIDGARD 0x100
1667#define GPU_ARCH_BIFROST 0x200
1668#define GPU_ARCH_VALHALL 0x300
1669
1670
1671#define CONCAT(a, b) a##b
1672
1673
1674#define EXPAND(x) x
1675
1676
1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678
1679
1680#define REV1(x) ((x))
1681#define REV2(x) ((x).s10)
1682#define REV3(x) ((x).s210)
1683#define REV4(x) ((x).s3210)
1684#define REV8(x) ((x).s76543210)
1685#define REV16(x) ((x).sFEDCBA9876543210)
1686
1687
1688
1689#define REVERSE_STR(x, s) REV##s((x))
1690#define REVERSE(x, s) REVERSE_STR(x, s)
1691
1692
1693
1694#define ROT1_0(x) ((x))
1695#define ROT1_1(x) ((x))
1696
1697#define ROT2_0(x) ((x))
1698#define ROT2_1(x) ((x).s10)
1699#define ROT2_2(x) ((x))
1700
1701#define ROT3_0(x) ((x))
1702#define ROT3_1(x) ((x).s201)
1703#define ROT3_2(x) ((x).s120)
1704#define ROT3_3(x) ((x))
1705
1706#define ROT4_0(x) ((x))
1707#define ROT4_1(x) ((x).s3012)
1708#define ROT4_2(x) ((x).s2301)
1709#define ROT4_3(x) ((x).s1230)
1710#define ROT4_4(x) ((x))
1711
1712#define ROT8_0(x) ((x))
1713#define ROT8_1(x) ((x).s70123456)
1714#define ROT8_2(x) ((x).s67012345)
1715#define ROT8_3(x) ((x).s56701234)
1716#define ROT8_4(x) ((x).s45670123)
1717#define ROT8_5(x) ((x).s34567012)
1718#define ROT8_6(x) ((x).s23456701)
1719#define ROT8_7(x) ((x).s12345670)
1720#define ROT8_8(x) ((x))
1721
1722#define ROT16_0(x) ((x))
1723#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728#define ROT16_6(x) ((x).sABCDEF0123456789)
1729#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738#define ROT16_16(x) ((x))
1739
1740
1741
1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744
1745
1746
1747#define V_OFFS1(dt) (dt##1)(0)
1748#define V_OFFS2(dt) (dt##2)(0, 1)
1749#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753
1754
1755
1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758
1759
1760#define VLOAD_STR(size) vload##size
1761#define VLOAD(size) VLOAD_STR(size)
1762
1763
1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766
1767#define NO_LOAD(data, offs, ptr) \
1768    {                            \
1769    }
1770
1771
1772#define vload_partial_1_0 NO_LOAD
1773#define vload_partial_1_1 vload1
1774#define vload_partial_1_2 NO_LOAD
1775#define vload_partial_1_3 NO_LOAD
1776#define vload_partial_1_4 NO_LOAD
1777#define vload_partial_1_5 NO_LOAD
1778#define vload_partial_1_6 NO_LOAD
1779#define vload_partial_1_7 NO_LOAD
1780#define vload_partial_1_8 NO_LOAD
1781#define vload_partial_1_9 NO_LOAD
1782#define vload_partial_1_10 NO_LOAD
1783#define vload_partial_1_11 NO_LOAD
1784#define vload_partial_1_12 NO_LOAD
1785#define vload_partial_1_13 NO_LOAD
1786#define vload_partial_1_14 NO_LOAD
1787#define vload_partial_1_15 NO_LOAD
1788#define vload_partial_1_16 NO_LOAD
1789
1790#define vload_partial_2_0 NO_LOAD
1791#define vload_partial_2_1 vload_partial_1
1792#define vload_partial_2_2 vload_partial_2
1793#define vload_partial_2_3 NO_LOAD
1794#define vload_partial_2_4 NO_LOAD
1795#define vload_partial_2_5 NO_LOAD
1796#define vload_partial_2_6 NO_LOAD
1797#define vload_partial_2_7 NO_LOAD
1798#define vload_partial_2_8 NO_LOAD
1799#define vload_partial_2_9 NO_LOAD
1800#define vload_partial_2_10 NO_LOAD
1801#define vload_partial_2_11 NO_LOAD
1802#define vload_partial_2_12 NO_LOAD
1803#define vload_partial_2_13 NO_LOAD
1804#define vload_partial_2_14 NO_LOAD
1805#define vload_partial_2_15 NO_LOAD
1806#define vload_partial_2_16 NO_LOAD
1807
1808#define vload_partial_3_0 NO_LOAD
1809#define vload_partial_3_1 vload_partial_1
1810#define vload_partial_3_2 vload_partial_2
1811#define vload_partial_3_3 vload_partial_3
1812#define vload_partial_3_4 NO_LOAD
1813#define vload_partial_3_5 NO_LOAD
1814#define vload_partial_3_6 NO_LOAD
1815#define vload_partial_3_7 NO_LOAD
1816#define vload_partial_3_8 NO_LOAD
1817#define vload_partial_3_9 NO_LOAD
1818#define vload_partial_3_10 NO_LOAD
1819#define vload_partial_3_11 NO_LOAD
1820#define vload_partial_3_12 NO_LOAD
1821#define vload_partial_3_13 NO_LOAD
1822#define vload_partial_3_14 NO_LOAD
1823#define vload_partial_3_15 NO_LOAD
1824#define vload_partial_3_16 NO_LOAD
1825
1826#define vload_partial_4_0 NO_LOAD
1827#define vload_partial_4_1 vload_partial_1
1828#define vload_partial_4_2 vload_partial_2
1829#define vload_partial_4_3 vload_partial_3
1830#define vload_partial_4_4 vload_partial_4
1831#define vload_partial_4_5 NO_LOAD
1832#define vload_partial_4_6 NO_LOAD
1833#define vload_partial_4_7 NO_LOAD
1834#define vload_partial_4_8 NO_LOAD
1835#define vload_partial_4_9 NO_LOAD
1836#define vload_partial_4_10 NO_LOAD
1837#define vload_partial_4_11 NO_LOAD
1838#define vload_partial_4_12 NO_LOAD
1839#define vload_partial_4_13 NO_LOAD
1840#define vload_partial_4_14 NO_LOAD
1841#define vload_partial_4_15 NO_LOAD
1842#define vload_partial_4_16 NO_LOAD
1843
1844#define vload_partial_8_0 NO_LOAD
1845#define vload_partial_8_1 vload_partial_1
1846#define vload_partial_8_2 vload_partial_2
1847#define vload_partial_8_3 vload_partial_3
1848#define vload_partial_8_4 vload_partial_4
1849#define vload_partial_8_5 vload_partial_5
1850#define vload_partial_8_6 vload_partial_6
1851#define vload_partial_8_7 vload_partial_7
1852#define vload_partial_8_8 vload_partial_8
1853#define vload_partial_8_9 NO_LOAD
1854#define vload_partial_8_10 NO_LOAD
1855#define vload_partial_8_11 NO_LOAD
1856#define vload_partial_8_12 NO_LOAD
1857#define vload_partial_8_13 NO_LOAD
1858#define vload_partial_8_14 NO_LOAD
1859#define vload_partial_8_15 NO_LOAD
1860#define vload_partial_8_16 NO_LOAD
1861
1862#define vload_partial_16_0 NO_LOAD
1863#define vload_partial_16_1 vload_partial_1
1864#define vload_partial_16_2 vload_partial_2
1865#define vload_partial_16_3 vload_partial_3
1866#define vload_partial_16_4 vload_partial_4
1867#define vload_partial_16_5 vload_partial_5
1868#define vload_partial_16_6 vload_partial_6
1869#define vload_partial_16_7 vload_partial_7
1870#define vload_partial_16_8 vload_partial_8
1871#define vload_partial_16_9 vload_partial_9
1872#define vload_partial_16_10 vload_partial_10
1873#define vload_partial_16_11 vload_partial_11
1874#define vload_partial_16_12 vload_partial_12
1875#define vload_partial_16_13 vload_partial_13
1876#define vload_partial_16_14 vload_partial_14
1877#define vload_partial_16_15 vload_partial_15
1878#define vload_partial_16_16 vload_partial_16
1879
1880
1881#define vload_partial_1(DATA, OFFSET, PTR) \
1882    DATA.s0 = vload1(OFFSET, PTR);
1883
1884#define vload_partial_2(DATA, OFFSET, PTR) \
1885    DATA.s01 = vload2(OFFSET, PTR);
1886
1887#define vload_partial_3(DATA, OFFSET, PTR) \
1888    DATA.s012 = vload3(OFFSET, PTR);
1889
1890#define vload_partial_4(DATA, OFFSET, PTR) \
1891    DATA.s0123 = vload4(OFFSET, PTR);
1892
1893#define vload_partial_5(DATA, OFFSET, PTR)    \
1894    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895    DATA.s4 = vload1(OFFSET, PTR + 4);
1896
1897#define vload_partial_6(DATA, OFFSET, PTR)    \
1898    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900
1901#define vload_partial_7(DATA, OFFSET, PTR)    \
1902    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904
1905#define vload_partial_8(DATA, OFFSET, PTR) \
1906    DATA.s01234567 = vload8(OFFSET, PTR);
1907
1908#define vload_partial_9(DATA, OFFSET, PTR)        \
1909    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910    DATA.s8 = vload1(OFFSET, PTR + 8);
1911
1912#define vload_partial_10(DATA, OFFSET, PTR)       \
1913    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915
1916#define vload_partial_11(DATA, OFFSET, PTR)       \
1917    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919
1920#define vload_partial_12(DATA, OFFSET, PTR)       \
1921    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923
1924#define vload_partial_13(DATA, OFFSET, PTR)       \
1925    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927
1928#define vload_partial_14(DATA, OFFSET, PTR)       \
1929    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931
1932#define vload_partial_15(DATA, OFFSET, PTR)       \
1933    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935
1936#define vload_partial_16(DATA, OFFSET, PTR) \
1937    DATA = vload16(OFFSET, PTR);
1938
1939
1940
1941#define PIXEL_UNIT4 1
1942#define PIXEL_UNIT8 2
1943#define PIXEL_UNIT16 4
1944
1945
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948
1949
1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953
1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958#endif
1959
1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963
1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968#endif
1969
1970
1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973
1974
1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977
1978#define VSTORE_STR(size) vstore##size
1979#define VSTORE(size) VSTORE_STR(size)
1980
1981#define float1 float
1982#define half1 half
1983#define char1 char
1984#define uchar1 uchar
1985#define short1 short
1986#define ushort1 ushort
1987#define int1 int
1988#define uint1 uint
1989#define long1 long
1990#define ulong1 ulong
1991#define double1 double
1992
1993#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995
1996
1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999
2000#define NO_STORE(data, offs, ptr) \
2001    {                             \
2002    }
2003
2004
2005#define vstore_partial_1_0 NO_STORE
2006#define vstore_partial_1_1 vstore1
2007#define vstore_partial_1_2 NO_STORE
2008#define vstore_partial_1_3 NO_STORE
2009#define vstore_partial_1_4 NO_STORE
2010#define vstore_partial_1_5 NO_STORE
2011#define vstore_partial_1_6 NO_STORE
2012#define vstore_partial_1_7 NO_STORE
2013#define vstore_partial_1_8 NO_STORE
2014#define vstore_partial_1_9 NO_STORE
2015#define vstore_partial_1_10 NO_STORE
2016#define vstore_partial_1_11 NO_STORE
2017#define vstore_partial_1_12 NO_STORE
2018#define vstore_partial_1_13 NO_STORE
2019#define vstore_partial_1_14 NO_STORE
2020#define vstore_partial_1_15 NO_STORE
2021#define vstore_partial_1_16 NO_STORE
2022
2023#define vstore_partial_2_0 NO_STORE
2024#define vstore_partial_2_1 vstore_partial_1
2025#define vstore_partial_2_2 vstore_partial_2
2026#define vstore_partial_2_3 NO_STORE
2027#define vstore_partial_2_4 NO_STORE
2028#define vstore_partial_2_5 NO_STORE
2029#define vstore_partial_2_6 NO_STORE
2030#define vstore_partial_2_7 NO_STORE
2031#define vstore_partial_2_8 NO_STORE
2032#define vstore_partial_2_9 NO_STORE
2033#define vstore_partial_2_10 NO_STORE
2034#define vstore_partial_2_11 NO_STORE
2035#define vstore_partial_2_12 NO_STORE
2036#define vstore_partial_2_13 NO_STORE
2037#define vstore_partial_2_14 NO_STORE
2038#define vstore_partial_2_15 NO_STORE
2039#define vstore_partial_2_16 NO_STORE
2040
2041#define vstore_partial_3_0 NO_STORE
2042#define vstore_partial_3_1 vstore_partial_1
2043#define vstore_partial_3_2 vstore_partial_2
2044#define vstore_partial_3_3 vstore_partial_3
2045#define vstore_partial_3_4 NO_STORE
2046#define vstore_partial_3_5 NO_STORE
2047#define vstore_partial_3_6 NO_STORE
2048#define vstore_partial_3_7 NO_STORE
2049#define vstore_partial_3_8 NO_STORE
2050#define vstore_partial_3_9 NO_STORE
2051#define vstore_partial_3_10 NO_STORE
2052#define vstore_partial_3_11 NO_STORE
2053#define vstore_partial_3_12 NO_STORE
2054#define vstore_partial_3_13 NO_STORE
2055#define vstore_partial_3_14 NO_STORE
2056#define vstore_partial_3_15 NO_STORE
2057#define vstore_partial_3_16 NO_STORE
2058
2059#define vstore_partial_4_0 NO_STORE
2060#define vstore_partial_4_1 vstore_partial_1
2061#define vstore_partial_4_2 vstore_partial_2
2062#define vstore_partial_4_3 vstore_partial_3
2063#define vstore_partial_4_4 vstore_partial_4
2064#define vstore_partial_4_5 NO_STORE
2065#define vstore_partial_4_6 NO_STORE
2066#define vstore_partial_4_7 NO_STORE
2067#define vstore_partial_4_8 NO_STORE
2068#define vstore_partial_4_9 NO_STORE
2069#define vstore_partial_4_10 NO_STORE
2070#define vstore_partial_4_11 NO_STORE
2071#define vstore_partial_4_12 NO_STORE
2072#define vstore_partial_4_13 NO_STORE
2073#define vstore_partial_4_14 NO_STORE
2074#define vstore_partial_4_15 NO_STORE
2075#define vstore_partial_4_16 NO_STORE
2076
2077#define vstore_partial_8_0 NO_STORE
2078#define vstore_partial_8_1 vstore_partial_1
2079#define vstore_partial_8_2 vstore_partial_2
2080#define vstore_partial_8_3 vstore_partial_3
2081#define vstore_partial_8_4 vstore_partial_4
2082#define vstore_partial_8_5 vstore_partial_5
2083#define vstore_partial_8_6 vstore_partial_6
2084#define vstore_partial_8_7 vstore_partial_7
2085#define vstore_partial_8_8 vstore_partial_8
2086#define vstore_partial_8_9 NO_STORE
2087#define vstore_partial_8_10 NO_STORE
2088#define vstore_partial_8_11 NO_STORE
2089#define vstore_partial_8_12 NO_STORE
2090#define vstore_partial_8_13 NO_STORE
2091#define vstore_partial_8_14 NO_STORE
2092#define vstore_partial_8_15 NO_STORE
2093#define vstore_partial_8_16 NO_STORE
2094
2095#define vstore_partial_16_0 NO_STORE
2096#define vstore_partial_16_1 vstore_partial_1
2097#define vstore_partial_16_2 vstore_partial_2
2098#define vstore_partial_16_3 vstore_partial_3
2099#define vstore_partial_16_4 vstore_partial_4
2100#define vstore_partial_16_5 vstore_partial_5
2101#define vstore_partial_16_6 vstore_partial_6
2102#define vstore_partial_16_7 vstore_partial_7
2103#define vstore_partial_16_8 vstore_partial_8
2104#define vstore_partial_16_9 vstore_partial_9
2105#define vstore_partial_16_10 vstore_partial_10
2106#define vstore_partial_16_11 vstore_partial_11
2107#define vstore_partial_16_12 vstore_partial_12
2108#define vstore_partial_16_13 vstore_partial_13
2109#define vstore_partial_16_14 vstore_partial_14
2110#define vstore_partial_16_15 vstore_partial_15
2111#define vstore_partial_16_16 vstore_partial_16
2112
2113
2114#define vstore_partial_1(DATA, OFFSET, PTR) \
2115    vstore1(DATA.s0, OFFSET, PTR);
2116
2117#define vstore_partial_2(DATA, OFFSET, PTR) \
2118    vstore2(DATA.s01, OFFSET, PTR);
2119
2120#define vstore_partial_3(DATA, OFFSET, PTR) \
2121    vstore3(DATA.s012, OFFSET, PTR);
2122
2123#define vstore_partial_4(DATA, OFFSET, PTR) \
2124    vstore4(DATA.s0123, OFFSET, PTR);
2125
2126#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128    vstore1(DATA.s4, OFFSET, PTR + 4);
2129
2130#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133
2134#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137
2138#define vstore_partial_8(DATA, OFFSET, PTR) \
2139    vstore8(DATA.s01234567, OFFSET, PTR);
2140
2141#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143    vstore1(DATA.s8, OFFSET, PTR + 8);
2144
2145#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148
2149#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152
2153#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156
2157#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160
2161#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164
2165#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168
2169#define vstore_partial_16(DATA, OFFSET, PTR) \
2170    vstore16(DATA, OFFSET, PTR);
2171
2172
2173
2174
2175
2176#define convert_float_sat convert_float
2177#define convert_float1_sat convert_float
2178#define convert_float2_sat convert_float2
2179#define convert_float3_sat convert_float3
2180#define convert_float4_sat convert_float4
2181#define convert_float8_sat convert_float8
2182#define convert_float16_sat convert_float16
2183#define convert_half_sat convert_float
2184#define convert_half1_sat convert_half
2185#define convert_half2_sat convert_half2
2186#define convert_half3_sat convert_half3
2187#define convert_half4_sat convert_half4
2188#define convert_half8_sat convert_half8
2189#define convert_half16_sat convert_half16
2190
2191#define convert_float1 convert_float
2192#define convert_half1 convert_half
2193#define convert_char1 convert_char
2194#define convert_uchar1 convert_uchar
2195#define convert_short1 convert_short
2196#define convert_ushort1 convert_ushort
2197#define convert_int1 convert_int
2198#define convert_uint1 convert_uint
2199#define convert_long1 convert_long
2200#define convert_ulong1 convert_ulong
2201#define convert_double1 convert_double
2202
2203#define convert_char1_sat convert_char_sat
2204#define convert_uchar1_sat convert_uchar_sat
2205#define convert_uchar2_sat convert_uchar2_sat
2206#define convert_uchar3_sat convert_uchar3_sat
2207#define convert_uchar4_sat convert_uchar4_sat
2208#define convert_uchar8_sat convert_uchar8_sat
2209#define convert_uchar16_sat convert_uchar16_sat
2210#define convert_short1_sat convert_short_sat
2211#define convert_ushort1_sat convert_ushort_sat
2212#define convert_int1_sat convert_int_sat
2213#define convert_uint1_sat convert_uint_sat
2214#define convert_long1_sat convert_long_sat
2215#define convert_ulong1_sat convert_ulong_sat
2216#define convert_double1_sat convert_double_sat
2217
2218#define VEC_DATA_TYPE_STR(type, size) type##size
2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220
2221#define CONVERT_STR(x, type) (convert_##type((x)))
2222#define CONVERT(x, type) CONVERT_STR(x, type)
2223
2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226
2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229
2230#define select_vec_dt_uchar(size) uchar##size
2231#define select_vec_dt_char(size) char##size
2232#define select_vec_dt_ushort(size) ushort##size
2233#define select_vec_dt_short(size) short##size
2234#define select_vec_dt_half(size) short##size
2235#define select_vec_dt_uint(size) uint##size
2236#define select_vec_dt_int(size) int##size
2237#define select_vec_dt_float(size) int##size
2238#define select_vec_dt_ulong(size) ulong##size
2239#define select_vec_dt_long(size) long##size
2240
2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244
2245#define signed_int_vec_dt_uchar(size) char##size
2246#define signed_int_vec_dt_char(size) char##size
2247#define signed_int_vec_dt_ushort(size) short##size
2248#define signed_int_vec_dt_short(size) short##size
2249#define signed_int_vec_dt_half(size) short##size
2250#define signed_int_vec_dt_uint(size) int##size
2251#define signed_int_vec_dt_int(size) int##size
2252#define signed_int_vec_dt_float(size) int##size
2253#define signed_int_vec_dt_ulong(size) long##size
2254#define signed_int_vec_dt_long(size) long##size
2255
2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259
2260#define sum_reduce_1(x) (x)
2261#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266
2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269
2270#define prod_reduce_1(x) (x)
2271#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276
2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279
2280#define max_reduce_1(x) (x)
2281#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286
2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289
2290#define VECTOR_DECLARATION(name)     \
2291    __global uchar *name##_ptr,      \
2292    uint        name##_stride_x, \
2293    uint        name##_step_x,   \
2294    uint        name##_offset_first_element_in_bytes
2295
2296#define IMAGE_DECLARATION(name)      \
2297    __global uchar *name##_ptr,      \
2298    uint        name##_stride_x, \
2299    uint        name##_step_x,   \
2300    uint        name##_stride_y, \
2301    uint        name##_step_y,   \
2302    uint        name##_offset_first_element_in_bytes
2303
2304#define TENSOR3D_DECLARATION(name)   \
2305    __global uchar *name##_ptr,      \
2306    uint        name##_stride_x, \
2307    uint        name##_step_x,   \
2308    uint        name##_stride_y, \
2309    uint        name##_step_y,   \
2310    uint        name##_stride_z, \
2311    uint        name##_step_z,   \
2312    uint        name##_offset_first_element_in_bytes
2313
2314#define TENSOR4D_DECLARATION(name)   \
2315    __global uchar *name##_ptr,      \
2316    uint        name##_stride_x, \
2317    uint        name##_step_x,   \
2318    uint        name##_stride_y, \
2319    uint        name##_step_y,   \
2320    uint        name##_stride_z, \
2321    uint        name##_step_z,   \
2322    uint        name##_stride_w, \
2323    uint        name##_step_w,   \
2324    uint        name##_offset_first_element_in_bytes
2325
2326#define TENSOR5D_DECLARATION(name)   \
2327    __global uchar *name##_ptr,      \
2328    uint        name##_stride_x, \
2329    uint        name##_step_x,   \
2330    uint        name##_stride_y, \
2331    uint        name##_step_y,   \
2332    uint        name##_stride_z, \
2333    uint        name##_step_z,   \
2334    uint        name##_stride_w, \
2335    uint        name##_step_w,   \
2336    uint        name##_stride_v, \
2337    uint        name##_step_v,   \
2338    uint        name##_offset_first_element_in_bytes
2339
2340#define CONVERT_TO_VECTOR_STRUCT(name) \
2341    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342
2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345
2346#define CONVERT_TO_IMAGE_STRUCT(name) \
2347    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348
2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351
2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354
2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357
2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360
2361#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363                                 name##_stride_z, name##_step_z)
2364
2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367
2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371
2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374
2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377                           name##_stride_z, name##_step_z)
2378
2379
2380typedef struct Vector
2381{
2382    __global uchar *ptr;
2383    int             offset_first_element_in_bytes;
2384    int             stride_x;
2385} Vector;
2386
2387
2388typedef struct Image
2389{
2390    __global uchar *ptr;
2391    int             offset_first_element_in_bytes;
2392    int             stride_x;
2393    int             stride_y;
2394} Image;
2395
2396
2397typedef struct Tensor3D
2398{
2399    __global uchar *ptr;
2400    int             offset_first_element_in_bytes;
2401    int             stride_x;
2402    int             stride_y;
2403    int             stride_z;
2404} Tensor3D;
2405
2406
2407typedef struct Tensor4D
2408{
2409    __global uchar *ptr;
2410    int             offset_first_element_in_bytes;
2411    int             stride_x;
2412    int             stride_y;
2413    int             stride_z;
2414    int             stride_w;
2415} Tensor4D;
2416
2417
2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419{
2420    Vector vector =
2421    {
2422        .ptr                           = ptr,
2423        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424        .stride_x                      = stride_x,
2425    };
2426    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427    return vector;
2428}
2429
2430
2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432{
2433    Image img =
2434    {
2435        .ptr                           = ptr,
2436        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437        .stride_x                      = stride_x,
2438        .stride_y                      = stride_y
2439    };
2440    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441    return img;
2442}
2443
2444
2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446{
2447    Image img =
2448    {
2449        .ptr                           = ptr,
2450        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451        .stride_x                      = stride_x,
2452        .stride_y                      = stride_y
2453    };
2454    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455    return img;
2456}
2457
2458
2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460{
2461    Tensor3D tensor =
2462    {
2463        .ptr                           = ptr,
2464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465        .stride_x                      = stride_x,
2466        .stride_y                      = stride_y,
2467        .stride_z                      = stride_z
2468    };
2469    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470    return tensor;
2471}
2472
2473
2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475{
2476    Tensor3D tensor =
2477    {
2478        .ptr                           = ptr,
2479        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480        .stride_x                      = stride_x,
2481        .stride_y                      = stride_y,
2482        .stride_z                      = stride_z
2483    };
2484    return tensor;
2485}
2486
2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488                                             uint step_w,
2489                                             uint mod_size)
2490{
2491    Tensor4D tensor =
2492    {
2493        .ptr                           = ptr,
2494        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495        .stride_x                      = stride_x,
2496        .stride_y                      = stride_y,
2497        .stride_z                      = stride_z,
2498        .stride_w                      = stride_w
2499    };
2500
2501    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502    return tensor;
2503}
2504
2505
2506inline __global const uchar *vector_offset(const Vector *vec, int x)
2507{
2508    return vec->ptr + x * vec->stride_x;
2509}
2510
2511
2512inline __global uchar *offset(const Image *img, int x, int y)
2513{
2514    return img->ptr + x * img->stride_x + y * img->stride_y;
2515}
2516
2517
2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519{
2520    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521}
2522
2523
2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525{
2526    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527}
2528
2529
2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531{
2532    uint num_elements = width * height;
2533
2534    const uint z = index / num_elements;
2535
2536    index %= num_elements;
2537
2538    const uint y = index / width;
2539
2540    index %= width;
2541
2542    const uint x = index;
2543
2544    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545}
2546
2547#endif
2548
2549#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
2550#define ARM_COMPUTE_HELPERS_ASYMM_H
2551
2552
2553#ifndef ARM_COMPUTE_HELPER_H
2554#define ARM_COMPUTE_HELPER_H
2555
2556
2557
2558
2559#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2560    VSTORE(N0)                                                 \
2561    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2562
2563#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2564    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2565    VSTORE(N0)                                                 \
2566    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2567
2568#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2569    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2570    VSTORE(N0)                                                 \
2571    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2572
2573#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2574    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2575    VSTORE(N0)                                                 \
2576    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2577
2578#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2579    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2580    VSTORE(N0)                                                 \
2581    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2582
2583#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2584    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2585    VSTORE(N0)                                                 \
2586    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2587
2588#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2589    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2590    VSTORE(N0)                                                 \
2591    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2592
2593#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2594    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2595    VSTORE(N0)                                                 \
2596    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2597
2598#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2599    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2600    VSTORE(N0)                                                 \
2601    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2602
2603#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2604    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2605    VSTORE(N0)                                                  \
2606    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2607
2608#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2609    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2610    VSTORE(N0)                                                  \
2611    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2612
2613#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2614    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2615    VSTORE(N0)                                                  \
2616    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2617
2618#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2619    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2620    VSTORE(N0)                                                  \
2621    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2622
2623#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2624    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2625    VSTORE(N0)                                                  \
2626    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2627
2628#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2629    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2630    VSTORE(N0)                                                  \
2631    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2632
2633#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2634    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2635    VSTORE(N0)                                                  \
2636    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2637
2638
2639
2640#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2641    VSTORE(N0)                                                         \
2642    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2643
2644#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2645    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2646    VSTORE(N0)                                                         \
2647    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2648
2649#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2650    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2651    VSTORE(N0)                                                         \
2652    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2653
2654#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2655    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2656    VSTORE(N0)                                                         \
2657    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2658
2659#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2660    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2661    VSTORE(N0)                                                         \
2662    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2663
2664#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2665    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2666    VSTORE(N0)                                                         \
2667    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2668
2669#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2670    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2671    VSTORE(N0)                                                         \
2672    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2673
2674#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2675    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2676    VSTORE(N0)                                                         \
2677    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2678
2679#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2680    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2681    VSTORE(N0)                                                         \
2682    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2683
2684#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
2685    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2686    VSTORE(N0)                                                     \
2687    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2688
2689#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2690    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2691    VSTORE(N0)                                                          \
2692    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2693
2694#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2695    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2696    VSTORE(N0)                                                          \
2697    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2698
2699#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2700    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2701    VSTORE(N0)                                                          \
2702    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2703
2704#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2705    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2706    VSTORE(N0)                                                          \
2707    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2708
2709#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2710    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2711    VSTORE(N0)                                                          \
2712    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2713
2714#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2715    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2716    VSTORE(N0)                                                          \
2717    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2718
2719
2720
2721
2722#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2723#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2724
2725
2726
2727#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2728#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2729
2730
2731
2732#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2733    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2734    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2735
2736#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2737    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2738    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2739    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2740
2741#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2742    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2743    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2744    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2745
2746#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2747    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2748    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2749    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2750
2751#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2752    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2753    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2754    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2755
2756#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2757    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2758    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2759    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2760
2761#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2762    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2763    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2764    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2765
2766#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2767    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2768    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2769    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2770
2771#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2772    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2773    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2774    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2775
2776#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2777    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2778    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2779    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2780
2781#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2782    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2783    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2784    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2785
2786#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2787    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2788    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2789    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2790
2791#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2792    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2793    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2794    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2795
2796#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2797    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2798    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2799    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2800
2801#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2802    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2803    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2804    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2805
2806#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2807    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2808    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2809    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2810
2811
2812
2813#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2814#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2815
2816#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2817    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
2818    {                                                                                                                                                     \
2819        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
2820    }                                                                                                                                                     \
2821    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
2822    {                                                                                                                                                     \
2823        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2824    }                                                                                                                                                     \
2825    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
2826    {                                                                                                                                                     \
2827        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2828    }                                                                                                                                                     \
2829    else                                                                                                                                                  \
2830    {                                                                                                                                                     \
2831        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
2832    }
2833
2834#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2835    if(!(PARTIAL_COND_X))                                                                                         \
2836    {                                                                                                             \
2837        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2838    }                                                                                                             \
2839    else                                                                                                          \
2840    {                                                                                                             \
2841        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2842    }
2843
2844#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2845    if(!(PARTIAL_COND_Y))                                                                                         \
2846    {                                                                                                             \
2847        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2848    }                                                                                                             \
2849    else                                                                                                          \
2850    {                                                                                                             \
2851        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2852    }
2853
2854
2855#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2856
2857
2858#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2859
2860#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2861    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2862
2863#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2864
2865#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2866    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2867
2868#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2869
2870#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2871    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2872
2873#else
2874
2875#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2876    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2877
2878#endif
2879
2880#endif
2881
2882
2883#if defined(PARTIAL_STORE_M0)
2884
2885#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2886    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
2887#else
2888#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2889    ((uint)(y * M0))
2890#endif
2891
2892
2893
2894#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
2895    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
2896
2897
2898#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2899#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2900#endif
2901
2902#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2903#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
2904#endif
2905
2906#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2907#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
2908#endif
2909
2910#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2911#pragma OPENCL EXTENSION cl_arm_printf : enable
2912#endif
2913
2914#define GPU_ARCH_MIDGARD 0x100
2915#define GPU_ARCH_BIFROST 0x200
2916#define GPU_ARCH_VALHALL 0x300
2917
2918
2919#define CONCAT(a, b) a##b
2920
2921
2922#define EXPAND(x) x
2923
2924
2925#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
2926
2927
2928#define REV1(x) ((x))
2929#define REV2(x) ((x).s10)
2930#define REV3(x) ((x).s210)
2931#define REV4(x) ((x).s3210)
2932#define REV8(x) ((x).s76543210)
2933#define REV16(x) ((x).sFEDCBA9876543210)
2934
2935
2936
2937#define REVERSE_STR(x, s) REV##s((x))
2938#define REVERSE(x, s) REVERSE_STR(x, s)
2939
2940
2941
2942#define ROT1_0(x) ((x))
2943#define ROT1_1(x) ((x))
2944
2945#define ROT2_0(x) ((x))
2946#define ROT2_1(x) ((x).s10)
2947#define ROT2_2(x) ((x))
2948
2949#define ROT3_0(x) ((x))
2950#define ROT3_1(x) ((x).s201)
2951#define ROT3_2(x) ((x).s120)
2952#define ROT3_3(x) ((x))
2953
2954#define ROT4_0(x) ((x))
2955#define ROT4_1(x) ((x).s3012)
2956#define ROT4_2(x) ((x).s2301)
2957#define ROT4_3(x) ((x).s1230)
2958#define ROT4_4(x) ((x))
2959
2960#define ROT8_0(x) ((x))
2961#define ROT8_1(x) ((x).s70123456)
2962#define ROT8_2(x) ((x).s67012345)
2963#define ROT8_3(x) ((x).s56701234)
2964#define ROT8_4(x) ((x).s45670123)
2965#define ROT8_5(x) ((x).s34567012)
2966#define ROT8_6(x) ((x).s23456701)
2967#define ROT8_7(x) ((x).s12345670)
2968#define ROT8_8(x) ((x))
2969
2970#define ROT16_0(x) ((x))
2971#define ROT16_1(x) ((x).sF0123456789ABCDE)
2972#define ROT16_2(x) ((x).sEF0123456789ABCD)
2973#define ROT16_3(x) ((x).sDEF0123456789ABC)
2974#define ROT16_4(x) ((x).sCDEF0123456789AB)
2975#define ROT16_5(x) ((x).sBCDEF0123456789A)
2976#define ROT16_6(x) ((x).sABCDEF0123456789)
2977#define ROT16_7(x) ((x).s9ABCDEF012345678)
2978#define ROT16_8(x) ((x).s89ABCDEF01234567)
2979#define ROT16_9(x) ((x).s789ABCDEF0123456)
2980#define ROT16_10(x) ((x).s6789ABCDEF012345)
2981#define ROT16_11(x) ((x).s56789ABCDEF01234)
2982#define ROT16_12(x) ((x).s456789ABCDEF0123)
2983#define ROT16_13(x) ((x).s3456789ABCDEF012)
2984#define ROT16_14(x) ((x).s23456789ABCDEF01)
2985#define ROT16_15(x) ((x).s123456789ABCDEF0)
2986#define ROT16_16(x) ((x))
2987
2988
2989
2990#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
2991#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
2992
2993
2994
2995#define V_OFFS1(dt) (dt##1)(0)
2996#define V_OFFS2(dt) (dt##2)(0, 1)
2997#define V_OFFS3(dt) (dt##3)(0, 1, 2)
2998#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
2999#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
3000#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
3001
3002
3003
3004#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
3005#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
3006
3007
3008#define VLOAD_STR(size) vload##size
3009#define VLOAD(size) VLOAD_STR(size)
3010
3011
3012#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
3013#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
3014
3015#define NO_LOAD(data, offs, ptr) \
3016    {                            \
3017    }
3018
3019
3020#define vload_partial_1_0 NO_LOAD
3021#define vload_partial_1_1 vload1
3022#define vload_partial_1_2 NO_LOAD
3023#define vload_partial_1_3 NO_LOAD
3024#define vload_partial_1_4 NO_LOAD
3025#define vload_partial_1_5 NO_LOAD
3026#define vload_partial_1_6 NO_LOAD
3027#define vload_partial_1_7 NO_LOAD
3028#define vload_partial_1_8 NO_LOAD
3029#define vload_partial_1_9 NO_LOAD
3030#define vload_partial_1_10 NO_LOAD
3031#define vload_partial_1_11 NO_LOAD
3032#define vload_partial_1_12 NO_LOAD
3033#define vload_partial_1_13 NO_LOAD
3034#define vload_partial_1_14 NO_LOAD
3035#define vload_partial_1_15 NO_LOAD
3036#define vload_partial_1_16 NO_LOAD
3037
3038#define vload_partial_2_0 NO_LOAD
3039#define vload_partial_2_1 vload_partial_1
3040#define vload_partial_2_2 vload_partial_2
3041#define vload_partial_2_3 NO_LOAD
3042#define vload_partial_2_4 NO_LOAD
3043#define vload_partial_2_5 NO_LOAD
3044#define vload_partial_2_6 NO_LOAD
3045#define vload_partial_2_7 NO_LOAD
3046#define vload_partial_2_8 NO_LOAD
3047#define vload_partial_2_9 NO_LOAD
3048#define vload_partial_2_10 NO_LOAD
3049#define vload_partial_2_11 NO_LOAD
3050#define vload_partial_2_12 NO_LOAD
3051#define vload_partial_2_13 NO_LOAD
3052#define vload_partial_2_14 NO_LOAD
3053#define vload_partial_2_15 NO_LOAD
3054#define vload_partial_2_16 NO_LOAD
3055
3056#define vload_partial_3_0 NO_LOAD
3057#define vload_partial_3_1 vload_partial_1
3058#define vload_partial_3_2 vload_partial_2
3059#define vload_partial_3_3 vload_partial_3
3060#define vload_partial_3_4 NO_LOAD
3061#define vload_partial_3_5 NO_LOAD
3062#define vload_partial_3_6 NO_LOAD
3063#define vload_partial_3_7 NO_LOAD
3064#define vload_partial_3_8 NO_LOAD
3065#define vload_partial_3_9 NO_LOAD
3066#define vload_partial_3_10 NO_LOAD
3067#define vload_partial_3_11 NO_LOAD
3068#define vload_partial_3_12 NO_LOAD
3069#define vload_partial_3_13 NO_LOAD
3070#define vload_partial_3_14 NO_LOAD
3071#define vload_partial_3_15 NO_LOAD
3072#define vload_partial_3_16 NO_LOAD
3073
3074#define vload_partial_4_0 NO_LOAD
3075#define vload_partial_4_1 vload_partial_1
3076#define vload_partial_4_2 vload_partial_2
3077#define vload_partial_4_3 vload_partial_3
3078#define vload_partial_4_4 vload_partial_4
3079#define vload_partial_4_5 NO_LOAD
3080#define vload_partial_4_6 NO_LOAD
3081#define vload_partial_4_7 NO_LOAD
3082#define vload_partial_4_8 NO_LOAD
3083#define vload_partial_4_9 NO_LOAD
3084#define vload_partial_4_10 NO_LOAD
3085#define vload_partial_4_11 NO_LOAD
3086#define vload_partial_4_12 NO_LOAD
3087#define vload_partial_4_13 NO_LOAD
3088#define vload_partial_4_14 NO_LOAD
3089#define vload_partial_4_15 NO_LOAD
3090#define vload_partial_4_16 NO_LOAD
3091
3092#define vload_partial_8_0 NO_LOAD
3093#define vload_partial_8_1 vload_partial_1
3094#define vload_partial_8_2 vload_partial_2
3095#define vload_partial_8_3 vload_partial_3
3096#define vload_partial_8_4 vload_partial_4
3097#define vload_partial_8_5 vload_partial_5
3098#define vload_partial_8_6 vload_partial_6
3099#define vload_partial_8_7 vload_partial_7
3100#define vload_partial_8_8 vload_partial_8
3101#define vload_partial_8_9 NO_LOAD
3102#define vload_partial_8_10 NO_LOAD
3103#define vload_partial_8_11 NO_LOAD
3104#define vload_partial_8_12 NO_LOAD
3105#define vload_partial_8_13 NO_LOAD
3106#define vload_partial_8_14 NO_LOAD
3107#define vload_partial_8_15 NO_LOAD
3108#define vload_partial_8_16 NO_LOAD
3109
3110#define vload_partial_16_0 NO_LOAD
3111#define vload_partial_16_1 vload_partial_1
3112#define vload_partial_16_2 vload_partial_2
3113#define vload_partial_16_3 vload_partial_3
3114#define vload_partial_16_4 vload_partial_4
3115#define vload_partial_16_5 vload_partial_5
3116#define vload_partial_16_6 vload_partial_6
3117#define vload_partial_16_7 vload_partial_7
3118#define vload_partial_16_8 vload_partial_8
3119#define vload_partial_16_9 vload_partial_9
3120#define vload_partial_16_10 vload_partial_10
3121#define vload_partial_16_11 vload_partial_11
3122#define vload_partial_16_12 vload_partial_12
3123#define vload_partial_16_13 vload_partial_13
3124#define vload_partial_16_14 vload_partial_14
3125#define vload_partial_16_15 vload_partial_15
3126#define vload_partial_16_16 vload_partial_16
3127
3128
3129#define vload_partial_1(DATA, OFFSET, PTR) \
3130    DATA.s0 = vload1(OFFSET, PTR);
3131
3132#define vload_partial_2(DATA, OFFSET, PTR) \
3133    DATA.s01 = vload2(OFFSET, PTR);
3134
3135#define vload_partial_3(DATA, OFFSET, PTR) \
3136    DATA.s012 = vload3(OFFSET, PTR);
3137
3138#define vload_partial_4(DATA, OFFSET, PTR) \
3139    DATA.s0123 = vload4(OFFSET, PTR);
3140
3141#define vload_partial_5(DATA, OFFSET, PTR)    \
3142    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3143    DATA.s4 = vload1(OFFSET, PTR + 4);
3144
3145#define vload_partial_6(DATA, OFFSET, PTR)    \
3146    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3147    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
3148
3149#define vload_partial_7(DATA, OFFSET, PTR)    \
3150    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3151    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
3152
3153#define vload_partial_8(DATA, OFFSET, PTR) \
3154    DATA.s01234567 = vload8(OFFSET, PTR);
3155
3156#define vload_partial_9(DATA, OFFSET, PTR)        \
3157    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3158    DATA.s8 = vload1(OFFSET, PTR + 8);
3159
3160#define vload_partial_10(DATA, OFFSET, PTR)       \
3161    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3162    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
3163
3164#define vload_partial_11(DATA, OFFSET, PTR)       \
3165    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3166    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
3167
3168#define vload_partial_12(DATA, OFFSET, PTR)       \
3169    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3170    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
3171
3172#define vload_partial_13(DATA, OFFSET, PTR)       \
3173    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3174    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
3175
3176#define vload_partial_14(DATA, OFFSET, PTR)       \
3177    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3178    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
3179
3180#define vload_partial_15(DATA, OFFSET, PTR)       \
3181    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3182    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
3183
3184#define vload_partial_16(DATA, OFFSET, PTR) \
3185    DATA = vload16(OFFSET, PTR);
3186
3187
3188
3189#define PIXEL_UNIT4 1
3190#define PIXEL_UNIT8 2
3191#define PIXEL_UNIT16 4
3192
3193
3194#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
3195#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
3196
3197
3198#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
3199#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
3200#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
3201
3202#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3203#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
3204#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
3205#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
3206#endif
3207
3208#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
3209#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
3210#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3211
3212#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3213#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
3214#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
3215#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3216#endif
3217
3218
3219#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
3220#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
3221
3222
3223#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
3224#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
3225
3226#define VSTORE_STR(size) vstore##size
3227#define VSTORE(size) VSTORE_STR(size)
3228
3229#define float1 float
3230#define half1 half
3231#define char1 char
3232#define uchar1 uchar
3233#define short1 short
3234#define ushort1 ushort
3235#define int1 int
3236#define uint1 uint
3237#define long1 long
3238#define ulong1 ulong
3239#define double1 double
3240
3241#define vload1(OFFSET, PTR) *(OFFSET + PTR)
3242#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
3243
3244
3245#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
3246#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
3247
3248#define NO_STORE(data, offs, ptr) \
3249    {                             \
3250    }
3251
3252
3253#define vstore_partial_1_0 NO_STORE
3254#define vstore_partial_1_1 vstore1
3255#define vstore_partial_1_2 NO_STORE
3256#define vstore_partial_1_3 NO_STORE
3257#define vstore_partial_1_4 NO_STORE
3258#define vstore_partial_1_5 NO_STORE
3259#define vstore_partial_1_6 NO_STORE
3260#define vstore_partial_1_7 NO_STORE
3261#define vstore_partial_1_8 NO_STORE
3262#define vstore_partial_1_9 NO_STORE
3263#define vstore_partial_1_10 NO_STORE
3264#define vstore_partial_1_11 NO_STORE
3265#define vstore_partial_1_12 NO_STORE
3266#define vstore_partial_1_13 NO_STORE
3267#define vstore_partial_1_14 NO_STORE
3268#define vstore_partial_1_15 NO_STORE
3269#define vstore_partial_1_16 NO_STORE
3270
3271#define vstore_partial_2_0 NO_STORE
3272#define vstore_partial_2_1 vstore_partial_1
3273#define vstore_partial_2_2 vstore_partial_2
3274#define vstore_partial_2_3 NO_STORE
3275#define vstore_partial_2_4 NO_STORE
3276#define vstore_partial_2_5 NO_STORE
3277#define vstore_partial_2_6 NO_STORE
3278#define vstore_partial_2_7 NO_STORE
3279#define vstore_partial_2_8 NO_STORE
3280#define vstore_partial_2_9 NO_STORE
3281#define vstore_partial_2_10 NO_STORE
3282#define vstore_partial_2_11 NO_STORE
3283#define vstore_partial_2_12 NO_STORE
3284#define vstore_partial_2_13 NO_STORE
3285#define vstore_partial_2_14 NO_STORE
3286#define vstore_partial_2_15 NO_STORE
3287#define vstore_partial_2_16 NO_STORE
3288
3289#define vstore_partial_3_0 NO_STORE
3290#define vstore_partial_3_1 vstore_partial_1
3291#define vstore_partial_3_2 vstore_partial_2
3292#define vstore_partial_3_3 vstore_partial_3
3293#define vstore_partial_3_4 NO_STORE
3294#define vstore_partial_3_5 NO_STORE
3295#define vstore_partial_3_6 NO_STORE
3296#define vstore_partial_3_7 NO_STORE
3297#define vstore_partial_3_8 NO_STORE
3298#define vstore_partial_3_9 NO_STORE
3299#define vstore_partial_3_10 NO_STORE
3300#define vstore_partial_3_11 NO_STORE
3301#define vstore_partial_3_12 NO_STORE
3302#define vstore_partial_3_13 NO_STORE
3303#define vstore_partial_3_14 NO_STORE
3304#define vstore_partial_3_15 NO_STORE
3305#define vstore_partial_3_16 NO_STORE
3306
3307#define vstore_partial_4_0 NO_STORE
3308#define vstore_partial_4_1 vstore_partial_1
3309#define vstore_partial_4_2 vstore_partial_2
3310#define vstore_partial_4_3 vstore_partial_3
3311#define vstore_partial_4_4 vstore_partial_4
3312#define vstore_partial_4_5 NO_STORE
3313#define vstore_partial_4_6 NO_STORE
3314#define vstore_partial_4_7 NO_STORE
3315#define vstore_partial_4_8 NO_STORE
3316#define vstore_partial_4_9 NO_STORE
3317#define vstore_partial_4_10 NO_STORE
3318#define vstore_partial_4_11 NO_STORE
3319#define vstore_partial_4_12 NO_STORE
3320#define vstore_partial_4_13 NO_STORE
3321#define vstore_partial_4_14 NO_STORE
3322#define vstore_partial_4_15 NO_STORE
3323#define vstore_partial_4_16 NO_STORE
3324
3325#define vstore_partial_8_0 NO_STORE
3326#define vstore_partial_8_1 vstore_partial_1
3327#define vstore_partial_8_2 vstore_partial_2
3328#define vstore_partial_8_3 vstore_partial_3
3329#define vstore_partial_8_4 vstore_partial_4
3330#define vstore_partial_8_5 vstore_partial_5
3331#define vstore_partial_8_6 vstore_partial_6
3332#define vstore_partial_8_7 vstore_partial_7
3333#define vstore_partial_8_8 vstore_partial_8
3334#define vstore_partial_8_9 NO_STORE
3335#define vstore_partial_8_10 NO_STORE
3336#define vstore_partial_8_11 NO_STORE
3337#define vstore_partial_8_12 NO_STORE
3338#define vstore_partial_8_13 NO_STORE
3339#define vstore_partial_8_14 NO_STORE
3340#define vstore_partial_8_15 NO_STORE
3341#define vstore_partial_8_16 NO_STORE
3342
3343#define vstore_partial_16_0 NO_STORE
3344#define vstore_partial_16_1 vstore_partial_1
3345#define vstore_partial_16_2 vstore_partial_2
3346#define vstore_partial_16_3 vstore_partial_3
3347#define vstore_partial_16_4 vstore_partial_4
3348#define vstore_partial_16_5 vstore_partial_5
3349#define vstore_partial_16_6 vstore_partial_6
3350#define vstore_partial_16_7 vstore_partial_7
3351#define vstore_partial_16_8 vstore_partial_8
3352#define vstore_partial_16_9 vstore_partial_9
3353#define vstore_partial_16_10 vstore_partial_10
3354#define vstore_partial_16_11 vstore_partial_11
3355#define vstore_partial_16_12 vstore_partial_12
3356#define vstore_partial_16_13 vstore_partial_13
3357#define vstore_partial_16_14 vstore_partial_14
3358#define vstore_partial_16_15 vstore_partial_15
3359#define vstore_partial_16_16 vstore_partial_16
3360
3361
3362#define vstore_partial_1(DATA, OFFSET, PTR) \
3363    vstore1(DATA.s0, OFFSET, PTR);
3364
3365#define vstore_partial_2(DATA, OFFSET, PTR) \
3366    vstore2(DATA.s01, OFFSET, PTR);
3367
3368#define vstore_partial_3(DATA, OFFSET, PTR) \
3369    vstore3(DATA.s012, OFFSET, PTR);
3370
3371#define vstore_partial_4(DATA, OFFSET, PTR) \
3372    vstore4(DATA.s0123, OFFSET, PTR);
3373
3374#define vstore_partial_5(DATA, OFFSET, PTR)    \
3375    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3376    vstore1(DATA.s4, OFFSET, PTR + 4);
3377
3378#define vstore_partial_6(DATA, OFFSET, PTR)    \
3379    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3380    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
3381
3382#define vstore_partial_7(DATA, OFFSET, PTR)    \
3383    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3384    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
3385
3386#define vstore_partial_8(DATA, OFFSET, PTR) \
3387    vstore8(DATA.s01234567, OFFSET, PTR);
3388
3389#define vstore_partial_9(DATA, OFFSET, PTR)        \
3390    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3391    vstore1(DATA.s8, OFFSET, PTR + 8);
3392
3393#define vstore_partial_10(DATA, OFFSET, PTR)       \
3394    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3395    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
3396
3397#define vstore_partial_11(DATA, OFFSET, PTR)       \
3398    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3399    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
3400
3401#define vstore_partial_12(DATA, OFFSET, PTR)       \
3402    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3403    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
3404
3405#define vstore_partial_13(DATA, OFFSET, PTR)       \
3406    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3407    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
3408
3409#define vstore_partial_14(DATA, OFFSET, PTR)       \
3410    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3411    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
3412
3413#define vstore_partial_15(DATA, OFFSET, PTR)       \
3414    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3415    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
3416
3417#define vstore_partial_16(DATA, OFFSET, PTR) \
3418    vstore16(DATA, OFFSET, PTR);
3419
3420
3421
3422
3423
3424#define convert_float_sat convert_float
3425#define convert_float1_sat convert_float
3426#define convert_float2_sat convert_float2
3427#define convert_float3_sat convert_float3
3428#define convert_float4_sat convert_float4
3429#define convert_float8_sat convert_float8
3430#define convert_float16_sat convert_float16
3431#define convert_half_sat convert_float
3432#define convert_half1_sat convert_half
3433#define convert_half2_sat convert_half2
3434#define convert_half3_sat convert_half3
3435#define convert_half4_sat convert_half4
3436#define convert_half8_sat convert_half8
3437#define convert_half16_sat convert_half16
3438
3439#define convert_float1 convert_float
3440#define convert_half1 convert_half
3441#define convert_char1 convert_char
3442#define convert_uchar1 convert_uchar
3443#define convert_short1 convert_short
3444#define convert_ushort1 convert_ushort
3445#define convert_int1 convert_int
3446#define convert_uint1 convert_uint
3447#define convert_long1 convert_long
3448#define convert_ulong1 convert_ulong
3449#define convert_double1 convert_double
3450
3451#define convert_char1_sat convert_char_sat
3452#define convert_uchar1_sat convert_uchar_sat
3453#define convert_uchar2_sat convert_uchar2_sat
3454#define convert_uchar3_sat convert_uchar3_sat
3455#define convert_uchar4_sat convert_uchar4_sat
3456#define convert_uchar8_sat convert_uchar8_sat
3457#define convert_uchar16_sat convert_uchar16_sat
3458#define convert_short1_sat convert_short_sat
3459#define convert_ushort1_sat convert_ushort_sat
3460#define convert_int1_sat convert_int_sat
3461#define convert_uint1_sat convert_uint_sat
3462#define convert_long1_sat convert_long_sat
3463#define convert_ulong1_sat convert_ulong_sat
3464#define convert_double1_sat convert_double_sat
3465
3466#define VEC_DATA_TYPE_STR(type, size) type##size
3467#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
3468
3469#define CONVERT_STR(x, type) (convert_##type((x)))
3470#define CONVERT(x, type) CONVERT_STR(x, type)
3471
3472#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
3473#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
3474
3475#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
3476#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
3477
3478#define select_vec_dt_uchar(size) uchar##size
3479#define select_vec_dt_char(size) char##size
3480#define select_vec_dt_ushort(size) ushort##size
3481#define select_vec_dt_short(size) short##size
3482#define select_vec_dt_half(size) short##size
3483#define select_vec_dt_uint(size) uint##size
3484#define select_vec_dt_int(size) int##size
3485#define select_vec_dt_float(size) int##size
3486#define select_vec_dt_ulong(size) ulong##size
3487#define select_vec_dt_long(size) long##size
3488
3489#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
3490#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
3491#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
3492
3493#define signed_int_vec_dt_uchar(size) char##size
3494#define signed_int_vec_dt_char(size) char##size
3495#define signed_int_vec_dt_ushort(size) short##size
3496#define signed_int_vec_dt_short(size) short##size
3497#define signed_int_vec_dt_half(size) short##size
3498#define signed_int_vec_dt_uint(size) int##size
3499#define signed_int_vec_dt_int(size) int##size
3500#define signed_int_vec_dt_float(size) int##size
3501#define signed_int_vec_dt_ulong(size) long##size
3502#define signed_int_vec_dt_long(size) long##size
3503
3504#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
3505#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
3506#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
3507
3508#define sum_reduce_1(x) (x)
3509#define sum_reduce_2(x) ((x).s0) + ((x).s1)
3510#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
3511#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
3512#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
3513#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
3514
3515#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
3516#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
3517
3518#define prod_reduce_1(x) (x)
3519#define prod_reduce_2(x) ((x).s0) * ((x).s1)
3520#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
3521#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
3522#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
3523#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
3524
3525#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
3526#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
3527
3528#define max_reduce_1(x) (x)
3529#define max_reduce_2(x) max(((x).s0), ((x).s1))
3530#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
3531#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
3532#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
3533#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
3534
3535#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
3536#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
3537
3538#define VECTOR_DECLARATION(name)     \
3539    __global uchar *name##_ptr,      \
3540    uint        name##_stride_x, \
3541    uint        name##_step_x,   \
3542    uint        name##_offset_first_element_in_bytes
3543
3544#define IMAGE_DECLARATION(name)      \
3545    __global uchar *name##_ptr,      \
3546    uint        name##_stride_x, \
3547    uint        name##_step_x,   \
3548    uint        name##_stride_y, \
3549    uint        name##_step_y,   \
3550    uint        name##_offset_first_element_in_bytes
3551
3552#define TENSOR3D_DECLARATION(name)   \
3553    __global uchar *name##_ptr,      \
3554    uint        name##_stride_x, \
3555    uint        name##_step_x,   \
3556    uint        name##_stride_y, \
3557    uint        name##_step_y,   \
3558    uint        name##_stride_z, \
3559    uint        name##_step_z,   \
3560    uint        name##_offset_first_element_in_bytes
3561
3562#define TENSOR4D_DECLARATION(name)   \
3563    __global uchar *name##_ptr,      \
3564    uint        name##_stride_x, \
3565    uint        name##_step_x,   \
3566    uint        name##_stride_y, \
3567    uint        name##_step_y,   \
3568    uint        name##_stride_z, \
3569    uint        name##_step_z,   \
3570    uint        name##_stride_w, \
3571    uint        name##_step_w,   \
3572    uint        name##_offset_first_element_in_bytes
3573
3574#define TENSOR5D_DECLARATION(name)   \
3575    __global uchar *name##_ptr,      \
3576    uint        name##_stride_x, \
3577    uint        name##_step_x,   \
3578    uint        name##_stride_y, \
3579    uint        name##_step_y,   \
3580    uint        name##_stride_z, \
3581    uint        name##_step_z,   \
3582    uint        name##_stride_w, \
3583    uint        name##_step_w,   \
3584    uint        name##_stride_v, \
3585    uint        name##_step_v,   \
3586    uint        name##_offset_first_element_in_bytes
3587
3588#define CONVERT_TO_VECTOR_STRUCT(name) \
3589    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
3590
3591#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
3592    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
3593
3594#define CONVERT_TO_IMAGE_STRUCT(name) \
3595    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
3596
3597#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
3598    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
3599
3600#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3601    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3602
3603#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
3604    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
3605
3606#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3607    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3608
3609#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
3610    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3611                                 name##_stride_z, name##_step_z)
3612
3613#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
3614    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
3615
3616#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
3617    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3618                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
3619
3620#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
3621    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
3622
3623#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
3624    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3625                           name##_stride_z, name##_step_z)
3626
3627
3628typedef struct Vector
3629{
3630    __global uchar *ptr;
3631    int             offset_first_element_in_bytes;
3632    int             stride_x;
3633} Vector;
3634
3635
3636typedef struct Image
3637{
3638    __global uchar *ptr;
3639    int             offset_first_element_in_bytes;
3640    int             stride_x;
3641    int             stride_y;
3642} Image;
3643
3644
3645typedef struct Tensor3D
3646{
3647    __global uchar *ptr;
3648    int             offset_first_element_in_bytes;
3649    int             stride_x;
3650    int             stride_y;
3651    int             stride_z;
3652} Tensor3D;
3653
3654
3655typedef struct Tensor4D
3656{
3657    __global uchar *ptr;
3658    int             offset_first_element_in_bytes;
3659    int             stride_x;
3660    int             stride_y;
3661    int             stride_z;
3662    int             stride_w;
3663} Tensor4D;
3664
3665
3666inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
3667{
3668    Vector vector =
3669    {
3670        .ptr                           = ptr,
3671        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3672        .stride_x                      = stride_x,
3673    };
3674    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
3675    return vector;
3676}
3677
3678
3679inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
3680{
3681    Image img =
3682    {
3683        .ptr                           = ptr,
3684        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3685        .stride_x                      = stride_x,
3686        .stride_y                      = stride_y
3687    };
3688    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
3689    return img;
3690}
3691
3692
3693inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3694{
3695    Image img =
3696    {
3697        .ptr                           = ptr,
3698        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3699        .stride_x                      = stride_x,
3700        .stride_y                      = stride_y
3701    };
3702    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3703    return img;
3704}
3705
3706
3707inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3708{
3709    Tensor3D tensor =
3710    {
3711        .ptr                           = ptr,
3712        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3713        .stride_x                      = stride_x,
3714        .stride_y                      = stride_y,
3715        .stride_z                      = stride_z
3716    };
3717    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3718    return tensor;
3719}
3720
3721
3722inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3723{
3724    Tensor3D tensor =
3725    {
3726        .ptr                           = ptr,
3727        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3728        .stride_x                      = stride_x,
3729        .stride_y                      = stride_y,
3730        .stride_z                      = stride_z
3731    };
3732    return tensor;
3733}
3734
3735inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
3736                                             uint step_w,
3737                                             uint mod_size)
3738{
3739    Tensor4D tensor =
3740    {
3741        .ptr                           = ptr,
3742        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3743        .stride_x                      = stride_x,
3744        .stride_y                      = stride_y,
3745        .stride_z                      = stride_z,
3746        .stride_w                      = stride_w
3747    };
3748
3749    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
3750    return tensor;
3751}
3752
3753
3754inline __global const uchar *vector_offset(const Vector *vec, int x)
3755{
3756    return vec->ptr + x * vec->stride_x;
3757}
3758
3759
3760inline __global uchar *offset(const Image *img, int x, int y)
3761{
3762    return img->ptr + x * img->stride_x + y * img->stride_y;
3763}
3764
3765
3766inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
3767{
3768    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
3769}
3770
3771
3772inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
3773{
3774    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
3775}
3776
3777
3778inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
3779{
3780    uint num_elements = width * height;
3781
3782    const uint z = index / num_elements;
3783
3784    index %= num_elements;
3785
3786    const uint y = index / width;
3787
3788    index %= width;
3789
3790    const uint x = index;
3791
3792    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
3793}
3794
3795#endif
3796
3797
3798#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
3799#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
3800
3801
3802inline uchar quantize_qasymm8(float input, float offset, float scale)
3803{
3804    float out_f32 = input / scale + offset;
3805    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
3806    return res_u8;
3807}
3808
3809
3810inline float dequantize_qasymm8(uchar input, float offset, float scale)
3811{
3812    return ((float)input - offset) * scale;
3813}
3814
3815
3816inline float dequantize_qasymm8_signed(char input, float offset, float scale)
3817{
3818    return ((float)input - offset) * scale;
3819}
3820
3821
3822#define QUANTIZE_IMPL(type, size)                                                                                       \
3823    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
3824    {                                                                                                                   \
3825        VEC_DATA_TYPE(float, size)                                                                                      \
3826        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
3827        VEC_DATA_TYPE(type, size)                                                                                       \
3828        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
3829        return res;                                                                                                     \
3830    }
3831
3832
3833#define DEQUANTIZE_IMPL(type, size)                                                                                       \
3834    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
3835    {                                                                                                                     \
3836        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
3837    }
3838
3839
3840#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
3841    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
3842    {                                                                                                                                   \
3843        const VEC_DATA_TYPE(int, size)                                                                                                  \
3844        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
3845        const VEC_DATA_TYPE(int, size)                                                                                                  \
3846        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
3847        VEC_DATA_TYPE(int, size)                                                                                                        \
3848        mask = (one << exponent) - one;                                                                                                 \
3849        VEC_DATA_TYPE(int, size)                                                                                                        \
3850        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
3851        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
3852    }
3853
3854
3855#define ASYMM_MULT_IMPL(size)                                                                                \
3856    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
3857    {                                                                                                        \
3858        VEC_DATA_TYPE(int, size)                                                                             \
3859        overflow = a == b && a == INT_MIN;                                                                   \
3860        VEC_DATA_TYPE(long, size)                                                                            \
3861        a_64 = convert_long##size(a);                                                                        \
3862        VEC_DATA_TYPE(long, size)                                                                            \
3863        b_64 = convert_long##size(b);                                                                        \
3864        VEC_DATA_TYPE(long, size)                                                                            \
3865        ab_64 = a_64 * b_64;                                                                                 \
3866                                                                                      \
3867        VEC_DATA_TYPE(long, size)                                                                            \
3868        mask1 = 1 << 30;                                                                                     \
3869        VEC_DATA_TYPE(long, size)                                                                            \
3870        mask2 = 1 - (1 << 30);                                                                               \
3871        VEC_DATA_TYPE(long, size)                                                                            \
3872        is_positive_or_zero = ab_64 >= 0;                                                                    \
3873        VEC_DATA_TYPE(long, size)                                                                            \
3874        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
3875        VEC_DATA_TYPE(long, size)                                                                            \
3876        mask = 1ll << 31;                                                                                    \
3877        VEC_DATA_TYPE(int, size)                                                                             \
3878        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
3879        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
3880    }
3881
3882
3883#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
3884    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
3885    {                                                                                                                               \
3886        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
3887        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
3888        const int k_fractional_bits = 31;                                                                                           \
3889        VEC_DATA_TYPE(int, size)                                                                                                    \
3890        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
3891        VEC_DATA_TYPE(int, size)                                                                                                    \
3892        x2 = ASYMM_MULT(x, x, size);                                                                                                \
3893        VEC_DATA_TYPE(int, size)                                                                                                    \
3894        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
3895        VEC_DATA_TYPE(int, size)                                                                                                    \
3896        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
3897        VEC_DATA_TYPE(int, size)                                                                                                    \
3898        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
3899        VEC_DATA_TYPE(int, size)                                                                                                    \
3900        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
3901        VEC_DATA_TYPE(int, size)                                                                                                    \
3902        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
3903        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
3904    }
3905
3906
3907#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
3908    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
3909    {                                                                                                                                                                     \
3910        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
3911    }
3912
3913
3914#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
3915    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
3916    {                                                                                    \
3917        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
3918        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
3919        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
3920    }
3921
3922
3923#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
3924    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
3925    {                                                                                        \
3926        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
3927        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
3928        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
3929    }
3930
3931#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
3932    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
3933    {                                                                                                                                                                                                         \
3934        if(k_integer_bits > exponent)                                                                                                                                                                         \
3935        {                                                                                                                                                                                                     \
3936            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
3937            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
3938                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
3939                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
3940        }                                                                                                                                                                                                     \
3941        \
3942        return result;                                                                                                                                                                                        \
3943    }
3944
3945
3946#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
3947    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
3948    {                                                                                                                         \
3949        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
3950        VEC_DATA_TYPE(int, size)                                                                                              \
3951        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
3952        VEC_DATA_TYPE(int, size)                                                                                              \
3953        mask = k_one_quarter - 1;                                                                                             \
3954        VEC_DATA_TYPE(int, size)                                                                                              \
3955        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
3956        VEC_DATA_TYPE(int, size)                                                                                              \
3957        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
3958        VEC_DATA_TYPE(int, size)                                                                                              \
3959        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
3960        VEC_DATA_TYPE(int, size)                                                                                              \
3961        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
3962        \
3963        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
3964        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
3965        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
3966        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
3967        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
3968        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
3969        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
3970        \
3971        if(k_integer_bits > 5)                                                                                                \
3972        {                                                                                                                     \
3973            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
3974            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
3975        }                                                                                                                     \
3976        \
3977        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
3978        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
3979    }
3980
3981
3982#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
3983    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
3984    {                                                                                                                      \
3985        if(exponent < 0)                                                                                                   \
3986        {                                                                                                                  \
3987            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
3988        }                                                                                                                  \
3989        \
3990        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
3991        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
3992        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
3993        VEC_DATA_TYPE(int, size)                                                                                           \
3994        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
3995        VEC_DATA_TYPE(int, size)                                                                                           \
3996        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
3997        VEC_DATA_TYPE(int, size)                                                                                           \
3998        result = x << exponent;                                                                                            \
3999        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
4000        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
4001        return result;                                                                                                     \
4002    }
4003
4004
4005#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
4006    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
4007    {                                                                                                                     \
4008        VEC_DATA_TYPE(long, size)                                                                                         \
4009        a64 = convert_long##size(a);                                                                                      \
4010        VEC_DATA_TYPE(long, size)                                                                                         \
4011        b64 = convert_long##size(b);                                                                                      \
4012        VEC_DATA_TYPE(long, size)                                                                                         \
4013        sum = a64 + b64;                                                                                                  \
4014        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
4015        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
4016        VEC_DATA_TYPE(long, size)                                                                                         \
4017        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
4018        return convert_int##size((sum + sign) / 2);                                                                       \
4019    }
4020
4021
4022#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
4023    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
4024    {                                                                                                        \
4025        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
4026        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
4027        VEC_DATA_TYPE(int, size)                                                                             \
4028        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
4029        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
4030        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
4031        VEC_DATA_TYPE(int, size)                                                                             \
4032        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
4033        for(int i = 0; i < 3; i++)                                                                           \
4034        {                                                                                                    \
4035            VEC_DATA_TYPE(int, size)                                                                         \
4036            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
4037            VEC_DATA_TYPE(int, size)                                                                         \
4038            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
4039            VEC_DATA_TYPE(int, size)                                                                         \
4040            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
4041            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
4042        }                                                                                                    \
4043        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
4044    }
4045
4046
4047#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
4048    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
4049    {                                                                                                                               \
4050        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
4051        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
4052    }
4053
4054#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
4055#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
4056#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
4057#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
4058
4059#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
4060#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
4061#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
4062#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
4063#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
4064    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
4065#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
4066    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
4067#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
4068#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
4069#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
4070#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
4071#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
4072#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
4073#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
4074#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
4075#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
4076#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
4077#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
4078#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
4079#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
4080
4081#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
4082    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
4083    {                                                                                                                           \
4084        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
4085        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
4086        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
4087    }
4088#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
4089
4090QUANTIZE_IMPL(uchar, 1)
4091QUANTIZE_IMPL(char, 1)
4092QUANTIZE_IMPL(uint, 1)
4093QUANTIZE_IMPL(int, 1)
4094QUANTIZE_IMPL(uchar, 2)
4095QUANTIZE_IMPL(char, 2)
4096QUANTIZE_IMPL(uint, 2)
4097QUANTIZE_IMPL(int, 2)
4098QUANTIZE_IMPL(uchar, 3)
4099QUANTIZE_IMPL(char, 3)
4100QUANTIZE_IMPL(uint, 3)
4101QUANTIZE_IMPL(int, 3)
4102QUANTIZE_IMPL(uchar, 4)
4103QUANTIZE_IMPL(ushort, 4)
4104QUANTIZE_IMPL(short, 4)
4105QUANTIZE_IMPL(int, 4)
4106QUANTIZE_IMPL(uchar, 8)
4107QUANTIZE_IMPL(char, 8)
4108QUANTIZE_IMPL(uint, 8)
4109QUANTIZE_IMPL(int, 8)
4110QUANTIZE_IMPL(uchar, 16)
4111QUANTIZE_IMPL(char, 16)
4112QUANTIZE_IMPL(ushort, 16)
4113QUANTIZE_IMPL(short, 16)
4114QUANTIZE_IMPL(uint, 16)
4115QUANTIZE_IMPL(int, 16)
4116
4117DEQUANTIZE_IMPL(uchar, 1)
4118DEQUANTIZE_IMPL(char, 1)
4119DEQUANTIZE_IMPL(uint, 1)
4120DEQUANTIZE_IMPL(int, 1)
4121DEQUANTIZE_IMPL(uchar, 2)
4122DEQUANTIZE_IMPL(char, 2)
4123DEQUANTIZE_IMPL(uint, 2)
4124DEQUANTIZE_IMPL(int, 2)
4125DEQUANTIZE_IMPL(uchar, 3)
4126DEQUANTIZE_IMPL(char, 3)
4127DEQUANTIZE_IMPL(uint, 3)
4128DEQUANTIZE_IMPL(int, 3)
4129DEQUANTIZE_IMPL(uchar, 4)
4130DEQUANTIZE_IMPL(ushort, 4)
4131DEQUANTIZE_IMPL(short, 4)
4132DEQUANTIZE_IMPL(int, 4)
4133DEQUANTIZE_IMPL(uchar, 8)
4134DEQUANTIZE_IMPL(char, 8)
4135DEQUANTIZE_IMPL(uint, 8)
4136DEQUANTIZE_IMPL(int, 8)
4137DEQUANTIZE_IMPL(uchar, 16)
4138DEQUANTIZE_IMPL(char, 16)
4139DEQUANTIZE_IMPL(ushort, 16)
4140DEQUANTIZE_IMPL(short, 16)
4141DEQUANTIZE_IMPL(uint, 16)
4142DEQUANTIZE_IMPL(int, 16)
4143
4144ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
4145ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
4146ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
4147ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
4148ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
4149ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
4150
4151ASYMM_MULT_IMPL(1)
4152ASYMM_MULT_IMPL(2)
4153ASYMM_MULT_IMPL(3)
4154ASYMM_MULT_IMPL(4)
4155ASYMM_MULT_IMPL(8)
4156ASYMM_MULT_IMPL(16)
4157
4158ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
4159ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
4160ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
4161ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
4162ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
4163ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
4164
4165ASYMM_SELECT_USING_MASK_IMPL(1)
4166ASYMM_SELECT_USING_MASK_IMPL(2)
4167ASYMM_SELECT_USING_MASK_IMPL(3)
4168ASYMM_SELECT_USING_MASK_IMPL(4)
4169ASYMM_SELECT_USING_MASK_IMPL(8)
4170ASYMM_SELECT_USING_MASK_IMPL(16)
4171
4172ASYMM_MASK_IF_ZERO_IMPL(1)
4173ASYMM_MASK_IF_ZERO_IMPL(2)
4174ASYMM_MASK_IF_ZERO_IMPL(3)
4175ASYMM_MASK_IF_ZERO_IMPL(4)
4176ASYMM_MASK_IF_ZERO_IMPL(8)
4177ASYMM_MASK_IF_ZERO_IMPL(16)
4178
4179ASYMM_MASK_IF_NON_ZERO_IMPL(1)
4180ASYMM_MASK_IF_NON_ZERO_IMPL(2)
4181ASYMM_MASK_IF_NON_ZERO_IMPL(3)
4182ASYMM_MASK_IF_NON_ZERO_IMPL(4)
4183ASYMM_MASK_IF_NON_ZERO_IMPL(8)
4184ASYMM_MASK_IF_NON_ZERO_IMPL(16)
4185
4186EXP_BARREL_SHIFTER_IMPL(1)
4187EXP_BARREL_SHIFTER_IMPL(2)
4188EXP_BARREL_SHIFTER_IMPL(3)
4189EXP_BARREL_SHIFTER_IMPL(4)
4190EXP_BARREL_SHIFTER_IMPL(8)
4191EXP_BARREL_SHIFTER_IMPL(16)
4192
4193ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
4194ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
4195ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
4196ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
4197ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
4198ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
4199
4200ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
4201ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
4202ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
4203ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
4204ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
4205ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
4206
4207ASYMM_ROUNDING_HALF_SUM_IMPL(1)
4208ASYMM_ROUNDING_HALF_SUM_IMPL(2)
4209ASYMM_ROUNDING_HALF_SUM_IMPL(3)
4210ASYMM_ROUNDING_HALF_SUM_IMPL(4)
4211ASYMM_ROUNDING_HALF_SUM_IMPL(8)
4212ASYMM_ROUNDING_HALF_SUM_IMPL(16)
4213
4214ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
4215ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
4216ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
4217ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
4218ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
4219ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
4220
4221ASYMM_RESCALE_IMPL(1)
4222ASYMM_RESCALE_IMPL(2)
4223ASYMM_RESCALE_IMPL(3)
4224ASYMM_RESCALE_IMPL(4)
4225ASYMM_RESCALE_IMPL(8)
4226ASYMM_RESCALE_IMPL(16)
4227
4228MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
4229MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
4230MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
4231MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
4232MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
4233MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
4234
4235#endif
4236
4237#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
4238#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
4239
4240
4241
4242
4243#define TILE_VECTOR_SIZE1 1
4244#define TILE_VECTOR_SIZE2 2
4245#define TILE_VECTOR_SIZE3 3
4246#define TILE_VECTOR_SIZE4 4
4247#define TILE_VECTOR_SIZE5 8
4248#define TILE_VECTOR_SIZE6 8
4249#define TILE_VECTOR_SIZE7 8
4250#define TILE_VECTOR_SIZE8 8
4251#define TILE_VECTOR_SIZE9 16
4252#define TILE_VECTOR_SIZE10 16
4253#define TILE_VECTOR_SIZE11 16
4254#define TILE_VECTOR_SIZE12 16
4255#define TILE_VECTOR_SIZE13 16
4256#define TILE_VECTOR_SIZE14 16
4257#define TILE_VECTOR_SIZE15 16
4258#define TILE_VECTOR_SIZE16 16
4259
4260#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
4261#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
4262#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
4263#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
4264#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
4265#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
4266#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
4267#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
4268#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
4269#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
4270#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
4271#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
4272#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
4273#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
4274#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
4275#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
4276
4277
4278#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
4279#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
4280    union {                                 \
4281        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
4282        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
4283    } BASENAME[H]
4284
4285#define TENSOR4D_IMAGE(name)          \
4286    __read_only image2d_t name##_img, \
4287    __global uchar *name##_ptr,       \
4288    uint            name##_stride_x,  \
4289    uint            name##_step_x,    \
4290    uint            name##_stride_y,  \
4291    uint            name##_step_y,    \
4292    uint            name##_stride_z,  \
4293    uint            name##_step_z,    \
4294    uint            name##_stride_w,  \
4295    uint            name##_step_w,    \
4296    uint            name##_offset_first_element_in_bytes
4297
4298#define TENSOR4D_BUFFER(name)    \
4299    __global uchar *name##_ptr,  \
4300    uint        name##_stride_x, \
4301    uint        name##_step_x,   \
4302    uint        name##_stride_y, \
4303    uint        name##_step_y,   \
4304    uint        name##_stride_z, \
4305    uint        name##_step_z,   \
4306    uint        name##_stride_w, \
4307    uint        name##_step_w,   \
4308    uint        name##_offset_first_element_in_bytes
4309
4310#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
4311#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
4312
4313#define TENSOR4D_T_IMAGE(name)          \
4314    __read_only image2d_t name##_img, \
4315    __global uchar *name##_ptr,       \
4316    uint        name##_stride_y, \
4317    uint        name##_stride_z, \
4318    uint        name##_stride_w, \
4319    uint        name##_c,   \
4320    uint        name##_w,   \
4321    uint        name##_h,   \
4322    uint        name##_n,   \
4323    uint        name##_offset_first_element_in_bytes
4324
4325#define TENSOR4D_T_BUFFER(name)    \
4326    __global uchar *name##_ptr,  \
4327    uint        name##_stride_y, \
4328    uint        name##_stride_z, \
4329    uint        name##_stride_w, \
4330    uint        name##_c,   \
4331    uint        name##_w,   \
4332    uint        name##_h,   \
4333    uint        name##_n,   \
4334    uint        name##_offset_first_element_in_bytes
4335
4336#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
4337
4338
4339#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
4340
4341#define TENSOR4D_RO_T_IMAGE(name)          \
4342    __read_only image2d_t name##_img, \
4343    TENSOR4D_T_BUFFER(name)
4344
4345#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
4346
4347#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
4348
4349
4350#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
4351
4352#define TENSOR4D_WO_T_IMAGE(name)          \
4353    __write_only image2d_t name##_img, \
4354    TENSOR4D_T_BUFFER(name)
4355
4356#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
4357
4358#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
4359
4360
4361#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
4362
4363#define TENSOR3D_T_IMAGE(name)          \
4364    __read_only image2d_t name##_img, \
4365    __global uchar *name##_ptr,       \
4366    uint        name##_stride_y, \
4367    uint        name##_stride_z, \
4368    uint        name##_w,   \
4369    uint        name##_h,   \
4370    uint        name##_n,   \
4371    uint        name##_offset_first_element_in_bytes
4372
4373#define TENSOR3D_T_BUFFER(name)    \
4374    __global uchar *name##_ptr,  \
4375    uint        name##_stride_y, \
4376    uint        name##_stride_z, \
4377    uint        name##_w,   \
4378    uint        name##_h,   \
4379    uint        name##_n,   \
4380    uint        name##_offset_first_element_in_bytes
4381
4382#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
4383#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
4384
4385#if !defined(UNROLL_WITH_PRAGMA)
4386#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
4387
4388#define LOOP_UNROLLING_1(idx, step, macro) (macro)
4389#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
4390#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
4391#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
4392#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
4393#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
4394#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
4395#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
4396#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
4397#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
4398#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
4399#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
4400#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
4401#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
4402#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
4403#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
4404#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
4405#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
4406#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
4407#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
4408#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
4409#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
4410#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
4411#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
4412#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
4413#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
4414#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
4415#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
4416#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
4417#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
4418#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
4419#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
4420#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
4421#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
4422#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
4423#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
4424#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
4425#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
4426#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
4427#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
4428#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
4429#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
4430#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
4431#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
4432#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
4433#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
4434#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
4435#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
4436#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
4437#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
4438#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
4439#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
4440#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
4441#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
4442#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
4443#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
4444#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
4445#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
4446#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
4447#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
4448#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
4449#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
4450#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
4451#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
4452#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
4453#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
4454#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
4455#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
4456#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
4457#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
4458#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
4459#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
4460#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
4461#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
4462#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
4463#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
4464#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
4465#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
4466#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
4467#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
4468#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
4469#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
4470#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
4471#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
4472#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
4473#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
4474#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
4475#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
4476#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
4477#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
4478#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
4479#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
4480#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
4481#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
4482#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
4483#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
4484#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
4485#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
4486#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
4487#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
4488#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
4489#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
4490#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
4491#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
4492#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
4493#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
4494#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
4495#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
4496#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
4497#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
4498#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
4499#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
4500#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
4501#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
4502#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
4503#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
4504#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
4505#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
4506#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
4507#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
4508#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
4509#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
4510#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
4511#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
4512#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
4513#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
4514#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
4515#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
4516
4517#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
4518    {                                                          \
4519        type idx = start;                                      \
4520        LOOP_UNROLLING_##num(idx, step, macro);                \
4521    }
4522#else
4523#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
4524    {                                                          \
4525        _Pragma("unroll")                                      \
4526        for(type idx = start; idx < (num * step); idx += step) \
4527        {                                                      \
4528            (macro);                                           \
4529        }                                                      \
4530    }
4531#endif
4532#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
4533
4534
4535#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
4536
4537
4538#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
4539#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
4540#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4541    ({                                                \
4542        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
4543    })
4544#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
4545#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
4546#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
4547#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
4548#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4549#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
4550#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
4551#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
4552#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4553#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
4554#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
4555#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
4556#else
4557#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
4558    ({                                                  \
4559        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
4560        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
4561    })
4562#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
4563    ({                                                  \
4564        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
4565        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
4566    })
4567#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
4568    ({                                                    \
4569        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
4570        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
4571        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
4572        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
4573    })
4574#endif
4575#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4576    ({                                                \
4577        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4578        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
4579    })
4580#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4581    ({                                                \
4582        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4583        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
4584    })
4585#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4586    ({                                                \
4587        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
4588        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
4589    })
4590#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4591    ({                                                \
4592        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
4593        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
4594    })
4595#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4596    ({                                                \
4597        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4598        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
4599    })
4600#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4601    ({                                                \
4602        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4603        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
4604    })
4605#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4606    ({                                                \
4607        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4608        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
4609    })
4610#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4611    ({                                                \
4612        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4613        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
4614    })
4615#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4616    ({                                                \
4617        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4618        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
4619    })
4620#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4621    ({                                                \
4622        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4623        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
4624    })
4625#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4626    ({                                                \
4627        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
4628        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
4629    })
4630#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
4631    ({                                                 \
4632        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
4633        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
4634    })
4635
4636
4637#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
4638#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
4639
4640
4641#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
4642#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
4643#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
4644    VLOAD(WIDTH)                                                \
4645    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
4646#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
4647
4648
4649#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
4650#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
4651#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
4652    VSTORE(WIDTH)                                                \
4653    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
4654#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
4655
4656
4657#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
4658    ({                                                                                                                 \
4659        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
4660        {                                                                                                              \
4661            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
4662        })                                                                                                             \
4663    })
4664
4665
4666#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
4667    ({                                                                                                  \
4668        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
4669        {                                                                                               \
4670            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
4671        })                                                                                              \
4672    })
4673
4674
4675#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
4676    ({                                                                                                                                                                                             \
4677        if(WIDTH1_CONDITION)                                                                                                                                                                       \
4678        {                                                                                                                                                                                          \
4679            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4680            {                                                                                                                                                                                      \
4681                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
4682                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
4683            })                                                                                                                                                                                     \
4684        }                                                                                                                                                                                          \
4685        else                                                                                                                                                                                       \
4686        {                                                                                                                                                                                          \
4687            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4688            {                                                                                                                                                                                      \
4689                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
4690            })                                                                                                                                                                                     \
4691        }                                                                                                                                                                                          \
4692    })
4693
4694#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
4695    ({                                                                                                                                                \
4696        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
4697        {                                                                                                                                             \
4698            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
4699            {                                                                                                                                         \
4700                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
4701                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
4702                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
4703                if(_src_valid_y != 0)                                                                                                                 \
4704                {                                                                                                                                     \
4705                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
4706                }                                                                                                                                     \
4707            })                                                                                                                                        \
4708        })                                                                                                                                            \
4709    })
4710
4711
4712#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
4713    ({ \
4714        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
4715        { \
4716            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
4717            { \
4718                int _src_y = (X) + _xk * (DILATION_X); \
4719                int _src_z = ((Y) + _yk * (DILATION_Y)); \
4720                int _src_w    = (B); \
4721                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
4722                if(!(BOUNDARY_CHECK)) \
4723                { \
4724                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
4725                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
4726                } \
4727                else \
4728                { \
4729                    if(_src_valid_y) \
4730                    { \
4731                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
4732                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
4733                    }                                                                                                                                                                                                 \
4734                } \
4735            })                                                                                                                                                                                                             \
4736        })                                                                                                                                                                                                             \
4737    })
4738
4739
4740#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
4741    ({                                                                                                                                                                \
4742        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
4743        {                                                                                                                                                             \
4744            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
4745            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
4746            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
4747            if(_src_valid_y != 0)                                                                                                                                     \
4748            {                                                                                                                                                         \
4749                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
4750            }                                                                                                                                                         \
4751        })                                                                                                                                                            \
4752    })
4753
4754
4755#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
4756#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
4757#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
4758    ({ \
4759        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
4760        { \
4761            if(yi[0].s[_i] >= 0) \
4762            { \
4763                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
4764            } \
4765        }) \
4766    })
4767
4768#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
4769    ({ \
4770        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
4771        { \
4772            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
4773        }) \
4774    })
4775
4776
4777#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
4778    ({                                                                                                                                                                \
4779        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
4780        {                                                                                                                                                             \
4781            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
4782            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
4783            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
4784                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
4785            if(_src_valid_y != 0)                                                                                                                                     \
4786            {                                                                                                                                                         \
4787                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
4788            }                                                                                                                                                         \
4789        })                                                                                                                                                            \
4790    })
4791
4792
4793#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
4794    ({                                                                                                                                                                                             \
4795        if(WIDTH1_CONDITION)                                                                                                                                                                       \
4796        {                                                                                                                                                                                          \
4797            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4798            {                                                                                                                                                                                      \
4799                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
4800                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
4801            })                                                                                                                                                                                     \
4802        }                                                                                                                                                                                          \
4803        else                                                                                                                                                                                       \
4804        {                                                                                                                                                                                          \
4805            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
4806            {                                                                                                                                                                                      \
4807                VSTORE(WIDTH0)                                                                                                                                                                     \
4808                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
4809            })                                                                                                                                                                                     \
4810        }                                                                                                                                                                                          \
4811    })
4812
4813
4814#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
4815    ({                                                                                               \
4816        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
4817        {                                                                                            \
4818            ACC_DATA_TYPE _tm = 0;                                                                   \
4819            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
4820            {                                                                                        \
4821                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
4822            })                                                                                       \
4823            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
4824            {                                                                                        \
4825                dst[_m0].s[_n0] += _tm;                                                              \
4826                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
4827                {                                                                                    \
4828                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
4829                })                                                                                   \
4830            })                                                                                       \
4831        })                                                                                          \
4832    })
4833
4834
4835#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
4836#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
4837
4838
4839#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
4840    ({ \
4841        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4842        { \
4843            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4844            { \
4845                SRC_DATA_TYPE _tmp = 0; \
4846                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4847                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
4848                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
4849                long a_64 = (long)(_src); \
4850                long b_64 = (long)(DST_MULTIPLIER); \
4851                long ab_64 = a_64 * b_64; \
4852                long mask1 = 1 << 30; \
4853                long mask2 = 1 - (1 << 30); \
4854                long is_positive_or_zero = ab_64 >= 0; \
4855                long nudge = select(mask2, mask1, is_positive_or_zero); \
4856                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4857                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4858                if(DST_SHIFT >= 0) \
4859                { \
4860                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
4861                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
4862                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
4863                } \
4864                _tmp += DST_OFFSET; \
4865                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4866            })                                                                                                                                          \
4867        })                                                                                                                                          \
4868    })
4869
4870
4871#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
4872    ({ \
4873        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4874        { \
4875            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4876            { \
4877                SRC_DATA_TYPE _tmp = 0; \
4878                SRC_DATA_TYPE _tmp2 = 0; \
4879                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4880                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
4881                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
4882                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
4883                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
4884                long a_64 = (long)(_src); \
4885                long b_64 = (long)(_dst_multiplier); \
4886                long ab_64 = a_64 * b_64; \
4887                long mask1 = 1 << 30; \
4888                long mask2 = 1 - (1 << 30); \
4889                long is_positive_or_zero = ab_64 >= 0; \
4890                long nudge = select(mask2, mask1, is_positive_or_zero); \
4891                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4892                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4893                long mask = ((((int)1) << _dst_shift) - (int)1); \
4894                long threshold = (mask >> 1) + any(_tmp); \
4895                _tmp2 = _tmp >> _dst_shift; \
4896                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
4897                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
4898                _tmp += DST_OFFSET; \
4899                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4900            })                                                                                                                                          \
4901        })                                                                                                                                         \
4902    })
4903
4904
4905#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
4906    ({ \
4907        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4908        { \
4909            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
4910            { \
4911                SRC_DATA_TYPE _tmp = 0; \
4912                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
4913                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
4914                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
4915                long a_64 = (long)(_src); \
4916                long b_64 = (long)(DST_MULTIPLIER); \
4917                long ab_64 = a_64 * b_64; \
4918                long mask1 = 1 << 30; \
4919                long mask2 = 1 - (1 << 30); \
4920                long is_positive_or_zero = ab_64 >= 0; \
4921                long nudge = select(mask2, mask1, is_positive_or_zero); \
4922                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
4923                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
4924                if(DST_SHIFT >= 0) \
4925                { \
4926                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
4927                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
4928                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
4929                } \
4930                _tmp += DST_OFFSET; \
4931                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
4932            })                                                                                                                                          \
4933        })                                                                                                                                          \
4934    })
4935
4936
4937#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
4938    ({                                                                                                                                                     \
4939        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
4940        {                                                                                                                                                  \
4941            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
4942            {                                                                                                                                              \
4943                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
4944            })                                                                                                                                             \
4945        })                                                                                                                                                 \
4946    })
4947
4948
4949#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
4950    ({                                                                                         \
4951        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
4952        {                                                                                      \
4953            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
4954        })                                                                                     \
4955    })
4956
4957
4958#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
4959
4960#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
4961
4962#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
4963
4964#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
4965
4966#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
4967
4968#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
4969#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
4970
4971#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
4972#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
4973#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
4974#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
4975
4976
4977#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
4978    ({ \
4979        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
4980        { \
4981            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
4982        })                                                                                          \
4983    })
4984
4985
4986#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
4987    ({                                                            \
4988        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
4989        {                                                         \
4990            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
4991        })                                                        \
4992    })
4993
4994
4995#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
4996    ({                                                            \
4997        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
4998        {                                                         \
4999            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
5000        })                                                        \
5001    })
5002
5003#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5004#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5005#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5006
5007#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5008#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5009
5010#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5011
5012#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5013#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5014
5015
5016#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
5017    ({                                                            \
5018        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
5019        {                                                         \
5020            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
5021        })                                                        \
5022    })
5023
5024
5025#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5026    ({                                                      \
5027        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5028        {                                                   \
5029            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5030        })                                                  \
5031    })
5032
5033
5034#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5035    ({                                                      \
5036        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5037        {                                                   \
5038            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5039        })                                                  \
5040    })
5041
5042#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5043#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5044#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5045#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
5046
5047
5048#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
5049    ({                                                      \
5050        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5051        {                                                   \
5052            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5053        })                                                  \
5054    })
5055
5056
5057#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
5058    ({                                                      \
5059        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
5060        {                                                   \
5061            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
5062        })                                                  \
5063    })
5064
5065
5066#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5067#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5068#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5069#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5070#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5071#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5072#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5073#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
5074#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
5075    {                                                                                     \
5076        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
5077        {                                                                                 \
5078            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
5079            {                                                                             \
5080                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
5081                {                                                                         \
5082                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
5083                })                                                                        \
5084            })                                                                            \
5085        })                                                                                \
5086    }
5087
5088#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
5089    ({ \
5090        LOOP_UNROLLING(int, _m, 0, 1, M0, \
5091        { \
5092            LOOP_UNROLLING(int, _n, 0, 1, N0, \
5093            { \
5094                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
5095            })                                                                                             \
5096        })                                                                                             \
5097    })
5098
5099#endif
5100
5101
5102
5103
5104__kernel void direct_convolution_nhwc(
5105    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
5106    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
5107    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
5108#if defined(HAS_BIAS)
5109    ,
5110    VECTOR_DECLARATION(bia)
5111#endif
5112)
5113{
5114
5115
5116#define _IWEI_WIDTH WEI_WIDTH
5117#define _IWEI_HEIGHT WEI_HEIGHT
5118#define _ISRC_WIDTH SRC_WIDTH
5119#define _ISRC_HEIGHT SRC_HEIGHT
5120#define _ISRC_CHANNELS SRC_CHANNELS
5121#define _IDST_WIDTH DST_WIDTH
5122#define _IDST_HEIGHT DST_HEIGHT
5123#define _IDST_CHANNELS DST_CHANNELS
5124#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
5125
5126
5127#if defined(IS_QUANTIZED)
5128#define _IOUTPUT_TILE cq
5129#else
5130#define _IOUTPUT_TILE c
5131#endif
5132
5133    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0);
5134    const int mout = GET_SPATIAL_IDX(1, M0, 0);
5135    const int bout = GET_SPATIAL_IDX(2, 1, 0);
5136
5137
5138
5139    TILE(int, 1, M0, xi);
5140    TILE(int, 1, M0, yi);
5141
5142
5143    LOOP_UNROLLING(int, i, 0, 1, M0,
5144    {
5145        xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
5146        yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
5147        xi[0].s[i] -= PAD_LEFT;
5148        yi[0].s[i] -= PAD_TOP;
5149    })
5150
5151
5152    TILE(ACC_DATA_TYPE, M0, N0, c);
5153
5154    LOOP_UNROLLING(int, i, 0, 1, M0,
5155    {
5156        c[i].v = 0;
5157    })
5158
5159    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
5160    {
5161        int xk = i % _IWEI_WIDTH;
5162        int yk = i / _IWEI_WIDTH;
5163
5164        TILE(int, 1, M0, my);
5165
5166        LOOP_UNROLLING(int, i, 0, 1, M0,
5167        {
5168            int x_s    = xi[0].s[i] + xk;
5169            int y_s    = yi[0].s[i] + yk;
5170            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
5171            my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
5172            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
5173            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
5174            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
5175            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
5176        })
5177
5178        int ck = 0;
5179        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
5180        {
5181            TILE(SRC_DATA_TYPE, M0, K0, a);
5182            TILE(WEI_DATA_TYPE, N0, K0, b);
5183
5184
5185            LOOP_UNROLLING(int, i, 0, 1, M0,
5186            {
5187                a[i].v = ZERO_VALUE;
5188            })
5189
5190            LOOP_UNROLLING(int, i, 0, 1, N0,
5191            {
5192                b[i].v = ZERO_VALUE;
5193            })
5194
5195
5196            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
5197
5198
5199            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
5200
5201
5202            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
5203
5204
5205
5206            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
5207        }
5208
5209
5210#if defined(LEFTOVER_LOOP)
5211
5212        for(; ck < _ISRC_CHANNELS; ++ck)
5213        {
5214            TILE(SRC_DATA_TYPE, M0, 1, a);
5215            TILE(WEI_DATA_TYPE, N0, 1, b);
5216
5217
5218            LOOP_UNROLLING(int, i, 0, 1, M0,
5219            {
5220                a[i].v = ZERO_VALUE;
5221            })
5222
5223            LOOP_UNROLLING(int, i, 0, 1, N0,
5224            {
5225                b[i].v = ZERO_VALUE;
5226            })
5227
5228
5229            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
5230
5231
5232
5233            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
5234
5235
5236            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
5237
5238
5239
5240            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
5241        }
5242#endif
5243    }
5244
5245
5246
5247    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
5248
5249#if defined(HAS_BIAS)
5250    TILE(BIA_DATA_TYPE, 1, N0, bias0);
5251
5252    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
5253
5254
5255    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
5256
5257#endif
5258
5259#if defined(IS_QUANTIZED)
5260
5261    TILE(DST_DATA_TYPE, M0, N0, cq);
5262
5263
5264    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
5265#endif
5266
5267
5268    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
5269
5270    TILE(uint, M0, 1, dst_indirect_y);
5271
5272
5273    LOOP_UNROLLING(int, i, 0, 1, M0,
5274    {
5275        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
5276        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
5277    })
5278
5279    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
5280
5281
5282
5283    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
5284
5285#undef _IWEI_WIDTH
5286#undef _IWEI_HEIGHT
5287#undef _ISRC_WIDTH
5288#undef _ISRC_HEIGHT
5289#undef _ISRC_CHANNELS
5290#undef _IDST_WIDTH
5291#undef _IDST_HEIGHT
5292#undef _IDST_CHANNELS
5293#undef _IY_MULTIPLIER
5294})"