xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemm_utils.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3
4
5
6#ifndef ARM_COMPUTE_HELPER_H
7#define ARM_COMPUTE_HELPER_H
8
9
10
11
12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    VSTORE(N0)                                                 \
14    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15
16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18    VSTORE(N0)                                                 \
19    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20
21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23    VSTORE(N0)                                                 \
24    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25
26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28    VSTORE(N0)                                                 \
29    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30
31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33    VSTORE(N0)                                                 \
34    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35
36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38    VSTORE(N0)                                                 \
39    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40
41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43    VSTORE(N0)                                                 \
44    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45
46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48    VSTORE(N0)                                                 \
49    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50
51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53    VSTORE(N0)                                                 \
54    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55
56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58    VSTORE(N0)                                                  \
59    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60
61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63    VSTORE(N0)                                                  \
64    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65
66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68    VSTORE(N0)                                                  \
69    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70
71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73    VSTORE(N0)                                                  \
74    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75
76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78    VSTORE(N0)                                                  \
79    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80
81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83    VSTORE(N0)                                                  \
84    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85
86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88    VSTORE(N0)                                                  \
89    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90
91
92
93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    VSTORE(N0)                                                         \
95    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96
97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99    VSTORE(N0)                                                         \
100    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101
102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104    VSTORE(N0)                                                         \
105    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106
107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109    VSTORE(N0)                                                         \
110    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111
112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114    VSTORE(N0)                                                         \
115    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116
117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                         \
120    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121
122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                         \
125    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126
127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                         \
130    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131
132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                         \
135    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136
137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    VSTORE(N0)                                                     \
140    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141
142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                          \
145    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146
147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                          \
150    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151
152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                          \
155    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159    VSTORE(N0)                                                          \
160    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161
162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                          \
165    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166
167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                          \
170    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171
172
173
174
175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177
178
179
180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182
183
184
185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188
189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193
194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198
199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203
204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208
209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213
214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218
219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223
224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228
229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233
234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238
239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243
244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248
249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253
254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258
259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263
264
265
266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268
269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273    }                                                                                                                                                     \
274    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281    }                                                                                                                                                     \
282    else                                                                                                                                                  \
283    {                                                                                                                                                     \
284        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285    }
286
287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288    if(!(PARTIAL_COND_X))                                                                                         \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291    }                                                                                                             \
292    else                                                                                                          \
293    {                                                                                                             \
294        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295    }
296
297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298    if(!(PARTIAL_COND_Y))                                                                                         \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301    }                                                                                                             \
302    else                                                                                                          \
303    {                                                                                                             \
304        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305    }
306
307
308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309
310
311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312
313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315
316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317
318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320
321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322
323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325
326#else
327
328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330
331#endif
332
333#endif
334
335
336#if defined(PARTIAL_STORE_M0)
337
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340#else
341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342    ((uint)(y * M0))
343#endif
344
345
346
347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349
350
351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361#endif
362
363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364#pragma OPENCL EXTENSION cl_arm_printf : enable
365#endif
366
367#define GPU_ARCH_MIDGARD 0x100
368#define GPU_ARCH_BIFROST 0x200
369#define GPU_ARCH_VALHALL 0x300
370
371
372#define CONCAT(a, b) a##b
373
374
375#define EXPAND(x) x
376
377
378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379
380
381#define REV1(x) ((x))
382#define REV2(x) ((x).s10)
383#define REV3(x) ((x).s210)
384#define REV4(x) ((x).s3210)
385#define REV8(x) ((x).s76543210)
386#define REV16(x) ((x).sFEDCBA9876543210)
387
388
389
390#define REVERSE_STR(x, s) REV##s((x))
391#define REVERSE(x, s) REVERSE_STR(x, s)
392
393
394
395#define ROT1_0(x) ((x))
396#define ROT1_1(x) ((x))
397
398#define ROT2_0(x) ((x))
399#define ROT2_1(x) ((x).s10)
400#define ROT2_2(x) ((x))
401
402#define ROT3_0(x) ((x))
403#define ROT3_1(x) ((x).s201)
404#define ROT3_2(x) ((x).s120)
405#define ROT3_3(x) ((x))
406
407#define ROT4_0(x) ((x))
408#define ROT4_1(x) ((x).s3012)
409#define ROT4_2(x) ((x).s2301)
410#define ROT4_3(x) ((x).s1230)
411#define ROT4_4(x) ((x))
412
413#define ROT8_0(x) ((x))
414#define ROT8_1(x) ((x).s70123456)
415#define ROT8_2(x) ((x).s67012345)
416#define ROT8_3(x) ((x).s56701234)
417#define ROT8_4(x) ((x).s45670123)
418#define ROT8_5(x) ((x).s34567012)
419#define ROT8_6(x) ((x).s23456701)
420#define ROT8_7(x) ((x).s12345670)
421#define ROT8_8(x) ((x))
422
423#define ROT16_0(x) ((x))
424#define ROT16_1(x) ((x).sF0123456789ABCDE)
425#define ROT16_2(x) ((x).sEF0123456789ABCD)
426#define ROT16_3(x) ((x).sDEF0123456789ABC)
427#define ROT16_4(x) ((x).sCDEF0123456789AB)
428#define ROT16_5(x) ((x).sBCDEF0123456789A)
429#define ROT16_6(x) ((x).sABCDEF0123456789)
430#define ROT16_7(x) ((x).s9ABCDEF012345678)
431#define ROT16_8(x) ((x).s89ABCDEF01234567)
432#define ROT16_9(x) ((x).s789ABCDEF0123456)
433#define ROT16_10(x) ((x).s6789ABCDEF012345)
434#define ROT16_11(x) ((x).s56789ABCDEF01234)
435#define ROT16_12(x) ((x).s456789ABCDEF0123)
436#define ROT16_13(x) ((x).s3456789ABCDEF012)
437#define ROT16_14(x) ((x).s23456789ABCDEF01)
438#define ROT16_15(x) ((x).s123456789ABCDEF0)
439#define ROT16_16(x) ((x))
440
441
442
443#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445
446
447
448#define V_OFFS1(dt) (dt##1)(0)
449#define V_OFFS2(dt) (dt##2)(0, 1)
450#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454
455
456
457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459
460
461#define VLOAD_STR(size) vload##size
462#define VLOAD(size) VLOAD_STR(size)
463
464
465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467
468#define NO_LOAD(data, offs, ptr) \
469    {                            \
470    }
471
472
473#define vload_partial_1_0 NO_LOAD
474#define vload_partial_1_1 vload1
475#define vload_partial_1_2 NO_LOAD
476#define vload_partial_1_3 NO_LOAD
477#define vload_partial_1_4 NO_LOAD
478#define vload_partial_1_5 NO_LOAD
479#define vload_partial_1_6 NO_LOAD
480#define vload_partial_1_7 NO_LOAD
481#define vload_partial_1_8 NO_LOAD
482#define vload_partial_1_9 NO_LOAD
483#define vload_partial_1_10 NO_LOAD
484#define vload_partial_1_11 NO_LOAD
485#define vload_partial_1_12 NO_LOAD
486#define vload_partial_1_13 NO_LOAD
487#define vload_partial_1_14 NO_LOAD
488#define vload_partial_1_15 NO_LOAD
489#define vload_partial_1_16 NO_LOAD
490
491#define vload_partial_2_0 NO_LOAD
492#define vload_partial_2_1 vload_partial_1
493#define vload_partial_2_2 vload_partial_2
494#define vload_partial_2_3 NO_LOAD
495#define vload_partial_2_4 NO_LOAD
496#define vload_partial_2_5 NO_LOAD
497#define vload_partial_2_6 NO_LOAD
498#define vload_partial_2_7 NO_LOAD
499#define vload_partial_2_8 NO_LOAD
500#define vload_partial_2_9 NO_LOAD
501#define vload_partial_2_10 NO_LOAD
502#define vload_partial_2_11 NO_LOAD
503#define vload_partial_2_12 NO_LOAD
504#define vload_partial_2_13 NO_LOAD
505#define vload_partial_2_14 NO_LOAD
506#define vload_partial_2_15 NO_LOAD
507#define vload_partial_2_16 NO_LOAD
508
509#define vload_partial_3_0 NO_LOAD
510#define vload_partial_3_1 vload_partial_1
511#define vload_partial_3_2 vload_partial_2
512#define vload_partial_3_3 vload_partial_3
513#define vload_partial_3_4 NO_LOAD
514#define vload_partial_3_5 NO_LOAD
515#define vload_partial_3_6 NO_LOAD
516#define vload_partial_3_7 NO_LOAD
517#define vload_partial_3_8 NO_LOAD
518#define vload_partial_3_9 NO_LOAD
519#define vload_partial_3_10 NO_LOAD
520#define vload_partial_3_11 NO_LOAD
521#define vload_partial_3_12 NO_LOAD
522#define vload_partial_3_13 NO_LOAD
523#define vload_partial_3_14 NO_LOAD
524#define vload_partial_3_15 NO_LOAD
525#define vload_partial_3_16 NO_LOAD
526
527#define vload_partial_4_0 NO_LOAD
528#define vload_partial_4_1 vload_partial_1
529#define vload_partial_4_2 vload_partial_2
530#define vload_partial_4_3 vload_partial_3
531#define vload_partial_4_4 vload_partial_4
532#define vload_partial_4_5 NO_LOAD
533#define vload_partial_4_6 NO_LOAD
534#define vload_partial_4_7 NO_LOAD
535#define vload_partial_4_8 NO_LOAD
536#define vload_partial_4_9 NO_LOAD
537#define vload_partial_4_10 NO_LOAD
538#define vload_partial_4_11 NO_LOAD
539#define vload_partial_4_12 NO_LOAD
540#define vload_partial_4_13 NO_LOAD
541#define vload_partial_4_14 NO_LOAD
542#define vload_partial_4_15 NO_LOAD
543#define vload_partial_4_16 NO_LOAD
544
545#define vload_partial_8_0 NO_LOAD
546#define vload_partial_8_1 vload_partial_1
547#define vload_partial_8_2 vload_partial_2
548#define vload_partial_8_3 vload_partial_3
549#define vload_partial_8_4 vload_partial_4
550#define vload_partial_8_5 vload_partial_5
551#define vload_partial_8_6 vload_partial_6
552#define vload_partial_8_7 vload_partial_7
553#define vload_partial_8_8 vload_partial_8
554#define vload_partial_8_9 NO_LOAD
555#define vload_partial_8_10 NO_LOAD
556#define vload_partial_8_11 NO_LOAD
557#define vload_partial_8_12 NO_LOAD
558#define vload_partial_8_13 NO_LOAD
559#define vload_partial_8_14 NO_LOAD
560#define vload_partial_8_15 NO_LOAD
561#define vload_partial_8_16 NO_LOAD
562
563#define vload_partial_16_0 NO_LOAD
564#define vload_partial_16_1 vload_partial_1
565#define vload_partial_16_2 vload_partial_2
566#define vload_partial_16_3 vload_partial_3
567#define vload_partial_16_4 vload_partial_4
568#define vload_partial_16_5 vload_partial_5
569#define vload_partial_16_6 vload_partial_6
570#define vload_partial_16_7 vload_partial_7
571#define vload_partial_16_8 vload_partial_8
572#define vload_partial_16_9 vload_partial_9
573#define vload_partial_16_10 vload_partial_10
574#define vload_partial_16_11 vload_partial_11
575#define vload_partial_16_12 vload_partial_12
576#define vload_partial_16_13 vload_partial_13
577#define vload_partial_16_14 vload_partial_14
578#define vload_partial_16_15 vload_partial_15
579#define vload_partial_16_16 vload_partial_16
580
581
582#define vload_partial_1(DATA, OFFSET, PTR) \
583    DATA.s0 = vload1(OFFSET, PTR);
584
585#define vload_partial_2(DATA, OFFSET, PTR) \
586    DATA.s01 = vload2(OFFSET, PTR);
587
588#define vload_partial_3(DATA, OFFSET, PTR) \
589    DATA.s012 = vload3(OFFSET, PTR);
590
591#define vload_partial_4(DATA, OFFSET, PTR) \
592    DATA.s0123 = vload4(OFFSET, PTR);
593
594#define vload_partial_5(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    DATA.s4 = vload1(OFFSET, PTR + 4);
597
598#define vload_partial_6(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601
602#define vload_partial_7(DATA, OFFSET, PTR)    \
603    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605
606#define vload_partial_8(DATA, OFFSET, PTR) \
607    DATA.s01234567 = vload8(OFFSET, PTR);
608
609#define vload_partial_9(DATA, OFFSET, PTR)        \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    DATA.s8 = vload1(OFFSET, PTR + 8);
612
613#define vload_partial_10(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616
617#define vload_partial_11(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620
621#define vload_partial_12(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624
625#define vload_partial_13(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_14(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_15(DATA, OFFSET, PTR)       \
634    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636
637#define vload_partial_16(DATA, OFFSET, PTR) \
638    DATA = vload16(OFFSET, PTR);
639
640
641
642#define PIXEL_UNIT4 1
643#define PIXEL_UNIT8 2
644#define PIXEL_UNIT16 4
645
646
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649
650
651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654
655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659#endif
660
661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669#endif
670
671
672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674
675
676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678
679#define VSTORE_STR(size) vstore##size
680#define VSTORE(size) VSTORE_STR(size)
681
682#define float1 float
683#define half1 half
684#define char1 char
685#define uchar1 uchar
686#define short1 short
687#define ushort1 ushort
688#define int1 int
689#define uint1 uint
690#define long1 long
691#define ulong1 ulong
692#define double1 double
693
694#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696
697
698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700
701#define NO_STORE(data, offs, ptr) \
702    {                             \
703    }
704
705
706#define vstore_partial_1_0 NO_STORE
707#define vstore_partial_1_1 vstore1
708#define vstore_partial_1_2 NO_STORE
709#define vstore_partial_1_3 NO_STORE
710#define vstore_partial_1_4 NO_STORE
711#define vstore_partial_1_5 NO_STORE
712#define vstore_partial_1_6 NO_STORE
713#define vstore_partial_1_7 NO_STORE
714#define vstore_partial_1_8 NO_STORE
715#define vstore_partial_1_9 NO_STORE
716#define vstore_partial_1_10 NO_STORE
717#define vstore_partial_1_11 NO_STORE
718#define vstore_partial_1_12 NO_STORE
719#define vstore_partial_1_13 NO_STORE
720#define vstore_partial_1_14 NO_STORE
721#define vstore_partial_1_15 NO_STORE
722#define vstore_partial_1_16 NO_STORE
723
724#define vstore_partial_2_0 NO_STORE
725#define vstore_partial_2_1 vstore_partial_1
726#define vstore_partial_2_2 vstore_partial_2
727#define vstore_partial_2_3 NO_STORE
728#define vstore_partial_2_4 NO_STORE
729#define vstore_partial_2_5 NO_STORE
730#define vstore_partial_2_6 NO_STORE
731#define vstore_partial_2_7 NO_STORE
732#define vstore_partial_2_8 NO_STORE
733#define vstore_partial_2_9 NO_STORE
734#define vstore_partial_2_10 NO_STORE
735#define vstore_partial_2_11 NO_STORE
736#define vstore_partial_2_12 NO_STORE
737#define vstore_partial_2_13 NO_STORE
738#define vstore_partial_2_14 NO_STORE
739#define vstore_partial_2_15 NO_STORE
740#define vstore_partial_2_16 NO_STORE
741
742#define vstore_partial_3_0 NO_STORE
743#define vstore_partial_3_1 vstore_partial_1
744#define vstore_partial_3_2 vstore_partial_2
745#define vstore_partial_3_3 vstore_partial_3
746#define vstore_partial_3_4 NO_STORE
747#define vstore_partial_3_5 NO_STORE
748#define vstore_partial_3_6 NO_STORE
749#define vstore_partial_3_7 NO_STORE
750#define vstore_partial_3_8 NO_STORE
751#define vstore_partial_3_9 NO_STORE
752#define vstore_partial_3_10 NO_STORE
753#define vstore_partial_3_11 NO_STORE
754#define vstore_partial_3_12 NO_STORE
755#define vstore_partial_3_13 NO_STORE
756#define vstore_partial_3_14 NO_STORE
757#define vstore_partial_3_15 NO_STORE
758#define vstore_partial_3_16 NO_STORE
759
760#define vstore_partial_4_0 NO_STORE
761#define vstore_partial_4_1 vstore_partial_1
762#define vstore_partial_4_2 vstore_partial_2
763#define vstore_partial_4_3 vstore_partial_3
764#define vstore_partial_4_4 vstore_partial_4
765#define vstore_partial_4_5 NO_STORE
766#define vstore_partial_4_6 NO_STORE
767#define vstore_partial_4_7 NO_STORE
768#define vstore_partial_4_8 NO_STORE
769#define vstore_partial_4_9 NO_STORE
770#define vstore_partial_4_10 NO_STORE
771#define vstore_partial_4_11 NO_STORE
772#define vstore_partial_4_12 NO_STORE
773#define vstore_partial_4_13 NO_STORE
774#define vstore_partial_4_14 NO_STORE
775#define vstore_partial_4_15 NO_STORE
776#define vstore_partial_4_16 NO_STORE
777
778#define vstore_partial_8_0 NO_STORE
779#define vstore_partial_8_1 vstore_partial_1
780#define vstore_partial_8_2 vstore_partial_2
781#define vstore_partial_8_3 vstore_partial_3
782#define vstore_partial_8_4 vstore_partial_4
783#define vstore_partial_8_5 vstore_partial_5
784#define vstore_partial_8_6 vstore_partial_6
785#define vstore_partial_8_7 vstore_partial_7
786#define vstore_partial_8_8 vstore_partial_8
787#define vstore_partial_8_9 NO_STORE
788#define vstore_partial_8_10 NO_STORE
789#define vstore_partial_8_11 NO_STORE
790#define vstore_partial_8_12 NO_STORE
791#define vstore_partial_8_13 NO_STORE
792#define vstore_partial_8_14 NO_STORE
793#define vstore_partial_8_15 NO_STORE
794#define vstore_partial_8_16 NO_STORE
795
796#define vstore_partial_16_0 NO_STORE
797#define vstore_partial_16_1 vstore_partial_1
798#define vstore_partial_16_2 vstore_partial_2
799#define vstore_partial_16_3 vstore_partial_3
800#define vstore_partial_16_4 vstore_partial_4
801#define vstore_partial_16_5 vstore_partial_5
802#define vstore_partial_16_6 vstore_partial_6
803#define vstore_partial_16_7 vstore_partial_7
804#define vstore_partial_16_8 vstore_partial_8
805#define vstore_partial_16_9 vstore_partial_9
806#define vstore_partial_16_10 vstore_partial_10
807#define vstore_partial_16_11 vstore_partial_11
808#define vstore_partial_16_12 vstore_partial_12
809#define vstore_partial_16_13 vstore_partial_13
810#define vstore_partial_16_14 vstore_partial_14
811#define vstore_partial_16_15 vstore_partial_15
812#define vstore_partial_16_16 vstore_partial_16
813
814
815#define vstore_partial_1(DATA, OFFSET, PTR) \
816    vstore1(DATA.s0, OFFSET, PTR);
817
818#define vstore_partial_2(DATA, OFFSET, PTR) \
819    vstore2(DATA.s01, OFFSET, PTR);
820
821#define vstore_partial_3(DATA, OFFSET, PTR) \
822    vstore3(DATA.s012, OFFSET, PTR);
823
824#define vstore_partial_4(DATA, OFFSET, PTR) \
825    vstore4(DATA.s0123, OFFSET, PTR);
826
827#define vstore_partial_5(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore1(DATA.s4, OFFSET, PTR + 4);
830
831#define vstore_partial_6(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834
835#define vstore_partial_7(DATA, OFFSET, PTR)    \
836    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838
839#define vstore_partial_8(DATA, OFFSET, PTR) \
840    vstore8(DATA.s01234567, OFFSET, PTR);
841
842#define vstore_partial_9(DATA, OFFSET, PTR)        \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore1(DATA.s8, OFFSET, PTR + 8);
845
846#define vstore_partial_10(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849
850#define vstore_partial_11(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853
854#define vstore_partial_12(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857
858#define vstore_partial_13(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_14(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_15(DATA, OFFSET, PTR)       \
867    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869
870#define vstore_partial_16(DATA, OFFSET, PTR) \
871    vstore16(DATA, OFFSET, PTR);
872
873
874
875
876
877#define convert_float_sat convert_float
878#define convert_float1_sat convert_float
879#define convert_float2_sat convert_float2
880#define convert_float3_sat convert_float3
881#define convert_float4_sat convert_float4
882#define convert_float8_sat convert_float8
883#define convert_float16_sat convert_float16
884#define convert_half_sat convert_float
885#define convert_half1_sat convert_half
886#define convert_half2_sat convert_half2
887#define convert_half3_sat convert_half3
888#define convert_half4_sat convert_half4
889#define convert_half8_sat convert_half8
890#define convert_half16_sat convert_half16
891
892#define convert_float1 convert_float
893#define convert_half1 convert_half
894#define convert_char1 convert_char
895#define convert_uchar1 convert_uchar
896#define convert_short1 convert_short
897#define convert_ushort1 convert_ushort
898#define convert_int1 convert_int
899#define convert_uint1 convert_uint
900#define convert_long1 convert_long
901#define convert_ulong1 convert_ulong
902#define convert_double1 convert_double
903
904#define convert_char1_sat convert_char_sat
905#define convert_uchar1_sat convert_uchar_sat
906#define convert_uchar2_sat convert_uchar2_sat
907#define convert_uchar3_sat convert_uchar3_sat
908#define convert_uchar4_sat convert_uchar4_sat
909#define convert_uchar8_sat convert_uchar8_sat
910#define convert_uchar16_sat convert_uchar16_sat
911#define convert_short1_sat convert_short_sat
912#define convert_ushort1_sat convert_ushort_sat
913#define convert_int1_sat convert_int_sat
914#define convert_uint1_sat convert_uint_sat
915#define convert_long1_sat convert_long_sat
916#define convert_ulong1_sat convert_ulong_sat
917#define convert_double1_sat convert_double_sat
918
919#define VEC_DATA_TYPE_STR(type, size) type##size
920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921
922#define CONVERT_STR(x, type) (convert_##type((x)))
923#define CONVERT(x, type) CONVERT_STR(x, type)
924
925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927
928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930
931#define select_vec_dt_uchar(size) uchar##size
932#define select_vec_dt_char(size) char##size
933#define select_vec_dt_ushort(size) ushort##size
934#define select_vec_dt_short(size) short##size
935#define select_vec_dt_half(size) short##size
936#define select_vec_dt_uint(size) uint##size
937#define select_vec_dt_int(size) int##size
938#define select_vec_dt_float(size) int##size
939#define select_vec_dt_ulong(size) ulong##size
940#define select_vec_dt_long(size) long##size
941
942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945
946#define signed_int_vec_dt_uchar(size) char##size
947#define signed_int_vec_dt_char(size) char##size
948#define signed_int_vec_dt_ushort(size) short##size
949#define signed_int_vec_dt_short(size) short##size
950#define signed_int_vec_dt_half(size) short##size
951#define signed_int_vec_dt_uint(size) int##size
952#define signed_int_vec_dt_int(size) int##size
953#define signed_int_vec_dt_float(size) int##size
954#define signed_int_vec_dt_ulong(size) long##size
955#define signed_int_vec_dt_long(size) long##size
956
957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960
961#define sum_reduce_1(x) (x)
962#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967
968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970
971#define prod_reduce_1(x) (x)
972#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977
978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980
981#define max_reduce_1(x) (x)
982#define max_reduce_2(x) max(((x).s0), ((x).s1))
983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987
988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990
991#define VECTOR_DECLARATION(name)     \
992    __global uchar *name##_ptr,      \
993    uint        name##_stride_x, \
994    uint        name##_step_x,   \
995    uint        name##_offset_first_element_in_bytes
996
997#define IMAGE_DECLARATION(name)      \
998    __global uchar *name##_ptr,      \
999    uint        name##_stride_x, \
1000    uint        name##_step_x,   \
1001    uint        name##_stride_y, \
1002    uint        name##_step_y,   \
1003    uint        name##_offset_first_element_in_bytes
1004
1005#define TENSOR3D_DECLARATION(name)   \
1006    __global uchar *name##_ptr,      \
1007    uint        name##_stride_x, \
1008    uint        name##_step_x,   \
1009    uint        name##_stride_y, \
1010    uint        name##_step_y,   \
1011    uint        name##_stride_z, \
1012    uint        name##_step_z,   \
1013    uint        name##_offset_first_element_in_bytes
1014
1015#define TENSOR4D_DECLARATION(name)   \
1016    __global uchar *name##_ptr,      \
1017    uint        name##_stride_x, \
1018    uint        name##_step_x,   \
1019    uint        name##_stride_y, \
1020    uint        name##_step_y,   \
1021    uint        name##_stride_z, \
1022    uint        name##_step_z,   \
1023    uint        name##_stride_w, \
1024    uint        name##_step_w,   \
1025    uint        name##_offset_first_element_in_bytes
1026
1027#define TENSOR5D_DECLARATION(name)   \
1028    __global uchar *name##_ptr,      \
1029    uint        name##_stride_x, \
1030    uint        name##_step_x,   \
1031    uint        name##_stride_y, \
1032    uint        name##_step_y,   \
1033    uint        name##_stride_z, \
1034    uint        name##_step_z,   \
1035    uint        name##_stride_w, \
1036    uint        name##_step_w,   \
1037    uint        name##_stride_v, \
1038    uint        name##_step_v,   \
1039    uint        name##_offset_first_element_in_bytes
1040
1041#define CONVERT_TO_VECTOR_STRUCT(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043
1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046
1047#define CONVERT_TO_IMAGE_STRUCT(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049
1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064                                 name##_stride_z, name##_step_z)
1065
1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072
1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075
1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078                           name##_stride_z, name##_step_z)
1079
1080
1081typedef struct Vector
1082{
1083    __global uchar *ptr;
1084    int             offset_first_element_in_bytes;
1085    int             stride_x;
1086} Vector;
1087
1088
1089typedef struct Image
1090{
1091    __global uchar *ptr;
1092    int             offset_first_element_in_bytes;
1093    int             stride_x;
1094    int             stride_y;
1095} Image;
1096
1097
1098typedef struct Tensor3D
1099{
1100    __global uchar *ptr;
1101    int             offset_first_element_in_bytes;
1102    int             stride_x;
1103    int             stride_y;
1104    int             stride_z;
1105} Tensor3D;
1106
1107
1108typedef struct Tensor4D
1109{
1110    __global uchar *ptr;
1111    int             offset_first_element_in_bytes;
1112    int             stride_x;
1113    int             stride_y;
1114    int             stride_z;
1115    int             stride_w;
1116} Tensor4D;
1117
1118
1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120{
1121    Vector vector =
1122    {
1123        .ptr                           = ptr,
1124        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125        .stride_x                      = stride_x,
1126    };
1127    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128    return vector;
1129}
1130
1131
1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133{
1134    Image img =
1135    {
1136        .ptr                           = ptr,
1137        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138        .stride_x                      = stride_x,
1139        .stride_y                      = stride_y
1140    };
1141    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142    return img;
1143}
1144
1145
1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147{
1148    Image img =
1149    {
1150        .ptr                           = ptr,
1151        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152        .stride_x                      = stride_x,
1153        .stride_y                      = stride_y
1154    };
1155    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156    return img;
1157}
1158
1159
1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161{
1162    Tensor3D tensor =
1163    {
1164        .ptr                           = ptr,
1165        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166        .stride_x                      = stride_x,
1167        .stride_y                      = stride_y,
1168        .stride_z                      = stride_z
1169    };
1170    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171    return tensor;
1172}
1173
1174
1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176{
1177    Tensor3D tensor =
1178    {
1179        .ptr                           = ptr,
1180        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181        .stride_x                      = stride_x,
1182        .stride_y                      = stride_y,
1183        .stride_z                      = stride_z
1184    };
1185    return tensor;
1186}
1187
1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189                                             uint step_w,
1190                                             uint mod_size)
1191{
1192    Tensor4D tensor =
1193    {
1194        .ptr                           = ptr,
1195        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196        .stride_x                      = stride_x,
1197        .stride_y                      = stride_y,
1198        .stride_z                      = stride_z,
1199        .stride_w                      = stride_w
1200    };
1201
1202    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203    return tensor;
1204}
1205
1206
1207inline __global const uchar *vector_offset(const Vector *vec, int x)
1208{
1209    return vec->ptr + x * vec->stride_x;
1210}
1211
1212
1213inline __global uchar *offset(const Image *img, int x, int y)
1214{
1215    return img->ptr + x * img->stride_x + y * img->stride_y;
1216}
1217
1218
1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220{
1221    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222}
1223
1224
1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226{
1227    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228}
1229
1230
1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232{
1233    uint num_elements = width * height;
1234
1235    const uint z = index / num_elements;
1236
1237    index %= num_elements;
1238
1239    const uint y = index / width;
1240
1241    index %= width;
1242
1243    const uint x = index;
1244
1245    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246}
1247
1248#endif
1249
1250#if GPU_ARCH == GPU_ARCH_BIFROST
1251#define MLA(a, b, c) (fma(c, b, a))
1252#else
1253#define MLA(a, b, c) ((b) * (c) + (a))
1254#endif
1255
1256
1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258
1259
1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261
1262
1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264
1265
1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267
1268
1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270
1271
1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273
1274
1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276
1277
1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279
1280
1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282
1283
1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285
1286
1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288
1289
1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291
1292
1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294
1295
1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297
1298
1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300
1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302
1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304
1305#ifndef ARM_COMPUTE_HELPER_H
1306#define ARM_COMPUTE_HELPER_H
1307
1308
1309
1310
1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    VSTORE(N0)                                                 \
1313    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314
1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317    VSTORE(N0)                                                 \
1318    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319
1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322    VSTORE(N0)                                                 \
1323    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324
1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327    VSTORE(N0)                                                 \
1328    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329
1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332    VSTORE(N0)                                                 \
1333    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334
1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337    VSTORE(N0)                                                 \
1338    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339
1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342    VSTORE(N0)                                                 \
1343    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344
1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347    VSTORE(N0)                                                 \
1348    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349
1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352    VSTORE(N0)                                                 \
1353    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354
1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357    VSTORE(N0)                                                  \
1358    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359
1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362    VSTORE(N0)                                                  \
1363    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364
1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367    VSTORE(N0)                                                  \
1368    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369
1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372    VSTORE(N0)                                                  \
1373    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374
1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377    VSTORE(N0)                                                  \
1378    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379
1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382    VSTORE(N0)                                                  \
1383    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384
1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387    VSTORE(N0)                                                  \
1388    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389
1390
1391
1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    VSTORE(N0)                                                         \
1394    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395
1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398    VSTORE(N0)                                                         \
1399    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400
1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403    VSTORE(N0)                                                         \
1404    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405
1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408    VSTORE(N0)                                                         \
1409    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410
1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413    VSTORE(N0)                                                         \
1414    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415
1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418    VSTORE(N0)                                                         \
1419    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420
1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423    VSTORE(N0)                                                         \
1424    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425
1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428    VSTORE(N0)                                                         \
1429    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430
1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433    VSTORE(N0)                                                         \
1434    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435
1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438    VSTORE(N0)                                                     \
1439    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440
1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443    VSTORE(N0)                                                          \
1444    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445
1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448    VSTORE(N0)                                                          \
1449    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450
1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453    VSTORE(N0)                                                          \
1454    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455
1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458    VSTORE(N0)                                                          \
1459    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460
1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463    VSTORE(N0)                                                          \
1464    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465
1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468    VSTORE(N0)                                                          \
1469    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470
1471
1472
1473
1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476
1477
1478
1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481
1482
1483
1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487
1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492
1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497
1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502
1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507
1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512
1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517
1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522
1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527
1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532
1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537
1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542
1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547
1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552
1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557
1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562
1563
1564
1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567
1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570    {                                                                                                                                                     \
1571        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572    }                                                                                                                                                     \
1573    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574    {                                                                                                                                                     \
1575        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576    }                                                                                                                                                     \
1577    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578    {                                                                                                                                                     \
1579        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580    }                                                                                                                                                     \
1581    else                                                                                                                                                  \
1582    {                                                                                                                                                     \
1583        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584    }
1585
1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587    if(!(PARTIAL_COND_X))                                                                                         \
1588    {                                                                                                             \
1589        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590    }                                                                                                             \
1591    else                                                                                                          \
1592    {                                                                                                             \
1593        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594    }
1595
1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597    if(!(PARTIAL_COND_Y))                                                                                         \
1598    {                                                                                                             \
1599        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600    }                                                                                                             \
1601    else                                                                                                          \
1602    {                                                                                                             \
1603        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604    }
1605
1606
1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608
1609
1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611
1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614
1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616
1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619
1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621
1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624
1625#else
1626
1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629
1630#endif
1631
1632#endif
1633
1634
1635#if defined(PARTIAL_STORE_M0)
1636
1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639#else
1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641    ((uint)(y * M0))
1642#endif
1643
1644
1645
1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648
1649
1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652#endif
1653
1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656#endif
1657
1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660#endif
1661
1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663#pragma OPENCL EXTENSION cl_arm_printf : enable
1664#endif
1665
1666#define GPU_ARCH_MIDGARD 0x100
1667#define GPU_ARCH_BIFROST 0x200
1668#define GPU_ARCH_VALHALL 0x300
1669
1670
1671#define CONCAT(a, b) a##b
1672
1673
1674#define EXPAND(x) x
1675
1676
1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678
1679
1680#define REV1(x) ((x))
1681#define REV2(x) ((x).s10)
1682#define REV3(x) ((x).s210)
1683#define REV4(x) ((x).s3210)
1684#define REV8(x) ((x).s76543210)
1685#define REV16(x) ((x).sFEDCBA9876543210)
1686
1687
1688
1689#define REVERSE_STR(x, s) REV##s((x))
1690#define REVERSE(x, s) REVERSE_STR(x, s)
1691
1692
1693
1694#define ROT1_0(x) ((x))
1695#define ROT1_1(x) ((x))
1696
1697#define ROT2_0(x) ((x))
1698#define ROT2_1(x) ((x).s10)
1699#define ROT2_2(x) ((x))
1700
1701#define ROT3_0(x) ((x))
1702#define ROT3_1(x) ((x).s201)
1703#define ROT3_2(x) ((x).s120)
1704#define ROT3_3(x) ((x))
1705
1706#define ROT4_0(x) ((x))
1707#define ROT4_1(x) ((x).s3012)
1708#define ROT4_2(x) ((x).s2301)
1709#define ROT4_3(x) ((x).s1230)
1710#define ROT4_4(x) ((x))
1711
1712#define ROT8_0(x) ((x))
1713#define ROT8_1(x) ((x).s70123456)
1714#define ROT8_2(x) ((x).s67012345)
1715#define ROT8_3(x) ((x).s56701234)
1716#define ROT8_4(x) ((x).s45670123)
1717#define ROT8_5(x) ((x).s34567012)
1718#define ROT8_6(x) ((x).s23456701)
1719#define ROT8_7(x) ((x).s12345670)
1720#define ROT8_8(x) ((x))
1721
1722#define ROT16_0(x) ((x))
1723#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728#define ROT16_6(x) ((x).sABCDEF0123456789)
1729#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738#define ROT16_16(x) ((x))
1739
1740
1741
1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744
1745
1746
1747#define V_OFFS1(dt) (dt##1)(0)
1748#define V_OFFS2(dt) (dt##2)(0, 1)
1749#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753
1754
1755
1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758
1759
1760#define VLOAD_STR(size) vload##size
1761#define VLOAD(size) VLOAD_STR(size)
1762
1763
1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766
1767#define NO_LOAD(data, offs, ptr) \
1768    {                            \
1769    }
1770
1771
1772#define vload_partial_1_0 NO_LOAD
1773#define vload_partial_1_1 vload1
1774#define vload_partial_1_2 NO_LOAD
1775#define vload_partial_1_3 NO_LOAD
1776#define vload_partial_1_4 NO_LOAD
1777#define vload_partial_1_5 NO_LOAD
1778#define vload_partial_1_6 NO_LOAD
1779#define vload_partial_1_7 NO_LOAD
1780#define vload_partial_1_8 NO_LOAD
1781#define vload_partial_1_9 NO_LOAD
1782#define vload_partial_1_10 NO_LOAD
1783#define vload_partial_1_11 NO_LOAD
1784#define vload_partial_1_12 NO_LOAD
1785#define vload_partial_1_13 NO_LOAD
1786#define vload_partial_1_14 NO_LOAD
1787#define vload_partial_1_15 NO_LOAD
1788#define vload_partial_1_16 NO_LOAD
1789
1790#define vload_partial_2_0 NO_LOAD
1791#define vload_partial_2_1 vload_partial_1
1792#define vload_partial_2_2 vload_partial_2
1793#define vload_partial_2_3 NO_LOAD
1794#define vload_partial_2_4 NO_LOAD
1795#define vload_partial_2_5 NO_LOAD
1796#define vload_partial_2_6 NO_LOAD
1797#define vload_partial_2_7 NO_LOAD
1798#define vload_partial_2_8 NO_LOAD
1799#define vload_partial_2_9 NO_LOAD
1800#define vload_partial_2_10 NO_LOAD
1801#define vload_partial_2_11 NO_LOAD
1802#define vload_partial_2_12 NO_LOAD
1803#define vload_partial_2_13 NO_LOAD
1804#define vload_partial_2_14 NO_LOAD
1805#define vload_partial_2_15 NO_LOAD
1806#define vload_partial_2_16 NO_LOAD
1807
1808#define vload_partial_3_0 NO_LOAD
1809#define vload_partial_3_1 vload_partial_1
1810#define vload_partial_3_2 vload_partial_2
1811#define vload_partial_3_3 vload_partial_3
1812#define vload_partial_3_4 NO_LOAD
1813#define vload_partial_3_5 NO_LOAD
1814#define vload_partial_3_6 NO_LOAD
1815#define vload_partial_3_7 NO_LOAD
1816#define vload_partial_3_8 NO_LOAD
1817#define vload_partial_3_9 NO_LOAD
1818#define vload_partial_3_10 NO_LOAD
1819#define vload_partial_3_11 NO_LOAD
1820#define vload_partial_3_12 NO_LOAD
1821#define vload_partial_3_13 NO_LOAD
1822#define vload_partial_3_14 NO_LOAD
1823#define vload_partial_3_15 NO_LOAD
1824#define vload_partial_3_16 NO_LOAD
1825
1826#define vload_partial_4_0 NO_LOAD
1827#define vload_partial_4_1 vload_partial_1
1828#define vload_partial_4_2 vload_partial_2
1829#define vload_partial_4_3 vload_partial_3
1830#define vload_partial_4_4 vload_partial_4
1831#define vload_partial_4_5 NO_LOAD
1832#define vload_partial_4_6 NO_LOAD
1833#define vload_partial_4_7 NO_LOAD
1834#define vload_partial_4_8 NO_LOAD
1835#define vload_partial_4_9 NO_LOAD
1836#define vload_partial_4_10 NO_LOAD
1837#define vload_partial_4_11 NO_LOAD
1838#define vload_partial_4_12 NO_LOAD
1839#define vload_partial_4_13 NO_LOAD
1840#define vload_partial_4_14 NO_LOAD
1841#define vload_partial_4_15 NO_LOAD
1842#define vload_partial_4_16 NO_LOAD
1843
1844#define vload_partial_8_0 NO_LOAD
1845#define vload_partial_8_1 vload_partial_1
1846#define vload_partial_8_2 vload_partial_2
1847#define vload_partial_8_3 vload_partial_3
1848#define vload_partial_8_4 vload_partial_4
1849#define vload_partial_8_5 vload_partial_5
1850#define vload_partial_8_6 vload_partial_6
1851#define vload_partial_8_7 vload_partial_7
1852#define vload_partial_8_8 vload_partial_8
1853#define vload_partial_8_9 NO_LOAD
1854#define vload_partial_8_10 NO_LOAD
1855#define vload_partial_8_11 NO_LOAD
1856#define vload_partial_8_12 NO_LOAD
1857#define vload_partial_8_13 NO_LOAD
1858#define vload_partial_8_14 NO_LOAD
1859#define vload_partial_8_15 NO_LOAD
1860#define vload_partial_8_16 NO_LOAD
1861
1862#define vload_partial_16_0 NO_LOAD
1863#define vload_partial_16_1 vload_partial_1
1864#define vload_partial_16_2 vload_partial_2
1865#define vload_partial_16_3 vload_partial_3
1866#define vload_partial_16_4 vload_partial_4
1867#define vload_partial_16_5 vload_partial_5
1868#define vload_partial_16_6 vload_partial_6
1869#define vload_partial_16_7 vload_partial_7
1870#define vload_partial_16_8 vload_partial_8
1871#define vload_partial_16_9 vload_partial_9
1872#define vload_partial_16_10 vload_partial_10
1873#define vload_partial_16_11 vload_partial_11
1874#define vload_partial_16_12 vload_partial_12
1875#define vload_partial_16_13 vload_partial_13
1876#define vload_partial_16_14 vload_partial_14
1877#define vload_partial_16_15 vload_partial_15
1878#define vload_partial_16_16 vload_partial_16
1879
1880
1881#define vload_partial_1(DATA, OFFSET, PTR) \
1882    DATA.s0 = vload1(OFFSET, PTR);
1883
1884#define vload_partial_2(DATA, OFFSET, PTR) \
1885    DATA.s01 = vload2(OFFSET, PTR);
1886
1887#define vload_partial_3(DATA, OFFSET, PTR) \
1888    DATA.s012 = vload3(OFFSET, PTR);
1889
1890#define vload_partial_4(DATA, OFFSET, PTR) \
1891    DATA.s0123 = vload4(OFFSET, PTR);
1892
1893#define vload_partial_5(DATA, OFFSET, PTR)    \
1894    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895    DATA.s4 = vload1(OFFSET, PTR + 4);
1896
1897#define vload_partial_6(DATA, OFFSET, PTR)    \
1898    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900
1901#define vload_partial_7(DATA, OFFSET, PTR)    \
1902    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904
1905#define vload_partial_8(DATA, OFFSET, PTR) \
1906    DATA.s01234567 = vload8(OFFSET, PTR);
1907
1908#define vload_partial_9(DATA, OFFSET, PTR)        \
1909    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910    DATA.s8 = vload1(OFFSET, PTR + 8);
1911
1912#define vload_partial_10(DATA, OFFSET, PTR)       \
1913    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915
1916#define vload_partial_11(DATA, OFFSET, PTR)       \
1917    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919
1920#define vload_partial_12(DATA, OFFSET, PTR)       \
1921    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923
1924#define vload_partial_13(DATA, OFFSET, PTR)       \
1925    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927
1928#define vload_partial_14(DATA, OFFSET, PTR)       \
1929    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931
1932#define vload_partial_15(DATA, OFFSET, PTR)       \
1933    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935
1936#define vload_partial_16(DATA, OFFSET, PTR) \
1937    DATA = vload16(OFFSET, PTR);
1938
1939
1940
1941#define PIXEL_UNIT4 1
1942#define PIXEL_UNIT8 2
1943#define PIXEL_UNIT16 4
1944
1945
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948
1949
1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953
1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958#endif
1959
1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963
1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968#endif
1969
1970
1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973
1974
1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977
1978#define VSTORE_STR(size) vstore##size
1979#define VSTORE(size) VSTORE_STR(size)
1980
1981#define float1 float
1982#define half1 half
1983#define char1 char
1984#define uchar1 uchar
1985#define short1 short
1986#define ushort1 ushort
1987#define int1 int
1988#define uint1 uint
1989#define long1 long
1990#define ulong1 ulong
1991#define double1 double
1992
1993#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995
1996
1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999
2000#define NO_STORE(data, offs, ptr) \
2001    {                             \
2002    }
2003
2004
2005#define vstore_partial_1_0 NO_STORE
2006#define vstore_partial_1_1 vstore1
2007#define vstore_partial_1_2 NO_STORE
2008#define vstore_partial_1_3 NO_STORE
2009#define vstore_partial_1_4 NO_STORE
2010#define vstore_partial_1_5 NO_STORE
2011#define vstore_partial_1_6 NO_STORE
2012#define vstore_partial_1_7 NO_STORE
2013#define vstore_partial_1_8 NO_STORE
2014#define vstore_partial_1_9 NO_STORE
2015#define vstore_partial_1_10 NO_STORE
2016#define vstore_partial_1_11 NO_STORE
2017#define vstore_partial_1_12 NO_STORE
2018#define vstore_partial_1_13 NO_STORE
2019#define vstore_partial_1_14 NO_STORE
2020#define vstore_partial_1_15 NO_STORE
2021#define vstore_partial_1_16 NO_STORE
2022
2023#define vstore_partial_2_0 NO_STORE
2024#define vstore_partial_2_1 vstore_partial_1
2025#define vstore_partial_2_2 vstore_partial_2
2026#define vstore_partial_2_3 NO_STORE
2027#define vstore_partial_2_4 NO_STORE
2028#define vstore_partial_2_5 NO_STORE
2029#define vstore_partial_2_6 NO_STORE
2030#define vstore_partial_2_7 NO_STORE
2031#define vstore_partial_2_8 NO_STORE
2032#define vstore_partial_2_9 NO_STORE
2033#define vstore_partial_2_10 NO_STORE
2034#define vstore_partial_2_11 NO_STORE
2035#define vstore_partial_2_12 NO_STORE
2036#define vstore_partial_2_13 NO_STORE
2037#define vstore_partial_2_14 NO_STORE
2038#define vstore_partial_2_15 NO_STORE
2039#define vstore_partial_2_16 NO_STORE
2040
2041#define vstore_partial_3_0 NO_STORE
2042#define vstore_partial_3_1 vstore_partial_1
2043#define vstore_partial_3_2 vstore_partial_2
2044#define vstore_partial_3_3 vstore_partial_3
2045#define vstore_partial_3_4 NO_STORE
2046#define vstore_partial_3_5 NO_STORE
2047#define vstore_partial_3_6 NO_STORE
2048#define vstore_partial_3_7 NO_STORE
2049#define vstore_partial_3_8 NO_STORE
2050#define vstore_partial_3_9 NO_STORE
2051#define vstore_partial_3_10 NO_STORE
2052#define vstore_partial_3_11 NO_STORE
2053#define vstore_partial_3_12 NO_STORE
2054#define vstore_partial_3_13 NO_STORE
2055#define vstore_partial_3_14 NO_STORE
2056#define vstore_partial_3_15 NO_STORE
2057#define vstore_partial_3_16 NO_STORE
2058
2059#define vstore_partial_4_0 NO_STORE
2060#define vstore_partial_4_1 vstore_partial_1
2061#define vstore_partial_4_2 vstore_partial_2
2062#define vstore_partial_4_3 vstore_partial_3
2063#define vstore_partial_4_4 vstore_partial_4
2064#define vstore_partial_4_5 NO_STORE
2065#define vstore_partial_4_6 NO_STORE
2066#define vstore_partial_4_7 NO_STORE
2067#define vstore_partial_4_8 NO_STORE
2068#define vstore_partial_4_9 NO_STORE
2069#define vstore_partial_4_10 NO_STORE
2070#define vstore_partial_4_11 NO_STORE
2071#define vstore_partial_4_12 NO_STORE
2072#define vstore_partial_4_13 NO_STORE
2073#define vstore_partial_4_14 NO_STORE
2074#define vstore_partial_4_15 NO_STORE
2075#define vstore_partial_4_16 NO_STORE
2076
2077#define vstore_partial_8_0 NO_STORE
2078#define vstore_partial_8_1 vstore_partial_1
2079#define vstore_partial_8_2 vstore_partial_2
2080#define vstore_partial_8_3 vstore_partial_3
2081#define vstore_partial_8_4 vstore_partial_4
2082#define vstore_partial_8_5 vstore_partial_5
2083#define vstore_partial_8_6 vstore_partial_6
2084#define vstore_partial_8_7 vstore_partial_7
2085#define vstore_partial_8_8 vstore_partial_8
2086#define vstore_partial_8_9 NO_STORE
2087#define vstore_partial_8_10 NO_STORE
2088#define vstore_partial_8_11 NO_STORE
2089#define vstore_partial_8_12 NO_STORE
2090#define vstore_partial_8_13 NO_STORE
2091#define vstore_partial_8_14 NO_STORE
2092#define vstore_partial_8_15 NO_STORE
2093#define vstore_partial_8_16 NO_STORE
2094
2095#define vstore_partial_16_0 NO_STORE
2096#define vstore_partial_16_1 vstore_partial_1
2097#define vstore_partial_16_2 vstore_partial_2
2098#define vstore_partial_16_3 vstore_partial_3
2099#define vstore_partial_16_4 vstore_partial_4
2100#define vstore_partial_16_5 vstore_partial_5
2101#define vstore_partial_16_6 vstore_partial_6
2102#define vstore_partial_16_7 vstore_partial_7
2103#define vstore_partial_16_8 vstore_partial_8
2104#define vstore_partial_16_9 vstore_partial_9
2105#define vstore_partial_16_10 vstore_partial_10
2106#define vstore_partial_16_11 vstore_partial_11
2107#define vstore_partial_16_12 vstore_partial_12
2108#define vstore_partial_16_13 vstore_partial_13
2109#define vstore_partial_16_14 vstore_partial_14
2110#define vstore_partial_16_15 vstore_partial_15
2111#define vstore_partial_16_16 vstore_partial_16
2112
2113
2114#define vstore_partial_1(DATA, OFFSET, PTR) \
2115    vstore1(DATA.s0, OFFSET, PTR);
2116
2117#define vstore_partial_2(DATA, OFFSET, PTR) \
2118    vstore2(DATA.s01, OFFSET, PTR);
2119
2120#define vstore_partial_3(DATA, OFFSET, PTR) \
2121    vstore3(DATA.s012, OFFSET, PTR);
2122
2123#define vstore_partial_4(DATA, OFFSET, PTR) \
2124    vstore4(DATA.s0123, OFFSET, PTR);
2125
2126#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128    vstore1(DATA.s4, OFFSET, PTR + 4);
2129
2130#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133
2134#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137
2138#define vstore_partial_8(DATA, OFFSET, PTR) \
2139    vstore8(DATA.s01234567, OFFSET, PTR);
2140
2141#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143    vstore1(DATA.s8, OFFSET, PTR + 8);
2144
2145#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148
2149#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152
2153#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156
2157#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160
2161#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164
2165#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168
2169#define vstore_partial_16(DATA, OFFSET, PTR) \
2170    vstore16(DATA, OFFSET, PTR);
2171
2172
2173
2174
2175
2176#define convert_float_sat convert_float
2177#define convert_float1_sat convert_float
2178#define convert_float2_sat convert_float2
2179#define convert_float3_sat convert_float3
2180#define convert_float4_sat convert_float4
2181#define convert_float8_sat convert_float8
2182#define convert_float16_sat convert_float16
2183#define convert_half_sat convert_float
2184#define convert_half1_sat convert_half
2185#define convert_half2_sat convert_half2
2186#define convert_half3_sat convert_half3
2187#define convert_half4_sat convert_half4
2188#define convert_half8_sat convert_half8
2189#define convert_half16_sat convert_half16
2190
2191#define convert_float1 convert_float
2192#define convert_half1 convert_half
2193#define convert_char1 convert_char
2194#define convert_uchar1 convert_uchar
2195#define convert_short1 convert_short
2196#define convert_ushort1 convert_ushort
2197#define convert_int1 convert_int
2198#define convert_uint1 convert_uint
2199#define convert_long1 convert_long
2200#define convert_ulong1 convert_ulong
2201#define convert_double1 convert_double
2202
2203#define convert_char1_sat convert_char_sat
2204#define convert_uchar1_sat convert_uchar_sat
2205#define convert_uchar2_sat convert_uchar2_sat
2206#define convert_uchar3_sat convert_uchar3_sat
2207#define convert_uchar4_sat convert_uchar4_sat
2208#define convert_uchar8_sat convert_uchar8_sat
2209#define convert_uchar16_sat convert_uchar16_sat
2210#define convert_short1_sat convert_short_sat
2211#define convert_ushort1_sat convert_ushort_sat
2212#define convert_int1_sat convert_int_sat
2213#define convert_uint1_sat convert_uint_sat
2214#define convert_long1_sat convert_long_sat
2215#define convert_ulong1_sat convert_ulong_sat
2216#define convert_double1_sat convert_double_sat
2217
2218#define VEC_DATA_TYPE_STR(type, size) type##size
2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220
2221#define CONVERT_STR(x, type) (convert_##type((x)))
2222#define CONVERT(x, type) CONVERT_STR(x, type)
2223
2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226
2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229
2230#define select_vec_dt_uchar(size) uchar##size
2231#define select_vec_dt_char(size) char##size
2232#define select_vec_dt_ushort(size) ushort##size
2233#define select_vec_dt_short(size) short##size
2234#define select_vec_dt_half(size) short##size
2235#define select_vec_dt_uint(size) uint##size
2236#define select_vec_dt_int(size) int##size
2237#define select_vec_dt_float(size) int##size
2238#define select_vec_dt_ulong(size) ulong##size
2239#define select_vec_dt_long(size) long##size
2240
2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244
2245#define signed_int_vec_dt_uchar(size) char##size
2246#define signed_int_vec_dt_char(size) char##size
2247#define signed_int_vec_dt_ushort(size) short##size
2248#define signed_int_vec_dt_short(size) short##size
2249#define signed_int_vec_dt_half(size) short##size
2250#define signed_int_vec_dt_uint(size) int##size
2251#define signed_int_vec_dt_int(size) int##size
2252#define signed_int_vec_dt_float(size) int##size
2253#define signed_int_vec_dt_ulong(size) long##size
2254#define signed_int_vec_dt_long(size) long##size
2255
2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259
2260#define sum_reduce_1(x) (x)
2261#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266
2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269
2270#define prod_reduce_1(x) (x)
2271#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276
2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279
2280#define max_reduce_1(x) (x)
2281#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286
2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289
2290#define VECTOR_DECLARATION(name)     \
2291    __global uchar *name##_ptr,      \
2292    uint        name##_stride_x, \
2293    uint        name##_step_x,   \
2294    uint        name##_offset_first_element_in_bytes
2295
2296#define IMAGE_DECLARATION(name)      \
2297    __global uchar *name##_ptr,      \
2298    uint        name##_stride_x, \
2299    uint        name##_step_x,   \
2300    uint        name##_stride_y, \
2301    uint        name##_step_y,   \
2302    uint        name##_offset_first_element_in_bytes
2303
2304#define TENSOR3D_DECLARATION(name)   \
2305    __global uchar *name##_ptr,      \
2306    uint        name##_stride_x, \
2307    uint        name##_step_x,   \
2308    uint        name##_stride_y, \
2309    uint        name##_step_y,   \
2310    uint        name##_stride_z, \
2311    uint        name##_step_z,   \
2312    uint        name##_offset_first_element_in_bytes
2313
2314#define TENSOR4D_DECLARATION(name)   \
2315    __global uchar *name##_ptr,      \
2316    uint        name##_stride_x, \
2317    uint        name##_step_x,   \
2318    uint        name##_stride_y, \
2319    uint        name##_step_y,   \
2320    uint        name##_stride_z, \
2321    uint        name##_step_z,   \
2322    uint        name##_stride_w, \
2323    uint        name##_step_w,   \
2324    uint        name##_offset_first_element_in_bytes
2325
2326#define TENSOR5D_DECLARATION(name)   \
2327    __global uchar *name##_ptr,      \
2328    uint        name##_stride_x, \
2329    uint        name##_step_x,   \
2330    uint        name##_stride_y, \
2331    uint        name##_step_y,   \
2332    uint        name##_stride_z, \
2333    uint        name##_step_z,   \
2334    uint        name##_stride_w, \
2335    uint        name##_step_w,   \
2336    uint        name##_stride_v, \
2337    uint        name##_step_v,   \
2338    uint        name##_offset_first_element_in_bytes
2339
2340#define CONVERT_TO_VECTOR_STRUCT(name) \
2341    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342
2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345
2346#define CONVERT_TO_IMAGE_STRUCT(name) \
2347    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348
2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351
2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354
2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357
2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360
2361#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363                                 name##_stride_z, name##_step_z)
2364
2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367
2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371
2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374
2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377                           name##_stride_z, name##_step_z)
2378
2379
2380typedef struct Vector
2381{
2382    __global uchar *ptr;
2383    int             offset_first_element_in_bytes;
2384    int             stride_x;
2385} Vector;
2386
2387
2388typedef struct Image
2389{
2390    __global uchar *ptr;
2391    int             offset_first_element_in_bytes;
2392    int             stride_x;
2393    int             stride_y;
2394} Image;
2395
2396
2397typedef struct Tensor3D
2398{
2399    __global uchar *ptr;
2400    int             offset_first_element_in_bytes;
2401    int             stride_x;
2402    int             stride_y;
2403    int             stride_z;
2404} Tensor3D;
2405
2406
2407typedef struct Tensor4D
2408{
2409    __global uchar *ptr;
2410    int             offset_first_element_in_bytes;
2411    int             stride_x;
2412    int             stride_y;
2413    int             stride_z;
2414    int             stride_w;
2415} Tensor4D;
2416
2417
2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419{
2420    Vector vector =
2421    {
2422        .ptr                           = ptr,
2423        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424        .stride_x                      = stride_x,
2425    };
2426    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427    return vector;
2428}
2429
2430
2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432{
2433    Image img =
2434    {
2435        .ptr                           = ptr,
2436        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437        .stride_x                      = stride_x,
2438        .stride_y                      = stride_y
2439    };
2440    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441    return img;
2442}
2443
2444
2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446{
2447    Image img =
2448    {
2449        .ptr                           = ptr,
2450        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451        .stride_x                      = stride_x,
2452        .stride_y                      = stride_y
2453    };
2454    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455    return img;
2456}
2457
2458
2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460{
2461    Tensor3D tensor =
2462    {
2463        .ptr                           = ptr,
2464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465        .stride_x                      = stride_x,
2466        .stride_y                      = stride_y,
2467        .stride_z                      = stride_z
2468    };
2469    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470    return tensor;
2471}
2472
2473
2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475{
2476    Tensor3D tensor =
2477    {
2478        .ptr                           = ptr,
2479        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480        .stride_x                      = stride_x,
2481        .stride_y                      = stride_y,
2482        .stride_z                      = stride_z
2483    };
2484    return tensor;
2485}
2486
2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488                                             uint step_w,
2489                                             uint mod_size)
2490{
2491    Tensor4D tensor =
2492    {
2493        .ptr                           = ptr,
2494        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495        .stride_x                      = stride_x,
2496        .stride_y                      = stride_y,
2497        .stride_z                      = stride_z,
2498        .stride_w                      = stride_w
2499    };
2500
2501    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502    return tensor;
2503}
2504
2505
2506inline __global const uchar *vector_offset(const Vector *vec, int x)
2507{
2508    return vec->ptr + x * vec->stride_x;
2509}
2510
2511
2512inline __global uchar *offset(const Image *img, int x, int y)
2513{
2514    return img->ptr + x * img->stride_x + y * img->stride_y;
2515}
2516
2517
2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519{
2520    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521}
2522
2523
2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525{
2526    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527}
2528
2529
2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531{
2532    uint num_elements = width * height;
2533
2534    const uint z = index / num_elements;
2535
2536    index %= num_elements;
2537
2538    const uint y = index / width;
2539
2540    index %= width;
2541
2542    const uint x = index;
2543
2544    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545}
2546
2547#endif
2548
2549
2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552
2553
2554#define scalar_access_0_1(x) ((x).s0)
2555#define scalar_access_0_2(x) ((x).s01)
2556#define scalar_access_0_3(x) ((x).s012)
2557#define scalar_access_0_4(x) ((x).s0123)
2558#define scalar_access_0_8(x) ((x).s01234567)
2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560
2561
2562#define scalar_access_1_1(x) ((x).s1)
2563#define scalar_access_1_2(x) ((x).s12)
2564#define scalar_access_1_3(x) ((x).s123)
2565#define scalar_access_1_4(x) ((x).s1234)
2566#define scalar_access_1_8(x) ((x).s12345678)
2567
2568
2569#define scalar_access_2_1(x) ((x).s2)
2570#define scalar_access_2_2(x) ((x).s23)
2571#define scalar_access_2_3(x) ((x).s234)
2572#define scalar_access_2_4(x) ((x).s2345)
2573#define scalar_access_2_8(x) ((x).s23456789)
2574
2575
2576#define scalar_access_3_1(x) ((x).s3)
2577#define scalar_access_3_2(x) ((x).s34)
2578#define scalar_access_3_3(x) ((x).s345)
2579#define scalar_access_3_4(x) ((x).s3456)
2580#define scalar_access_3_8(x) ((x).s3456789A)
2581
2582
2583#define scalar_access_4_1(x) ((x).s4)
2584#define scalar_access_4_2(x) ((x).s45)
2585#define scalar_access_4_3(x) ((x).s456)
2586#define scalar_access_4_4(x) ((x).s4567)
2587#define scalar_access_4_8(x) ((x).s456789AB)
2588
2589
2590#define scalar_access_8_1(x) ((x).s8)
2591#define scalar_access_8_2(x) ((x).s89)
2592#define scalar_access_8_3(x) ((x).s89A)
2593#define scalar_access_8_4(x) ((x).s89AB)
2594#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595
2596
2597#define scalar_access_12_1(x) ((x).sC)
2598#define scalar_access_12_2(x) ((x).sCD)
2599#define scalar_access_12_3(x) ((x).sCDE)
2600#define scalar_access_12_4(x) ((x).sCDEF)
2601
2602
2603#define scalar_access_16_1(x) ((x).sF)
2604
2605
2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607    ({})
2608
2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611
2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615
2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619
2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623
2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627
2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631
2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635
2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639
2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643
2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647
2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651
2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655
2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659
2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663
2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667
2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671
2672
2673
2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676
2677
2678
2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680    ({})
2681
2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684
2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687
2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690
2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693
2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697
2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701
2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705
2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708
2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712
2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716
2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720
2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724
2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729
2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734
2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739
2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742
2743
2744
2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747
2748
2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752
2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757
2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762
2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767
2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772
2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777
2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782
2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787
2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792
2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797
2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802
2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807
2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812
2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817
2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822
2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827
2828
2829
2830
2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833
2834
2835
2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839
2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844
2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849
2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854
2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859
2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864
2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869
2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874
2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879
2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884
2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889
2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894
2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899
2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904
2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909
2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914
2915
2916
2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919
2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922    {                                                                                                                                                            \
2923        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924    }                                                                                                                                                            \
2925    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926    {                                                                                                                                                            \
2927        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928    }                                                                                                                                                            \
2929    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930    {                                                                                                                                                            \
2931        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932    }                                                                                                                                                            \
2933    else                                                                                                                                                         \
2934    {                                                                                                                                                            \
2935        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936    }
2937
2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939    if(!(PARTIAL_COND_X))                                                                                                \
2940    {                                                                                                                    \
2941        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942    }                                                                                                                    \
2943    else                                                                                                                 \
2944    {                                                                                                                    \
2945        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946    }
2947
2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949    if(!(PARTIAL_COND_Y))                                                                                                \
2950    {                                                                                                                    \
2951        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952    }                                                                                                                    \
2953    else                                                                                                                 \
2954    {                                                                                                                    \
2955        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956    }
2957
2958
2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960
2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963
2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965
2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969
2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971
2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975
2976#else
2977
2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981
2982#endif
2983
2984
2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987
2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991
2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995
2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999
3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003
3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007
3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011
3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015
3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019
3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023
3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027
3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031
3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035
3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039
3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043
3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047
3048
3049
3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052
3053
3054
3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057    BASENAME##0;                                                                            \
3058    if(Y_MASK##0 != 0)                                                                      \
3059        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060    else                                                                                    \
3061        BASENAME##0 = 0;
3062
3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066    BASENAME##1;                                                                            \
3067    if(Y_MASK##1 != 0)                                                                      \
3068        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069    else                                                                                    \
3070        BASENAME##1 = 0;
3071
3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075    BASENAME##2;                                                                            \
3076    if(Y_MASK##2 != 0)                                                                      \
3077        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078    else                                                                                    \
3079        BASENAME##2 = 0;
3080
3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084    BASENAME##3;                                                                            \
3085    if(Y_MASK##3 != 0)                                                                      \
3086        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087    else                                                                                    \
3088        BASENAME##3 = 0;
3089
3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093    BASENAME##4;                                                                            \
3094    if(Y_MASK##4 != 0)                                                                      \
3095        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096    else                                                                                    \
3097        BASENAME##4 = 0;
3098
3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102    BASENAME##5;                                                                            \
3103    if(Y_MASK##5 != 0)                                                                      \
3104        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105    else                                                                                    \
3106        BASENAME##5 = 0;
3107
3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111    BASENAME##6;                                                                            \
3112    if(Y_MASK##6 != 0)                                                                      \
3113        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114    else                                                                                    \
3115        BASENAME##6 = 0;
3116
3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120    BASENAME##7;                                                                            \
3121    if(Y_MASK##7 != 0)                                                                      \
3122        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123    else                                                                                    \
3124        BASENAME##7 = 0;
3125
3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129    BASENAME##8;                                                                            \
3130    if(Y_MASK##8 != 0)                                                                      \
3131        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132    else                                                                                    \
3133        BASENAME##8 = 0;
3134
3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138    BASENAME##9;                                                                            \
3139    if(Y_MASK##9 != 0)                                                                      \
3140        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141    else                                                                                    \
3142        BASENAME##9 = 0;
3143
3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147    BASENAME##A;                                                                            \
3148    if(Y_MASK##A != 0)                                                                      \
3149        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150    else                                                                                    \
3151        BASENAME##A = 0;
3152
3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156    BASENAME##B;                                                                            \
3157    if(Y_MASK##B != 0)                                                                      \
3158        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159    else                                                                                    \
3160        BASENAME##B = 0;
3161
3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165    BASENAME##C;                                                                            \
3166    if(Y_MASK##C != 0)                                                                      \
3167        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168    else                                                                                    \
3169        BASENAME##C = 0;
3170
3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174    BASENAME##D;                                                                            \
3175    if(Y_MASK##D != 0)                                                                      \
3176        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177    else                                                                                    \
3178        BASENAME##D = 0;
3179
3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183    BASENAME##E;                                                                            \
3184    if(Y_MASK##E != 0)                                                                      \
3185        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186    else                                                                                    \
3187        BASENAME##E = 0;
3188
3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192    BASENAME##F;                                                                            \
3193    if(Y_MASK##F != 0)                                                                      \
3194        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195    else                                                                                    \
3196        BASENAME##F = 0;
3197
3198
3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201
3202
3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206
3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211
3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216
3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221
3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226
3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231
3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236
3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241
3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246
3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251
3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256
3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261
3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266
3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271
3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276
3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281
3282
3283
3284
3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287
3288
3289
3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294
3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300
3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306
3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312
3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318
3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324
3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330
3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336
3337
3338
3339
3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342
3343
3344
3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346    BASENAME##0 *= (DATA_TYPE)SCALE;
3347
3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350    BASENAME##1 *= (DATA_TYPE)SCALE;
3351
3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354    BASENAME##2 *= (DATA_TYPE)SCALE;
3355
3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358    BASENAME##3 *= (DATA_TYPE)SCALE;
3359
3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362    BASENAME##4 *= (DATA_TYPE)SCALE;
3363
3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366    BASENAME##5 *= (DATA_TYPE)SCALE;
3367
3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370    BASENAME##6 *= (DATA_TYPE)SCALE;
3371
3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374    BASENAME##7 *= (DATA_TYPE)SCALE;
3375
3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378    BASENAME##8 *= (DATA_TYPE)SCALE;
3379
3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382    BASENAME##9 *= (DATA_TYPE)SCALE;
3383
3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386    BASENAME##A *= (DATA_TYPE)SCALE;
3387
3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390    BASENAME##B *= (DATA_TYPE)SCALE;
3391
3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394    BASENAME##C *= (DATA_TYPE)SCALE;
3395
3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398    BASENAME##D *= (DATA_TYPE)SCALE;
3399
3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402    BASENAME##E *= (DATA_TYPE)SCALE;
3403
3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406    BASENAME##F *= (DATA_TYPE)SCALE;
3407
3408
3409
3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412
3413
3414
3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418    VEC_DATA_TYPE(TYPE, 2)                         \
3419    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421    VEC_DATA_TYPE(TYPE, 3)                         \
3422    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424    VEC_DATA_TYPE(TYPE, 4)                         \
3425    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427    VEC_DATA_TYPE(TYPE, 8)                         \
3428    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430    VEC_DATA_TYPE(TYPE, 16)                         \
3431    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432
3433
3434
3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438    VEC_DATA_TYPE(TYPE, 2)                                \
3439    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441    VEC_DATA_TYPE(TYPE, 3)                                \
3442    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444    VEC_DATA_TYPE(TYPE, 4)                                \
3445    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447    VEC_DATA_TYPE(TYPE, 8)                                \
3448    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450    VEC_DATA_TYPE(TYPE, 16)                                \
3451    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452
3453
3454
3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482
3483
3484
3485
3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487    CONCAT(COLUMN_VECTOR, K0)                          \
3488    (IDX_COL, BASENAME, BS, TYPE);
3489
3490
3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493    (IDX_COL, BASENAME, BS, TYPE);
3494
3495
3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497    CONCAT(TRANSPOSE_K0X, N0)                       \
3498    (K0, BASENAME, BS, TYPE);
3499
3500
3501#define ADD_ROW_1(BASENAME, BIAS) \
3502    BASENAME##0 += BIAS##0;
3503
3504#define ADD_ROW_2(BASENAME, BIAS) \
3505    ADD_ROW_1(BASENAME, BIAS)     \
3506    BASENAME##1 += BIAS##1;
3507
3508#define ADD_ROW_3(BASENAME, BIAS) \
3509    ADD_ROW_2(BASENAME, BIAS)     \
3510    BASENAME##2 += BIAS##2;
3511
3512#define ADD_ROW_4(BASENAME, BIAS) \
3513    ADD_ROW_3(BASENAME, BIAS)     \
3514    BASENAME##3 += BIAS##3;
3515
3516#define ADD_ROW_5(BASENAME, BIAS) \
3517    ADD_ROW_4(BASENAME, BIAS)     \
3518    BASENAME##4 += BIAS##4;
3519
3520#define ADD_ROW_6(BASENAME, BIAS) \
3521    ADD_ROW_5(BASENAME, BIAS)     \
3522    BASENAME##5 += BIAS##5;
3523
3524#define ADD_ROW_7(BASENAME, BIAS) \
3525    ADD_ROW_6(BASENAME, BIAS)     \
3526    BASENAME##6 += BIAS##6;
3527
3528#define ADD_ROW_8(BASENAME, BIAS) \
3529    ADD_ROW_7(BASENAME, BIAS)     \
3530    BASENAME##7 += BIAS##7;
3531
3532#define ADD_ROW_9(BASENAME, BIAS) \
3533    ADD_ROW_8(BASENAME, BIAS)     \
3534    BASENAME##8 += BIAS##8;
3535
3536#define ADD_ROW_10(BASENAME, BIAS) \
3537    ADD_ROW_9(BASENAME, BIAS)      \
3538    BASENAME##9 += BIAS##9;
3539
3540#define ADD_ROW_11(BASENAME, BIAS) \
3541    ADD_ROW_10(BASENAME, BIAS)     \
3542    BASENAME##A += BIAS##A;
3543
3544#define ADD_ROW_12(BASENAME, BIAS) \
3545    ADD_ROW_11(BASENAME, BIAS)     \
3546    BASENAME##B += BIAS##B;
3547
3548#define ADD_ROW_13(BASENAME, BIAS) \
3549    ADD_ROW_12(BASENAME, BIAS)     \
3550    BASENAME##C += BIAS##C;
3551
3552#define ADD_ROW_14(BASENAME, BIAS) \
3553    ADD_ROW_13(BASENAME, BIAS)     \
3554    BASENAME##D += BIAS##D;
3555
3556#define ADD_ROW_15(BASENAME, BIAS) \
3557    ADD_ROW_14(BASENAME, BIAS)     \
3558    BASENAME##E += BIAS##E;
3559
3560#define ADD_ROW_16(BASENAME, BIAS) \
3561    ADD_ROW_15(BASENAME, BIAS)     \
3562    BASENAME##F += BIAS##F;
3563
3564
3565
3566
3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569
3570
3571
3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573    BASENAME##0 += BIAS;
3574
3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577    BASENAME##1 += BIAS;
3578
3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581    BASENAME##2 += BIAS;
3582
3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585    BASENAME##3 += BIAS;
3586
3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589    BASENAME##4 += BIAS;
3590
3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593    BASENAME##5 += BIAS;
3594
3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597    BASENAME##6 += BIAS;
3598
3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601    BASENAME##7 += BIAS;
3602
3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605    BASENAME##8 += BIAS;
3606
3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609    BASENAME##9 += BIAS;
3610
3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613    BASENAME##A += BIAS;
3614
3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617    BASENAME##B += BIAS;
3618
3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621    BASENAME##C += BIAS;
3622
3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625    BASENAME##D += BIAS;
3626
3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629    BASENAME##E += BIAS;
3630
3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633    BASENAME##F += BIAS;
3634
3635
3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638
3639
3640
3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643
3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647
3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651
3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655
3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659
3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663
3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667
3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671
3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675
3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679
3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683
3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687
3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691
3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695
3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699
3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703
3704
3705
3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708
3709
3710
3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714
3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719
3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724
3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729
3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734
3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739
3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744
3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749
3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754
3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759
3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764
3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769
3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774
3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779
3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784
3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789
3790
3791
3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794
3795
3796#ifndef ARM_COMPUTE_HELPER_H
3797#define ARM_COMPUTE_HELPER_H
3798
3799
3800
3801
3802#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3803    VSTORE(N0)                                                 \
3804    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3805
3806#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3808    VSTORE(N0)                                                 \
3809    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3810
3811#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3812    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3813    VSTORE(N0)                                                 \
3814    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3815
3816#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3817    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3818    VSTORE(N0)                                                 \
3819    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3820
3821#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3822    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3823    VSTORE(N0)                                                 \
3824    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3825
3826#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3827    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3828    VSTORE(N0)                                                 \
3829    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3830
3831#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3832    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3833    VSTORE(N0)                                                 \
3834    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3835
3836#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3837    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3838    VSTORE(N0)                                                 \
3839    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3840
3841#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3842    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3843    VSTORE(N0)                                                 \
3844    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3845
3846#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3847    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3848    VSTORE(N0)                                                  \
3849    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3850
3851#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3852    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3853    VSTORE(N0)                                                  \
3854    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3855
3856#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3857    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3858    VSTORE(N0)                                                  \
3859    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3860
3861#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3862    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3863    VSTORE(N0)                                                  \
3864    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3865
3866#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3867    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3868    VSTORE(N0)                                                  \
3869    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3870
3871#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3872    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3873    VSTORE(N0)                                                  \
3874    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3875
3876#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3877    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3878    VSTORE(N0)                                                  \
3879    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3880
3881
3882
3883#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3884    VSTORE(N0)                                                         \
3885    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3886
3887#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3889    VSTORE(N0)                                                         \
3890    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3891
3892#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3893    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3894    VSTORE(N0)                                                         \
3895    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3896
3897#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3898    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3899    VSTORE(N0)                                                         \
3900    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3901
3902#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3903    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3904    VSTORE(N0)                                                         \
3905    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3906
3907#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3908    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3909    VSTORE(N0)                                                         \
3910    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3911
3912#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3913    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3914    VSTORE(N0)                                                         \
3915    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3916
3917#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3918    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3919    VSTORE(N0)                                                         \
3920    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3921
3922#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3923    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3924    VSTORE(N0)                                                         \
3925    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3926
3927#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3928    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3929    VSTORE(N0)                                                     \
3930    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3931
3932#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3934    VSTORE(N0)                                                          \
3935    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3936
3937#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3938    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3939    VSTORE(N0)                                                          \
3940    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3941
3942#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3943    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3944    VSTORE(N0)                                                          \
3945    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3946
3947#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3948    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3949    VSTORE(N0)                                                          \
3950    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3951
3952#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3953    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3954    VSTORE(N0)                                                          \
3955    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3956
3957#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3958    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3959    VSTORE(N0)                                                          \
3960    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3961
3962
3963
3964
3965#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3966#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3967
3968
3969
3970#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3972
3973
3974
3975#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3976    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3977    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3978
3979#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3981    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3982    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3983
3984#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3985    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3986    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3987    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3988
3989#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3990    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3991    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3992    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3993
3994#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3995    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3996    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3997    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3998
3999#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4000    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4001    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4002    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4003
4004#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4005    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4006    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4007    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4008
4009#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4010    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4011    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4012    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4013
4014#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4015    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4016    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4017    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4018
4019#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4020    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4021    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4022    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4023
4024#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4025    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4026    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4027    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4028
4029#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4030    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4031    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4032    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4033
4034#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4035    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4036    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4037    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4038
4039#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4040    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4041    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4042    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4043
4044#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4045    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4046    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4047    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4048
4049#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4050    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4051    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4052    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4053
4054
4055
4056#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4057#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4058
4059#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4060    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4061    {                                                                                                                                                     \
4062        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4063    }                                                                                                                                                     \
4064    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4065    {                                                                                                                                                     \
4066        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4067    }                                                                                                                                                     \
4068    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4069    {                                                                                                                                                     \
4070        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071    }                                                                                                                                                     \
4072    else                                                                                                                                                  \
4073    {                                                                                                                                                     \
4074        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4075    }
4076
4077#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4078    if(!(PARTIAL_COND_X))                                                                                         \
4079    {                                                                                                             \
4080        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4081    }                                                                                                             \
4082    else                                                                                                          \
4083    {                                                                                                             \
4084        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4085    }
4086
4087#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4088    if(!(PARTIAL_COND_Y))                                                                                         \
4089    {                                                                                                             \
4090        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4091    }                                                                                                             \
4092    else                                                                                                          \
4093    {                                                                                                             \
4094        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4095    }
4096
4097
4098#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4099
4100
4101#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4102
4103#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4104    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4105
4106#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4107
4108#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4109    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4110
4111#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4112
4113#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4114    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4115
4116#else
4117
4118#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4119    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4120
4121#endif
4122
4123#endif
4124
4125
4126#if defined(PARTIAL_STORE_M0)
4127
4128#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4129    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4130#else
4131#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4132    ((uint)(y * M0))
4133#endif
4134
4135
4136
4137#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4138    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4139
4140
4141#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4142#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4143#endif
4144
4145#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4146#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4147#endif
4148
4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4151#endif
4152
4153#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4154#pragma OPENCL EXTENSION cl_arm_printf : enable
4155#endif
4156
4157#define GPU_ARCH_MIDGARD 0x100
4158#define GPU_ARCH_BIFROST 0x200
4159#define GPU_ARCH_VALHALL 0x300
4160
4161
4162#define CONCAT(a, b) a##b
4163
4164
4165#define EXPAND(x) x
4166
4167
4168#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4169
4170
4171#define REV1(x) ((x))
4172#define REV2(x) ((x).s10)
4173#define REV3(x) ((x).s210)
4174#define REV4(x) ((x).s3210)
4175#define REV8(x) ((x).s76543210)
4176#define REV16(x) ((x).sFEDCBA9876543210)
4177
4178
4179
4180#define REVERSE_STR(x, s) REV##s((x))
4181#define REVERSE(x, s) REVERSE_STR(x, s)
4182
4183
4184
4185#define ROT1_0(x) ((x))
4186#define ROT1_1(x) ((x))
4187
4188#define ROT2_0(x) ((x))
4189#define ROT2_1(x) ((x).s10)
4190#define ROT2_2(x) ((x))
4191
4192#define ROT3_0(x) ((x))
4193#define ROT3_1(x) ((x).s201)
4194#define ROT3_2(x) ((x).s120)
4195#define ROT3_3(x) ((x))
4196
4197#define ROT4_0(x) ((x))
4198#define ROT4_1(x) ((x).s3012)
4199#define ROT4_2(x) ((x).s2301)
4200#define ROT4_3(x) ((x).s1230)
4201#define ROT4_4(x) ((x))
4202
4203#define ROT8_0(x) ((x))
4204#define ROT8_1(x) ((x).s70123456)
4205#define ROT8_2(x) ((x).s67012345)
4206#define ROT8_3(x) ((x).s56701234)
4207#define ROT8_4(x) ((x).s45670123)
4208#define ROT8_5(x) ((x).s34567012)
4209#define ROT8_6(x) ((x).s23456701)
4210#define ROT8_7(x) ((x).s12345670)
4211#define ROT8_8(x) ((x))
4212
4213#define ROT16_0(x) ((x))
4214#define ROT16_1(x) ((x).sF0123456789ABCDE)
4215#define ROT16_2(x) ((x).sEF0123456789ABCD)
4216#define ROT16_3(x) ((x).sDEF0123456789ABC)
4217#define ROT16_4(x) ((x).sCDEF0123456789AB)
4218#define ROT16_5(x) ((x).sBCDEF0123456789A)
4219#define ROT16_6(x) ((x).sABCDEF0123456789)
4220#define ROT16_7(x) ((x).s9ABCDEF012345678)
4221#define ROT16_8(x) ((x).s89ABCDEF01234567)
4222#define ROT16_9(x) ((x).s789ABCDEF0123456)
4223#define ROT16_10(x) ((x).s6789ABCDEF012345)
4224#define ROT16_11(x) ((x).s56789ABCDEF01234)
4225#define ROT16_12(x) ((x).s456789ABCDEF0123)
4226#define ROT16_13(x) ((x).s3456789ABCDEF012)
4227#define ROT16_14(x) ((x).s23456789ABCDEF01)
4228#define ROT16_15(x) ((x).s123456789ABCDEF0)
4229#define ROT16_16(x) ((x))
4230
4231
4232
4233#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4234#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4235
4236
4237
4238#define V_OFFS1(dt) (dt##1)(0)
4239#define V_OFFS2(dt) (dt##2)(0, 1)
4240#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4241#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4242#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4243#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4244
4245
4246
4247#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4248#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4249
4250
4251#define VLOAD_STR(size) vload##size
4252#define VLOAD(size) VLOAD_STR(size)
4253
4254
4255#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4256#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4257
4258#define NO_LOAD(data, offs, ptr) \
4259    {                            \
4260    }
4261
4262
4263#define vload_partial_1_0 NO_LOAD
4264#define vload_partial_1_1 vload1
4265#define vload_partial_1_2 NO_LOAD
4266#define vload_partial_1_3 NO_LOAD
4267#define vload_partial_1_4 NO_LOAD
4268#define vload_partial_1_5 NO_LOAD
4269#define vload_partial_1_6 NO_LOAD
4270#define vload_partial_1_7 NO_LOAD
4271#define vload_partial_1_8 NO_LOAD
4272#define vload_partial_1_9 NO_LOAD
4273#define vload_partial_1_10 NO_LOAD
4274#define vload_partial_1_11 NO_LOAD
4275#define vload_partial_1_12 NO_LOAD
4276#define vload_partial_1_13 NO_LOAD
4277#define vload_partial_1_14 NO_LOAD
4278#define vload_partial_1_15 NO_LOAD
4279#define vload_partial_1_16 NO_LOAD
4280
4281#define vload_partial_2_0 NO_LOAD
4282#define vload_partial_2_1 vload_partial_1
4283#define vload_partial_2_2 vload_partial_2
4284#define vload_partial_2_3 NO_LOAD
4285#define vload_partial_2_4 NO_LOAD
4286#define vload_partial_2_5 NO_LOAD
4287#define vload_partial_2_6 NO_LOAD
4288#define vload_partial_2_7 NO_LOAD
4289#define vload_partial_2_8 NO_LOAD
4290#define vload_partial_2_9 NO_LOAD
4291#define vload_partial_2_10 NO_LOAD
4292#define vload_partial_2_11 NO_LOAD
4293#define vload_partial_2_12 NO_LOAD
4294#define vload_partial_2_13 NO_LOAD
4295#define vload_partial_2_14 NO_LOAD
4296#define vload_partial_2_15 NO_LOAD
4297#define vload_partial_2_16 NO_LOAD
4298
4299#define vload_partial_3_0 NO_LOAD
4300#define vload_partial_3_1 vload_partial_1
4301#define vload_partial_3_2 vload_partial_2
4302#define vload_partial_3_3 vload_partial_3
4303#define vload_partial_3_4 NO_LOAD
4304#define vload_partial_3_5 NO_LOAD
4305#define vload_partial_3_6 NO_LOAD
4306#define vload_partial_3_7 NO_LOAD
4307#define vload_partial_3_8 NO_LOAD
4308#define vload_partial_3_9 NO_LOAD
4309#define vload_partial_3_10 NO_LOAD
4310#define vload_partial_3_11 NO_LOAD
4311#define vload_partial_3_12 NO_LOAD
4312#define vload_partial_3_13 NO_LOAD
4313#define vload_partial_3_14 NO_LOAD
4314#define vload_partial_3_15 NO_LOAD
4315#define vload_partial_3_16 NO_LOAD
4316
4317#define vload_partial_4_0 NO_LOAD
4318#define vload_partial_4_1 vload_partial_1
4319#define vload_partial_4_2 vload_partial_2
4320#define vload_partial_4_3 vload_partial_3
4321#define vload_partial_4_4 vload_partial_4
4322#define vload_partial_4_5 NO_LOAD
4323#define vload_partial_4_6 NO_LOAD
4324#define vload_partial_4_7 NO_LOAD
4325#define vload_partial_4_8 NO_LOAD
4326#define vload_partial_4_9 NO_LOAD
4327#define vload_partial_4_10 NO_LOAD
4328#define vload_partial_4_11 NO_LOAD
4329#define vload_partial_4_12 NO_LOAD
4330#define vload_partial_4_13 NO_LOAD
4331#define vload_partial_4_14 NO_LOAD
4332#define vload_partial_4_15 NO_LOAD
4333#define vload_partial_4_16 NO_LOAD
4334
4335#define vload_partial_8_0 NO_LOAD
4336#define vload_partial_8_1 vload_partial_1
4337#define vload_partial_8_2 vload_partial_2
4338#define vload_partial_8_3 vload_partial_3
4339#define vload_partial_8_4 vload_partial_4
4340#define vload_partial_8_5 vload_partial_5
4341#define vload_partial_8_6 vload_partial_6
4342#define vload_partial_8_7 vload_partial_7
4343#define vload_partial_8_8 vload_partial_8
4344#define vload_partial_8_9 NO_LOAD
4345#define vload_partial_8_10 NO_LOAD
4346#define vload_partial_8_11 NO_LOAD
4347#define vload_partial_8_12 NO_LOAD
4348#define vload_partial_8_13 NO_LOAD
4349#define vload_partial_8_14 NO_LOAD
4350#define vload_partial_8_15 NO_LOAD
4351#define vload_partial_8_16 NO_LOAD
4352
4353#define vload_partial_16_0 NO_LOAD
4354#define vload_partial_16_1 vload_partial_1
4355#define vload_partial_16_2 vload_partial_2
4356#define vload_partial_16_3 vload_partial_3
4357#define vload_partial_16_4 vload_partial_4
4358#define vload_partial_16_5 vload_partial_5
4359#define vload_partial_16_6 vload_partial_6
4360#define vload_partial_16_7 vload_partial_7
4361#define vload_partial_16_8 vload_partial_8
4362#define vload_partial_16_9 vload_partial_9
4363#define vload_partial_16_10 vload_partial_10
4364#define vload_partial_16_11 vload_partial_11
4365#define vload_partial_16_12 vload_partial_12
4366#define vload_partial_16_13 vload_partial_13
4367#define vload_partial_16_14 vload_partial_14
4368#define vload_partial_16_15 vload_partial_15
4369#define vload_partial_16_16 vload_partial_16
4370
4371
4372#define vload_partial_1(DATA, OFFSET, PTR) \
4373    DATA.s0 = vload1(OFFSET, PTR);
4374
4375#define vload_partial_2(DATA, OFFSET, PTR) \
4376    DATA.s01 = vload2(OFFSET, PTR);
4377
4378#define vload_partial_3(DATA, OFFSET, PTR) \
4379    DATA.s012 = vload3(OFFSET, PTR);
4380
4381#define vload_partial_4(DATA, OFFSET, PTR) \
4382    DATA.s0123 = vload4(OFFSET, PTR);
4383
4384#define vload_partial_5(DATA, OFFSET, PTR)    \
4385    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4386    DATA.s4 = vload1(OFFSET, PTR + 4);
4387
4388#define vload_partial_6(DATA, OFFSET, PTR)    \
4389    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4391
4392#define vload_partial_7(DATA, OFFSET, PTR)    \
4393    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4395
4396#define vload_partial_8(DATA, OFFSET, PTR) \
4397    DATA.s01234567 = vload8(OFFSET, PTR);
4398
4399#define vload_partial_9(DATA, OFFSET, PTR)        \
4400    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4401    DATA.s8 = vload1(OFFSET, PTR + 8);
4402
4403#define vload_partial_10(DATA, OFFSET, PTR)       \
4404    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4406
4407#define vload_partial_11(DATA, OFFSET, PTR)       \
4408    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4410
4411#define vload_partial_12(DATA, OFFSET, PTR)       \
4412    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4414
4415#define vload_partial_13(DATA, OFFSET, PTR)       \
4416    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4418
4419#define vload_partial_14(DATA, OFFSET, PTR)       \
4420    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422
4423#define vload_partial_15(DATA, OFFSET, PTR)       \
4424    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426
4427#define vload_partial_16(DATA, OFFSET, PTR) \
4428    DATA = vload16(OFFSET, PTR);
4429
4430
4431
4432#define PIXEL_UNIT4 1
4433#define PIXEL_UNIT8 2
4434#define PIXEL_UNIT16 4
4435
4436
4437#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4438#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4439
4440
4441#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4442#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4443#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4444
4445#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4446#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4447#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4448#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4449#endif
4450
4451#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4452#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4453#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4454
4455#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4456#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4457#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4458#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4459#endif
4460
4461
4462#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4463#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4464
4465
4466#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4467#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4468
4469#define VSTORE_STR(size) vstore##size
4470#define VSTORE(size) VSTORE_STR(size)
4471
4472#define float1 float
4473#define half1 half
4474#define char1 char
4475#define uchar1 uchar
4476#define short1 short
4477#define ushort1 ushort
4478#define int1 int
4479#define uint1 uint
4480#define long1 long
4481#define ulong1 ulong
4482#define double1 double
4483
4484#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4485#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4486
4487
4488#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4489#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4490
4491#define NO_STORE(data, offs, ptr) \
4492    {                             \
4493    }
4494
4495
4496#define vstore_partial_1_0 NO_STORE
4497#define vstore_partial_1_1 vstore1
4498#define vstore_partial_1_2 NO_STORE
4499#define vstore_partial_1_3 NO_STORE
4500#define vstore_partial_1_4 NO_STORE
4501#define vstore_partial_1_5 NO_STORE
4502#define vstore_partial_1_6 NO_STORE
4503#define vstore_partial_1_7 NO_STORE
4504#define vstore_partial_1_8 NO_STORE
4505#define vstore_partial_1_9 NO_STORE
4506#define vstore_partial_1_10 NO_STORE
4507#define vstore_partial_1_11 NO_STORE
4508#define vstore_partial_1_12 NO_STORE
4509#define vstore_partial_1_13 NO_STORE
4510#define vstore_partial_1_14 NO_STORE
4511#define vstore_partial_1_15 NO_STORE
4512#define vstore_partial_1_16 NO_STORE
4513
4514#define vstore_partial_2_0 NO_STORE
4515#define vstore_partial_2_1 vstore_partial_1
4516#define vstore_partial_2_2 vstore_partial_2
4517#define vstore_partial_2_3 NO_STORE
4518#define vstore_partial_2_4 NO_STORE
4519#define vstore_partial_2_5 NO_STORE
4520#define vstore_partial_2_6 NO_STORE
4521#define vstore_partial_2_7 NO_STORE
4522#define vstore_partial_2_8 NO_STORE
4523#define vstore_partial_2_9 NO_STORE
4524#define vstore_partial_2_10 NO_STORE
4525#define vstore_partial_2_11 NO_STORE
4526#define vstore_partial_2_12 NO_STORE
4527#define vstore_partial_2_13 NO_STORE
4528#define vstore_partial_2_14 NO_STORE
4529#define vstore_partial_2_15 NO_STORE
4530#define vstore_partial_2_16 NO_STORE
4531
4532#define vstore_partial_3_0 NO_STORE
4533#define vstore_partial_3_1 vstore_partial_1
4534#define vstore_partial_3_2 vstore_partial_2
4535#define vstore_partial_3_3 vstore_partial_3
4536#define vstore_partial_3_4 NO_STORE
4537#define vstore_partial_3_5 NO_STORE
4538#define vstore_partial_3_6 NO_STORE
4539#define vstore_partial_3_7 NO_STORE
4540#define vstore_partial_3_8 NO_STORE
4541#define vstore_partial_3_9 NO_STORE
4542#define vstore_partial_3_10 NO_STORE
4543#define vstore_partial_3_11 NO_STORE
4544#define vstore_partial_3_12 NO_STORE
4545#define vstore_partial_3_13 NO_STORE
4546#define vstore_partial_3_14 NO_STORE
4547#define vstore_partial_3_15 NO_STORE
4548#define vstore_partial_3_16 NO_STORE
4549
4550#define vstore_partial_4_0 NO_STORE
4551#define vstore_partial_4_1 vstore_partial_1
4552#define vstore_partial_4_2 vstore_partial_2
4553#define vstore_partial_4_3 vstore_partial_3
4554#define vstore_partial_4_4 vstore_partial_4
4555#define vstore_partial_4_5 NO_STORE
4556#define vstore_partial_4_6 NO_STORE
4557#define vstore_partial_4_7 NO_STORE
4558#define vstore_partial_4_8 NO_STORE
4559#define vstore_partial_4_9 NO_STORE
4560#define vstore_partial_4_10 NO_STORE
4561#define vstore_partial_4_11 NO_STORE
4562#define vstore_partial_4_12 NO_STORE
4563#define vstore_partial_4_13 NO_STORE
4564#define vstore_partial_4_14 NO_STORE
4565#define vstore_partial_4_15 NO_STORE
4566#define vstore_partial_4_16 NO_STORE
4567
4568#define vstore_partial_8_0 NO_STORE
4569#define vstore_partial_8_1 vstore_partial_1
4570#define vstore_partial_8_2 vstore_partial_2
4571#define vstore_partial_8_3 vstore_partial_3
4572#define vstore_partial_8_4 vstore_partial_4
4573#define vstore_partial_8_5 vstore_partial_5
4574#define vstore_partial_8_6 vstore_partial_6
4575#define vstore_partial_8_7 vstore_partial_7
4576#define vstore_partial_8_8 vstore_partial_8
4577#define vstore_partial_8_9 NO_STORE
4578#define vstore_partial_8_10 NO_STORE
4579#define vstore_partial_8_11 NO_STORE
4580#define vstore_partial_8_12 NO_STORE
4581#define vstore_partial_8_13 NO_STORE
4582#define vstore_partial_8_14 NO_STORE
4583#define vstore_partial_8_15 NO_STORE
4584#define vstore_partial_8_16 NO_STORE
4585
4586#define vstore_partial_16_0 NO_STORE
4587#define vstore_partial_16_1 vstore_partial_1
4588#define vstore_partial_16_2 vstore_partial_2
4589#define vstore_partial_16_3 vstore_partial_3
4590#define vstore_partial_16_4 vstore_partial_4
4591#define vstore_partial_16_5 vstore_partial_5
4592#define vstore_partial_16_6 vstore_partial_6
4593#define vstore_partial_16_7 vstore_partial_7
4594#define vstore_partial_16_8 vstore_partial_8
4595#define vstore_partial_16_9 vstore_partial_9
4596#define vstore_partial_16_10 vstore_partial_10
4597#define vstore_partial_16_11 vstore_partial_11
4598#define vstore_partial_16_12 vstore_partial_12
4599#define vstore_partial_16_13 vstore_partial_13
4600#define vstore_partial_16_14 vstore_partial_14
4601#define vstore_partial_16_15 vstore_partial_15
4602#define vstore_partial_16_16 vstore_partial_16
4603
4604
4605#define vstore_partial_1(DATA, OFFSET, PTR) \
4606    vstore1(DATA.s0, OFFSET, PTR);
4607
4608#define vstore_partial_2(DATA, OFFSET, PTR) \
4609    vstore2(DATA.s01, OFFSET, PTR);
4610
4611#define vstore_partial_3(DATA, OFFSET, PTR) \
4612    vstore3(DATA.s012, OFFSET, PTR);
4613
4614#define vstore_partial_4(DATA, OFFSET, PTR) \
4615    vstore4(DATA.s0123, OFFSET, PTR);
4616
4617#define vstore_partial_5(DATA, OFFSET, PTR)    \
4618    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4619    vstore1(DATA.s4, OFFSET, PTR + 4);
4620
4621#define vstore_partial_6(DATA, OFFSET, PTR)    \
4622    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4624
4625#define vstore_partial_7(DATA, OFFSET, PTR)    \
4626    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4628
4629#define vstore_partial_8(DATA, OFFSET, PTR) \
4630    vstore8(DATA.s01234567, OFFSET, PTR);
4631
4632#define vstore_partial_9(DATA, OFFSET, PTR)        \
4633    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4634    vstore1(DATA.s8, OFFSET, PTR + 8);
4635
4636#define vstore_partial_10(DATA, OFFSET, PTR)       \
4637    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4639
4640#define vstore_partial_11(DATA, OFFSET, PTR)       \
4641    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4643
4644#define vstore_partial_12(DATA, OFFSET, PTR)       \
4645    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4647
4648#define vstore_partial_13(DATA, OFFSET, PTR)       \
4649    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4651
4652#define vstore_partial_14(DATA, OFFSET, PTR)       \
4653    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4655
4656#define vstore_partial_15(DATA, OFFSET, PTR)       \
4657    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4659
4660#define vstore_partial_16(DATA, OFFSET, PTR) \
4661    vstore16(DATA, OFFSET, PTR);
4662
4663
4664
4665
4666
4667#define convert_float_sat convert_float
4668#define convert_float1_sat convert_float
4669#define convert_float2_sat convert_float2
4670#define convert_float3_sat convert_float3
4671#define convert_float4_sat convert_float4
4672#define convert_float8_sat convert_float8
4673#define convert_float16_sat convert_float16
4674#define convert_half_sat convert_float
4675#define convert_half1_sat convert_half
4676#define convert_half2_sat convert_half2
4677#define convert_half3_sat convert_half3
4678#define convert_half4_sat convert_half4
4679#define convert_half8_sat convert_half8
4680#define convert_half16_sat convert_half16
4681
4682#define convert_float1 convert_float
4683#define convert_half1 convert_half
4684#define convert_char1 convert_char
4685#define convert_uchar1 convert_uchar
4686#define convert_short1 convert_short
4687#define convert_ushort1 convert_ushort
4688#define convert_int1 convert_int
4689#define convert_uint1 convert_uint
4690#define convert_long1 convert_long
4691#define convert_ulong1 convert_ulong
4692#define convert_double1 convert_double
4693
4694#define convert_char1_sat convert_char_sat
4695#define convert_uchar1_sat convert_uchar_sat
4696#define convert_uchar2_sat convert_uchar2_sat
4697#define convert_uchar3_sat convert_uchar3_sat
4698#define convert_uchar4_sat convert_uchar4_sat
4699#define convert_uchar8_sat convert_uchar8_sat
4700#define convert_uchar16_sat convert_uchar16_sat
4701#define convert_short1_sat convert_short_sat
4702#define convert_ushort1_sat convert_ushort_sat
4703#define convert_int1_sat convert_int_sat
4704#define convert_uint1_sat convert_uint_sat
4705#define convert_long1_sat convert_long_sat
4706#define convert_ulong1_sat convert_ulong_sat
4707#define convert_double1_sat convert_double_sat
4708
4709#define VEC_DATA_TYPE_STR(type, size) type##size
4710#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4711
4712#define CONVERT_STR(x, type) (convert_##type((x)))
4713#define CONVERT(x, type) CONVERT_STR(x, type)
4714
4715#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4716#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4717
4718#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4719#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4720
4721#define select_vec_dt_uchar(size) uchar##size
4722#define select_vec_dt_char(size) char##size
4723#define select_vec_dt_ushort(size) ushort##size
4724#define select_vec_dt_short(size) short##size
4725#define select_vec_dt_half(size) short##size
4726#define select_vec_dt_uint(size) uint##size
4727#define select_vec_dt_int(size) int##size
4728#define select_vec_dt_float(size) int##size
4729#define select_vec_dt_ulong(size) ulong##size
4730#define select_vec_dt_long(size) long##size
4731
4732#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4733#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4734#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4735
4736#define signed_int_vec_dt_uchar(size) char##size
4737#define signed_int_vec_dt_char(size) char##size
4738#define signed_int_vec_dt_ushort(size) short##size
4739#define signed_int_vec_dt_short(size) short##size
4740#define signed_int_vec_dt_half(size) short##size
4741#define signed_int_vec_dt_uint(size) int##size
4742#define signed_int_vec_dt_int(size) int##size
4743#define signed_int_vec_dt_float(size) int##size
4744#define signed_int_vec_dt_ulong(size) long##size
4745#define signed_int_vec_dt_long(size) long##size
4746
4747#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4748#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4749#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4750
4751#define sum_reduce_1(x) (x)
4752#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4753#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4754#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4755#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4756#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4757
4758#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4759#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4760
4761#define prod_reduce_1(x) (x)
4762#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4763#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4764#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4765#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4766#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4767
4768#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4769#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4770
4771#define max_reduce_1(x) (x)
4772#define max_reduce_2(x) max(((x).s0), ((x).s1))
4773#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4774#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4775#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4776#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4777
4778#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4779#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4780
4781#define VECTOR_DECLARATION(name)     \
4782    __global uchar *name##_ptr,      \
4783    uint        name##_stride_x, \
4784    uint        name##_step_x,   \
4785    uint        name##_offset_first_element_in_bytes
4786
4787#define IMAGE_DECLARATION(name)      \
4788    __global uchar *name##_ptr,      \
4789    uint        name##_stride_x, \
4790    uint        name##_step_x,   \
4791    uint        name##_stride_y, \
4792    uint        name##_step_y,   \
4793    uint        name##_offset_first_element_in_bytes
4794
4795#define TENSOR3D_DECLARATION(name)   \
4796    __global uchar *name##_ptr,      \
4797    uint        name##_stride_x, \
4798    uint        name##_step_x,   \
4799    uint        name##_stride_y, \
4800    uint        name##_step_y,   \
4801    uint        name##_stride_z, \
4802    uint        name##_step_z,   \
4803    uint        name##_offset_first_element_in_bytes
4804
4805#define TENSOR4D_DECLARATION(name)   \
4806    __global uchar *name##_ptr,      \
4807    uint        name##_stride_x, \
4808    uint        name##_step_x,   \
4809    uint        name##_stride_y, \
4810    uint        name##_step_y,   \
4811    uint        name##_stride_z, \
4812    uint        name##_step_z,   \
4813    uint        name##_stride_w, \
4814    uint        name##_step_w,   \
4815    uint        name##_offset_first_element_in_bytes
4816
4817#define TENSOR5D_DECLARATION(name)   \
4818    __global uchar *name##_ptr,      \
4819    uint        name##_stride_x, \
4820    uint        name##_step_x,   \
4821    uint        name##_stride_y, \
4822    uint        name##_step_y,   \
4823    uint        name##_stride_z, \
4824    uint        name##_step_z,   \
4825    uint        name##_stride_w, \
4826    uint        name##_step_w,   \
4827    uint        name##_stride_v, \
4828    uint        name##_step_v,   \
4829    uint        name##_offset_first_element_in_bytes
4830
4831#define CONVERT_TO_VECTOR_STRUCT(name) \
4832    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4833
4834#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4835    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4836
4837#define CONVERT_TO_IMAGE_STRUCT(name) \
4838    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4839
4840#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4841    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4842
4843#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4844    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4845
4846#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4847    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4848
4849#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4850    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4851
4852#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4853    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4854                                 name##_stride_z, name##_step_z)
4855
4856#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4857    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4858
4859#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4860    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4861                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4862
4863#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4864    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4865
4866#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4867    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4868                           name##_stride_z, name##_step_z)
4869
4870
4871typedef struct Vector
4872{
4873    __global uchar *ptr;
4874    int             offset_first_element_in_bytes;
4875    int             stride_x;
4876} Vector;
4877
4878
4879typedef struct Image
4880{
4881    __global uchar *ptr;
4882    int             offset_first_element_in_bytes;
4883    int             stride_x;
4884    int             stride_y;
4885} Image;
4886
4887
4888typedef struct Tensor3D
4889{
4890    __global uchar *ptr;
4891    int             offset_first_element_in_bytes;
4892    int             stride_x;
4893    int             stride_y;
4894    int             stride_z;
4895} Tensor3D;
4896
4897
4898typedef struct Tensor4D
4899{
4900    __global uchar *ptr;
4901    int             offset_first_element_in_bytes;
4902    int             stride_x;
4903    int             stride_y;
4904    int             stride_z;
4905    int             stride_w;
4906} Tensor4D;
4907
4908
4909inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4910{
4911    Vector vector =
4912    {
4913        .ptr                           = ptr,
4914        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4915        .stride_x                      = stride_x,
4916    };
4917    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4918    return vector;
4919}
4920
4921
4922inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4923{
4924    Image img =
4925    {
4926        .ptr                           = ptr,
4927        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4928        .stride_x                      = stride_x,
4929        .stride_y                      = stride_y
4930    };
4931    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4932    return img;
4933}
4934
4935
4936inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4937{
4938    Image img =
4939    {
4940        .ptr                           = ptr,
4941        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4942        .stride_x                      = stride_x,
4943        .stride_y                      = stride_y
4944    };
4945    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4946    return img;
4947}
4948
4949
4950inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4951{
4952    Tensor3D tensor =
4953    {
4954        .ptr                           = ptr,
4955        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4956        .stride_x                      = stride_x,
4957        .stride_y                      = stride_y,
4958        .stride_z                      = stride_z
4959    };
4960    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4961    return tensor;
4962}
4963
4964
4965inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4966{
4967    Tensor3D tensor =
4968    {
4969        .ptr                           = ptr,
4970        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4971        .stride_x                      = stride_x,
4972        .stride_y                      = stride_y,
4973        .stride_z                      = stride_z
4974    };
4975    return tensor;
4976}
4977
4978inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4979                                             uint step_w,
4980                                             uint mod_size)
4981{
4982    Tensor4D tensor =
4983    {
4984        .ptr                           = ptr,
4985        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4986        .stride_x                      = stride_x,
4987        .stride_y                      = stride_y,
4988        .stride_z                      = stride_z,
4989        .stride_w                      = stride_w
4990    };
4991
4992    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4993    return tensor;
4994}
4995
4996
4997inline __global const uchar *vector_offset(const Vector *vec, int x)
4998{
4999    return vec->ptr + x * vec->stride_x;
5000}
5001
5002
5003inline __global uchar *offset(const Image *img, int x, int y)
5004{
5005    return img->ptr + x * img->stride_x + y * img->stride_y;
5006}
5007
5008
5009inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5010{
5011    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5012}
5013
5014
5015inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5016{
5017    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5018}
5019
5020
5021inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5022{
5023    uint num_elements = width * height;
5024
5025    const uint z = index / num_elements;
5026
5027    index %= num_elements;
5028
5029    const uint y = index / width;
5030
5031    index %= width;
5032
5033    const uint x = index;
5034
5035    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5036}
5037
5038#endif
5039
5040#ifndef ARM_COMPUTE_REPEAT_H
5041#define ARM_COMPUTE_REPEAT_H
5042
5043
5044#ifndef ARM_COMPUTE_HELPER_H
5045#define ARM_COMPUTE_HELPER_H
5046
5047
5048
5049
5050#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5051    VSTORE(N0)                                                 \
5052    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5053
5054#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5055    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5056    VSTORE(N0)                                                 \
5057    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5058
5059#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5060    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5061    VSTORE(N0)                                                 \
5062    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5063
5064#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5065    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5066    VSTORE(N0)                                                 \
5067    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5068
5069#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5070    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5071    VSTORE(N0)                                                 \
5072    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5073
5074#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5075    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5076    VSTORE(N0)                                                 \
5077    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5078
5079#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5080    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5081    VSTORE(N0)                                                 \
5082    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5083
5084#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5085    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5086    VSTORE(N0)                                                 \
5087    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5088
5089#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5090    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5091    VSTORE(N0)                                                 \
5092    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5093
5094#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5095    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5096    VSTORE(N0)                                                  \
5097    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5098
5099#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5100    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5101    VSTORE(N0)                                                  \
5102    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5103
5104#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5105    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5106    VSTORE(N0)                                                  \
5107    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5108
5109#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5110    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5111    VSTORE(N0)                                                  \
5112    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5113
5114#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5115    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5116    VSTORE(N0)                                                  \
5117    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5118
5119#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5120    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5121    VSTORE(N0)                                                  \
5122    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5123
5124#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5125    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5126    VSTORE(N0)                                                  \
5127    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5128
5129
5130
5131#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5132    VSTORE(N0)                                                         \
5133    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5134
5135#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5136    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5137    VSTORE(N0)                                                         \
5138    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5139
5140#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5141    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5142    VSTORE(N0)                                                         \
5143    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5144
5145#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5146    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5147    VSTORE(N0)                                                         \
5148    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5149
5150#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5151    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5152    VSTORE(N0)                                                         \
5153    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5154
5155#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5156    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5157    VSTORE(N0)                                                         \
5158    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5159
5160#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5161    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5162    VSTORE(N0)                                                         \
5163    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5164
5165#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5166    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5167    VSTORE(N0)                                                         \
5168    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5169
5170#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5171    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5172    VSTORE(N0)                                                         \
5173    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5174
5175#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
5176    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5177    VSTORE(N0)                                                     \
5178    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5179
5180#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5181    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5182    VSTORE(N0)                                                          \
5183    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5184
5185#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5186    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5187    VSTORE(N0)                                                          \
5188    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5189
5190#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5191    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5192    VSTORE(N0)                                                          \
5193    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5194
5195#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5196    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5197    VSTORE(N0)                                                          \
5198    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5199
5200#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5201    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5202    VSTORE(N0)                                                          \
5203    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5204
5205#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5206    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5207    VSTORE(N0)                                                          \
5208    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5209
5210
5211
5212
5213#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5214#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5215
5216
5217
5218#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5219#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5220
5221
5222
5223#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5224    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5225    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5226
5227#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5228    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5229    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5230    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5231
5232#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5233    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5234    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5235    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5236
5237#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5238    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5239    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5240    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5241
5242#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5243    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5244    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5245    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5246
5247#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5248    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5249    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5250    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5251
5252#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5253    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5254    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5255    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5256
5257#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5258    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5259    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5260    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5261
5262#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5263    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5264    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5265    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5266
5267#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5268    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5269    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5270    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5271
5272#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5273    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5274    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5275    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5276
5277#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5278    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5279    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5280    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5281
5282#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5283    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5284    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5285    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5286
5287#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5288    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5289    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5290    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5291
5292#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5293    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5294    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5295    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5296
5297#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5298    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5299    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5300    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5301
5302
5303
5304#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5305#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5306
5307#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5308    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
5309    {                                                                                                                                                     \
5310        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
5311    }                                                                                                                                                     \
5312    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
5313    {                                                                                                                                                     \
5314        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5315    }                                                                                                                                                     \
5316    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
5317    {                                                                                                                                                     \
5318        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5319    }                                                                                                                                                     \
5320    else                                                                                                                                                  \
5321    {                                                                                                                                                     \
5322        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
5323    }
5324
5325#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
5326    if(!(PARTIAL_COND_X))                                                                                         \
5327    {                                                                                                             \
5328        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5329    }                                                                                                             \
5330    else                                                                                                          \
5331    {                                                                                                             \
5332        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5333    }
5334
5335#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
5336    if(!(PARTIAL_COND_Y))                                                                                         \
5337    {                                                                                                             \
5338        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5339    }                                                                                                             \
5340    else                                                                                                          \
5341    {                                                                                                             \
5342        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5343    }
5344
5345
5346#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
5347
5348
5349#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
5350
5351#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5352    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5353
5354#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
5355
5356#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5357    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
5358
5359#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
5360
5361#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5362    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
5363
5364#else
5365
5366#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5367    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
5368
5369#endif
5370
5371#endif
5372
5373
5374#if defined(PARTIAL_STORE_M0)
5375
5376#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5377    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
5378#else
5379#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5380    ((uint)(y * M0))
5381#endif
5382
5383
5384
5385#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
5386    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
5387
5388
5389#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5390#pragma OPENCL EXTENSION cl_khr_fp16 : enable
5391#endif
5392
5393#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
5394#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
5395#endif
5396
5397#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
5398#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
5399#endif
5400
5401#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
5402#pragma OPENCL EXTENSION cl_arm_printf : enable
5403#endif
5404
5405#define GPU_ARCH_MIDGARD 0x100
5406#define GPU_ARCH_BIFROST 0x200
5407#define GPU_ARCH_VALHALL 0x300
5408
5409
5410#define CONCAT(a, b) a##b
5411
5412
5413#define EXPAND(x) x
5414
5415
5416#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
5417
5418
5419#define REV1(x) ((x))
5420#define REV2(x) ((x).s10)
5421#define REV3(x) ((x).s210)
5422#define REV4(x) ((x).s3210)
5423#define REV8(x) ((x).s76543210)
5424#define REV16(x) ((x).sFEDCBA9876543210)
5425
5426
5427
5428#define REVERSE_STR(x, s) REV##s((x))
5429#define REVERSE(x, s) REVERSE_STR(x, s)
5430
5431
5432
5433#define ROT1_0(x) ((x))
5434#define ROT1_1(x) ((x))
5435
5436#define ROT2_0(x) ((x))
5437#define ROT2_1(x) ((x).s10)
5438#define ROT2_2(x) ((x))
5439
5440#define ROT3_0(x) ((x))
5441#define ROT3_1(x) ((x).s201)
5442#define ROT3_2(x) ((x).s120)
5443#define ROT3_3(x) ((x))
5444
5445#define ROT4_0(x) ((x))
5446#define ROT4_1(x) ((x).s3012)
5447#define ROT4_2(x) ((x).s2301)
5448#define ROT4_3(x) ((x).s1230)
5449#define ROT4_4(x) ((x))
5450
5451#define ROT8_0(x) ((x))
5452#define ROT8_1(x) ((x).s70123456)
5453#define ROT8_2(x) ((x).s67012345)
5454#define ROT8_3(x) ((x).s56701234)
5455#define ROT8_4(x) ((x).s45670123)
5456#define ROT8_5(x) ((x).s34567012)
5457#define ROT8_6(x) ((x).s23456701)
5458#define ROT8_7(x) ((x).s12345670)
5459#define ROT8_8(x) ((x))
5460
5461#define ROT16_0(x) ((x))
5462#define ROT16_1(x) ((x).sF0123456789ABCDE)
5463#define ROT16_2(x) ((x).sEF0123456789ABCD)
5464#define ROT16_3(x) ((x).sDEF0123456789ABC)
5465#define ROT16_4(x) ((x).sCDEF0123456789AB)
5466#define ROT16_5(x) ((x).sBCDEF0123456789A)
5467#define ROT16_6(x) ((x).sABCDEF0123456789)
5468#define ROT16_7(x) ((x).s9ABCDEF012345678)
5469#define ROT16_8(x) ((x).s89ABCDEF01234567)
5470#define ROT16_9(x) ((x).s789ABCDEF0123456)
5471#define ROT16_10(x) ((x).s6789ABCDEF012345)
5472#define ROT16_11(x) ((x).s56789ABCDEF01234)
5473#define ROT16_12(x) ((x).s456789ABCDEF0123)
5474#define ROT16_13(x) ((x).s3456789ABCDEF012)
5475#define ROT16_14(x) ((x).s23456789ABCDEF01)
5476#define ROT16_15(x) ((x).s123456789ABCDEF0)
5477#define ROT16_16(x) ((x))
5478
5479
5480
5481#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
5482#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
5483
5484
5485
5486#define V_OFFS1(dt) (dt##1)(0)
5487#define V_OFFS2(dt) (dt##2)(0, 1)
5488#define V_OFFS3(dt) (dt##3)(0, 1, 2)
5489#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
5490#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
5491#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
5492
5493
5494
5495#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
5496#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
5497
5498
5499#define VLOAD_STR(size) vload##size
5500#define VLOAD(size) VLOAD_STR(size)
5501
5502
5503#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
5504#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
5505
5506#define NO_LOAD(data, offs, ptr) \
5507    {                            \
5508    }
5509
5510
5511#define vload_partial_1_0 NO_LOAD
5512#define vload_partial_1_1 vload1
5513#define vload_partial_1_2 NO_LOAD
5514#define vload_partial_1_3 NO_LOAD
5515#define vload_partial_1_4 NO_LOAD
5516#define vload_partial_1_5 NO_LOAD
5517#define vload_partial_1_6 NO_LOAD
5518#define vload_partial_1_7 NO_LOAD
5519#define vload_partial_1_8 NO_LOAD
5520#define vload_partial_1_9 NO_LOAD
5521#define vload_partial_1_10 NO_LOAD
5522#define vload_partial_1_11 NO_LOAD
5523#define vload_partial_1_12 NO_LOAD
5524#define vload_partial_1_13 NO_LOAD
5525#define vload_partial_1_14 NO_LOAD
5526#define vload_partial_1_15 NO_LOAD
5527#define vload_partial_1_16 NO_LOAD
5528
5529#define vload_partial_2_0 NO_LOAD
5530#define vload_partial_2_1 vload_partial_1
5531#define vload_partial_2_2 vload_partial_2
5532#define vload_partial_2_3 NO_LOAD
5533#define vload_partial_2_4 NO_LOAD
5534#define vload_partial_2_5 NO_LOAD
5535#define vload_partial_2_6 NO_LOAD
5536#define vload_partial_2_7 NO_LOAD
5537#define vload_partial_2_8 NO_LOAD
5538#define vload_partial_2_9 NO_LOAD
5539#define vload_partial_2_10 NO_LOAD
5540#define vload_partial_2_11 NO_LOAD
5541#define vload_partial_2_12 NO_LOAD
5542#define vload_partial_2_13 NO_LOAD
5543#define vload_partial_2_14 NO_LOAD
5544#define vload_partial_2_15 NO_LOAD
5545#define vload_partial_2_16 NO_LOAD
5546
5547#define vload_partial_3_0 NO_LOAD
5548#define vload_partial_3_1 vload_partial_1
5549#define vload_partial_3_2 vload_partial_2
5550#define vload_partial_3_3 vload_partial_3
5551#define vload_partial_3_4 NO_LOAD
5552#define vload_partial_3_5 NO_LOAD
5553#define vload_partial_3_6 NO_LOAD
5554#define vload_partial_3_7 NO_LOAD
5555#define vload_partial_3_8 NO_LOAD
5556#define vload_partial_3_9 NO_LOAD
5557#define vload_partial_3_10 NO_LOAD
5558#define vload_partial_3_11 NO_LOAD
5559#define vload_partial_3_12 NO_LOAD
5560#define vload_partial_3_13 NO_LOAD
5561#define vload_partial_3_14 NO_LOAD
5562#define vload_partial_3_15 NO_LOAD
5563#define vload_partial_3_16 NO_LOAD
5564
5565#define vload_partial_4_0 NO_LOAD
5566#define vload_partial_4_1 vload_partial_1
5567#define vload_partial_4_2 vload_partial_2
5568#define vload_partial_4_3 vload_partial_3
5569#define vload_partial_4_4 vload_partial_4
5570#define vload_partial_4_5 NO_LOAD
5571#define vload_partial_4_6 NO_LOAD
5572#define vload_partial_4_7 NO_LOAD
5573#define vload_partial_4_8 NO_LOAD
5574#define vload_partial_4_9 NO_LOAD
5575#define vload_partial_4_10 NO_LOAD
5576#define vload_partial_4_11 NO_LOAD
5577#define vload_partial_4_12 NO_LOAD
5578#define vload_partial_4_13 NO_LOAD
5579#define vload_partial_4_14 NO_LOAD
5580#define vload_partial_4_15 NO_LOAD
5581#define vload_partial_4_16 NO_LOAD
5582
5583#define vload_partial_8_0 NO_LOAD
5584#define vload_partial_8_1 vload_partial_1
5585#define vload_partial_8_2 vload_partial_2
5586#define vload_partial_8_3 vload_partial_3
5587#define vload_partial_8_4 vload_partial_4
5588#define vload_partial_8_5 vload_partial_5
5589#define vload_partial_8_6 vload_partial_6
5590#define vload_partial_8_7 vload_partial_7
5591#define vload_partial_8_8 vload_partial_8
5592#define vload_partial_8_9 NO_LOAD
5593#define vload_partial_8_10 NO_LOAD
5594#define vload_partial_8_11 NO_LOAD
5595#define vload_partial_8_12 NO_LOAD
5596#define vload_partial_8_13 NO_LOAD
5597#define vload_partial_8_14 NO_LOAD
5598#define vload_partial_8_15 NO_LOAD
5599#define vload_partial_8_16 NO_LOAD
5600
5601#define vload_partial_16_0 NO_LOAD
5602#define vload_partial_16_1 vload_partial_1
5603#define vload_partial_16_2 vload_partial_2
5604#define vload_partial_16_3 vload_partial_3
5605#define vload_partial_16_4 vload_partial_4
5606#define vload_partial_16_5 vload_partial_5
5607#define vload_partial_16_6 vload_partial_6
5608#define vload_partial_16_7 vload_partial_7
5609#define vload_partial_16_8 vload_partial_8
5610#define vload_partial_16_9 vload_partial_9
5611#define vload_partial_16_10 vload_partial_10
5612#define vload_partial_16_11 vload_partial_11
5613#define vload_partial_16_12 vload_partial_12
5614#define vload_partial_16_13 vload_partial_13
5615#define vload_partial_16_14 vload_partial_14
5616#define vload_partial_16_15 vload_partial_15
5617#define vload_partial_16_16 vload_partial_16
5618
5619
5620#define vload_partial_1(DATA, OFFSET, PTR) \
5621    DATA.s0 = vload1(OFFSET, PTR);
5622
5623#define vload_partial_2(DATA, OFFSET, PTR) \
5624    DATA.s01 = vload2(OFFSET, PTR);
5625
5626#define vload_partial_3(DATA, OFFSET, PTR) \
5627    DATA.s012 = vload3(OFFSET, PTR);
5628
5629#define vload_partial_4(DATA, OFFSET, PTR) \
5630    DATA.s0123 = vload4(OFFSET, PTR);
5631
5632#define vload_partial_5(DATA, OFFSET, PTR)    \
5633    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5634    DATA.s4 = vload1(OFFSET, PTR + 4);
5635
5636#define vload_partial_6(DATA, OFFSET, PTR)    \
5637    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5638    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
5639
5640#define vload_partial_7(DATA, OFFSET, PTR)    \
5641    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5642    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
5643
5644#define vload_partial_8(DATA, OFFSET, PTR) \
5645    DATA.s01234567 = vload8(OFFSET, PTR);
5646
5647#define vload_partial_9(DATA, OFFSET, PTR)        \
5648    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5649    DATA.s8 = vload1(OFFSET, PTR + 8);
5650
5651#define vload_partial_10(DATA, OFFSET, PTR)       \
5652    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5653    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
5654
5655#define vload_partial_11(DATA, OFFSET, PTR)       \
5656    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5657    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
5658
5659#define vload_partial_12(DATA, OFFSET, PTR)       \
5660    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5661    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
5662
5663#define vload_partial_13(DATA, OFFSET, PTR)       \
5664    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5665    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
5666
5667#define vload_partial_14(DATA, OFFSET, PTR)       \
5668    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5669    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
5670
5671#define vload_partial_15(DATA, OFFSET, PTR)       \
5672    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5673    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
5674
5675#define vload_partial_16(DATA, OFFSET, PTR) \
5676    DATA = vload16(OFFSET, PTR);
5677
5678
5679
5680#define PIXEL_UNIT4 1
5681#define PIXEL_UNIT8 2
5682#define PIXEL_UNIT16 4
5683
5684
5685#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
5686#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
5687
5688
5689#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
5690#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
5691#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
5692
5693#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5694#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
5695#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
5696#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
5697#endif
5698
5699#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
5700#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
5701#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
5702
5703#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5704#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
5705#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
5706#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
5707#endif
5708
5709
5710#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
5711#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
5712
5713
5714#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
5715#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
5716
5717#define VSTORE_STR(size) vstore##size
5718#define VSTORE(size) VSTORE_STR(size)
5719
5720#define float1 float
5721#define half1 half
5722#define char1 char
5723#define uchar1 uchar
5724#define short1 short
5725#define ushort1 ushort
5726#define int1 int
5727#define uint1 uint
5728#define long1 long
5729#define ulong1 ulong
5730#define double1 double
5731
5732#define vload1(OFFSET, PTR) *(OFFSET + PTR)
5733#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
5734
5735
5736#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
5737#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
5738
5739#define NO_STORE(data, offs, ptr) \
5740    {                             \
5741    }
5742
5743
5744#define vstore_partial_1_0 NO_STORE
5745#define vstore_partial_1_1 vstore1
5746#define vstore_partial_1_2 NO_STORE
5747#define vstore_partial_1_3 NO_STORE
5748#define vstore_partial_1_4 NO_STORE
5749#define vstore_partial_1_5 NO_STORE
5750#define vstore_partial_1_6 NO_STORE
5751#define vstore_partial_1_7 NO_STORE
5752#define vstore_partial_1_8 NO_STORE
5753#define vstore_partial_1_9 NO_STORE
5754#define vstore_partial_1_10 NO_STORE
5755#define vstore_partial_1_11 NO_STORE
5756#define vstore_partial_1_12 NO_STORE
5757#define vstore_partial_1_13 NO_STORE
5758#define vstore_partial_1_14 NO_STORE
5759#define vstore_partial_1_15 NO_STORE
5760#define vstore_partial_1_16 NO_STORE
5761
5762#define vstore_partial_2_0 NO_STORE
5763#define vstore_partial_2_1 vstore_partial_1
5764#define vstore_partial_2_2 vstore_partial_2
5765#define vstore_partial_2_3 NO_STORE
5766#define vstore_partial_2_4 NO_STORE
5767#define vstore_partial_2_5 NO_STORE
5768#define vstore_partial_2_6 NO_STORE
5769#define vstore_partial_2_7 NO_STORE
5770#define vstore_partial_2_8 NO_STORE
5771#define vstore_partial_2_9 NO_STORE
5772#define vstore_partial_2_10 NO_STORE
5773#define vstore_partial_2_11 NO_STORE
5774#define vstore_partial_2_12 NO_STORE
5775#define vstore_partial_2_13 NO_STORE
5776#define vstore_partial_2_14 NO_STORE
5777#define vstore_partial_2_15 NO_STORE
5778#define vstore_partial_2_16 NO_STORE
5779
5780#define vstore_partial_3_0 NO_STORE
5781#define vstore_partial_3_1 vstore_partial_1
5782#define vstore_partial_3_2 vstore_partial_2
5783#define vstore_partial_3_3 vstore_partial_3
5784#define vstore_partial_3_4 NO_STORE
5785#define vstore_partial_3_5 NO_STORE
5786#define vstore_partial_3_6 NO_STORE
5787#define vstore_partial_3_7 NO_STORE
5788#define vstore_partial_3_8 NO_STORE
5789#define vstore_partial_3_9 NO_STORE
5790#define vstore_partial_3_10 NO_STORE
5791#define vstore_partial_3_11 NO_STORE
5792#define vstore_partial_3_12 NO_STORE
5793#define vstore_partial_3_13 NO_STORE
5794#define vstore_partial_3_14 NO_STORE
5795#define vstore_partial_3_15 NO_STORE
5796#define vstore_partial_3_16 NO_STORE
5797
5798#define vstore_partial_4_0 NO_STORE
5799#define vstore_partial_4_1 vstore_partial_1
5800#define vstore_partial_4_2 vstore_partial_2
5801#define vstore_partial_4_3 vstore_partial_3
5802#define vstore_partial_4_4 vstore_partial_4
5803#define vstore_partial_4_5 NO_STORE
5804#define vstore_partial_4_6 NO_STORE
5805#define vstore_partial_4_7 NO_STORE
5806#define vstore_partial_4_8 NO_STORE
5807#define vstore_partial_4_9 NO_STORE
5808#define vstore_partial_4_10 NO_STORE
5809#define vstore_partial_4_11 NO_STORE
5810#define vstore_partial_4_12 NO_STORE
5811#define vstore_partial_4_13 NO_STORE
5812#define vstore_partial_4_14 NO_STORE
5813#define vstore_partial_4_15 NO_STORE
5814#define vstore_partial_4_16 NO_STORE
5815
5816#define vstore_partial_8_0 NO_STORE
5817#define vstore_partial_8_1 vstore_partial_1
5818#define vstore_partial_8_2 vstore_partial_2
5819#define vstore_partial_8_3 vstore_partial_3
5820#define vstore_partial_8_4 vstore_partial_4
5821#define vstore_partial_8_5 vstore_partial_5
5822#define vstore_partial_8_6 vstore_partial_6
5823#define vstore_partial_8_7 vstore_partial_7
5824#define vstore_partial_8_8 vstore_partial_8
5825#define vstore_partial_8_9 NO_STORE
5826#define vstore_partial_8_10 NO_STORE
5827#define vstore_partial_8_11 NO_STORE
5828#define vstore_partial_8_12 NO_STORE
5829#define vstore_partial_8_13 NO_STORE
5830#define vstore_partial_8_14 NO_STORE
5831#define vstore_partial_8_15 NO_STORE
5832#define vstore_partial_8_16 NO_STORE
5833
5834#define vstore_partial_16_0 NO_STORE
5835#define vstore_partial_16_1 vstore_partial_1
5836#define vstore_partial_16_2 vstore_partial_2
5837#define vstore_partial_16_3 vstore_partial_3
5838#define vstore_partial_16_4 vstore_partial_4
5839#define vstore_partial_16_5 vstore_partial_5
5840#define vstore_partial_16_6 vstore_partial_6
5841#define vstore_partial_16_7 vstore_partial_7
5842#define vstore_partial_16_8 vstore_partial_8
5843#define vstore_partial_16_9 vstore_partial_9
5844#define vstore_partial_16_10 vstore_partial_10
5845#define vstore_partial_16_11 vstore_partial_11
5846#define vstore_partial_16_12 vstore_partial_12
5847#define vstore_partial_16_13 vstore_partial_13
5848#define vstore_partial_16_14 vstore_partial_14
5849#define vstore_partial_16_15 vstore_partial_15
5850#define vstore_partial_16_16 vstore_partial_16
5851
5852
5853#define vstore_partial_1(DATA, OFFSET, PTR) \
5854    vstore1(DATA.s0, OFFSET, PTR);
5855
5856#define vstore_partial_2(DATA, OFFSET, PTR) \
5857    vstore2(DATA.s01, OFFSET, PTR);
5858
5859#define vstore_partial_3(DATA, OFFSET, PTR) \
5860    vstore3(DATA.s012, OFFSET, PTR);
5861
5862#define vstore_partial_4(DATA, OFFSET, PTR) \
5863    vstore4(DATA.s0123, OFFSET, PTR);
5864
5865#define vstore_partial_5(DATA, OFFSET, PTR)    \
5866    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5867    vstore1(DATA.s4, OFFSET, PTR + 4);
5868
5869#define vstore_partial_6(DATA, OFFSET, PTR)    \
5870    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5871    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
5872
5873#define vstore_partial_7(DATA, OFFSET, PTR)    \
5874    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5875    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
5876
5877#define vstore_partial_8(DATA, OFFSET, PTR) \
5878    vstore8(DATA.s01234567, OFFSET, PTR);
5879
5880#define vstore_partial_9(DATA, OFFSET, PTR)        \
5881    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5882    vstore1(DATA.s8, OFFSET, PTR + 8);
5883
5884#define vstore_partial_10(DATA, OFFSET, PTR)       \
5885    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5886    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
5887
5888#define vstore_partial_11(DATA, OFFSET, PTR)       \
5889    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5890    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
5891
5892#define vstore_partial_12(DATA, OFFSET, PTR)       \
5893    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5894    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
5895
5896#define vstore_partial_13(DATA, OFFSET, PTR)       \
5897    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5898    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
5899
5900#define vstore_partial_14(DATA, OFFSET, PTR)       \
5901    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5902    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
5903
5904#define vstore_partial_15(DATA, OFFSET, PTR)       \
5905    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5906    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
5907
5908#define vstore_partial_16(DATA, OFFSET, PTR) \
5909    vstore16(DATA, OFFSET, PTR);
5910
5911
5912
5913
5914
5915#define convert_float_sat convert_float
5916#define convert_float1_sat convert_float
5917#define convert_float2_sat convert_float2
5918#define convert_float3_sat convert_float3
5919#define convert_float4_sat convert_float4
5920#define convert_float8_sat convert_float8
5921#define convert_float16_sat convert_float16
5922#define convert_half_sat convert_float
5923#define convert_half1_sat convert_half
5924#define convert_half2_sat convert_half2
5925#define convert_half3_sat convert_half3
5926#define convert_half4_sat convert_half4
5927#define convert_half8_sat convert_half8
5928#define convert_half16_sat convert_half16
5929
5930#define convert_float1 convert_float
5931#define convert_half1 convert_half
5932#define convert_char1 convert_char
5933#define convert_uchar1 convert_uchar
5934#define convert_short1 convert_short
5935#define convert_ushort1 convert_ushort
5936#define convert_int1 convert_int
5937#define convert_uint1 convert_uint
5938#define convert_long1 convert_long
5939#define convert_ulong1 convert_ulong
5940#define convert_double1 convert_double
5941
5942#define convert_char1_sat convert_char_sat
5943#define convert_uchar1_sat convert_uchar_sat
5944#define convert_uchar2_sat convert_uchar2_sat
5945#define convert_uchar3_sat convert_uchar3_sat
5946#define convert_uchar4_sat convert_uchar4_sat
5947#define convert_uchar8_sat convert_uchar8_sat
5948#define convert_uchar16_sat convert_uchar16_sat
5949#define convert_short1_sat convert_short_sat
5950#define convert_ushort1_sat convert_ushort_sat
5951#define convert_int1_sat convert_int_sat
5952#define convert_uint1_sat convert_uint_sat
5953#define convert_long1_sat convert_long_sat
5954#define convert_ulong1_sat convert_ulong_sat
5955#define convert_double1_sat convert_double_sat
5956
5957#define VEC_DATA_TYPE_STR(type, size) type##size
5958#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
5959
5960#define CONVERT_STR(x, type) (convert_##type((x)))
5961#define CONVERT(x, type) CONVERT_STR(x, type)
5962
5963#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
5964#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
5965
5966#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
5967#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
5968
5969#define select_vec_dt_uchar(size) uchar##size
5970#define select_vec_dt_char(size) char##size
5971#define select_vec_dt_ushort(size) ushort##size
5972#define select_vec_dt_short(size) short##size
5973#define select_vec_dt_half(size) short##size
5974#define select_vec_dt_uint(size) uint##size
5975#define select_vec_dt_int(size) int##size
5976#define select_vec_dt_float(size) int##size
5977#define select_vec_dt_ulong(size) ulong##size
5978#define select_vec_dt_long(size) long##size
5979
5980#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
5981#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
5982#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
5983
5984#define signed_int_vec_dt_uchar(size) char##size
5985#define signed_int_vec_dt_char(size) char##size
5986#define signed_int_vec_dt_ushort(size) short##size
5987#define signed_int_vec_dt_short(size) short##size
5988#define signed_int_vec_dt_half(size) short##size
5989#define signed_int_vec_dt_uint(size) int##size
5990#define signed_int_vec_dt_int(size) int##size
5991#define signed_int_vec_dt_float(size) int##size
5992#define signed_int_vec_dt_ulong(size) long##size
5993#define signed_int_vec_dt_long(size) long##size
5994
5995#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
5996#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
5997#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
5998
5999#define sum_reduce_1(x) (x)
6000#define sum_reduce_2(x) ((x).s0) + ((x).s1)
6001#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
6002#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
6003#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
6004#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
6005
6006#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
6007#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
6008
6009#define prod_reduce_1(x) (x)
6010#define prod_reduce_2(x) ((x).s0) * ((x).s1)
6011#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
6012#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
6013#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
6014#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
6015
6016#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
6017#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
6018
6019#define max_reduce_1(x) (x)
6020#define max_reduce_2(x) max(((x).s0), ((x).s1))
6021#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
6022#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
6023#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
6024#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
6025
6026#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
6027#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
6028
6029#define VECTOR_DECLARATION(name)     \
6030    __global uchar *name##_ptr,      \
6031    uint        name##_stride_x, \
6032    uint        name##_step_x,   \
6033    uint        name##_offset_first_element_in_bytes
6034
6035#define IMAGE_DECLARATION(name)      \
6036    __global uchar *name##_ptr,      \
6037    uint        name##_stride_x, \
6038    uint        name##_step_x,   \
6039    uint        name##_stride_y, \
6040    uint        name##_step_y,   \
6041    uint        name##_offset_first_element_in_bytes
6042
6043#define TENSOR3D_DECLARATION(name)   \
6044    __global uchar *name##_ptr,      \
6045    uint        name##_stride_x, \
6046    uint        name##_step_x,   \
6047    uint        name##_stride_y, \
6048    uint        name##_step_y,   \
6049    uint        name##_stride_z, \
6050    uint        name##_step_z,   \
6051    uint        name##_offset_first_element_in_bytes
6052
6053#define TENSOR4D_DECLARATION(name)   \
6054    __global uchar *name##_ptr,      \
6055    uint        name##_stride_x, \
6056    uint        name##_step_x,   \
6057    uint        name##_stride_y, \
6058    uint        name##_step_y,   \
6059    uint        name##_stride_z, \
6060    uint        name##_step_z,   \
6061    uint        name##_stride_w, \
6062    uint        name##_step_w,   \
6063    uint        name##_offset_first_element_in_bytes
6064
6065#define TENSOR5D_DECLARATION(name)   \
6066    __global uchar *name##_ptr,      \
6067    uint        name##_stride_x, \
6068    uint        name##_step_x,   \
6069    uint        name##_stride_y, \
6070    uint        name##_step_y,   \
6071    uint        name##_stride_z, \
6072    uint        name##_step_z,   \
6073    uint        name##_stride_w, \
6074    uint        name##_step_w,   \
6075    uint        name##_stride_v, \
6076    uint        name##_step_v,   \
6077    uint        name##_offset_first_element_in_bytes
6078
6079#define CONVERT_TO_VECTOR_STRUCT(name) \
6080    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
6081
6082#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
6083    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
6084
6085#define CONVERT_TO_IMAGE_STRUCT(name) \
6086    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
6087
6088#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
6089    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
6090
6091#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6092    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6093
6094#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
6095    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
6096
6097#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6098    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6099
6100#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
6101    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6102                                 name##_stride_z, name##_step_z)
6103
6104#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
6105    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
6106
6107#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
6108    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6109                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
6110
6111#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
6112    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
6113
6114#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
6115    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6116                           name##_stride_z, name##_step_z)
6117
6118
6119typedef struct Vector
6120{
6121    __global uchar *ptr;
6122    int             offset_first_element_in_bytes;
6123    int             stride_x;
6124} Vector;
6125
6126
6127typedef struct Image
6128{
6129    __global uchar *ptr;
6130    int             offset_first_element_in_bytes;
6131    int             stride_x;
6132    int             stride_y;
6133} Image;
6134
6135
6136typedef struct Tensor3D
6137{
6138    __global uchar *ptr;
6139    int             offset_first_element_in_bytes;
6140    int             stride_x;
6141    int             stride_y;
6142    int             stride_z;
6143} Tensor3D;
6144
6145
6146typedef struct Tensor4D
6147{
6148    __global uchar *ptr;
6149    int             offset_first_element_in_bytes;
6150    int             stride_x;
6151    int             stride_y;
6152    int             stride_z;
6153    int             stride_w;
6154} Tensor4D;
6155
6156
6157inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
6158{
6159    Vector vector =
6160    {
6161        .ptr                           = ptr,
6162        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6163        .stride_x                      = stride_x,
6164    };
6165    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
6166    return vector;
6167}
6168
6169
6170inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
6171{
6172    Image img =
6173    {
6174        .ptr                           = ptr,
6175        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6176        .stride_x                      = stride_x,
6177        .stride_y                      = stride_y
6178    };
6179    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
6180    return img;
6181}
6182
6183
6184inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6185{
6186    Image img =
6187    {
6188        .ptr                           = ptr,
6189        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6190        .stride_x                      = stride_x,
6191        .stride_y                      = stride_y
6192    };
6193    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6194    return img;
6195}
6196
6197
6198inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6199{
6200    Tensor3D tensor =
6201    {
6202        .ptr                           = ptr,
6203        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6204        .stride_x                      = stride_x,
6205        .stride_y                      = stride_y,
6206        .stride_z                      = stride_z
6207    };
6208    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6209    return tensor;
6210}
6211
6212
6213inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6214{
6215    Tensor3D tensor =
6216    {
6217        .ptr                           = ptr,
6218        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6219        .stride_x                      = stride_x,
6220        .stride_y                      = stride_y,
6221        .stride_z                      = stride_z
6222    };
6223    return tensor;
6224}
6225
6226inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
6227                                             uint step_w,
6228                                             uint mod_size)
6229{
6230    Tensor4D tensor =
6231    {
6232        .ptr                           = ptr,
6233        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6234        .stride_x                      = stride_x,
6235        .stride_y                      = stride_y,
6236        .stride_z                      = stride_z,
6237        .stride_w                      = stride_w
6238    };
6239
6240    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
6241    return tensor;
6242}
6243
6244
6245inline __global const uchar *vector_offset(const Vector *vec, int x)
6246{
6247    return vec->ptr + x * vec->stride_x;
6248}
6249
6250
6251inline __global uchar *offset(const Image *img, int x, int y)
6252{
6253    return img->ptr + x * img->stride_x + y * img->stride_y;
6254}
6255
6256
6257inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
6258{
6259    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
6260}
6261
6262
6263inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
6264{
6265    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
6266}
6267
6268
6269inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
6270{
6271    uint num_elements = width * height;
6272
6273    const uint z = index / num_elements;
6274
6275    index %= num_elements;
6276
6277    const uint y = index / width;
6278
6279    index %= width;
6280
6281    const uint x = index;
6282
6283    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
6284}
6285
6286#endif
6287
6288
6289
6290#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
6291#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
6292    P_X##_DEF(1, P_A, P_B, P_C);       \
6293    REPEAT_3_1(P_X, P_A, P_B, P_C)
6294#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
6295    P_X##_DEF(2, P_A, P_B, P_C);       \
6296    REPEAT_3_2(P_X, P_A, P_B, P_C)
6297#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
6298    P_X##_DEF(3, P_A, P_B, P_C);       \
6299    REPEAT_3_3(P_X, P_A, P_B, P_C)
6300#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
6301    P_X##_DEF(4, P_A, P_B, P_C);       \
6302    REPEAT_3_4(P_X, P_A, P_B, P_C)
6303#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
6304    P_X##_DEF(5, P_A, P_B, P_C);       \
6305    REPEAT_3_5(P_X, P_A, P_B, P_C)
6306#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
6307    P_X##_DEF(6, P_A, P_B, P_C);       \
6308    REPEAT_3_6(P_X, P_A, P_B, P_C)
6309#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
6310    P_X##_DEF(7, P_A, P_B, P_C);       \
6311    REPEAT_3_7(P_X, P_A, P_B, P_C)
6312#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
6313    P_X##_DEF(8, P_A, P_B, P_C);       \
6314    REPEAT_3_8(P_X, P_A, P_B, P_C)
6315#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
6316    P_X##_DEF(9, P_A, P_B, P_C);        \
6317    REPEAT_3_9(P_X, P_A, P_B, P_C)
6318#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
6319    P_X##_DEF(A, P_A, P_B, P_C);        \
6320    REPEAT_3_10(P_X, P_A, P_B, P_C)
6321#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
6322    P_X##_DEF(B, P_A, P_B, P_C);        \
6323    REPEAT_3_11(P_X, P_A, P_B, P_C)
6324#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
6325    P_X##_DEF(C, P_A, P_B, P_C);        \
6326    REPEAT_3_12(P_X, P_A, P_B, P_C)
6327#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
6328    P_X##_DEF(D, P_A, P_B, P_C);        \
6329    REPEAT_3_13(P_X, P_A, P_B, P_C)
6330#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
6331    P_X##_DEF(E, P_A, P_B, P_C);        \
6332    REPEAT_3_14(P_X, P_A, P_B, P_C)
6333#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
6334    P_X##_DEF(F, P_A, P_B, P_C);        \
6335    REPEAT_3_15(P_X, P_A, P_B, P_C)
6336
6337#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
6338#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
6339
6340
6341#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
6342#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
6343    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
6344    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
6345#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
6346    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
6347    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
6348#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
6349    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
6350    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
6351#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
6352    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
6353    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
6354#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
6355    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
6356    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
6357#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
6358    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
6359    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
6360#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
6361    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
6362    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
6363#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
6364    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
6365    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
6366#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
6367    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
6368    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
6369#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
6370    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
6371    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
6372#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
6373    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
6374    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
6375#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
6376    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
6377    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
6378#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
6379    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
6380    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
6381#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
6382    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
6383    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
6384#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
6385    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
6386    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
6387
6388#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
6389#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
6390
6391
6392#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
6393#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
6394
6395
6396#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
6397#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
6398
6399
6400#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
6401#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
6402
6403
6404#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
6405#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
6406
6407
6408#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
6409#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
6410
6411
6412#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
6413#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
6414
6415
6416#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
6417#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
6418
6419
6420#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
6421#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
6422
6423
6424#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
6425#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
6426
6427
6428#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6429#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6430
6431
6432#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6433#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6434
6435
6436#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
6437    ({                                                                                                        \
6438        VEC_DATA_TYPE(int, N0)                                                                                \
6439        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
6440        VEC_DATA_TYPE(int, N0)                                                                                \
6441        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
6442        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
6443    })
6444#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
6445
6446#endif
6447
6448#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6449#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6450
6451
6452
6453
6454#define TILE_VECTOR_SIZE1 1
6455#define TILE_VECTOR_SIZE2 2
6456#define TILE_VECTOR_SIZE3 3
6457#define TILE_VECTOR_SIZE4 4
6458#define TILE_VECTOR_SIZE5 8
6459#define TILE_VECTOR_SIZE6 8
6460#define TILE_VECTOR_SIZE7 8
6461#define TILE_VECTOR_SIZE8 8
6462#define TILE_VECTOR_SIZE9 16
6463#define TILE_VECTOR_SIZE10 16
6464#define TILE_VECTOR_SIZE11 16
6465#define TILE_VECTOR_SIZE12 16
6466#define TILE_VECTOR_SIZE13 16
6467#define TILE_VECTOR_SIZE14 16
6468#define TILE_VECTOR_SIZE15 16
6469#define TILE_VECTOR_SIZE16 16
6470
6471#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
6472#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
6473#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
6474#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
6475#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
6476#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
6477#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
6478#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
6479#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
6480#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
6481#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
6482#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
6483#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
6484#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
6485#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
6486#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
6487
6488
6489#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
6490#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
6491    union {                                 \
6492        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
6493        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
6494    } BASENAME[H]
6495
6496#define TENSOR4D_IMAGE(name)          \
6497    __read_only image2d_t name##_img, \
6498    __global uchar *name##_ptr,       \
6499    uint            name##_stride_x,  \
6500    uint            name##_step_x,    \
6501    uint            name##_stride_y,  \
6502    uint            name##_step_y,    \
6503    uint            name##_stride_z,  \
6504    uint            name##_step_z,    \
6505    uint            name##_stride_w,  \
6506    uint            name##_step_w,    \
6507    uint            name##_offset_first_element_in_bytes
6508
6509#define TENSOR4D_BUFFER(name)    \
6510    __global uchar *name##_ptr,  \
6511    uint        name##_stride_x, \
6512    uint        name##_step_x,   \
6513    uint        name##_stride_y, \
6514    uint        name##_step_y,   \
6515    uint        name##_stride_z, \
6516    uint        name##_step_z,   \
6517    uint        name##_stride_w, \
6518    uint        name##_step_w,   \
6519    uint        name##_offset_first_element_in_bytes
6520
6521#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
6522#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
6523
6524#define TENSOR4D_T_IMAGE(name)          \
6525    __read_only image2d_t name##_img, \
6526    __global uchar *name##_ptr,       \
6527    uint        name##_stride_y, \
6528    uint        name##_stride_z, \
6529    uint        name##_stride_w, \
6530    uint        name##_c,   \
6531    uint        name##_w,   \
6532    uint        name##_h,   \
6533    uint        name##_n,   \
6534    uint        name##_offset_first_element_in_bytes
6535
6536#define TENSOR4D_T_BUFFER(name)    \
6537    __global uchar *name##_ptr,  \
6538    uint        name##_stride_y, \
6539    uint        name##_stride_z, \
6540    uint        name##_stride_w, \
6541    uint        name##_c,   \
6542    uint        name##_w,   \
6543    uint        name##_h,   \
6544    uint        name##_n,   \
6545    uint        name##_offset_first_element_in_bytes
6546
6547#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
6548
6549
6550#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
6551
6552#define TENSOR4D_RO_T_IMAGE(name)          \
6553    __read_only image2d_t name##_img, \
6554    TENSOR4D_T_BUFFER(name)
6555
6556#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
6557
6558#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
6559
6560
6561#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
6562
6563#define TENSOR4D_WO_T_IMAGE(name)          \
6564    __write_only image2d_t name##_img, \
6565    TENSOR4D_T_BUFFER(name)
6566
6567#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
6568
6569#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
6570
6571
6572#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
6573
6574#define TENSOR3D_T_IMAGE(name)          \
6575    __read_only image2d_t name##_img, \
6576    __global uchar *name##_ptr,       \
6577    uint        name##_stride_y, \
6578    uint        name##_stride_z, \
6579    uint        name##_w,   \
6580    uint        name##_h,   \
6581    uint        name##_n,   \
6582    uint        name##_offset_first_element_in_bytes
6583
6584#define TENSOR3D_T_BUFFER(name)    \
6585    __global uchar *name##_ptr,  \
6586    uint        name##_stride_y, \
6587    uint        name##_stride_z, \
6588    uint        name##_w,   \
6589    uint        name##_h,   \
6590    uint        name##_n,   \
6591    uint        name##_offset_first_element_in_bytes
6592
6593#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
6594#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
6595
6596#if !defined(UNROLL_WITH_PRAGMA)
6597#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
6598
6599#define LOOP_UNROLLING_1(idx, step, macro) (macro)
6600#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
6601#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
6602#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
6603#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
6604#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
6605#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
6606#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
6607#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
6608#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
6609#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
6610#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
6611#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
6612#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
6613#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
6614#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
6615#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
6616#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
6617#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
6618#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
6619#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
6620#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
6621#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
6622#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
6623#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
6624#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
6625#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
6626#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
6627#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
6628#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
6629#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
6630#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
6631#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
6632#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
6633#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
6634#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
6635#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
6636#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
6637#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
6638#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
6639#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
6640#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
6641#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
6642#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
6643#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
6644#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
6645#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
6646#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
6647#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
6648#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
6649#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
6650#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
6651#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
6652#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
6653#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
6654#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
6655#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
6656#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
6657#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
6658#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
6659#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
6660#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
6661#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
6662#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
6663#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
6664#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
6665#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
6666#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
6667#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
6668#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
6669#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
6670#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
6671#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
6672#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
6673#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
6674#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
6675#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
6676#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
6677#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
6678#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
6679#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
6680#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
6681#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
6682#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
6683#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
6684#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
6685#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
6686#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
6687#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
6688#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
6689#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
6690#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
6691#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
6692#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
6693#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
6694#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
6695#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
6696#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
6697#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
6698#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
6699#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
6700#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
6701#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
6702#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
6703#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
6704#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
6705#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
6706#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
6707#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
6708#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
6709#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
6710#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
6711#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
6712#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
6713#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
6714#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
6715#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
6716#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
6717#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
6718#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
6719#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
6720#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
6721#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
6722#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
6723#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
6724#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
6725#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
6726#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
6727
6728#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
6729    {                                                          \
6730        type idx = start;                                      \
6731        LOOP_UNROLLING_##num(idx, step, macro);                \
6732    }
6733#else
6734#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
6735    {                                                          \
6736        _Pragma("unroll")                                      \
6737        for(type idx = start; idx < (num * step); idx += step) \
6738        {                                                      \
6739            (macro);                                           \
6740        }                                                      \
6741    }
6742#endif
6743#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
6744
6745
6746#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
6747
6748
6749#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
6750#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
6751#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6752    ({                                                \
6753        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
6754    })
6755#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
6756#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
6757#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
6758#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
6759#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
6760#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
6761#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
6762#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
6763#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
6764#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
6765#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
6766#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
6767#else
6768#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
6769    ({                                                  \
6770        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
6771        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
6772    })
6773#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
6774    ({                                                  \
6775        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
6776        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
6777    })
6778#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
6779    ({                                                    \
6780        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
6781        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
6782        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
6783        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
6784    })
6785#endif
6786#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6787    ({                                                \
6788        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6789        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
6790    })
6791#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6792    ({                                                \
6793        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6794        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
6795    })
6796#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6797    ({                                                \
6798        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6799        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
6800    })
6801#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6802    ({                                                \
6803        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
6804        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
6805    })
6806#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6807    ({                                                \
6808        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6809        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
6810    })
6811#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6812    ({                                                \
6813        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6814        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
6815    })
6816#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6817    ({                                                \
6818        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6819        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
6820    })
6821#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6822    ({                                                \
6823        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6824        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
6825    })
6826#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6827    ({                                                \
6828        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6829        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
6830    })
6831#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6832    ({                                                \
6833        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6834        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
6835    })
6836#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6837    ({                                                \
6838        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6839        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
6840    })
6841#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6842    ({                                                 \
6843        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
6844        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
6845    })
6846
6847
6848#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
6849#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
6850
6851
6852#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
6853#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
6854#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
6855    VLOAD(WIDTH)                                                \
6856    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
6857#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
6858
6859
6860#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
6861#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
6862#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
6863    VSTORE(WIDTH)                                                \
6864    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
6865#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
6866
6867
6868#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
6869    ({                                                                                                                 \
6870        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
6871        {                                                                                                              \
6872            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
6873        })                                                                                                             \
6874    })
6875
6876
6877#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
6878    ({                                                                                                  \
6879        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
6880        {                                                                                               \
6881            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
6882        })                                                                                              \
6883    })
6884
6885
6886#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
6887    ({                                                                                                                                                                                             \
6888        if(WIDTH1_CONDITION)                                                                                                                                                                       \
6889        {                                                                                                                                                                                          \
6890            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
6891            {                                                                                                                                                                                      \
6892                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
6893                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
6894            })                                                                                                                                                                                     \
6895        }                                                                                                                                                                                          \
6896        else                                                                                                                                                                                       \
6897        {                                                                                                                                                                                          \
6898            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
6899            {                                                                                                                                                                                      \
6900                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
6901            })                                                                                                                                                                                     \
6902        }                                                                                                                                                                                          \
6903    })
6904
6905#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
6906    ({                                                                                                                                                \
6907        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
6908        {                                                                                                                                             \
6909            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
6910            {                                                                                                                                         \
6911                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
6912                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
6913                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
6914                if(_src_valid_y != 0)                                                                                                                 \
6915                {                                                                                                                                     \
6916                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
6917                }                                                                                                                                     \
6918            })                                                                                                                                        \
6919        })                                                                                                                                            \
6920    })
6921
6922
6923#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
6924    ({ \
6925        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
6926        { \
6927            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
6928            { \
6929                int _src_y = (X) + _xk * (DILATION_X); \
6930                int _src_z = ((Y) + _yk * (DILATION_Y)); \
6931                int _src_w    = (B); \
6932                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
6933                if(!(BOUNDARY_CHECK)) \
6934                { \
6935                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
6936                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
6937                } \
6938                else \
6939                { \
6940                    if(_src_valid_y) \
6941                    { \
6942                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
6943                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
6944                    }                                                                                                                                                                                                 \
6945                } \
6946            })                                                                                                                                                                                                             \
6947        })                                                                                                                                                                                                             \
6948    })
6949
6950
6951#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
6952    ({                                                                                                                                                                \
6953        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
6954        {                                                                                                                                                             \
6955            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
6956            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
6957            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
6958            if(_src_valid_y != 0)                                                                                                                                     \
6959            {                                                                                                                                                         \
6960                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
6961            }                                                                                                                                                         \
6962        })                                                                                                                                                            \
6963    })
6964
6965
6966#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
6967#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
6968#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
6969    ({ \
6970        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
6971        { \
6972            if(yi[0].s[_i] >= 0) \
6973            { \
6974                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
6975            } \
6976        }) \
6977    })
6978
6979#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
6980    ({ \
6981        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
6982        { \
6983            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
6984        }) \
6985    })
6986
6987
6988#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
6989    ({                                                                                                                                                                \
6990        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
6991        {                                                                                                                                                             \
6992            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
6993            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
6994            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
6995                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
6996            if(_src_valid_y != 0)                                                                                                                                     \
6997            {                                                                                                                                                         \
6998                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
6999            }                                                                                                                                                         \
7000        })                                                                                                                                                            \
7001    })
7002
7003
7004#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
7005    ({                                                                                                                                                                                             \
7006        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7007        {                                                                                                                                                                                          \
7008            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7009            {                                                                                                                                                                                      \
7010                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
7011                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7012            })                                                                                                                                                                                     \
7013        }                                                                                                                                                                                          \
7014        else                                                                                                                                                                                       \
7015        {                                                                                                                                                                                          \
7016            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7017            {                                                                                                                                                                                      \
7018                VSTORE(WIDTH0)                                                                                                                                                                     \
7019                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7020            })                                                                                                                                                                                     \
7021        }                                                                                                                                                                                          \
7022    })
7023
7024
7025#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
7026    ({                                                                                               \
7027        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
7028        {                                                                                            \
7029            ACC_DATA_TYPE _tm = 0;                                                                   \
7030            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
7031            {                                                                                        \
7032                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
7033            })                                                                                       \
7034            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
7035            {                                                                                        \
7036                dst[_m0].s[_n0] += _tm;                                                              \
7037                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
7038                {                                                                                    \
7039                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
7040                })                                                                                   \
7041            })                                                                                       \
7042        })                                                                                          \
7043    })
7044
7045
7046#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7047#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7048
7049
7050#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7051    ({ \
7052        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7053        { \
7054            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7055            { \
7056                SRC_DATA_TYPE _tmp = 0; \
7057                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7058                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7059                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7060                long a_64 = (long)(_src); \
7061                long b_64 = (long)(DST_MULTIPLIER); \
7062                long ab_64 = a_64 * b_64; \
7063                long mask1 = 1 << 30; \
7064                long mask2 = 1 - (1 << 30); \
7065                long is_positive_or_zero = ab_64 >= 0; \
7066                long nudge = select(mask2, mask1, is_positive_or_zero); \
7067                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7068                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7069                if(DST_SHIFT >= 0) \
7070                { \
7071                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
7072                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7073                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7074                } \
7075                _tmp += DST_OFFSET; \
7076                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7077            })                                                                                                                                          \
7078        })                                                                                                                                          \
7079    })
7080
7081
7082#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7083    ({ \
7084        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7085        { \
7086            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7087            { \
7088                SRC_DATA_TYPE _tmp = 0; \
7089                SRC_DATA_TYPE _tmp2 = 0; \
7090                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7091                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
7092                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
7093                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
7094                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
7095                long a_64 = (long)(_src); \
7096                long b_64 = (long)(_dst_multiplier); \
7097                long ab_64 = a_64 * b_64; \
7098                long mask1 = 1 << 30; \
7099                long mask2 = 1 - (1 << 30); \
7100                long is_positive_or_zero = ab_64 >= 0; \
7101                long nudge = select(mask2, mask1, is_positive_or_zero); \
7102                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7103                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7104                long mask = ((((int)1) << _dst_shift) - (int)1); \
7105                long threshold = (mask >> 1) + any(_tmp); \
7106                _tmp2 = _tmp >> _dst_shift; \
7107                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
7108                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
7109                _tmp += DST_OFFSET; \
7110                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7111            })                                                                                                                                          \
7112        })                                                                                                                                         \
7113    })
7114
7115
7116#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
7117    ({ \
7118        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7119        { \
7120            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7121            { \
7122                SRC_DATA_TYPE _tmp = 0; \
7123                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7124                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7125                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7126                long a_64 = (long)(_src); \
7127                long b_64 = (long)(DST_MULTIPLIER); \
7128                long ab_64 = a_64 * b_64; \
7129                long mask1 = 1 << 30; \
7130                long mask2 = 1 - (1 << 30); \
7131                long is_positive_or_zero = ab_64 >= 0; \
7132                long nudge = select(mask2, mask1, is_positive_or_zero); \
7133                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7134                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7135                if(DST_SHIFT >= 0) \
7136                { \
7137                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
7138                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7139                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7140                } \
7141                _tmp += DST_OFFSET; \
7142                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7143            })                                                                                                                                          \
7144        })                                                                                                                                          \
7145    })
7146
7147
7148#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
7149    ({                                                                                                                                                     \
7150        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
7151        {                                                                                                                                                  \
7152            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
7153            {                                                                                                                                              \
7154                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
7155            })                                                                                                                                             \
7156        })                                                                                                                                                 \
7157    })
7158
7159
7160#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
7161    ({                                                                                         \
7162        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
7163        {                                                                                      \
7164            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
7165        })                                                                                     \
7166    })
7167
7168
7169#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
7170
7171#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
7172
7173#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
7174
7175#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
7176
7177#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
7178
7179#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7180#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7181
7182#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
7183#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
7184#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
7185#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
7186
7187
7188#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
7189    ({ \
7190        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7191        { \
7192            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
7193        })                                                                                          \
7194    })
7195
7196
7197#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
7198    ({                                                            \
7199        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7200        {                                                         \
7201            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
7202        })                                                        \
7203    })
7204
7205
7206#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7207    ({                                                            \
7208        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7209        {                                                         \
7210            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
7211        })                                                        \
7212    })
7213
7214#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7215#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7216#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7217
7218#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7219#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7220
7221#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7222
7223#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7224#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7225
7226
7227#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7228    ({                                                            \
7229        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7230        {                                                         \
7231            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
7232        })                                                        \
7233    })
7234
7235
7236#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7237    ({                                                      \
7238        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7239        {                                                   \
7240            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7241        })                                                  \
7242    })
7243
7244
7245#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7246    ({                                                      \
7247        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7248        {                                                   \
7249            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7250        })                                                  \
7251    })
7252
7253#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7254#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7255#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7256#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7257
7258
7259#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7260    ({                                                      \
7261        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7262        {                                                   \
7263            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7264        })                                                  \
7265    })
7266
7267
7268#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
7269    ({                                                      \
7270        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7271        {                                                   \
7272            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7273        })                                                  \
7274    })
7275
7276
7277#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7278#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7279#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7280#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7281#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7282#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7283#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7284#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7285#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
7286    {                                                                                     \
7287        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
7288        {                                                                                 \
7289            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
7290            {                                                                             \
7291                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
7292                {                                                                         \
7293                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
7294                })                                                                        \
7295            })                                                                            \
7296        })                                                                                \
7297    }
7298
7299#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
7300    ({ \
7301        LOOP_UNROLLING(int, _m, 0, 1, M0, \
7302        { \
7303            LOOP_UNROLLING(int, _n, 0, 1, N0, \
7304            { \
7305                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
7306            })                                                                                             \
7307        })                                                                                             \
7308    })
7309
7310#endif
7311
7312#if defined(RESHAPE_LHS_NT)
7313
7314__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
7315                                         TENSOR3D_T(dst, BUFFER),
7316                                         const int M,
7317                                         const int V0)
7318{
7319
7320#define BLOCK_SIZE ((M0) * (K0))
7321
7322
7323#if defined(INTERLEAVE)
7324#define OUTPUT_OFFSET_X (K0)
7325#else
7326#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7327#endif
7328
7329
7330#if defined(INTERLEAVE)
7331#define OUTPUT_STEP_X (K0) * (V0)
7332#else
7333#define OUTPUT_STEP_X (K0)
7334#endif
7335
7336    const int x = GET_SPATIAL_IDX(0, 1, 0);
7337    const int y = GET_SPATIAL_IDX(1, 1, 0);
7338    const int z = GET_SPATIAL_IDX(2, 1, 0);
7339
7340    const int xi = x * K0;
7341    const int yi = y * M0;
7342
7343    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
7344    const int yo = (y / V0);
7345
7346
7347    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
7348    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7349
7350    TILE(DATA_TYPE, M0, K0, in);
7351
7352
7353    LOOP_UNROLLING(int, _i, 0, 1, M0,
7354    {
7355        in[_i].v = 0;
7356    });
7357
7358    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
7359    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
7360
7361    TILE(uint, M0, 1, in_indirect_y);
7362    LOOP_UNROLLING(int, _i, 0, 1, M0,
7363    {
7364        in_indirect_y[_i].v = _i;
7365
7366    });
7367#if PARTIAL_M0 != 0
7368    if(y_cond)
7369    {
7370        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7371    }
7372    else
7373#endif
7374    {
7375        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7376    }
7377
7378
7379    TILE(uint, M0, 1, dst_indirect_y);
7380    LOOP_UNROLLING(int, _i, 0, 1, M0,
7381    {
7382        dst_indirect_y[_i].v = _i;
7383    });
7384
7385    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
7386#undef BLOCK_SIZE
7387#undef OUTPUT_OFFSET_X
7388#undef OUTPUT_STEP_X
7389}
7390#endif
7391
7392#if defined(RESHAPE_LHS_T)
7393
7394__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
7395                                        TENSOR3D_T(dst, BUFFER),
7396                                        const int M,
7397                                        const int V0)
7398{
7399
7400#define BLOCK_SIZE ((M0) * (K0))
7401
7402
7403#if defined(INTERLEAVE)
7404#define OUTPUT_OFFSET_X (M0)
7405#else
7406#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7407#endif
7408
7409
7410#if defined(INTERLEAVE)
7411#define OUTPUT_STEP_X (M0) * (V0)
7412#else
7413#define OUTPUT_STEP_X (M0)
7414#endif
7415
7416    const int x = GET_SPATIAL_IDX(0, 1, 0);
7417    const int y = GET_SPATIAL_IDX(1, 1, 0);
7418    const int z = GET_SPATIAL_IDX(2, 1, 0);
7419
7420    const int xi = x * K0;
7421    const int yi = y * M0;
7422
7423    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
7424    const int yo = (y / V0);
7425
7426
7427    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
7428    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7429
7430    TILE(DATA_TYPE, M0, K0, in);
7431    TILE(DATA_TYPE, K0, M0, in_tr);
7432
7433
7434    LOOP_UNROLLING(int, _i, 0, 1, M0,
7435    {
7436        in[_i].v = 0;
7437    });
7438
7439
7440    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
7441    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
7442
7443    TILE(uint, M0, 1, in_indirect_y);
7444    LOOP_UNROLLING(int, _i, 0, 1, M0,
7445    {
7446        in_indirect_y[_i].v = _i;
7447
7448    });
7449#if PARTIAL_M0 != 0
7450    if(y_cond)
7451    {
7452        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7453    }
7454    else
7455#endif
7456    {
7457        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7458    }
7459
7460    LOOP_UNROLLING(int, m0, 0, 1, M0,
7461    {
7462        LOOP_UNROLLING(int, k0, 0, 1, K0,
7463        {
7464            in_tr[k0].s[m0] = in[m0].s[k0];
7465        })
7466    });
7467
7468    TILE(uint, K0, 1, dst_indirect_y);
7469    LOOP_UNROLLING(int, _i, 0, 1, K0,
7470    {
7471        dst_indirect_y[_i].v = _i;
7472    });
7473
7474
7475    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
7476
7477#undef BLOCK_SIZE
7478#undef OUTPUT_OFFSET_X
7479#undef OUTPUT_STEP_X
7480}
7481#endif
7482
7483#if defined(RESHAPE_RHS_NT)
7484
7485__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
7486                                         TENSOR3D_T(dst, BUFFER),
7487                                         const int H0)
7488{
7489
7490#define BLOCK_SIZE ((K0) * (N0))
7491
7492
7493#if defined(INTERLEAVE)
7494#define OUTPUT_OFFSET_X (N0)
7495#else
7496#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7497#endif
7498
7499
7500#if defined(INTERLEAVE)
7501#define OUTPUT_STEP_X (N0) * (H0)
7502#else
7503#define OUTPUT_STEP_X (N0)
7504#endif
7505
7506    const int x = GET_SPATIAL_IDX(0, 1, 0);
7507    const int y = GET_SPATIAL_IDX(1, 1, 0);
7508    const int z = GET_SPATIAL_IDX(2, 1, 0);
7509
7510    const int xi = x * N0;
7511    const int yi = y * K0;
7512
7513    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
7514    const int yo = (x / H0);
7515
7516    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
7517    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7518
7519    TILE(DATA_TYPE, K0, N0, in);
7520
7521
7522    for(int i = 0; i < K0; ++i)
7523    {
7524        in[i].v = 0;
7525    }
7526
7527
7528    for(int i = 0; i < K0; ++i)
7529    {
7530        if(yi + i < src_h)
7531        {
7532            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
7533        }
7534    }
7535
7536    TILE(uint, K0, 1, dst_indirect_y);
7537    for(int i = 0; i < K0; ++i)
7538    {
7539        dst_indirect_y[i].v = i;
7540    }
7541
7542    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
7543
7544#undef BLOCK_SIZE
7545#undef OUTPUT_OFFSET_X
7546#undef OUTPUT_STEP_X
7547}
7548#endif
7549
7550#if defined(RESHAPE_RHS_T)
7551
7552__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
7553                                        TENSOR3D_T(dst, BUFFER),
7554                                        const int H0)
7555{
7556
7557#define BLOCK_SIZE ((K0) * (N0))
7558
7559
7560#if defined(INTERLEAVE)
7561#define OUTPUT_OFFSET_X (K0)
7562#else
7563#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7564#endif
7565
7566
7567#if defined(INTERLEAVE)
7568#define OUTPUT_STEP_X (K0) * (H0)
7569#else
7570#define OUTPUT_STEP_X (K0)
7571#endif
7572
7573    const int x = GET_SPATIAL_IDX(0, 1, 0);
7574    const int y = GET_SPATIAL_IDX(1, 1, 0);
7575    const int z = GET_SPATIAL_IDX(2, 1, 0);
7576
7577    const int xi = x * N0;
7578    const int yi = y * K0;
7579
7580    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
7581    const int yo = (x / H0);
7582
7583    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
7584    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7585
7586    TILE(DATA_TYPE, K0, N0, in);
7587    TILE(DATA_TYPE, N0, K0, in_tr);
7588
7589
7590    for(int i = 0; i < K0; ++i)
7591    {
7592        in[i].v = 0;
7593    }
7594
7595
7596    for(int i = 0; i < K0; ++i)
7597    {
7598        if(yi + i < src_h)
7599        {
7600            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
7601        }
7602    }
7603
7604
7605    for(int k0 = 0; k0 < K0; ++k0)
7606    {
7607        for(int n0 = 0; n0 < N0; ++n0)
7608        {
7609            in_tr[n0].s[k0] = in[k0].s[n0];
7610        }
7611    }
7612
7613    TILE(uint, N0, 1, dst_indirect_y);
7614    for(int i = 0; i < N0; ++i)
7615    {
7616        dst_indirect_y[i].v = i;
7617    }
7618
7619    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
7620
7621#undef BLOCK_SIZE
7622#undef OUTPUT_OFFSET_X
7623#undef OUTPUT_STEP_X
7624}
7625
7626#endif  )"