xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/nchw/batchnormalization_layer.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3#ifndef ARM_COMPUTE_HELPER_H
4#define ARM_COMPUTE_HELPER_H
5
6
7
8
9#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
10    VSTORE(N0)                                                 \
11    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
12
13#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
14    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
15    VSTORE(N0)                                                 \
16    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
17
18#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
19    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
20    VSTORE(N0)                                                 \
21    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
22
23#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
24    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
25    VSTORE(N0)                                                 \
26    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
27
28#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
29    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
30    VSTORE(N0)                                                 \
31    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
32
33#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
34    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
35    VSTORE(N0)                                                 \
36    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
37
38#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
39    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
40    VSTORE(N0)                                                 \
41    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
42
43#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
44    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
45    VSTORE(N0)                                                 \
46    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
47
48#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
49    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
50    VSTORE(N0)                                                 \
51    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
52
53#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
54    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
55    VSTORE(N0)                                                  \
56    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
57
58#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
59    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
60    VSTORE(N0)                                                  \
61    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
62
63#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
64    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
65    VSTORE(N0)                                                  \
66    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
67
68#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
69    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
70    VSTORE(N0)                                                  \
71    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
72
73#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
74    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
75    VSTORE(N0)                                                  \
76    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
77
78#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
79    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
80    VSTORE(N0)                                                  \
81    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
82
83#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
84    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
85    VSTORE(N0)                                                  \
86    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
87
88
89
90#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
91    VSTORE(N0)                                                         \
92    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
93
94#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
95    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
96    VSTORE(N0)                                                         \
97    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
98
99#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
100    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
101    VSTORE(N0)                                                         \
102    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
103
104#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
105    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
106    VSTORE(N0)                                                         \
107    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
108
109#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
110    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
111    VSTORE(N0)                                                         \
112    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
113
114#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
115    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
116    VSTORE(N0)                                                         \
117    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
118
119#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
120    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
121    VSTORE(N0)                                                         \
122    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
123
124#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
125    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
126    VSTORE(N0)                                                         \
127    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
128
129#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
130    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
131    VSTORE(N0)                                                         \
132    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
133
134#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
135    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136    VSTORE(N0)                                                     \
137    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
138
139#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
140    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
141    VSTORE(N0)                                                          \
142    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
143
144#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
145    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
146    VSTORE(N0)                                                          \
147    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
148
149#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
150    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
151    VSTORE(N0)                                                          \
152    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
153
154#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
155    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
156    VSTORE(N0)                                                          \
157    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
158
159#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
160    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
161    VSTORE(N0)                                                          \
162    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
163
164#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
165    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
166    VSTORE(N0)                                                          \
167    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168
169
170
171
172#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
173#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
174
175
176
177#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
178#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179
180
181
182#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
184    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
185
186#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
187    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
188    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
189    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
190
191#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
192    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
193    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
194    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
195
196#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
197    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
198    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
199    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
200
201#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
202    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
203    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
204    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
205
206#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
207    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
208    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
209    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
210
211#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
212    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
213    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
214    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
215
216#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
217    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
218    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
219    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
220
221#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
222    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
223    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
224    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
225
226#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
227    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
228    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
229    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
230
231#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
232    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
233    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
234    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
235
236#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
237    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
238    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
239    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
240
241#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
242    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
243    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
244    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
245
246#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
247    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
248    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
249    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
250
251#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
252    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
253    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
254    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
255
256#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
257    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
258    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
259    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
260
261
262
263#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
264#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
265
266#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
267    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
268    {                                                                                                                                                     \
269        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
270    }                                                                                                                                                     \
271    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
272    {                                                                                                                                                     \
273        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
274    }                                                                                                                                                     \
275    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
276    {                                                                                                                                                     \
277        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
278    }                                                                                                                                                     \
279    else                                                                                                                                                  \
280    {                                                                                                                                                     \
281        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
282    }
283
284#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
285    if(!(PARTIAL_COND_X))                                                                                         \
286    {                                                                                                             \
287        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
288    }                                                                                                             \
289    else                                                                                                          \
290    {                                                                                                             \
291        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
292    }
293
294#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
295    if(!(PARTIAL_COND_Y))                                                                                         \
296    {                                                                                                             \
297        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
298    }                                                                                                             \
299    else                                                                                                          \
300    {                                                                                                             \
301        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
302    }
303
304
305#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
306
307
308#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
309
310#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
311    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
312
313#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
314
315#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
316    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
317
318#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
319
320#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
321    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
322
323#else
324
325#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
326    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
327
328#endif
329
330#endif
331
332
333#if defined(PARTIAL_STORE_M0)
334
335#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
336    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
337#else
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(y * M0))
340#endif
341
342
343
344#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
345    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
346
347
348#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
349#pragma OPENCL EXTENSION cl_khr_fp16 : enable
350#endif
351
352#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
353#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
354#endif
355
356#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
357#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
358#endif
359
360#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
361#pragma OPENCL EXTENSION cl_arm_printf : enable
362#endif
363
364#define GPU_ARCH_MIDGARD 0x100
365#define GPU_ARCH_BIFROST 0x200
366#define GPU_ARCH_VALHALL 0x300
367
368
369#define CONCAT(a, b) a##b
370
371
372#define EXPAND(x) x
373
374
375#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
376
377
378#define REV1(x) ((x))
379#define REV2(x) ((x).s10)
380#define REV3(x) ((x).s210)
381#define REV4(x) ((x).s3210)
382#define REV8(x) ((x).s76543210)
383#define REV16(x) ((x).sFEDCBA9876543210)
384
385
386
387#define REVERSE_STR(x, s) REV##s((x))
388#define REVERSE(x, s) REVERSE_STR(x, s)
389
390
391
392#define ROT1_0(x) ((x))
393#define ROT1_1(x) ((x))
394
395#define ROT2_0(x) ((x))
396#define ROT2_1(x) ((x).s10)
397#define ROT2_2(x) ((x))
398
399#define ROT3_0(x) ((x))
400#define ROT3_1(x) ((x).s201)
401#define ROT3_2(x) ((x).s120)
402#define ROT3_3(x) ((x))
403
404#define ROT4_0(x) ((x))
405#define ROT4_1(x) ((x).s3012)
406#define ROT4_2(x) ((x).s2301)
407#define ROT4_3(x) ((x).s1230)
408#define ROT4_4(x) ((x))
409
410#define ROT8_0(x) ((x))
411#define ROT8_1(x) ((x).s70123456)
412#define ROT8_2(x) ((x).s67012345)
413#define ROT8_3(x) ((x).s56701234)
414#define ROT8_4(x) ((x).s45670123)
415#define ROT8_5(x) ((x).s34567012)
416#define ROT8_6(x) ((x).s23456701)
417#define ROT8_7(x) ((x).s12345670)
418#define ROT8_8(x) ((x))
419
420#define ROT16_0(x) ((x))
421#define ROT16_1(x) ((x).sF0123456789ABCDE)
422#define ROT16_2(x) ((x).sEF0123456789ABCD)
423#define ROT16_3(x) ((x).sDEF0123456789ABC)
424#define ROT16_4(x) ((x).sCDEF0123456789AB)
425#define ROT16_5(x) ((x).sBCDEF0123456789A)
426#define ROT16_6(x) ((x).sABCDEF0123456789)
427#define ROT16_7(x) ((x).s9ABCDEF012345678)
428#define ROT16_8(x) ((x).s89ABCDEF01234567)
429#define ROT16_9(x) ((x).s789ABCDEF0123456)
430#define ROT16_10(x) ((x).s6789ABCDEF012345)
431#define ROT16_11(x) ((x).s56789ABCDEF01234)
432#define ROT16_12(x) ((x).s456789ABCDEF0123)
433#define ROT16_13(x) ((x).s3456789ABCDEF012)
434#define ROT16_14(x) ((x).s23456789ABCDEF01)
435#define ROT16_15(x) ((x).s123456789ABCDEF0)
436#define ROT16_16(x) ((x))
437
438
439
440#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
441#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
442
443
444
445#define V_OFFS1(dt) (dt##1)(0)
446#define V_OFFS2(dt) (dt##2)(0, 1)
447#define V_OFFS3(dt) (dt##3)(0, 1, 2)
448#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
449#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
450#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
451
452
453
454#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
455#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
456
457
458#define VLOAD_STR(size) vload##size
459#define VLOAD(size) VLOAD_STR(size)
460
461
462#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
463#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
464
465#define NO_LOAD(data, offs, ptr) \
466    {                            \
467    }
468
469
470#define vload_partial_1_0 NO_LOAD
471#define vload_partial_1_1 vload1
472#define vload_partial_1_2 NO_LOAD
473#define vload_partial_1_3 NO_LOAD
474#define vload_partial_1_4 NO_LOAD
475#define vload_partial_1_5 NO_LOAD
476#define vload_partial_1_6 NO_LOAD
477#define vload_partial_1_7 NO_LOAD
478#define vload_partial_1_8 NO_LOAD
479#define vload_partial_1_9 NO_LOAD
480#define vload_partial_1_10 NO_LOAD
481#define vload_partial_1_11 NO_LOAD
482#define vload_partial_1_12 NO_LOAD
483#define vload_partial_1_13 NO_LOAD
484#define vload_partial_1_14 NO_LOAD
485#define vload_partial_1_15 NO_LOAD
486#define vload_partial_1_16 NO_LOAD
487
488#define vload_partial_2_0 NO_LOAD
489#define vload_partial_2_1 vload_partial_1
490#define vload_partial_2_2 vload_partial_2
491#define vload_partial_2_3 NO_LOAD
492#define vload_partial_2_4 NO_LOAD
493#define vload_partial_2_5 NO_LOAD
494#define vload_partial_2_6 NO_LOAD
495#define vload_partial_2_7 NO_LOAD
496#define vload_partial_2_8 NO_LOAD
497#define vload_partial_2_9 NO_LOAD
498#define vload_partial_2_10 NO_LOAD
499#define vload_partial_2_11 NO_LOAD
500#define vload_partial_2_12 NO_LOAD
501#define vload_partial_2_13 NO_LOAD
502#define vload_partial_2_14 NO_LOAD
503#define vload_partial_2_15 NO_LOAD
504#define vload_partial_2_16 NO_LOAD
505
506#define vload_partial_3_0 NO_LOAD
507#define vload_partial_3_1 vload_partial_1
508#define vload_partial_3_2 vload_partial_2
509#define vload_partial_3_3 vload_partial_3
510#define vload_partial_3_4 NO_LOAD
511#define vload_partial_3_5 NO_LOAD
512#define vload_partial_3_6 NO_LOAD
513#define vload_partial_3_7 NO_LOAD
514#define vload_partial_3_8 NO_LOAD
515#define vload_partial_3_9 NO_LOAD
516#define vload_partial_3_10 NO_LOAD
517#define vload_partial_3_11 NO_LOAD
518#define vload_partial_3_12 NO_LOAD
519#define vload_partial_3_13 NO_LOAD
520#define vload_partial_3_14 NO_LOAD
521#define vload_partial_3_15 NO_LOAD
522#define vload_partial_3_16 NO_LOAD
523
524#define vload_partial_4_0 NO_LOAD
525#define vload_partial_4_1 vload_partial_1
526#define vload_partial_4_2 vload_partial_2
527#define vload_partial_4_3 vload_partial_3
528#define vload_partial_4_4 vload_partial_4
529#define vload_partial_4_5 NO_LOAD
530#define vload_partial_4_6 NO_LOAD
531#define vload_partial_4_7 NO_LOAD
532#define vload_partial_4_8 NO_LOAD
533#define vload_partial_4_9 NO_LOAD
534#define vload_partial_4_10 NO_LOAD
535#define vload_partial_4_11 NO_LOAD
536#define vload_partial_4_12 NO_LOAD
537#define vload_partial_4_13 NO_LOAD
538#define vload_partial_4_14 NO_LOAD
539#define vload_partial_4_15 NO_LOAD
540#define vload_partial_4_16 NO_LOAD
541
542#define vload_partial_8_0 NO_LOAD
543#define vload_partial_8_1 vload_partial_1
544#define vload_partial_8_2 vload_partial_2
545#define vload_partial_8_3 vload_partial_3
546#define vload_partial_8_4 vload_partial_4
547#define vload_partial_8_5 vload_partial_5
548#define vload_partial_8_6 vload_partial_6
549#define vload_partial_8_7 vload_partial_7
550#define vload_partial_8_8 vload_partial_8
551#define vload_partial_8_9 NO_LOAD
552#define vload_partial_8_10 NO_LOAD
553#define vload_partial_8_11 NO_LOAD
554#define vload_partial_8_12 NO_LOAD
555#define vload_partial_8_13 NO_LOAD
556#define vload_partial_8_14 NO_LOAD
557#define vload_partial_8_15 NO_LOAD
558#define vload_partial_8_16 NO_LOAD
559
560#define vload_partial_16_0 NO_LOAD
561#define vload_partial_16_1 vload_partial_1
562#define vload_partial_16_2 vload_partial_2
563#define vload_partial_16_3 vload_partial_3
564#define vload_partial_16_4 vload_partial_4
565#define vload_partial_16_5 vload_partial_5
566#define vload_partial_16_6 vload_partial_6
567#define vload_partial_16_7 vload_partial_7
568#define vload_partial_16_8 vload_partial_8
569#define vload_partial_16_9 vload_partial_9
570#define vload_partial_16_10 vload_partial_10
571#define vload_partial_16_11 vload_partial_11
572#define vload_partial_16_12 vload_partial_12
573#define vload_partial_16_13 vload_partial_13
574#define vload_partial_16_14 vload_partial_14
575#define vload_partial_16_15 vload_partial_15
576#define vload_partial_16_16 vload_partial_16
577
578
579#define vload_partial_1(DATA, OFFSET, PTR) \
580    DATA.s0 = vload1(OFFSET, PTR);
581
582#define vload_partial_2(DATA, OFFSET, PTR) \
583    DATA.s01 = vload2(OFFSET, PTR);
584
585#define vload_partial_3(DATA, OFFSET, PTR) \
586    DATA.s012 = vload3(OFFSET, PTR);
587
588#define vload_partial_4(DATA, OFFSET, PTR) \
589    DATA.s0123 = vload4(OFFSET, PTR);
590
591#define vload_partial_5(DATA, OFFSET, PTR)    \
592    vload_partial_4(DATA.s0123, OFFSET, PTR); \
593    DATA.s4 = vload1(OFFSET, PTR + 4);
594
595#define vload_partial_6(DATA, OFFSET, PTR)    \
596    vload_partial_4(DATA.s0123, OFFSET, PTR); \
597    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
598
599#define vload_partial_7(DATA, OFFSET, PTR)    \
600    vload_partial_4(DATA.s0123, OFFSET, PTR); \
601    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
602
603#define vload_partial_8(DATA, OFFSET, PTR) \
604    DATA.s01234567 = vload8(OFFSET, PTR);
605
606#define vload_partial_9(DATA, OFFSET, PTR)        \
607    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
608    DATA.s8 = vload1(OFFSET, PTR + 8);
609
610#define vload_partial_10(DATA, OFFSET, PTR)       \
611    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
612    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
613
614#define vload_partial_11(DATA, OFFSET, PTR)       \
615    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
616    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
617
618#define vload_partial_12(DATA, OFFSET, PTR)       \
619    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
620    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
621
622#define vload_partial_13(DATA, OFFSET, PTR)       \
623    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
624    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
625
626#define vload_partial_14(DATA, OFFSET, PTR)       \
627    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
628    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
629
630#define vload_partial_15(DATA, OFFSET, PTR)       \
631    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
632    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
633
634#define vload_partial_16(DATA, OFFSET, PTR) \
635    DATA = vload16(OFFSET, PTR);
636
637
638
639#define PIXEL_UNIT4 1
640#define PIXEL_UNIT8 2
641#define PIXEL_UNIT16 4
642
643
644#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
646
647
648#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
649#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
650#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
651
652#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
653#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
654#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
655#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
656#endif
657
658#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
659#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
660#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
661
662#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
663#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
664#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
665#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
666#endif
667
668
669#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
670#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
671
672
673#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
674#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
675
676#define VSTORE_STR(size) vstore##size
677#define VSTORE(size) VSTORE_STR(size)
678
679#define float1 float
680#define half1 half
681#define char1 char
682#define uchar1 uchar
683#define short1 short
684#define ushort1 ushort
685#define int1 int
686#define uint1 uint
687#define long1 long
688#define ulong1 ulong
689#define double1 double
690
691#define vload1(OFFSET, PTR) *(OFFSET + PTR)
692#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
693
694
695#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
696#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
697
698#define NO_STORE(data, offs, ptr) \
699    {                             \
700    }
701
702
703#define vstore_partial_1_0 NO_STORE
704#define vstore_partial_1_1 vstore1
705#define vstore_partial_1_2 NO_STORE
706#define vstore_partial_1_3 NO_STORE
707#define vstore_partial_1_4 NO_STORE
708#define vstore_partial_1_5 NO_STORE
709#define vstore_partial_1_6 NO_STORE
710#define vstore_partial_1_7 NO_STORE
711#define vstore_partial_1_8 NO_STORE
712#define vstore_partial_1_9 NO_STORE
713#define vstore_partial_1_10 NO_STORE
714#define vstore_partial_1_11 NO_STORE
715#define vstore_partial_1_12 NO_STORE
716#define vstore_partial_1_13 NO_STORE
717#define vstore_partial_1_14 NO_STORE
718#define vstore_partial_1_15 NO_STORE
719#define vstore_partial_1_16 NO_STORE
720
721#define vstore_partial_2_0 NO_STORE
722#define vstore_partial_2_1 vstore_partial_1
723#define vstore_partial_2_2 vstore_partial_2
724#define vstore_partial_2_3 NO_STORE
725#define vstore_partial_2_4 NO_STORE
726#define vstore_partial_2_5 NO_STORE
727#define vstore_partial_2_6 NO_STORE
728#define vstore_partial_2_7 NO_STORE
729#define vstore_partial_2_8 NO_STORE
730#define vstore_partial_2_9 NO_STORE
731#define vstore_partial_2_10 NO_STORE
732#define vstore_partial_2_11 NO_STORE
733#define vstore_partial_2_12 NO_STORE
734#define vstore_partial_2_13 NO_STORE
735#define vstore_partial_2_14 NO_STORE
736#define vstore_partial_2_15 NO_STORE
737#define vstore_partial_2_16 NO_STORE
738
739#define vstore_partial_3_0 NO_STORE
740#define vstore_partial_3_1 vstore_partial_1
741#define vstore_partial_3_2 vstore_partial_2
742#define vstore_partial_3_3 vstore_partial_3
743#define vstore_partial_3_4 NO_STORE
744#define vstore_partial_3_5 NO_STORE
745#define vstore_partial_3_6 NO_STORE
746#define vstore_partial_3_7 NO_STORE
747#define vstore_partial_3_8 NO_STORE
748#define vstore_partial_3_9 NO_STORE
749#define vstore_partial_3_10 NO_STORE
750#define vstore_partial_3_11 NO_STORE
751#define vstore_partial_3_12 NO_STORE
752#define vstore_partial_3_13 NO_STORE
753#define vstore_partial_3_14 NO_STORE
754#define vstore_partial_3_15 NO_STORE
755#define vstore_partial_3_16 NO_STORE
756
757#define vstore_partial_4_0 NO_STORE
758#define vstore_partial_4_1 vstore_partial_1
759#define vstore_partial_4_2 vstore_partial_2
760#define vstore_partial_4_3 vstore_partial_3
761#define vstore_partial_4_4 vstore_partial_4
762#define vstore_partial_4_5 NO_STORE
763#define vstore_partial_4_6 NO_STORE
764#define vstore_partial_4_7 NO_STORE
765#define vstore_partial_4_8 NO_STORE
766#define vstore_partial_4_9 NO_STORE
767#define vstore_partial_4_10 NO_STORE
768#define vstore_partial_4_11 NO_STORE
769#define vstore_partial_4_12 NO_STORE
770#define vstore_partial_4_13 NO_STORE
771#define vstore_partial_4_14 NO_STORE
772#define vstore_partial_4_15 NO_STORE
773#define vstore_partial_4_16 NO_STORE
774
775#define vstore_partial_8_0 NO_STORE
776#define vstore_partial_8_1 vstore_partial_1
777#define vstore_partial_8_2 vstore_partial_2
778#define vstore_partial_8_3 vstore_partial_3
779#define vstore_partial_8_4 vstore_partial_4
780#define vstore_partial_8_5 vstore_partial_5
781#define vstore_partial_8_6 vstore_partial_6
782#define vstore_partial_8_7 vstore_partial_7
783#define vstore_partial_8_8 vstore_partial_8
784#define vstore_partial_8_9 NO_STORE
785#define vstore_partial_8_10 NO_STORE
786#define vstore_partial_8_11 NO_STORE
787#define vstore_partial_8_12 NO_STORE
788#define vstore_partial_8_13 NO_STORE
789#define vstore_partial_8_14 NO_STORE
790#define vstore_partial_8_15 NO_STORE
791#define vstore_partial_8_16 NO_STORE
792
793#define vstore_partial_16_0 NO_STORE
794#define vstore_partial_16_1 vstore_partial_1
795#define vstore_partial_16_2 vstore_partial_2
796#define vstore_partial_16_3 vstore_partial_3
797#define vstore_partial_16_4 vstore_partial_4
798#define vstore_partial_16_5 vstore_partial_5
799#define vstore_partial_16_6 vstore_partial_6
800#define vstore_partial_16_7 vstore_partial_7
801#define vstore_partial_16_8 vstore_partial_8
802#define vstore_partial_16_9 vstore_partial_9
803#define vstore_partial_16_10 vstore_partial_10
804#define vstore_partial_16_11 vstore_partial_11
805#define vstore_partial_16_12 vstore_partial_12
806#define vstore_partial_16_13 vstore_partial_13
807#define vstore_partial_16_14 vstore_partial_14
808#define vstore_partial_16_15 vstore_partial_15
809#define vstore_partial_16_16 vstore_partial_16
810
811
812#define vstore_partial_1(DATA, OFFSET, PTR) \
813    vstore1(DATA.s0, OFFSET, PTR);
814
815#define vstore_partial_2(DATA, OFFSET, PTR) \
816    vstore2(DATA.s01, OFFSET, PTR);
817
818#define vstore_partial_3(DATA, OFFSET, PTR) \
819    vstore3(DATA.s012, OFFSET, PTR);
820
821#define vstore_partial_4(DATA, OFFSET, PTR) \
822    vstore4(DATA.s0123, OFFSET, PTR);
823
824#define vstore_partial_5(DATA, OFFSET, PTR)    \
825    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
826    vstore1(DATA.s4, OFFSET, PTR + 4);
827
828#define vstore_partial_6(DATA, OFFSET, PTR)    \
829    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
830    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
831
832#define vstore_partial_7(DATA, OFFSET, PTR)    \
833    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
834    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
835
836#define vstore_partial_8(DATA, OFFSET, PTR) \
837    vstore8(DATA.s01234567, OFFSET, PTR);
838
839#define vstore_partial_9(DATA, OFFSET, PTR)        \
840    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
841    vstore1(DATA.s8, OFFSET, PTR + 8);
842
843#define vstore_partial_10(DATA, OFFSET, PTR)       \
844    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
845    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
846
847#define vstore_partial_11(DATA, OFFSET, PTR)       \
848    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
849    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
850
851#define vstore_partial_12(DATA, OFFSET, PTR)       \
852    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
853    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
854
855#define vstore_partial_13(DATA, OFFSET, PTR)       \
856    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
857    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
858
859#define vstore_partial_14(DATA, OFFSET, PTR)       \
860    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
861    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
862
863#define vstore_partial_15(DATA, OFFSET, PTR)       \
864    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
865    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
866
867#define vstore_partial_16(DATA, OFFSET, PTR) \
868    vstore16(DATA, OFFSET, PTR);
869
870
871
872
873
874#define convert_float_sat convert_float
875#define convert_float1_sat convert_float
876#define convert_float2_sat convert_float2
877#define convert_float3_sat convert_float3
878#define convert_float4_sat convert_float4
879#define convert_float8_sat convert_float8
880#define convert_float16_sat convert_float16
881#define convert_half_sat convert_float
882#define convert_half1_sat convert_half
883#define convert_half2_sat convert_half2
884#define convert_half3_sat convert_half3
885#define convert_half4_sat convert_half4
886#define convert_half8_sat convert_half8
887#define convert_half16_sat convert_half16
888
889#define convert_float1 convert_float
890#define convert_half1 convert_half
891#define convert_char1 convert_char
892#define convert_uchar1 convert_uchar
893#define convert_short1 convert_short
894#define convert_ushort1 convert_ushort
895#define convert_int1 convert_int
896#define convert_uint1 convert_uint
897#define convert_long1 convert_long
898#define convert_ulong1 convert_ulong
899#define convert_double1 convert_double
900
901#define convert_char1_sat convert_char_sat
902#define convert_uchar1_sat convert_uchar_sat
903#define convert_uchar2_sat convert_uchar2_sat
904#define convert_uchar3_sat convert_uchar3_sat
905#define convert_uchar4_sat convert_uchar4_sat
906#define convert_uchar8_sat convert_uchar8_sat
907#define convert_uchar16_sat convert_uchar16_sat
908#define convert_short1_sat convert_short_sat
909#define convert_ushort1_sat convert_ushort_sat
910#define convert_int1_sat convert_int_sat
911#define convert_uint1_sat convert_uint_sat
912#define convert_long1_sat convert_long_sat
913#define convert_ulong1_sat convert_ulong_sat
914#define convert_double1_sat convert_double_sat
915
916#define VEC_DATA_TYPE_STR(type, size) type##size
917#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
918
919#define CONVERT_STR(x, type) (convert_##type((x)))
920#define CONVERT(x, type) CONVERT_STR(x, type)
921
922#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
923#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
924
925#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
926#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
927
928#define select_vec_dt_uchar(size) uchar##size
929#define select_vec_dt_char(size) char##size
930#define select_vec_dt_ushort(size) ushort##size
931#define select_vec_dt_short(size) short##size
932#define select_vec_dt_half(size) short##size
933#define select_vec_dt_uint(size) uint##size
934#define select_vec_dt_int(size) int##size
935#define select_vec_dt_float(size) int##size
936#define select_vec_dt_ulong(size) ulong##size
937#define select_vec_dt_long(size) long##size
938
939#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
940#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
941#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
942
943#define signed_int_vec_dt_uchar(size) char##size
944#define signed_int_vec_dt_char(size) char##size
945#define signed_int_vec_dt_ushort(size) short##size
946#define signed_int_vec_dt_short(size) short##size
947#define signed_int_vec_dt_half(size) short##size
948#define signed_int_vec_dt_uint(size) int##size
949#define signed_int_vec_dt_int(size) int##size
950#define signed_int_vec_dt_float(size) int##size
951#define signed_int_vec_dt_ulong(size) long##size
952#define signed_int_vec_dt_long(size) long##size
953
954#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
955#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
956#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
957
958#define sum_reduce_1(x) (x)
959#define sum_reduce_2(x) ((x).s0) + ((x).s1)
960#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
961#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
962#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
963#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
964
965#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
966#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
967
968#define prod_reduce_1(x) (x)
969#define prod_reduce_2(x) ((x).s0) * ((x).s1)
970#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
971#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
972#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
973#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
974
975#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
976#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
977
978#define max_reduce_1(x) (x)
979#define max_reduce_2(x) max(((x).s0), ((x).s1))
980#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
981#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
982#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
983#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
984
985#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
986#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
987
988#define VECTOR_DECLARATION(name)     \
989    __global uchar *name##_ptr,      \
990    uint        name##_stride_x, \
991    uint        name##_step_x,   \
992    uint        name##_offset_first_element_in_bytes
993
994#define IMAGE_DECLARATION(name)      \
995    __global uchar *name##_ptr,      \
996    uint        name##_stride_x, \
997    uint        name##_step_x,   \
998    uint        name##_stride_y, \
999    uint        name##_step_y,   \
1000    uint        name##_offset_first_element_in_bytes
1001
1002#define TENSOR3D_DECLARATION(name)   \
1003    __global uchar *name##_ptr,      \
1004    uint        name##_stride_x, \
1005    uint        name##_step_x,   \
1006    uint        name##_stride_y, \
1007    uint        name##_step_y,   \
1008    uint        name##_stride_z, \
1009    uint        name##_step_z,   \
1010    uint        name##_offset_first_element_in_bytes
1011
1012#define TENSOR4D_DECLARATION(name)   \
1013    __global uchar *name##_ptr,      \
1014    uint        name##_stride_x, \
1015    uint        name##_step_x,   \
1016    uint        name##_stride_y, \
1017    uint        name##_step_y,   \
1018    uint        name##_stride_z, \
1019    uint        name##_step_z,   \
1020    uint        name##_stride_w, \
1021    uint        name##_step_w,   \
1022    uint        name##_offset_first_element_in_bytes
1023
1024#define TENSOR5D_DECLARATION(name)   \
1025    __global uchar *name##_ptr,      \
1026    uint        name##_stride_x, \
1027    uint        name##_step_x,   \
1028    uint        name##_stride_y, \
1029    uint        name##_step_y,   \
1030    uint        name##_stride_z, \
1031    uint        name##_step_z,   \
1032    uint        name##_stride_w, \
1033    uint        name##_step_w,   \
1034    uint        name##_stride_v, \
1035    uint        name##_step_v,   \
1036    uint        name##_offset_first_element_in_bytes
1037
1038#define CONVERT_TO_VECTOR_STRUCT(name) \
1039    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1040
1041#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1043
1044#define CONVERT_TO_IMAGE_STRUCT(name) \
1045    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1046
1047#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1049
1050#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1051    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1060    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1061                                 name##_stride_z, name##_step_z)
1062
1063#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1064    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1065
1066#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1067    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1068                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1069
1070#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1071    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1072
1073#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1074    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1075                           name##_stride_z, name##_step_z)
1076
1077
1078typedef struct Vector
1079{
1080    __global uchar *ptr;
1081    int             offset_first_element_in_bytes;
1082    int             stride_x;
1083} Vector;
1084
1085
1086typedef struct Image
1087{
1088    __global uchar *ptr;
1089    int             offset_first_element_in_bytes;
1090    int             stride_x;
1091    int             stride_y;
1092} Image;
1093
1094
1095typedef struct Tensor3D
1096{
1097    __global uchar *ptr;
1098    int             offset_first_element_in_bytes;
1099    int             stride_x;
1100    int             stride_y;
1101    int             stride_z;
1102} Tensor3D;
1103
1104
1105typedef struct Tensor4D
1106{
1107    __global uchar *ptr;
1108    int             offset_first_element_in_bytes;
1109    int             stride_x;
1110    int             stride_y;
1111    int             stride_z;
1112    int             stride_w;
1113} Tensor4D;
1114
1115
1116inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1117{
1118    Vector vector =
1119    {
1120        .ptr                           = ptr,
1121        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1122        .stride_x                      = stride_x,
1123    };
1124    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1125    return vector;
1126}
1127
1128
1129inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1130{
1131    Image img =
1132    {
1133        .ptr                           = ptr,
1134        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1135        .stride_x                      = stride_x,
1136        .stride_y                      = stride_y
1137    };
1138    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1139    return img;
1140}
1141
1142
1143inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1144{
1145    Image img =
1146    {
1147        .ptr                           = ptr,
1148        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1149        .stride_x                      = stride_x,
1150        .stride_y                      = stride_y
1151    };
1152    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1153    return img;
1154}
1155
1156
1157inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1158{
1159    Tensor3D tensor =
1160    {
1161        .ptr                           = ptr,
1162        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1163        .stride_x                      = stride_x,
1164        .stride_y                      = stride_y,
1165        .stride_z                      = stride_z
1166    };
1167    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1168    return tensor;
1169}
1170
1171
1172inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1173{
1174    Tensor3D tensor =
1175    {
1176        .ptr                           = ptr,
1177        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1178        .stride_x                      = stride_x,
1179        .stride_y                      = stride_y,
1180        .stride_z                      = stride_z
1181    };
1182    return tensor;
1183}
1184
1185inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1186                                             uint step_w,
1187                                             uint mod_size)
1188{
1189    Tensor4D tensor =
1190    {
1191        .ptr                           = ptr,
1192        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1193        .stride_x                      = stride_x,
1194        .stride_y                      = stride_y,
1195        .stride_z                      = stride_z,
1196        .stride_w                      = stride_w
1197    };
1198
1199    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1200    return tensor;
1201}
1202
1203
1204inline __global const uchar *vector_offset(const Vector *vec, int x)
1205{
1206    return vec->ptr + x * vec->stride_x;
1207}
1208
1209
1210inline __global uchar *offset(const Image *img, int x, int y)
1211{
1212    return img->ptr + x * img->stride_x + y * img->stride_y;
1213}
1214
1215
1216inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1217{
1218    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1219}
1220
1221
1222inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1223{
1224    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1225}
1226
1227
1228inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1229{
1230    uint num_elements = width * height;
1231
1232    const uint z = index / num_elements;
1233
1234    index %= num_elements;
1235
1236    const uint y = index / width;
1237
1238    index %= width;
1239
1240    const uint x = index;
1241
1242    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1243}
1244
1245#endif
1246
1247#define ADD_OP(a, b) ((a) + (b))
1248#define SUB_OP(a, b) ((a) - (b))
1249#define MUL_OP(a, b) ((a) * (b))
1250#define INVSQRT_OP(a) rsqrt((a))
1251#define SQCVT_SAT(a) (a)
1252
1253#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
1254
1255
1256
1257#ifndef ARM_COMPUTE_HELPER_H
1258#define ARM_COMPUTE_HELPER_H
1259
1260
1261
1262
1263#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1264    VSTORE(N0)                                                 \
1265    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1266
1267#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1268    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1269    VSTORE(N0)                                                 \
1270    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1271
1272#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1273    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1274    VSTORE(N0)                                                 \
1275    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1276
1277#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1278    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1279    VSTORE(N0)                                                 \
1280    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1281
1282#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1283    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1284    VSTORE(N0)                                                 \
1285    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1286
1287#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1288    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1289    VSTORE(N0)                                                 \
1290    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1291
1292#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1293    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1294    VSTORE(N0)                                                 \
1295    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1296
1297#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1298    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1299    VSTORE(N0)                                                 \
1300    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1301
1302#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1303    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1304    VSTORE(N0)                                                 \
1305    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1306
1307#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1308    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1309    VSTORE(N0)                                                  \
1310    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1311
1312#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1313    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1314    VSTORE(N0)                                                  \
1315    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1316
1317#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1318    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1319    VSTORE(N0)                                                  \
1320    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1321
1322#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1323    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1324    VSTORE(N0)                                                  \
1325    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1326
1327#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1328    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1329    VSTORE(N0)                                                  \
1330    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1331
1332#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1333    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1334    VSTORE(N0)                                                  \
1335    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1336
1337#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1338    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1339    VSTORE(N0)                                                  \
1340    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1341
1342
1343
1344#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1345    VSTORE(N0)                                                         \
1346    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1347
1348#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1349    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1350    VSTORE(N0)                                                         \
1351    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1352
1353#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1354    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1355    VSTORE(N0)                                                         \
1356    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1357
1358#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1359    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1360    VSTORE(N0)                                                         \
1361    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1362
1363#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1364    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1365    VSTORE(N0)                                                         \
1366    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1367
1368#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1369    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1370    VSTORE(N0)                                                         \
1371    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1372
1373#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1374    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1375    VSTORE(N0)                                                         \
1376    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1377
1378#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1379    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1380    VSTORE(N0)                                                         \
1381    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1382
1383#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1384    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1385    VSTORE(N0)                                                         \
1386    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1387
1388#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1389    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1390    VSTORE(N0)                                                     \
1391    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1392
1393#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1394    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1395    VSTORE(N0)                                                          \
1396    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1397
1398#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1399    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1400    VSTORE(N0)                                                          \
1401    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1402
1403#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1404    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1405    VSTORE(N0)                                                          \
1406    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1407
1408#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1409    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1410    VSTORE(N0)                                                          \
1411    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1412
1413#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1414    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1415    VSTORE(N0)                                                          \
1416    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1417
1418#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1419    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1420    VSTORE(N0)                                                          \
1421    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1422
1423
1424
1425
1426#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1427#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1428
1429
1430
1431#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1432#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1433
1434
1435
1436#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1437    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1438    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1439
1440#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1441    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1442    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1443    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1444
1445#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1446    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1447    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1448    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1449
1450#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1451    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1452    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1453    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1454
1455#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1456    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1457    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1458    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1459
1460#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1461    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1462    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1463    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1464
1465#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1466    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1467    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1468    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1469
1470#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1471    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1472    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1473    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1474
1475#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1476    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1477    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1478    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1479
1480#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1481    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1482    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1483    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1484
1485#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1486    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1487    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1488    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1489
1490#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1491    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1492    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1493    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1494
1495#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1496    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1497    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1498    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1499
1500#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1501    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1502    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1503    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1504
1505#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1506    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1507    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1508    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1509
1510#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1511    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1512    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1513    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1514
1515
1516
1517#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1518#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1519
1520#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1521    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1522    {                                                                                                                                                     \
1523        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1524    }                                                                                                                                                     \
1525    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1526    {                                                                                                                                                     \
1527        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1528    }                                                                                                                                                     \
1529    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1530    {                                                                                                                                                     \
1531        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1532    }                                                                                                                                                     \
1533    else                                                                                                                                                  \
1534    {                                                                                                                                                     \
1535        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1536    }
1537
1538#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1539    if(!(PARTIAL_COND_X))                                                                                         \
1540    {                                                                                                             \
1541        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1542    }                                                                                                             \
1543    else                                                                                                          \
1544    {                                                                                                             \
1545        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1546    }
1547
1548#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1549    if(!(PARTIAL_COND_Y))                                                                                         \
1550    {                                                                                                             \
1551        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1552    }                                                                                                             \
1553    else                                                                                                          \
1554    {                                                                                                             \
1555        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1556    }
1557
1558
1559#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1560
1561
1562#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1563
1564#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1565    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566
1567#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1568
1569#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1570    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1571
1572#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1573
1574#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1575    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1576
1577#else
1578
1579#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1580    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1581
1582#endif
1583
1584#endif
1585
1586
1587#if defined(PARTIAL_STORE_M0)
1588
1589#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1590    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1591#else
1592#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1593    ((uint)(y * M0))
1594#endif
1595
1596
1597
1598#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1599    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1600
1601
1602#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1603#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1604#endif
1605
1606#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1607#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1608#endif
1609
1610#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1611#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1612#endif
1613
1614#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1615#pragma OPENCL EXTENSION cl_arm_printf : enable
1616#endif
1617
1618#define GPU_ARCH_MIDGARD 0x100
1619#define GPU_ARCH_BIFROST 0x200
1620#define GPU_ARCH_VALHALL 0x300
1621
1622
1623#define CONCAT(a, b) a##b
1624
1625
1626#define EXPAND(x) x
1627
1628
1629#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1630
1631
1632#define REV1(x) ((x))
1633#define REV2(x) ((x).s10)
1634#define REV3(x) ((x).s210)
1635#define REV4(x) ((x).s3210)
1636#define REV8(x) ((x).s76543210)
1637#define REV16(x) ((x).sFEDCBA9876543210)
1638
1639
1640
1641#define REVERSE_STR(x, s) REV##s((x))
1642#define REVERSE(x, s) REVERSE_STR(x, s)
1643
1644
1645
1646#define ROT1_0(x) ((x))
1647#define ROT1_1(x) ((x))
1648
1649#define ROT2_0(x) ((x))
1650#define ROT2_1(x) ((x).s10)
1651#define ROT2_2(x) ((x))
1652
1653#define ROT3_0(x) ((x))
1654#define ROT3_1(x) ((x).s201)
1655#define ROT3_2(x) ((x).s120)
1656#define ROT3_3(x) ((x))
1657
1658#define ROT4_0(x) ((x))
1659#define ROT4_1(x) ((x).s3012)
1660#define ROT4_2(x) ((x).s2301)
1661#define ROT4_3(x) ((x).s1230)
1662#define ROT4_4(x) ((x))
1663
1664#define ROT8_0(x) ((x))
1665#define ROT8_1(x) ((x).s70123456)
1666#define ROT8_2(x) ((x).s67012345)
1667#define ROT8_3(x) ((x).s56701234)
1668#define ROT8_4(x) ((x).s45670123)
1669#define ROT8_5(x) ((x).s34567012)
1670#define ROT8_6(x) ((x).s23456701)
1671#define ROT8_7(x) ((x).s12345670)
1672#define ROT8_8(x) ((x))
1673
1674#define ROT16_0(x) ((x))
1675#define ROT16_1(x) ((x).sF0123456789ABCDE)
1676#define ROT16_2(x) ((x).sEF0123456789ABCD)
1677#define ROT16_3(x) ((x).sDEF0123456789ABC)
1678#define ROT16_4(x) ((x).sCDEF0123456789AB)
1679#define ROT16_5(x) ((x).sBCDEF0123456789A)
1680#define ROT16_6(x) ((x).sABCDEF0123456789)
1681#define ROT16_7(x) ((x).s9ABCDEF012345678)
1682#define ROT16_8(x) ((x).s89ABCDEF01234567)
1683#define ROT16_9(x) ((x).s789ABCDEF0123456)
1684#define ROT16_10(x) ((x).s6789ABCDEF012345)
1685#define ROT16_11(x) ((x).s56789ABCDEF01234)
1686#define ROT16_12(x) ((x).s456789ABCDEF0123)
1687#define ROT16_13(x) ((x).s3456789ABCDEF012)
1688#define ROT16_14(x) ((x).s23456789ABCDEF01)
1689#define ROT16_15(x) ((x).s123456789ABCDEF0)
1690#define ROT16_16(x) ((x))
1691
1692
1693
1694#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1695#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1696
1697
1698
1699#define V_OFFS1(dt) (dt##1)(0)
1700#define V_OFFS2(dt) (dt##2)(0, 1)
1701#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1702#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1703#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1704#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1705
1706
1707
1708#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1709#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1710
1711
1712#define VLOAD_STR(size) vload##size
1713#define VLOAD(size) VLOAD_STR(size)
1714
1715
1716#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1717#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1718
1719#define NO_LOAD(data, offs, ptr) \
1720    {                            \
1721    }
1722
1723
1724#define vload_partial_1_0 NO_LOAD
1725#define vload_partial_1_1 vload1
1726#define vload_partial_1_2 NO_LOAD
1727#define vload_partial_1_3 NO_LOAD
1728#define vload_partial_1_4 NO_LOAD
1729#define vload_partial_1_5 NO_LOAD
1730#define vload_partial_1_6 NO_LOAD
1731#define vload_partial_1_7 NO_LOAD
1732#define vload_partial_1_8 NO_LOAD
1733#define vload_partial_1_9 NO_LOAD
1734#define vload_partial_1_10 NO_LOAD
1735#define vload_partial_1_11 NO_LOAD
1736#define vload_partial_1_12 NO_LOAD
1737#define vload_partial_1_13 NO_LOAD
1738#define vload_partial_1_14 NO_LOAD
1739#define vload_partial_1_15 NO_LOAD
1740#define vload_partial_1_16 NO_LOAD
1741
1742#define vload_partial_2_0 NO_LOAD
1743#define vload_partial_2_1 vload_partial_1
1744#define vload_partial_2_2 vload_partial_2
1745#define vload_partial_2_3 NO_LOAD
1746#define vload_partial_2_4 NO_LOAD
1747#define vload_partial_2_5 NO_LOAD
1748#define vload_partial_2_6 NO_LOAD
1749#define vload_partial_2_7 NO_LOAD
1750#define vload_partial_2_8 NO_LOAD
1751#define vload_partial_2_9 NO_LOAD
1752#define vload_partial_2_10 NO_LOAD
1753#define vload_partial_2_11 NO_LOAD
1754#define vload_partial_2_12 NO_LOAD
1755#define vload_partial_2_13 NO_LOAD
1756#define vload_partial_2_14 NO_LOAD
1757#define vload_partial_2_15 NO_LOAD
1758#define vload_partial_2_16 NO_LOAD
1759
1760#define vload_partial_3_0 NO_LOAD
1761#define vload_partial_3_1 vload_partial_1
1762#define vload_partial_3_2 vload_partial_2
1763#define vload_partial_3_3 vload_partial_3
1764#define vload_partial_3_4 NO_LOAD
1765#define vload_partial_3_5 NO_LOAD
1766#define vload_partial_3_6 NO_LOAD
1767#define vload_partial_3_7 NO_LOAD
1768#define vload_partial_3_8 NO_LOAD
1769#define vload_partial_3_9 NO_LOAD
1770#define vload_partial_3_10 NO_LOAD
1771#define vload_partial_3_11 NO_LOAD
1772#define vload_partial_3_12 NO_LOAD
1773#define vload_partial_3_13 NO_LOAD
1774#define vload_partial_3_14 NO_LOAD
1775#define vload_partial_3_15 NO_LOAD
1776#define vload_partial_3_16 NO_LOAD
1777
1778#define vload_partial_4_0 NO_LOAD
1779#define vload_partial_4_1 vload_partial_1
1780#define vload_partial_4_2 vload_partial_2
1781#define vload_partial_4_3 vload_partial_3
1782#define vload_partial_4_4 vload_partial_4
1783#define vload_partial_4_5 NO_LOAD
1784#define vload_partial_4_6 NO_LOAD
1785#define vload_partial_4_7 NO_LOAD
1786#define vload_partial_4_8 NO_LOAD
1787#define vload_partial_4_9 NO_LOAD
1788#define vload_partial_4_10 NO_LOAD
1789#define vload_partial_4_11 NO_LOAD
1790#define vload_partial_4_12 NO_LOAD
1791#define vload_partial_4_13 NO_LOAD
1792#define vload_partial_4_14 NO_LOAD
1793#define vload_partial_4_15 NO_LOAD
1794#define vload_partial_4_16 NO_LOAD
1795
1796#define vload_partial_8_0 NO_LOAD
1797#define vload_partial_8_1 vload_partial_1
1798#define vload_partial_8_2 vload_partial_2
1799#define vload_partial_8_3 vload_partial_3
1800#define vload_partial_8_4 vload_partial_4
1801#define vload_partial_8_5 vload_partial_5
1802#define vload_partial_8_6 vload_partial_6
1803#define vload_partial_8_7 vload_partial_7
1804#define vload_partial_8_8 vload_partial_8
1805#define vload_partial_8_9 NO_LOAD
1806#define vload_partial_8_10 NO_LOAD
1807#define vload_partial_8_11 NO_LOAD
1808#define vload_partial_8_12 NO_LOAD
1809#define vload_partial_8_13 NO_LOAD
1810#define vload_partial_8_14 NO_LOAD
1811#define vload_partial_8_15 NO_LOAD
1812#define vload_partial_8_16 NO_LOAD
1813
1814#define vload_partial_16_0 NO_LOAD
1815#define vload_partial_16_1 vload_partial_1
1816#define vload_partial_16_2 vload_partial_2
1817#define vload_partial_16_3 vload_partial_3
1818#define vload_partial_16_4 vload_partial_4
1819#define vload_partial_16_5 vload_partial_5
1820#define vload_partial_16_6 vload_partial_6
1821#define vload_partial_16_7 vload_partial_7
1822#define vload_partial_16_8 vload_partial_8
1823#define vload_partial_16_9 vload_partial_9
1824#define vload_partial_16_10 vload_partial_10
1825#define vload_partial_16_11 vload_partial_11
1826#define vload_partial_16_12 vload_partial_12
1827#define vload_partial_16_13 vload_partial_13
1828#define vload_partial_16_14 vload_partial_14
1829#define vload_partial_16_15 vload_partial_15
1830#define vload_partial_16_16 vload_partial_16
1831
1832
1833#define vload_partial_1(DATA, OFFSET, PTR) \
1834    DATA.s0 = vload1(OFFSET, PTR);
1835
1836#define vload_partial_2(DATA, OFFSET, PTR) \
1837    DATA.s01 = vload2(OFFSET, PTR);
1838
1839#define vload_partial_3(DATA, OFFSET, PTR) \
1840    DATA.s012 = vload3(OFFSET, PTR);
1841
1842#define vload_partial_4(DATA, OFFSET, PTR) \
1843    DATA.s0123 = vload4(OFFSET, PTR);
1844
1845#define vload_partial_5(DATA, OFFSET, PTR)    \
1846    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1847    DATA.s4 = vload1(OFFSET, PTR + 4);
1848
1849#define vload_partial_6(DATA, OFFSET, PTR)    \
1850    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1851    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1852
1853#define vload_partial_7(DATA, OFFSET, PTR)    \
1854    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1855    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1856
1857#define vload_partial_8(DATA, OFFSET, PTR) \
1858    DATA.s01234567 = vload8(OFFSET, PTR);
1859
1860#define vload_partial_9(DATA, OFFSET, PTR)        \
1861    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1862    DATA.s8 = vload1(OFFSET, PTR + 8);
1863
1864#define vload_partial_10(DATA, OFFSET, PTR)       \
1865    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1866    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1867
1868#define vload_partial_11(DATA, OFFSET, PTR)       \
1869    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1870    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1871
1872#define vload_partial_12(DATA, OFFSET, PTR)       \
1873    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1874    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1875
1876#define vload_partial_13(DATA, OFFSET, PTR)       \
1877    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1878    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1879
1880#define vload_partial_14(DATA, OFFSET, PTR)       \
1881    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1882    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1883
1884#define vload_partial_15(DATA, OFFSET, PTR)       \
1885    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1886    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1887
1888#define vload_partial_16(DATA, OFFSET, PTR) \
1889    DATA = vload16(OFFSET, PTR);
1890
1891
1892
1893#define PIXEL_UNIT4 1
1894#define PIXEL_UNIT8 2
1895#define PIXEL_UNIT16 4
1896
1897
1898#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1899#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1900
1901
1902#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1903#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1904#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1905
1906#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1907#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1908#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1909#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1910#endif
1911
1912#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1913#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1914#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1915
1916#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1917#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1918#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1919#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1920#endif
1921
1922
1923#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1924#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1925
1926
1927#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1928#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1929
1930#define VSTORE_STR(size) vstore##size
1931#define VSTORE(size) VSTORE_STR(size)
1932
1933#define float1 float
1934#define half1 half
1935#define char1 char
1936#define uchar1 uchar
1937#define short1 short
1938#define ushort1 ushort
1939#define int1 int
1940#define uint1 uint
1941#define long1 long
1942#define ulong1 ulong
1943#define double1 double
1944
1945#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1946#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1947
1948
1949#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1950#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1951
1952#define NO_STORE(data, offs, ptr) \
1953    {                             \
1954    }
1955
1956
1957#define vstore_partial_1_0 NO_STORE
1958#define vstore_partial_1_1 vstore1
1959#define vstore_partial_1_2 NO_STORE
1960#define vstore_partial_1_3 NO_STORE
1961#define vstore_partial_1_4 NO_STORE
1962#define vstore_partial_1_5 NO_STORE
1963#define vstore_partial_1_6 NO_STORE
1964#define vstore_partial_1_7 NO_STORE
1965#define vstore_partial_1_8 NO_STORE
1966#define vstore_partial_1_9 NO_STORE
1967#define vstore_partial_1_10 NO_STORE
1968#define vstore_partial_1_11 NO_STORE
1969#define vstore_partial_1_12 NO_STORE
1970#define vstore_partial_1_13 NO_STORE
1971#define vstore_partial_1_14 NO_STORE
1972#define vstore_partial_1_15 NO_STORE
1973#define vstore_partial_1_16 NO_STORE
1974
1975#define vstore_partial_2_0 NO_STORE
1976#define vstore_partial_2_1 vstore_partial_1
1977#define vstore_partial_2_2 vstore_partial_2
1978#define vstore_partial_2_3 NO_STORE
1979#define vstore_partial_2_4 NO_STORE
1980#define vstore_partial_2_5 NO_STORE
1981#define vstore_partial_2_6 NO_STORE
1982#define vstore_partial_2_7 NO_STORE
1983#define vstore_partial_2_8 NO_STORE
1984#define vstore_partial_2_9 NO_STORE
1985#define vstore_partial_2_10 NO_STORE
1986#define vstore_partial_2_11 NO_STORE
1987#define vstore_partial_2_12 NO_STORE
1988#define vstore_partial_2_13 NO_STORE
1989#define vstore_partial_2_14 NO_STORE
1990#define vstore_partial_2_15 NO_STORE
1991#define vstore_partial_2_16 NO_STORE
1992
1993#define vstore_partial_3_0 NO_STORE
1994#define vstore_partial_3_1 vstore_partial_1
1995#define vstore_partial_3_2 vstore_partial_2
1996#define vstore_partial_3_3 vstore_partial_3
1997#define vstore_partial_3_4 NO_STORE
1998#define vstore_partial_3_5 NO_STORE
1999#define vstore_partial_3_6 NO_STORE
2000#define vstore_partial_3_7 NO_STORE
2001#define vstore_partial_3_8 NO_STORE
2002#define vstore_partial_3_9 NO_STORE
2003#define vstore_partial_3_10 NO_STORE
2004#define vstore_partial_3_11 NO_STORE
2005#define vstore_partial_3_12 NO_STORE
2006#define vstore_partial_3_13 NO_STORE
2007#define vstore_partial_3_14 NO_STORE
2008#define vstore_partial_3_15 NO_STORE
2009#define vstore_partial_3_16 NO_STORE
2010
2011#define vstore_partial_4_0 NO_STORE
2012#define vstore_partial_4_1 vstore_partial_1
2013#define vstore_partial_4_2 vstore_partial_2
2014#define vstore_partial_4_3 vstore_partial_3
2015#define vstore_partial_4_4 vstore_partial_4
2016#define vstore_partial_4_5 NO_STORE
2017#define vstore_partial_4_6 NO_STORE
2018#define vstore_partial_4_7 NO_STORE
2019#define vstore_partial_4_8 NO_STORE
2020#define vstore_partial_4_9 NO_STORE
2021#define vstore_partial_4_10 NO_STORE
2022#define vstore_partial_4_11 NO_STORE
2023#define vstore_partial_4_12 NO_STORE
2024#define vstore_partial_4_13 NO_STORE
2025#define vstore_partial_4_14 NO_STORE
2026#define vstore_partial_4_15 NO_STORE
2027#define vstore_partial_4_16 NO_STORE
2028
2029#define vstore_partial_8_0 NO_STORE
2030#define vstore_partial_8_1 vstore_partial_1
2031#define vstore_partial_8_2 vstore_partial_2
2032#define vstore_partial_8_3 vstore_partial_3
2033#define vstore_partial_8_4 vstore_partial_4
2034#define vstore_partial_8_5 vstore_partial_5
2035#define vstore_partial_8_6 vstore_partial_6
2036#define vstore_partial_8_7 vstore_partial_7
2037#define vstore_partial_8_8 vstore_partial_8
2038#define vstore_partial_8_9 NO_STORE
2039#define vstore_partial_8_10 NO_STORE
2040#define vstore_partial_8_11 NO_STORE
2041#define vstore_partial_8_12 NO_STORE
2042#define vstore_partial_8_13 NO_STORE
2043#define vstore_partial_8_14 NO_STORE
2044#define vstore_partial_8_15 NO_STORE
2045#define vstore_partial_8_16 NO_STORE
2046
2047#define vstore_partial_16_0 NO_STORE
2048#define vstore_partial_16_1 vstore_partial_1
2049#define vstore_partial_16_2 vstore_partial_2
2050#define vstore_partial_16_3 vstore_partial_3
2051#define vstore_partial_16_4 vstore_partial_4
2052#define vstore_partial_16_5 vstore_partial_5
2053#define vstore_partial_16_6 vstore_partial_6
2054#define vstore_partial_16_7 vstore_partial_7
2055#define vstore_partial_16_8 vstore_partial_8
2056#define vstore_partial_16_9 vstore_partial_9
2057#define vstore_partial_16_10 vstore_partial_10
2058#define vstore_partial_16_11 vstore_partial_11
2059#define vstore_partial_16_12 vstore_partial_12
2060#define vstore_partial_16_13 vstore_partial_13
2061#define vstore_partial_16_14 vstore_partial_14
2062#define vstore_partial_16_15 vstore_partial_15
2063#define vstore_partial_16_16 vstore_partial_16
2064
2065
2066#define vstore_partial_1(DATA, OFFSET, PTR) \
2067    vstore1(DATA.s0, OFFSET, PTR);
2068
2069#define vstore_partial_2(DATA, OFFSET, PTR) \
2070    vstore2(DATA.s01, OFFSET, PTR);
2071
2072#define vstore_partial_3(DATA, OFFSET, PTR) \
2073    vstore3(DATA.s012, OFFSET, PTR);
2074
2075#define vstore_partial_4(DATA, OFFSET, PTR) \
2076    vstore4(DATA.s0123, OFFSET, PTR);
2077
2078#define vstore_partial_5(DATA, OFFSET, PTR)    \
2079    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2080    vstore1(DATA.s4, OFFSET, PTR + 4);
2081
2082#define vstore_partial_6(DATA, OFFSET, PTR)    \
2083    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2084    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2085
2086#define vstore_partial_7(DATA, OFFSET, PTR)    \
2087    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2088    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2089
2090#define vstore_partial_8(DATA, OFFSET, PTR) \
2091    vstore8(DATA.s01234567, OFFSET, PTR);
2092
2093#define vstore_partial_9(DATA, OFFSET, PTR)        \
2094    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2095    vstore1(DATA.s8, OFFSET, PTR + 8);
2096
2097#define vstore_partial_10(DATA, OFFSET, PTR)       \
2098    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2099    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2100
2101#define vstore_partial_11(DATA, OFFSET, PTR)       \
2102    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2103    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2104
2105#define vstore_partial_12(DATA, OFFSET, PTR)       \
2106    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2107    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2108
2109#define vstore_partial_13(DATA, OFFSET, PTR)       \
2110    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2111    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2112
2113#define vstore_partial_14(DATA, OFFSET, PTR)       \
2114    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2115    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2116
2117#define vstore_partial_15(DATA, OFFSET, PTR)       \
2118    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2119    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2120
2121#define vstore_partial_16(DATA, OFFSET, PTR) \
2122    vstore16(DATA, OFFSET, PTR);
2123
2124
2125
2126
2127
2128#define convert_float_sat convert_float
2129#define convert_float1_sat convert_float
2130#define convert_float2_sat convert_float2
2131#define convert_float3_sat convert_float3
2132#define convert_float4_sat convert_float4
2133#define convert_float8_sat convert_float8
2134#define convert_float16_sat convert_float16
2135#define convert_half_sat convert_float
2136#define convert_half1_sat convert_half
2137#define convert_half2_sat convert_half2
2138#define convert_half3_sat convert_half3
2139#define convert_half4_sat convert_half4
2140#define convert_half8_sat convert_half8
2141#define convert_half16_sat convert_half16
2142
2143#define convert_float1 convert_float
2144#define convert_half1 convert_half
2145#define convert_char1 convert_char
2146#define convert_uchar1 convert_uchar
2147#define convert_short1 convert_short
2148#define convert_ushort1 convert_ushort
2149#define convert_int1 convert_int
2150#define convert_uint1 convert_uint
2151#define convert_long1 convert_long
2152#define convert_ulong1 convert_ulong
2153#define convert_double1 convert_double
2154
2155#define convert_char1_sat convert_char_sat
2156#define convert_uchar1_sat convert_uchar_sat
2157#define convert_uchar2_sat convert_uchar2_sat
2158#define convert_uchar3_sat convert_uchar3_sat
2159#define convert_uchar4_sat convert_uchar4_sat
2160#define convert_uchar8_sat convert_uchar8_sat
2161#define convert_uchar16_sat convert_uchar16_sat
2162#define convert_short1_sat convert_short_sat
2163#define convert_ushort1_sat convert_ushort_sat
2164#define convert_int1_sat convert_int_sat
2165#define convert_uint1_sat convert_uint_sat
2166#define convert_long1_sat convert_long_sat
2167#define convert_ulong1_sat convert_ulong_sat
2168#define convert_double1_sat convert_double_sat
2169
2170#define VEC_DATA_TYPE_STR(type, size) type##size
2171#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2172
2173#define CONVERT_STR(x, type) (convert_##type((x)))
2174#define CONVERT(x, type) CONVERT_STR(x, type)
2175
2176#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2177#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2178
2179#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2180#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2181
2182#define select_vec_dt_uchar(size) uchar##size
2183#define select_vec_dt_char(size) char##size
2184#define select_vec_dt_ushort(size) ushort##size
2185#define select_vec_dt_short(size) short##size
2186#define select_vec_dt_half(size) short##size
2187#define select_vec_dt_uint(size) uint##size
2188#define select_vec_dt_int(size) int##size
2189#define select_vec_dt_float(size) int##size
2190#define select_vec_dt_ulong(size) ulong##size
2191#define select_vec_dt_long(size) long##size
2192
2193#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2194#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2195#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2196
2197#define signed_int_vec_dt_uchar(size) char##size
2198#define signed_int_vec_dt_char(size) char##size
2199#define signed_int_vec_dt_ushort(size) short##size
2200#define signed_int_vec_dt_short(size) short##size
2201#define signed_int_vec_dt_half(size) short##size
2202#define signed_int_vec_dt_uint(size) int##size
2203#define signed_int_vec_dt_int(size) int##size
2204#define signed_int_vec_dt_float(size) int##size
2205#define signed_int_vec_dt_ulong(size) long##size
2206#define signed_int_vec_dt_long(size) long##size
2207
2208#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2209#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2210#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2211
2212#define sum_reduce_1(x) (x)
2213#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2214#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2215#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2216#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2217#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2218
2219#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2220#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2221
2222#define prod_reduce_1(x) (x)
2223#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2224#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2225#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2226#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2227#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2228
2229#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2230#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2231
2232#define max_reduce_1(x) (x)
2233#define max_reduce_2(x) max(((x).s0), ((x).s1))
2234#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2235#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2236#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2237#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2238
2239#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2240#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2241
2242#define VECTOR_DECLARATION(name)     \
2243    __global uchar *name##_ptr,      \
2244    uint        name##_stride_x, \
2245    uint        name##_step_x,   \
2246    uint        name##_offset_first_element_in_bytes
2247
2248#define IMAGE_DECLARATION(name)      \
2249    __global uchar *name##_ptr,      \
2250    uint        name##_stride_x, \
2251    uint        name##_step_x,   \
2252    uint        name##_stride_y, \
2253    uint        name##_step_y,   \
2254    uint        name##_offset_first_element_in_bytes
2255
2256#define TENSOR3D_DECLARATION(name)   \
2257    __global uchar *name##_ptr,      \
2258    uint        name##_stride_x, \
2259    uint        name##_step_x,   \
2260    uint        name##_stride_y, \
2261    uint        name##_step_y,   \
2262    uint        name##_stride_z, \
2263    uint        name##_step_z,   \
2264    uint        name##_offset_first_element_in_bytes
2265
2266#define TENSOR4D_DECLARATION(name)   \
2267    __global uchar *name##_ptr,      \
2268    uint        name##_stride_x, \
2269    uint        name##_step_x,   \
2270    uint        name##_stride_y, \
2271    uint        name##_step_y,   \
2272    uint        name##_stride_z, \
2273    uint        name##_step_z,   \
2274    uint        name##_stride_w, \
2275    uint        name##_step_w,   \
2276    uint        name##_offset_first_element_in_bytes
2277
2278#define TENSOR5D_DECLARATION(name)   \
2279    __global uchar *name##_ptr,      \
2280    uint        name##_stride_x, \
2281    uint        name##_step_x,   \
2282    uint        name##_stride_y, \
2283    uint        name##_step_y,   \
2284    uint        name##_stride_z, \
2285    uint        name##_step_z,   \
2286    uint        name##_stride_w, \
2287    uint        name##_step_w,   \
2288    uint        name##_stride_v, \
2289    uint        name##_step_v,   \
2290    uint        name##_offset_first_element_in_bytes
2291
2292#define CONVERT_TO_VECTOR_STRUCT(name) \
2293    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2294
2295#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2296    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2297
2298#define CONVERT_TO_IMAGE_STRUCT(name) \
2299    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2300
2301#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2302    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2303
2304#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2305    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2306
2307#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2308    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2309
2310#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2311    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2312
2313#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2314    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2315                                 name##_stride_z, name##_step_z)
2316
2317#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2318    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2319
2320#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2321    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2322                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2323
2324#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2325    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2326
2327#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2328    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2329                           name##_stride_z, name##_step_z)
2330
2331
2332typedef struct Vector
2333{
2334    __global uchar *ptr;
2335    int             offset_first_element_in_bytes;
2336    int             stride_x;
2337} Vector;
2338
2339
2340typedef struct Image
2341{
2342    __global uchar *ptr;
2343    int             offset_first_element_in_bytes;
2344    int             stride_x;
2345    int             stride_y;
2346} Image;
2347
2348
2349typedef struct Tensor3D
2350{
2351    __global uchar *ptr;
2352    int             offset_first_element_in_bytes;
2353    int             stride_x;
2354    int             stride_y;
2355    int             stride_z;
2356} Tensor3D;
2357
2358
2359typedef struct Tensor4D
2360{
2361    __global uchar *ptr;
2362    int             offset_first_element_in_bytes;
2363    int             stride_x;
2364    int             stride_y;
2365    int             stride_z;
2366    int             stride_w;
2367} Tensor4D;
2368
2369
2370inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2371{
2372    Vector vector =
2373    {
2374        .ptr                           = ptr,
2375        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2376        .stride_x                      = stride_x,
2377    };
2378    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2379    return vector;
2380}
2381
2382
2383inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2384{
2385    Image img =
2386    {
2387        .ptr                           = ptr,
2388        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2389        .stride_x                      = stride_x,
2390        .stride_y                      = stride_y
2391    };
2392    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2393    return img;
2394}
2395
2396
2397inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2398{
2399    Image img =
2400    {
2401        .ptr                           = ptr,
2402        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2403        .stride_x                      = stride_x,
2404        .stride_y                      = stride_y
2405    };
2406    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2407    return img;
2408}
2409
2410
2411inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2412{
2413    Tensor3D tensor =
2414    {
2415        .ptr                           = ptr,
2416        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2417        .stride_x                      = stride_x,
2418        .stride_y                      = stride_y,
2419        .stride_z                      = stride_z
2420    };
2421    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2422    return tensor;
2423}
2424
2425
2426inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2427{
2428    Tensor3D tensor =
2429    {
2430        .ptr                           = ptr,
2431        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2432        .stride_x                      = stride_x,
2433        .stride_y                      = stride_y,
2434        .stride_z                      = stride_z
2435    };
2436    return tensor;
2437}
2438
2439inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2440                                             uint step_w,
2441                                             uint mod_size)
2442{
2443    Tensor4D tensor =
2444    {
2445        .ptr                           = ptr,
2446        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2447        .stride_x                      = stride_x,
2448        .stride_y                      = stride_y,
2449        .stride_z                      = stride_z,
2450        .stride_w                      = stride_w
2451    };
2452
2453    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2454    return tensor;
2455}
2456
2457
2458inline __global const uchar *vector_offset(const Vector *vec, int x)
2459{
2460    return vec->ptr + x * vec->stride_x;
2461}
2462
2463
2464inline __global uchar *offset(const Image *img, int x, int y)
2465{
2466    return img->ptr + x * img->stride_x + y * img->stride_y;
2467}
2468
2469
2470inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2471{
2472    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2473}
2474
2475
2476inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2477{
2478    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2479}
2480
2481
2482inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2483{
2484    uint num_elements = width * height;
2485
2486    const uint z = index / num_elements;
2487
2488    index %= num_elements;
2489
2490    const uint y = index / width;
2491
2492    index %= width;
2493
2494    const uint x = index;
2495
2496    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2497}
2498
2499#endif
2500
2501#if GPU_ARCH == GPU_ARCH_BIFROST
2502#define MLA(a, b, c) (fma(c, b, a))
2503#else
2504#define MLA(a, b, c) ((b) * (c) + (a))
2505#endif
2506
2507
2508#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
2509
2510
2511#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
2512
2513
2514#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
2515
2516
2517#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
2518
2519
2520#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
2521
2522
2523#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
2524
2525
2526#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
2527
2528
2529#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
2530
2531
2532#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
2533
2534
2535#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
2536
2537
2538#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
2539
2540
2541#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
2542
2543
2544#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
2545
2546
2547#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
2548
2549
2550#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
2551
2552#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
2553
2554#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
2555
2556
2557__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
2558#ifndef IN_PLACE
2559                                            TENSOR3D_DECLARATION(output),
2560#endif
2561                                            VECTOR_DECLARATION(mean),
2562                                            VECTOR_DECLARATION(var),
2563#ifndef USE_DEFAULT_BETA
2564                                            VECTOR_DECLARATION(beta),
2565#endif
2566#ifndef USE_DEFAULT_GAMMA
2567                                            VECTOR_DECLARATION(gamma),
2568#endif
2569                                            float epsilon)
2570{
2571    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
2572#ifdef IN_PLACE
2573    Tensor3D out = in;
2574#else
2575    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
2576#endif
2577    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
2578    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
2579#ifndef USE_DEFAULT_BETA
2580    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
2581#endif
2582#ifndef USE_DEFAULT_GAMMA
2583    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
2584#endif
2585
2586    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2587    data = 0;
2588    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2589    denominator = 0;
2590    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2591    numerator = 0;
2592    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2593    x_bar = 0;
2594    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2595    res = 0;
2596
2597    const int current_slice = get_global_id(2);
2598
2599    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
2600    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
2601    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
2602
2603
2604    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
2605    numerator = SUB_OP(data, numerator);
2606    x_bar     = MUL_OP(numerator, denominator);
2607
2608#ifndef USE_DEFAULT_GAMMA
2609    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2610    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
2611
2612    res = MUL_OP(gamma_vec, x_bar);
2613#else
2614
2615    res = x_bar;
2616#endif
2617
2618#ifndef USE_DEFAULT_BETA
2619    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
2620    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
2621
2622    res = ADD_OP(res, beta_vec);
2623#endif
2624
2625    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
2626
2627    VSTORE(VEC_SIZE)
2628    (res, 0, (__global DATA_TYPE *)out.ptr);
2629}
2630#endif  )"