xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/nhwc/pooling_layer.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3#ifndef ARM_COMPUTE_HELPER_H
4#define ARM_COMPUTE_HELPER_H
5
6
7
8
9#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
10    VSTORE(N0)                                                 \
11    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
12
13#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
14    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
15    VSTORE(N0)                                                 \
16    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
17
18#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
19    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
20    VSTORE(N0)                                                 \
21    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
22
23#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
24    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
25    VSTORE(N0)                                                 \
26    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
27
28#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
29    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
30    VSTORE(N0)                                                 \
31    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
32
33#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
34    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
35    VSTORE(N0)                                                 \
36    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
37
38#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
39    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
40    VSTORE(N0)                                                 \
41    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
42
43#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
44    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
45    VSTORE(N0)                                                 \
46    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
47
48#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
49    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
50    VSTORE(N0)                                                 \
51    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
52
53#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
54    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
55    VSTORE(N0)                                                  \
56    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
57
58#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
59    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
60    VSTORE(N0)                                                  \
61    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
62
63#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
64    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
65    VSTORE(N0)                                                  \
66    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
67
68#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
69    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
70    VSTORE(N0)                                                  \
71    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
72
73#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
74    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
75    VSTORE(N0)                                                  \
76    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
77
78#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
79    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
80    VSTORE(N0)                                                  \
81    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
82
83#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
84    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
85    VSTORE(N0)                                                  \
86    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
87
88
89
90#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
91    VSTORE(N0)                                                         \
92    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
93
94#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
95    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
96    VSTORE(N0)                                                         \
97    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
98
99#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
100    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
101    VSTORE(N0)                                                         \
102    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
103
104#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
105    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
106    VSTORE(N0)                                                         \
107    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
108
109#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
110    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
111    VSTORE(N0)                                                         \
112    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
113
114#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
115    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
116    VSTORE(N0)                                                         \
117    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
118
119#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
120    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
121    VSTORE(N0)                                                         \
122    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
123
124#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
125    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
126    VSTORE(N0)                                                         \
127    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
128
129#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
130    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
131    VSTORE(N0)                                                         \
132    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
133
134#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
135    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136    VSTORE(N0)                                                     \
137    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
138
139#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
140    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
141    VSTORE(N0)                                                          \
142    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
143
144#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
145    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
146    VSTORE(N0)                                                          \
147    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
148
149#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
150    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
151    VSTORE(N0)                                                          \
152    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
153
154#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
155    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
156    VSTORE(N0)                                                          \
157    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
158
159#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
160    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
161    VSTORE(N0)                                                          \
162    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
163
164#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
165    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
166    VSTORE(N0)                                                          \
167    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168
169
170
171
172#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
173#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
174
175
176
177#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
178#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179
180
181
182#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
184    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
185
186#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
187    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
188    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
189    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
190
191#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
192    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
193    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
194    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
195
196#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
197    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
198    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
199    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
200
201#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
202    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
203    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
204    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
205
206#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
207    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
208    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
209    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
210
211#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
212    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
213    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
214    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
215
216#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
217    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
218    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
219    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
220
221#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
222    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
223    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
224    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
225
226#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
227    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
228    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
229    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
230
231#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
232    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
233    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
234    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
235
236#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
237    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
238    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
239    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
240
241#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
242    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
243    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
244    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
245
246#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
247    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
248    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
249    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
250
251#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
252    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
253    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
254    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
255
256#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
257    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
258    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
259    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
260
261
262
263#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
264#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
265
266#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
267    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
268    {                                                                                                                                                     \
269        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
270    }                                                                                                                                                     \
271    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
272    {                                                                                                                                                     \
273        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
274    }                                                                                                                                                     \
275    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
276    {                                                                                                                                                     \
277        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
278    }                                                                                                                                                     \
279    else                                                                                                                                                  \
280    {                                                                                                                                                     \
281        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
282    }
283
284#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
285    if(!(PARTIAL_COND_X))                                                                                         \
286    {                                                                                                             \
287        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
288    }                                                                                                             \
289    else                                                                                                          \
290    {                                                                                                             \
291        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
292    }
293
294#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
295    if(!(PARTIAL_COND_Y))                                                                                         \
296    {                                                                                                             \
297        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
298    }                                                                                                             \
299    else                                                                                                          \
300    {                                                                                                             \
301        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
302    }
303
304
305#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
306
307
308#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
309
310#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
311    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
312
313#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
314
315#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
316    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
317
318#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
319
320#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
321    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
322
323#else
324
325#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
326    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
327
328#endif
329
330#endif
331
332
333#if defined(PARTIAL_STORE_M0)
334
335#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
336    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
337#else
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(y * M0))
340#endif
341
342
343
344#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
345    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
346
347
348#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
349#pragma OPENCL EXTENSION cl_khr_fp16 : enable
350#endif
351
352#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
353#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
354#endif
355
356#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
357#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
358#endif
359
360#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
361#pragma OPENCL EXTENSION cl_arm_printf : enable
362#endif
363
364#define GPU_ARCH_MIDGARD 0x100
365#define GPU_ARCH_BIFROST 0x200
366#define GPU_ARCH_VALHALL 0x300
367
368
369#define CONCAT(a, b) a##b
370
371
372#define EXPAND(x) x
373
374
375#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
376
377
378#define REV1(x) ((x))
379#define REV2(x) ((x).s10)
380#define REV3(x) ((x).s210)
381#define REV4(x) ((x).s3210)
382#define REV8(x) ((x).s76543210)
383#define REV16(x) ((x).sFEDCBA9876543210)
384
385
386
387#define REVERSE_STR(x, s) REV##s((x))
388#define REVERSE(x, s) REVERSE_STR(x, s)
389
390
391
392#define ROT1_0(x) ((x))
393#define ROT1_1(x) ((x))
394
395#define ROT2_0(x) ((x))
396#define ROT2_1(x) ((x).s10)
397#define ROT2_2(x) ((x))
398
399#define ROT3_0(x) ((x))
400#define ROT3_1(x) ((x).s201)
401#define ROT3_2(x) ((x).s120)
402#define ROT3_3(x) ((x))
403
404#define ROT4_0(x) ((x))
405#define ROT4_1(x) ((x).s3012)
406#define ROT4_2(x) ((x).s2301)
407#define ROT4_3(x) ((x).s1230)
408#define ROT4_4(x) ((x))
409
410#define ROT8_0(x) ((x))
411#define ROT8_1(x) ((x).s70123456)
412#define ROT8_2(x) ((x).s67012345)
413#define ROT8_3(x) ((x).s56701234)
414#define ROT8_4(x) ((x).s45670123)
415#define ROT8_5(x) ((x).s34567012)
416#define ROT8_6(x) ((x).s23456701)
417#define ROT8_7(x) ((x).s12345670)
418#define ROT8_8(x) ((x))
419
420#define ROT16_0(x) ((x))
421#define ROT16_1(x) ((x).sF0123456789ABCDE)
422#define ROT16_2(x) ((x).sEF0123456789ABCD)
423#define ROT16_3(x) ((x).sDEF0123456789ABC)
424#define ROT16_4(x) ((x).sCDEF0123456789AB)
425#define ROT16_5(x) ((x).sBCDEF0123456789A)
426#define ROT16_6(x) ((x).sABCDEF0123456789)
427#define ROT16_7(x) ((x).s9ABCDEF012345678)
428#define ROT16_8(x) ((x).s89ABCDEF01234567)
429#define ROT16_9(x) ((x).s789ABCDEF0123456)
430#define ROT16_10(x) ((x).s6789ABCDEF012345)
431#define ROT16_11(x) ((x).s56789ABCDEF01234)
432#define ROT16_12(x) ((x).s456789ABCDEF0123)
433#define ROT16_13(x) ((x).s3456789ABCDEF012)
434#define ROT16_14(x) ((x).s23456789ABCDEF01)
435#define ROT16_15(x) ((x).s123456789ABCDEF0)
436#define ROT16_16(x) ((x))
437
438
439
440#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
441#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
442
443
444
445#define V_OFFS1(dt) (dt##1)(0)
446#define V_OFFS2(dt) (dt##2)(0, 1)
447#define V_OFFS3(dt) (dt##3)(0, 1, 2)
448#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
449#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
450#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
451
452
453
454#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
455#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
456
457
458#define VLOAD_STR(size) vload##size
459#define VLOAD(size) VLOAD_STR(size)
460
461
462#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
463#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
464
465#define NO_LOAD(data, offs, ptr) \
466    {                            \
467    }
468
469
470#define vload_partial_1_0 NO_LOAD
471#define vload_partial_1_1 vload1
472#define vload_partial_1_2 NO_LOAD
473#define vload_partial_1_3 NO_LOAD
474#define vload_partial_1_4 NO_LOAD
475#define vload_partial_1_5 NO_LOAD
476#define vload_partial_1_6 NO_LOAD
477#define vload_partial_1_7 NO_LOAD
478#define vload_partial_1_8 NO_LOAD
479#define vload_partial_1_9 NO_LOAD
480#define vload_partial_1_10 NO_LOAD
481#define vload_partial_1_11 NO_LOAD
482#define vload_partial_1_12 NO_LOAD
483#define vload_partial_1_13 NO_LOAD
484#define vload_partial_1_14 NO_LOAD
485#define vload_partial_1_15 NO_LOAD
486#define vload_partial_1_16 NO_LOAD
487
488#define vload_partial_2_0 NO_LOAD
489#define vload_partial_2_1 vload_partial_1
490#define vload_partial_2_2 vload_partial_2
491#define vload_partial_2_3 NO_LOAD
492#define vload_partial_2_4 NO_LOAD
493#define vload_partial_2_5 NO_LOAD
494#define vload_partial_2_6 NO_LOAD
495#define vload_partial_2_7 NO_LOAD
496#define vload_partial_2_8 NO_LOAD
497#define vload_partial_2_9 NO_LOAD
498#define vload_partial_2_10 NO_LOAD
499#define vload_partial_2_11 NO_LOAD
500#define vload_partial_2_12 NO_LOAD
501#define vload_partial_2_13 NO_LOAD
502#define vload_partial_2_14 NO_LOAD
503#define vload_partial_2_15 NO_LOAD
504#define vload_partial_2_16 NO_LOAD
505
506#define vload_partial_3_0 NO_LOAD
507#define vload_partial_3_1 vload_partial_1
508#define vload_partial_3_2 vload_partial_2
509#define vload_partial_3_3 vload_partial_3
510#define vload_partial_3_4 NO_LOAD
511#define vload_partial_3_5 NO_LOAD
512#define vload_partial_3_6 NO_LOAD
513#define vload_partial_3_7 NO_LOAD
514#define vload_partial_3_8 NO_LOAD
515#define vload_partial_3_9 NO_LOAD
516#define vload_partial_3_10 NO_LOAD
517#define vload_partial_3_11 NO_LOAD
518#define vload_partial_3_12 NO_LOAD
519#define vload_partial_3_13 NO_LOAD
520#define vload_partial_3_14 NO_LOAD
521#define vload_partial_3_15 NO_LOAD
522#define vload_partial_3_16 NO_LOAD
523
524#define vload_partial_4_0 NO_LOAD
525#define vload_partial_4_1 vload_partial_1
526#define vload_partial_4_2 vload_partial_2
527#define vload_partial_4_3 vload_partial_3
528#define vload_partial_4_4 vload_partial_4
529#define vload_partial_4_5 NO_LOAD
530#define vload_partial_4_6 NO_LOAD
531#define vload_partial_4_7 NO_LOAD
532#define vload_partial_4_8 NO_LOAD
533#define vload_partial_4_9 NO_LOAD
534#define vload_partial_4_10 NO_LOAD
535#define vload_partial_4_11 NO_LOAD
536#define vload_partial_4_12 NO_LOAD
537#define vload_partial_4_13 NO_LOAD
538#define vload_partial_4_14 NO_LOAD
539#define vload_partial_4_15 NO_LOAD
540#define vload_partial_4_16 NO_LOAD
541
542#define vload_partial_8_0 NO_LOAD
543#define vload_partial_8_1 vload_partial_1
544#define vload_partial_8_2 vload_partial_2
545#define vload_partial_8_3 vload_partial_3
546#define vload_partial_8_4 vload_partial_4
547#define vload_partial_8_5 vload_partial_5
548#define vload_partial_8_6 vload_partial_6
549#define vload_partial_8_7 vload_partial_7
550#define vload_partial_8_8 vload_partial_8
551#define vload_partial_8_9 NO_LOAD
552#define vload_partial_8_10 NO_LOAD
553#define vload_partial_8_11 NO_LOAD
554#define vload_partial_8_12 NO_LOAD
555#define vload_partial_8_13 NO_LOAD
556#define vload_partial_8_14 NO_LOAD
557#define vload_partial_8_15 NO_LOAD
558#define vload_partial_8_16 NO_LOAD
559
560#define vload_partial_16_0 NO_LOAD
561#define vload_partial_16_1 vload_partial_1
562#define vload_partial_16_2 vload_partial_2
563#define vload_partial_16_3 vload_partial_3
564#define vload_partial_16_4 vload_partial_4
565#define vload_partial_16_5 vload_partial_5
566#define vload_partial_16_6 vload_partial_6
567#define vload_partial_16_7 vload_partial_7
568#define vload_partial_16_8 vload_partial_8
569#define vload_partial_16_9 vload_partial_9
570#define vload_partial_16_10 vload_partial_10
571#define vload_partial_16_11 vload_partial_11
572#define vload_partial_16_12 vload_partial_12
573#define vload_partial_16_13 vload_partial_13
574#define vload_partial_16_14 vload_partial_14
575#define vload_partial_16_15 vload_partial_15
576#define vload_partial_16_16 vload_partial_16
577
578
579#define vload_partial_1(DATA, OFFSET, PTR) \
580    DATA.s0 = vload1(OFFSET, PTR);
581
582#define vload_partial_2(DATA, OFFSET, PTR) \
583    DATA.s01 = vload2(OFFSET, PTR);
584
585#define vload_partial_3(DATA, OFFSET, PTR) \
586    DATA.s012 = vload3(OFFSET, PTR);
587
588#define vload_partial_4(DATA, OFFSET, PTR) \
589    DATA.s0123 = vload4(OFFSET, PTR);
590
591#define vload_partial_5(DATA, OFFSET, PTR)    \
592    vload_partial_4(DATA.s0123, OFFSET, PTR); \
593    DATA.s4 = vload1(OFFSET, PTR + 4);
594
595#define vload_partial_6(DATA, OFFSET, PTR)    \
596    vload_partial_4(DATA.s0123, OFFSET, PTR); \
597    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
598
599#define vload_partial_7(DATA, OFFSET, PTR)    \
600    vload_partial_4(DATA.s0123, OFFSET, PTR); \
601    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
602
603#define vload_partial_8(DATA, OFFSET, PTR) \
604    DATA.s01234567 = vload8(OFFSET, PTR);
605
606#define vload_partial_9(DATA, OFFSET, PTR)        \
607    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
608    DATA.s8 = vload1(OFFSET, PTR + 8);
609
610#define vload_partial_10(DATA, OFFSET, PTR)       \
611    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
612    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
613
614#define vload_partial_11(DATA, OFFSET, PTR)       \
615    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
616    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
617
618#define vload_partial_12(DATA, OFFSET, PTR)       \
619    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
620    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
621
622#define vload_partial_13(DATA, OFFSET, PTR)       \
623    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
624    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
625
626#define vload_partial_14(DATA, OFFSET, PTR)       \
627    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
628    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
629
630#define vload_partial_15(DATA, OFFSET, PTR)       \
631    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
632    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
633
634#define vload_partial_16(DATA, OFFSET, PTR) \
635    DATA = vload16(OFFSET, PTR);
636
637
638
639#define PIXEL_UNIT4 1
640#define PIXEL_UNIT8 2
641#define PIXEL_UNIT16 4
642
643
644#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
646
647
648#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
649#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
650#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
651
652#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
653#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
654#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
655#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
656#endif
657
658#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
659#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
660#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
661
662#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
663#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
664#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
665#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
666#endif
667
668
669#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
670#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
671
672
673#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
674#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
675
676#define VSTORE_STR(size) vstore##size
677#define VSTORE(size) VSTORE_STR(size)
678
679#define float1 float
680#define half1 half
681#define char1 char
682#define uchar1 uchar
683#define short1 short
684#define ushort1 ushort
685#define int1 int
686#define uint1 uint
687#define long1 long
688#define ulong1 ulong
689#define double1 double
690
691#define vload1(OFFSET, PTR) *(OFFSET + PTR)
692#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
693
694
695#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
696#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
697
698#define NO_STORE(data, offs, ptr) \
699    {                             \
700    }
701
702
703#define vstore_partial_1_0 NO_STORE
704#define vstore_partial_1_1 vstore1
705#define vstore_partial_1_2 NO_STORE
706#define vstore_partial_1_3 NO_STORE
707#define vstore_partial_1_4 NO_STORE
708#define vstore_partial_1_5 NO_STORE
709#define vstore_partial_1_6 NO_STORE
710#define vstore_partial_1_7 NO_STORE
711#define vstore_partial_1_8 NO_STORE
712#define vstore_partial_1_9 NO_STORE
713#define vstore_partial_1_10 NO_STORE
714#define vstore_partial_1_11 NO_STORE
715#define vstore_partial_1_12 NO_STORE
716#define vstore_partial_1_13 NO_STORE
717#define vstore_partial_1_14 NO_STORE
718#define vstore_partial_1_15 NO_STORE
719#define vstore_partial_1_16 NO_STORE
720
721#define vstore_partial_2_0 NO_STORE
722#define vstore_partial_2_1 vstore_partial_1
723#define vstore_partial_2_2 vstore_partial_2
724#define vstore_partial_2_3 NO_STORE
725#define vstore_partial_2_4 NO_STORE
726#define vstore_partial_2_5 NO_STORE
727#define vstore_partial_2_6 NO_STORE
728#define vstore_partial_2_7 NO_STORE
729#define vstore_partial_2_8 NO_STORE
730#define vstore_partial_2_9 NO_STORE
731#define vstore_partial_2_10 NO_STORE
732#define vstore_partial_2_11 NO_STORE
733#define vstore_partial_2_12 NO_STORE
734#define vstore_partial_2_13 NO_STORE
735#define vstore_partial_2_14 NO_STORE
736#define vstore_partial_2_15 NO_STORE
737#define vstore_partial_2_16 NO_STORE
738
739#define vstore_partial_3_0 NO_STORE
740#define vstore_partial_3_1 vstore_partial_1
741#define vstore_partial_3_2 vstore_partial_2
742#define vstore_partial_3_3 vstore_partial_3
743#define vstore_partial_3_4 NO_STORE
744#define vstore_partial_3_5 NO_STORE
745#define vstore_partial_3_6 NO_STORE
746#define vstore_partial_3_7 NO_STORE
747#define vstore_partial_3_8 NO_STORE
748#define vstore_partial_3_9 NO_STORE
749#define vstore_partial_3_10 NO_STORE
750#define vstore_partial_3_11 NO_STORE
751#define vstore_partial_3_12 NO_STORE
752#define vstore_partial_3_13 NO_STORE
753#define vstore_partial_3_14 NO_STORE
754#define vstore_partial_3_15 NO_STORE
755#define vstore_partial_3_16 NO_STORE
756
757#define vstore_partial_4_0 NO_STORE
758#define vstore_partial_4_1 vstore_partial_1
759#define vstore_partial_4_2 vstore_partial_2
760#define vstore_partial_4_3 vstore_partial_3
761#define vstore_partial_4_4 vstore_partial_4
762#define vstore_partial_4_5 NO_STORE
763#define vstore_partial_4_6 NO_STORE
764#define vstore_partial_4_7 NO_STORE
765#define vstore_partial_4_8 NO_STORE
766#define vstore_partial_4_9 NO_STORE
767#define vstore_partial_4_10 NO_STORE
768#define vstore_partial_4_11 NO_STORE
769#define vstore_partial_4_12 NO_STORE
770#define vstore_partial_4_13 NO_STORE
771#define vstore_partial_4_14 NO_STORE
772#define vstore_partial_4_15 NO_STORE
773#define vstore_partial_4_16 NO_STORE
774
775#define vstore_partial_8_0 NO_STORE
776#define vstore_partial_8_1 vstore_partial_1
777#define vstore_partial_8_2 vstore_partial_2
778#define vstore_partial_8_3 vstore_partial_3
779#define vstore_partial_8_4 vstore_partial_4
780#define vstore_partial_8_5 vstore_partial_5
781#define vstore_partial_8_6 vstore_partial_6
782#define vstore_partial_8_7 vstore_partial_7
783#define vstore_partial_8_8 vstore_partial_8
784#define vstore_partial_8_9 NO_STORE
785#define vstore_partial_8_10 NO_STORE
786#define vstore_partial_8_11 NO_STORE
787#define vstore_partial_8_12 NO_STORE
788#define vstore_partial_8_13 NO_STORE
789#define vstore_partial_8_14 NO_STORE
790#define vstore_partial_8_15 NO_STORE
791#define vstore_partial_8_16 NO_STORE
792
793#define vstore_partial_16_0 NO_STORE
794#define vstore_partial_16_1 vstore_partial_1
795#define vstore_partial_16_2 vstore_partial_2
796#define vstore_partial_16_3 vstore_partial_3
797#define vstore_partial_16_4 vstore_partial_4
798#define vstore_partial_16_5 vstore_partial_5
799#define vstore_partial_16_6 vstore_partial_6
800#define vstore_partial_16_7 vstore_partial_7
801#define vstore_partial_16_8 vstore_partial_8
802#define vstore_partial_16_9 vstore_partial_9
803#define vstore_partial_16_10 vstore_partial_10
804#define vstore_partial_16_11 vstore_partial_11
805#define vstore_partial_16_12 vstore_partial_12
806#define vstore_partial_16_13 vstore_partial_13
807#define vstore_partial_16_14 vstore_partial_14
808#define vstore_partial_16_15 vstore_partial_15
809#define vstore_partial_16_16 vstore_partial_16
810
811
812#define vstore_partial_1(DATA, OFFSET, PTR) \
813    vstore1(DATA.s0, OFFSET, PTR);
814
815#define vstore_partial_2(DATA, OFFSET, PTR) \
816    vstore2(DATA.s01, OFFSET, PTR);
817
818#define vstore_partial_3(DATA, OFFSET, PTR) \
819    vstore3(DATA.s012, OFFSET, PTR);
820
821#define vstore_partial_4(DATA, OFFSET, PTR) \
822    vstore4(DATA.s0123, OFFSET, PTR);
823
824#define vstore_partial_5(DATA, OFFSET, PTR)    \
825    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
826    vstore1(DATA.s4, OFFSET, PTR + 4);
827
828#define vstore_partial_6(DATA, OFFSET, PTR)    \
829    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
830    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
831
832#define vstore_partial_7(DATA, OFFSET, PTR)    \
833    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
834    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
835
836#define vstore_partial_8(DATA, OFFSET, PTR) \
837    vstore8(DATA.s01234567, OFFSET, PTR);
838
839#define vstore_partial_9(DATA, OFFSET, PTR)        \
840    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
841    vstore1(DATA.s8, OFFSET, PTR + 8);
842
843#define vstore_partial_10(DATA, OFFSET, PTR)       \
844    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
845    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
846
847#define vstore_partial_11(DATA, OFFSET, PTR)       \
848    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
849    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
850
851#define vstore_partial_12(DATA, OFFSET, PTR)       \
852    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
853    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
854
855#define vstore_partial_13(DATA, OFFSET, PTR)       \
856    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
857    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
858
859#define vstore_partial_14(DATA, OFFSET, PTR)       \
860    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
861    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
862
863#define vstore_partial_15(DATA, OFFSET, PTR)       \
864    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
865    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
866
867#define vstore_partial_16(DATA, OFFSET, PTR) \
868    vstore16(DATA, OFFSET, PTR);
869
870
871
872
873
874#define convert_float_sat convert_float
875#define convert_float1_sat convert_float
876#define convert_float2_sat convert_float2
877#define convert_float3_sat convert_float3
878#define convert_float4_sat convert_float4
879#define convert_float8_sat convert_float8
880#define convert_float16_sat convert_float16
881#define convert_half_sat convert_float
882#define convert_half1_sat convert_half
883#define convert_half2_sat convert_half2
884#define convert_half3_sat convert_half3
885#define convert_half4_sat convert_half4
886#define convert_half8_sat convert_half8
887#define convert_half16_sat convert_half16
888
889#define convert_float1 convert_float
890#define convert_half1 convert_half
891#define convert_char1 convert_char
892#define convert_uchar1 convert_uchar
893#define convert_short1 convert_short
894#define convert_ushort1 convert_ushort
895#define convert_int1 convert_int
896#define convert_uint1 convert_uint
897#define convert_long1 convert_long
898#define convert_ulong1 convert_ulong
899#define convert_double1 convert_double
900
901#define convert_char1_sat convert_char_sat
902#define convert_uchar1_sat convert_uchar_sat
903#define convert_uchar2_sat convert_uchar2_sat
904#define convert_uchar3_sat convert_uchar3_sat
905#define convert_uchar4_sat convert_uchar4_sat
906#define convert_uchar8_sat convert_uchar8_sat
907#define convert_uchar16_sat convert_uchar16_sat
908#define convert_short1_sat convert_short_sat
909#define convert_ushort1_sat convert_ushort_sat
910#define convert_int1_sat convert_int_sat
911#define convert_uint1_sat convert_uint_sat
912#define convert_long1_sat convert_long_sat
913#define convert_ulong1_sat convert_ulong_sat
914#define convert_double1_sat convert_double_sat
915
916#define VEC_DATA_TYPE_STR(type, size) type##size
917#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
918
919#define CONVERT_STR(x, type) (convert_##type((x)))
920#define CONVERT(x, type) CONVERT_STR(x, type)
921
922#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
923#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
924
925#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
926#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
927
928#define select_vec_dt_uchar(size) uchar##size
929#define select_vec_dt_char(size) char##size
930#define select_vec_dt_ushort(size) ushort##size
931#define select_vec_dt_short(size) short##size
932#define select_vec_dt_half(size) short##size
933#define select_vec_dt_uint(size) uint##size
934#define select_vec_dt_int(size) int##size
935#define select_vec_dt_float(size) int##size
936#define select_vec_dt_ulong(size) ulong##size
937#define select_vec_dt_long(size) long##size
938
939#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
940#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
941#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
942
943#define signed_int_vec_dt_uchar(size) char##size
944#define signed_int_vec_dt_char(size) char##size
945#define signed_int_vec_dt_ushort(size) short##size
946#define signed_int_vec_dt_short(size) short##size
947#define signed_int_vec_dt_half(size) short##size
948#define signed_int_vec_dt_uint(size) int##size
949#define signed_int_vec_dt_int(size) int##size
950#define signed_int_vec_dt_float(size) int##size
951#define signed_int_vec_dt_ulong(size) long##size
952#define signed_int_vec_dt_long(size) long##size
953
954#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
955#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
956#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
957
958#define sum_reduce_1(x) (x)
959#define sum_reduce_2(x) ((x).s0) + ((x).s1)
960#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
961#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
962#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
963#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
964
965#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
966#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
967
968#define prod_reduce_1(x) (x)
969#define prod_reduce_2(x) ((x).s0) * ((x).s1)
970#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
971#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
972#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
973#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
974
975#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
976#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
977
978#define max_reduce_1(x) (x)
979#define max_reduce_2(x) max(((x).s0), ((x).s1))
980#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
981#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
982#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
983#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
984
985#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
986#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
987
988#define VECTOR_DECLARATION(name)     \
989    __global uchar *name##_ptr,      \
990    uint        name##_stride_x, \
991    uint        name##_step_x,   \
992    uint        name##_offset_first_element_in_bytes
993
994#define IMAGE_DECLARATION(name)      \
995    __global uchar *name##_ptr,      \
996    uint        name##_stride_x, \
997    uint        name##_step_x,   \
998    uint        name##_stride_y, \
999    uint        name##_step_y,   \
1000    uint        name##_offset_first_element_in_bytes
1001
1002#define TENSOR3D_DECLARATION(name)   \
1003    __global uchar *name##_ptr,      \
1004    uint        name##_stride_x, \
1005    uint        name##_step_x,   \
1006    uint        name##_stride_y, \
1007    uint        name##_step_y,   \
1008    uint        name##_stride_z, \
1009    uint        name##_step_z,   \
1010    uint        name##_offset_first_element_in_bytes
1011
1012#define TENSOR4D_DECLARATION(name)   \
1013    __global uchar *name##_ptr,      \
1014    uint        name##_stride_x, \
1015    uint        name##_step_x,   \
1016    uint        name##_stride_y, \
1017    uint        name##_step_y,   \
1018    uint        name##_stride_z, \
1019    uint        name##_step_z,   \
1020    uint        name##_stride_w, \
1021    uint        name##_step_w,   \
1022    uint        name##_offset_first_element_in_bytes
1023
1024#define TENSOR5D_DECLARATION(name)   \
1025    __global uchar *name##_ptr,      \
1026    uint        name##_stride_x, \
1027    uint        name##_step_x,   \
1028    uint        name##_stride_y, \
1029    uint        name##_step_y,   \
1030    uint        name##_stride_z, \
1031    uint        name##_step_z,   \
1032    uint        name##_stride_w, \
1033    uint        name##_step_w,   \
1034    uint        name##_stride_v, \
1035    uint        name##_step_v,   \
1036    uint        name##_offset_first_element_in_bytes
1037
1038#define CONVERT_TO_VECTOR_STRUCT(name) \
1039    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1040
1041#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1043
1044#define CONVERT_TO_IMAGE_STRUCT(name) \
1045    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1046
1047#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1049
1050#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1051    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1060    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1061                                 name##_stride_z, name##_step_z)
1062
1063#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1064    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1065
1066#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1067    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1068                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1069
1070#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1071    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1072
1073#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1074    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1075                           name##_stride_z, name##_step_z)
1076
1077
1078typedef struct Vector
1079{
1080    __global uchar *ptr;
1081    int             offset_first_element_in_bytes;
1082    int             stride_x;
1083} Vector;
1084
1085
1086typedef struct Image
1087{
1088    __global uchar *ptr;
1089    int             offset_first_element_in_bytes;
1090    int             stride_x;
1091    int             stride_y;
1092} Image;
1093
1094
1095typedef struct Tensor3D
1096{
1097    __global uchar *ptr;
1098    int             offset_first_element_in_bytes;
1099    int             stride_x;
1100    int             stride_y;
1101    int             stride_z;
1102} Tensor3D;
1103
1104
1105typedef struct Tensor4D
1106{
1107    __global uchar *ptr;
1108    int             offset_first_element_in_bytes;
1109    int             stride_x;
1110    int             stride_y;
1111    int             stride_z;
1112    int             stride_w;
1113} Tensor4D;
1114
1115
1116inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1117{
1118    Vector vector =
1119    {
1120        .ptr                           = ptr,
1121        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1122        .stride_x                      = stride_x,
1123    };
1124    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1125    return vector;
1126}
1127
1128
1129inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1130{
1131    Image img =
1132    {
1133        .ptr                           = ptr,
1134        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1135        .stride_x                      = stride_x,
1136        .stride_y                      = stride_y
1137    };
1138    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1139    return img;
1140}
1141
1142
1143inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1144{
1145    Image img =
1146    {
1147        .ptr                           = ptr,
1148        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1149        .stride_x                      = stride_x,
1150        .stride_y                      = stride_y
1151    };
1152    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1153    return img;
1154}
1155
1156
1157inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1158{
1159    Tensor3D tensor =
1160    {
1161        .ptr                           = ptr,
1162        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1163        .stride_x                      = stride_x,
1164        .stride_y                      = stride_y,
1165        .stride_z                      = stride_z
1166    };
1167    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1168    return tensor;
1169}
1170
1171
1172inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1173{
1174    Tensor3D tensor =
1175    {
1176        .ptr                           = ptr,
1177        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1178        .stride_x                      = stride_x,
1179        .stride_y                      = stride_y,
1180        .stride_z                      = stride_z
1181    };
1182    return tensor;
1183}
1184
1185inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1186                                             uint step_w,
1187                                             uint mod_size)
1188{
1189    Tensor4D tensor =
1190    {
1191        .ptr                           = ptr,
1192        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1193        .stride_x                      = stride_x,
1194        .stride_y                      = stride_y,
1195        .stride_z                      = stride_z,
1196        .stride_w                      = stride_w
1197    };
1198
1199    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1200    return tensor;
1201}
1202
1203
1204inline __global const uchar *vector_offset(const Vector *vec, int x)
1205{
1206    return vec->ptr + x * vec->stride_x;
1207}
1208
1209
1210inline __global uchar *offset(const Image *img, int x, int y)
1211{
1212    return img->ptr + x * img->stride_x + y * img->stride_y;
1213}
1214
1215
1216inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1217{
1218    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1219}
1220
1221
1222inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1223{
1224    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1225}
1226
1227
1228inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1229{
1230    uint num_elements = width * height;
1231
1232    const uint z = index / num_elements;
1233
1234    index %= num_elements;
1235
1236    const uint y = index / width;
1237
1238    index %= width;
1239
1240    const uint x = index;
1241
1242    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1243}
1244
1245#endif
1246
1247#ifndef ARM_COMPUTE_REPEAT_H
1248#define ARM_COMPUTE_REPEAT_H
1249
1250
1251#ifndef ARM_COMPUTE_HELPER_H
1252#define ARM_COMPUTE_HELPER_H
1253
1254
1255
1256
1257#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1258    VSTORE(N0)                                                 \
1259    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1260
1261#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1262    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1263    VSTORE(N0)                                                 \
1264    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1265
1266#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1267    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1268    VSTORE(N0)                                                 \
1269    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1270
1271#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1272    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1273    VSTORE(N0)                                                 \
1274    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1275
1276#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1277    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1278    VSTORE(N0)                                                 \
1279    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1280
1281#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1282    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1283    VSTORE(N0)                                                 \
1284    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1285
1286#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1287    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1288    VSTORE(N0)                                                 \
1289    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1290
1291#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1292    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1293    VSTORE(N0)                                                 \
1294    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1295
1296#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1297    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1298    VSTORE(N0)                                                 \
1299    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1300
1301#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1302    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1303    VSTORE(N0)                                                  \
1304    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1305
1306#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1307    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1308    VSTORE(N0)                                                  \
1309    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1310
1311#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1313    VSTORE(N0)                                                  \
1314    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1315
1316#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1317    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1318    VSTORE(N0)                                                  \
1319    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1320
1321#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1322    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1323    VSTORE(N0)                                                  \
1324    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1325
1326#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1327    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1328    VSTORE(N0)                                                  \
1329    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1330
1331#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1332    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1333    VSTORE(N0)                                                  \
1334    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1335
1336
1337
1338#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1339    VSTORE(N0)                                                         \
1340    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1341
1342#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1343    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1344    VSTORE(N0)                                                         \
1345    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1346
1347#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1348    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1349    VSTORE(N0)                                                         \
1350    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1351
1352#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1353    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1354    VSTORE(N0)                                                         \
1355    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1356
1357#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1358    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1359    VSTORE(N0)                                                         \
1360    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1361
1362#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1363    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1364    VSTORE(N0)                                                         \
1365    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1366
1367#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1368    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1369    VSTORE(N0)                                                         \
1370    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1371
1372#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1373    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1374    VSTORE(N0)                                                         \
1375    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1376
1377#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1378    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1379    VSTORE(N0)                                                         \
1380    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1381
1382#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1383    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1384    VSTORE(N0)                                                     \
1385    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1386
1387#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1388    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1389    VSTORE(N0)                                                          \
1390    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1391
1392#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1394    VSTORE(N0)                                                          \
1395    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1396
1397#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1398    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1399    VSTORE(N0)                                                          \
1400    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1401
1402#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1403    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1404    VSTORE(N0)                                                          \
1405    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1406
1407#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1408    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1409    VSTORE(N0)                                                          \
1410    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1411
1412#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1413    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1414    VSTORE(N0)                                                          \
1415    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1416
1417
1418
1419
1420#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1421#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1422
1423
1424
1425#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1426#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1427
1428
1429
1430#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1431    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1432    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1433
1434#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1435    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1436    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1437    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1438
1439#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1440    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1441    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1442    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1443
1444#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1445    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1446    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1447    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1448
1449#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1450    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1451    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1452    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1453
1454#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1455    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1456    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1457    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1458
1459#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1460    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1461    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1462    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1463
1464#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1465    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1466    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1467    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1468
1469#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1470    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1471    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1472    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1473
1474#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1475    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1476    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1477    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1478
1479#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1480    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1481    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1482    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1483
1484#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1486    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1487    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1488
1489#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1490    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1491    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1492    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1493
1494#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1495    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1496    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1497    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1498
1499#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1500    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1501    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1502    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1503
1504#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1505    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1506    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1507    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1508
1509
1510
1511#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1512#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1513
1514#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1515    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1516    {                                                                                                                                                     \
1517        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1518    }                                                                                                                                                     \
1519    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1520    {                                                                                                                                                     \
1521        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1522    }                                                                                                                                                     \
1523    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1524    {                                                                                                                                                     \
1525        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1526    }                                                                                                                                                     \
1527    else                                                                                                                                                  \
1528    {                                                                                                                                                     \
1529        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1530    }
1531
1532#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1533    if(!(PARTIAL_COND_X))                                                                                         \
1534    {                                                                                                             \
1535        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1536    }                                                                                                             \
1537    else                                                                                                          \
1538    {                                                                                                             \
1539        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1540    }
1541
1542#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1543    if(!(PARTIAL_COND_Y))                                                                                         \
1544    {                                                                                                             \
1545        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1546    }                                                                                                             \
1547    else                                                                                                          \
1548    {                                                                                                             \
1549        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1550    }
1551
1552
1553#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1554
1555
1556#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1557
1558#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1559    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1560
1561#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1562
1563#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1564    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1565
1566#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1567
1568#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1570
1571#else
1572
1573#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1574    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1575
1576#endif
1577
1578#endif
1579
1580
1581#if defined(PARTIAL_STORE_M0)
1582
1583#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1584    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1585#else
1586#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1587    ((uint)(y * M0))
1588#endif
1589
1590
1591
1592#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1593    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1594
1595
1596#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1597#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1598#endif
1599
1600#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1601#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1602#endif
1603
1604#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1605#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1606#endif
1607
1608#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1609#pragma OPENCL EXTENSION cl_arm_printf : enable
1610#endif
1611
1612#define GPU_ARCH_MIDGARD 0x100
1613#define GPU_ARCH_BIFROST 0x200
1614#define GPU_ARCH_VALHALL 0x300
1615
1616
1617#define CONCAT(a, b) a##b
1618
1619
1620#define EXPAND(x) x
1621
1622
1623#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1624
1625
1626#define REV1(x) ((x))
1627#define REV2(x) ((x).s10)
1628#define REV3(x) ((x).s210)
1629#define REV4(x) ((x).s3210)
1630#define REV8(x) ((x).s76543210)
1631#define REV16(x) ((x).sFEDCBA9876543210)
1632
1633
1634
1635#define REVERSE_STR(x, s) REV##s((x))
1636#define REVERSE(x, s) REVERSE_STR(x, s)
1637
1638
1639
1640#define ROT1_0(x) ((x))
1641#define ROT1_1(x) ((x))
1642
1643#define ROT2_0(x) ((x))
1644#define ROT2_1(x) ((x).s10)
1645#define ROT2_2(x) ((x))
1646
1647#define ROT3_0(x) ((x))
1648#define ROT3_1(x) ((x).s201)
1649#define ROT3_2(x) ((x).s120)
1650#define ROT3_3(x) ((x))
1651
1652#define ROT4_0(x) ((x))
1653#define ROT4_1(x) ((x).s3012)
1654#define ROT4_2(x) ((x).s2301)
1655#define ROT4_3(x) ((x).s1230)
1656#define ROT4_4(x) ((x))
1657
1658#define ROT8_0(x) ((x))
1659#define ROT8_1(x) ((x).s70123456)
1660#define ROT8_2(x) ((x).s67012345)
1661#define ROT8_3(x) ((x).s56701234)
1662#define ROT8_4(x) ((x).s45670123)
1663#define ROT8_5(x) ((x).s34567012)
1664#define ROT8_6(x) ((x).s23456701)
1665#define ROT8_7(x) ((x).s12345670)
1666#define ROT8_8(x) ((x))
1667
1668#define ROT16_0(x) ((x))
1669#define ROT16_1(x) ((x).sF0123456789ABCDE)
1670#define ROT16_2(x) ((x).sEF0123456789ABCD)
1671#define ROT16_3(x) ((x).sDEF0123456789ABC)
1672#define ROT16_4(x) ((x).sCDEF0123456789AB)
1673#define ROT16_5(x) ((x).sBCDEF0123456789A)
1674#define ROT16_6(x) ((x).sABCDEF0123456789)
1675#define ROT16_7(x) ((x).s9ABCDEF012345678)
1676#define ROT16_8(x) ((x).s89ABCDEF01234567)
1677#define ROT16_9(x) ((x).s789ABCDEF0123456)
1678#define ROT16_10(x) ((x).s6789ABCDEF012345)
1679#define ROT16_11(x) ((x).s56789ABCDEF01234)
1680#define ROT16_12(x) ((x).s456789ABCDEF0123)
1681#define ROT16_13(x) ((x).s3456789ABCDEF012)
1682#define ROT16_14(x) ((x).s23456789ABCDEF01)
1683#define ROT16_15(x) ((x).s123456789ABCDEF0)
1684#define ROT16_16(x) ((x))
1685
1686
1687
1688#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1689#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1690
1691
1692
1693#define V_OFFS1(dt) (dt##1)(0)
1694#define V_OFFS2(dt) (dt##2)(0, 1)
1695#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1696#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1697#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1698#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1699
1700
1701
1702#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1703#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1704
1705
1706#define VLOAD_STR(size) vload##size
1707#define VLOAD(size) VLOAD_STR(size)
1708
1709
1710#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1711#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1712
1713#define NO_LOAD(data, offs, ptr) \
1714    {                            \
1715    }
1716
1717
1718#define vload_partial_1_0 NO_LOAD
1719#define vload_partial_1_1 vload1
1720#define vload_partial_1_2 NO_LOAD
1721#define vload_partial_1_3 NO_LOAD
1722#define vload_partial_1_4 NO_LOAD
1723#define vload_partial_1_5 NO_LOAD
1724#define vload_partial_1_6 NO_LOAD
1725#define vload_partial_1_7 NO_LOAD
1726#define vload_partial_1_8 NO_LOAD
1727#define vload_partial_1_9 NO_LOAD
1728#define vload_partial_1_10 NO_LOAD
1729#define vload_partial_1_11 NO_LOAD
1730#define vload_partial_1_12 NO_LOAD
1731#define vload_partial_1_13 NO_LOAD
1732#define vload_partial_1_14 NO_LOAD
1733#define vload_partial_1_15 NO_LOAD
1734#define vload_partial_1_16 NO_LOAD
1735
1736#define vload_partial_2_0 NO_LOAD
1737#define vload_partial_2_1 vload_partial_1
1738#define vload_partial_2_2 vload_partial_2
1739#define vload_partial_2_3 NO_LOAD
1740#define vload_partial_2_4 NO_LOAD
1741#define vload_partial_2_5 NO_LOAD
1742#define vload_partial_2_6 NO_LOAD
1743#define vload_partial_2_7 NO_LOAD
1744#define vload_partial_2_8 NO_LOAD
1745#define vload_partial_2_9 NO_LOAD
1746#define vload_partial_2_10 NO_LOAD
1747#define vload_partial_2_11 NO_LOAD
1748#define vload_partial_2_12 NO_LOAD
1749#define vload_partial_2_13 NO_LOAD
1750#define vload_partial_2_14 NO_LOAD
1751#define vload_partial_2_15 NO_LOAD
1752#define vload_partial_2_16 NO_LOAD
1753
1754#define vload_partial_3_0 NO_LOAD
1755#define vload_partial_3_1 vload_partial_1
1756#define vload_partial_3_2 vload_partial_2
1757#define vload_partial_3_3 vload_partial_3
1758#define vload_partial_3_4 NO_LOAD
1759#define vload_partial_3_5 NO_LOAD
1760#define vload_partial_3_6 NO_LOAD
1761#define vload_partial_3_7 NO_LOAD
1762#define vload_partial_3_8 NO_LOAD
1763#define vload_partial_3_9 NO_LOAD
1764#define vload_partial_3_10 NO_LOAD
1765#define vload_partial_3_11 NO_LOAD
1766#define vload_partial_3_12 NO_LOAD
1767#define vload_partial_3_13 NO_LOAD
1768#define vload_partial_3_14 NO_LOAD
1769#define vload_partial_3_15 NO_LOAD
1770#define vload_partial_3_16 NO_LOAD
1771
1772#define vload_partial_4_0 NO_LOAD
1773#define vload_partial_4_1 vload_partial_1
1774#define vload_partial_4_2 vload_partial_2
1775#define vload_partial_4_3 vload_partial_3
1776#define vload_partial_4_4 vload_partial_4
1777#define vload_partial_4_5 NO_LOAD
1778#define vload_partial_4_6 NO_LOAD
1779#define vload_partial_4_7 NO_LOAD
1780#define vload_partial_4_8 NO_LOAD
1781#define vload_partial_4_9 NO_LOAD
1782#define vload_partial_4_10 NO_LOAD
1783#define vload_partial_4_11 NO_LOAD
1784#define vload_partial_4_12 NO_LOAD
1785#define vload_partial_4_13 NO_LOAD
1786#define vload_partial_4_14 NO_LOAD
1787#define vload_partial_4_15 NO_LOAD
1788#define vload_partial_4_16 NO_LOAD
1789
1790#define vload_partial_8_0 NO_LOAD
1791#define vload_partial_8_1 vload_partial_1
1792#define vload_partial_8_2 vload_partial_2
1793#define vload_partial_8_3 vload_partial_3
1794#define vload_partial_8_4 vload_partial_4
1795#define vload_partial_8_5 vload_partial_5
1796#define vload_partial_8_6 vload_partial_6
1797#define vload_partial_8_7 vload_partial_7
1798#define vload_partial_8_8 vload_partial_8
1799#define vload_partial_8_9 NO_LOAD
1800#define vload_partial_8_10 NO_LOAD
1801#define vload_partial_8_11 NO_LOAD
1802#define vload_partial_8_12 NO_LOAD
1803#define vload_partial_8_13 NO_LOAD
1804#define vload_partial_8_14 NO_LOAD
1805#define vload_partial_8_15 NO_LOAD
1806#define vload_partial_8_16 NO_LOAD
1807
1808#define vload_partial_16_0 NO_LOAD
1809#define vload_partial_16_1 vload_partial_1
1810#define vload_partial_16_2 vload_partial_2
1811#define vload_partial_16_3 vload_partial_3
1812#define vload_partial_16_4 vload_partial_4
1813#define vload_partial_16_5 vload_partial_5
1814#define vload_partial_16_6 vload_partial_6
1815#define vload_partial_16_7 vload_partial_7
1816#define vload_partial_16_8 vload_partial_8
1817#define vload_partial_16_9 vload_partial_9
1818#define vload_partial_16_10 vload_partial_10
1819#define vload_partial_16_11 vload_partial_11
1820#define vload_partial_16_12 vload_partial_12
1821#define vload_partial_16_13 vload_partial_13
1822#define vload_partial_16_14 vload_partial_14
1823#define vload_partial_16_15 vload_partial_15
1824#define vload_partial_16_16 vload_partial_16
1825
1826
1827#define vload_partial_1(DATA, OFFSET, PTR) \
1828    DATA.s0 = vload1(OFFSET, PTR);
1829
1830#define vload_partial_2(DATA, OFFSET, PTR) \
1831    DATA.s01 = vload2(OFFSET, PTR);
1832
1833#define vload_partial_3(DATA, OFFSET, PTR) \
1834    DATA.s012 = vload3(OFFSET, PTR);
1835
1836#define vload_partial_4(DATA, OFFSET, PTR) \
1837    DATA.s0123 = vload4(OFFSET, PTR);
1838
1839#define vload_partial_5(DATA, OFFSET, PTR)    \
1840    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1841    DATA.s4 = vload1(OFFSET, PTR + 4);
1842
1843#define vload_partial_6(DATA, OFFSET, PTR)    \
1844    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1845    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1846
1847#define vload_partial_7(DATA, OFFSET, PTR)    \
1848    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1849    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1850
1851#define vload_partial_8(DATA, OFFSET, PTR) \
1852    DATA.s01234567 = vload8(OFFSET, PTR);
1853
1854#define vload_partial_9(DATA, OFFSET, PTR)        \
1855    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1856    DATA.s8 = vload1(OFFSET, PTR + 8);
1857
1858#define vload_partial_10(DATA, OFFSET, PTR)       \
1859    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1860    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1861
1862#define vload_partial_11(DATA, OFFSET, PTR)       \
1863    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1864    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1865
1866#define vload_partial_12(DATA, OFFSET, PTR)       \
1867    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1868    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1869
1870#define vload_partial_13(DATA, OFFSET, PTR)       \
1871    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1872    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1873
1874#define vload_partial_14(DATA, OFFSET, PTR)       \
1875    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1876    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1877
1878#define vload_partial_15(DATA, OFFSET, PTR)       \
1879    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1880    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1881
1882#define vload_partial_16(DATA, OFFSET, PTR) \
1883    DATA = vload16(OFFSET, PTR);
1884
1885
1886
1887#define PIXEL_UNIT4 1
1888#define PIXEL_UNIT8 2
1889#define PIXEL_UNIT16 4
1890
1891
1892#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1893#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1894
1895
1896#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1897#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1898#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1899
1900#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1901#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1902#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1903#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1904#endif
1905
1906#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1907#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1908#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1909
1910#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1911#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1912#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1913#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1914#endif
1915
1916
1917#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1918#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1919
1920
1921#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1922#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1923
1924#define VSTORE_STR(size) vstore##size
1925#define VSTORE(size) VSTORE_STR(size)
1926
1927#define float1 float
1928#define half1 half
1929#define char1 char
1930#define uchar1 uchar
1931#define short1 short
1932#define ushort1 ushort
1933#define int1 int
1934#define uint1 uint
1935#define long1 long
1936#define ulong1 ulong
1937#define double1 double
1938
1939#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1940#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1941
1942
1943#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1944#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1945
1946#define NO_STORE(data, offs, ptr) \
1947    {                             \
1948    }
1949
1950
1951#define vstore_partial_1_0 NO_STORE
1952#define vstore_partial_1_1 vstore1
1953#define vstore_partial_1_2 NO_STORE
1954#define vstore_partial_1_3 NO_STORE
1955#define vstore_partial_1_4 NO_STORE
1956#define vstore_partial_1_5 NO_STORE
1957#define vstore_partial_1_6 NO_STORE
1958#define vstore_partial_1_7 NO_STORE
1959#define vstore_partial_1_8 NO_STORE
1960#define vstore_partial_1_9 NO_STORE
1961#define vstore_partial_1_10 NO_STORE
1962#define vstore_partial_1_11 NO_STORE
1963#define vstore_partial_1_12 NO_STORE
1964#define vstore_partial_1_13 NO_STORE
1965#define vstore_partial_1_14 NO_STORE
1966#define vstore_partial_1_15 NO_STORE
1967#define vstore_partial_1_16 NO_STORE
1968
1969#define vstore_partial_2_0 NO_STORE
1970#define vstore_partial_2_1 vstore_partial_1
1971#define vstore_partial_2_2 vstore_partial_2
1972#define vstore_partial_2_3 NO_STORE
1973#define vstore_partial_2_4 NO_STORE
1974#define vstore_partial_2_5 NO_STORE
1975#define vstore_partial_2_6 NO_STORE
1976#define vstore_partial_2_7 NO_STORE
1977#define vstore_partial_2_8 NO_STORE
1978#define vstore_partial_2_9 NO_STORE
1979#define vstore_partial_2_10 NO_STORE
1980#define vstore_partial_2_11 NO_STORE
1981#define vstore_partial_2_12 NO_STORE
1982#define vstore_partial_2_13 NO_STORE
1983#define vstore_partial_2_14 NO_STORE
1984#define vstore_partial_2_15 NO_STORE
1985#define vstore_partial_2_16 NO_STORE
1986
1987#define vstore_partial_3_0 NO_STORE
1988#define vstore_partial_3_1 vstore_partial_1
1989#define vstore_partial_3_2 vstore_partial_2
1990#define vstore_partial_3_3 vstore_partial_3
1991#define vstore_partial_3_4 NO_STORE
1992#define vstore_partial_3_5 NO_STORE
1993#define vstore_partial_3_6 NO_STORE
1994#define vstore_partial_3_7 NO_STORE
1995#define vstore_partial_3_8 NO_STORE
1996#define vstore_partial_3_9 NO_STORE
1997#define vstore_partial_3_10 NO_STORE
1998#define vstore_partial_3_11 NO_STORE
1999#define vstore_partial_3_12 NO_STORE
2000#define vstore_partial_3_13 NO_STORE
2001#define vstore_partial_3_14 NO_STORE
2002#define vstore_partial_3_15 NO_STORE
2003#define vstore_partial_3_16 NO_STORE
2004
2005#define vstore_partial_4_0 NO_STORE
2006#define vstore_partial_4_1 vstore_partial_1
2007#define vstore_partial_4_2 vstore_partial_2
2008#define vstore_partial_4_3 vstore_partial_3
2009#define vstore_partial_4_4 vstore_partial_4
2010#define vstore_partial_4_5 NO_STORE
2011#define vstore_partial_4_6 NO_STORE
2012#define vstore_partial_4_7 NO_STORE
2013#define vstore_partial_4_8 NO_STORE
2014#define vstore_partial_4_9 NO_STORE
2015#define vstore_partial_4_10 NO_STORE
2016#define vstore_partial_4_11 NO_STORE
2017#define vstore_partial_4_12 NO_STORE
2018#define vstore_partial_4_13 NO_STORE
2019#define vstore_partial_4_14 NO_STORE
2020#define vstore_partial_4_15 NO_STORE
2021#define vstore_partial_4_16 NO_STORE
2022
2023#define vstore_partial_8_0 NO_STORE
2024#define vstore_partial_8_1 vstore_partial_1
2025#define vstore_partial_8_2 vstore_partial_2
2026#define vstore_partial_8_3 vstore_partial_3
2027#define vstore_partial_8_4 vstore_partial_4
2028#define vstore_partial_8_5 vstore_partial_5
2029#define vstore_partial_8_6 vstore_partial_6
2030#define vstore_partial_8_7 vstore_partial_7
2031#define vstore_partial_8_8 vstore_partial_8
2032#define vstore_partial_8_9 NO_STORE
2033#define vstore_partial_8_10 NO_STORE
2034#define vstore_partial_8_11 NO_STORE
2035#define vstore_partial_8_12 NO_STORE
2036#define vstore_partial_8_13 NO_STORE
2037#define vstore_partial_8_14 NO_STORE
2038#define vstore_partial_8_15 NO_STORE
2039#define vstore_partial_8_16 NO_STORE
2040
2041#define vstore_partial_16_0 NO_STORE
2042#define vstore_partial_16_1 vstore_partial_1
2043#define vstore_partial_16_2 vstore_partial_2
2044#define vstore_partial_16_3 vstore_partial_3
2045#define vstore_partial_16_4 vstore_partial_4
2046#define vstore_partial_16_5 vstore_partial_5
2047#define vstore_partial_16_6 vstore_partial_6
2048#define vstore_partial_16_7 vstore_partial_7
2049#define vstore_partial_16_8 vstore_partial_8
2050#define vstore_partial_16_9 vstore_partial_9
2051#define vstore_partial_16_10 vstore_partial_10
2052#define vstore_partial_16_11 vstore_partial_11
2053#define vstore_partial_16_12 vstore_partial_12
2054#define vstore_partial_16_13 vstore_partial_13
2055#define vstore_partial_16_14 vstore_partial_14
2056#define vstore_partial_16_15 vstore_partial_15
2057#define vstore_partial_16_16 vstore_partial_16
2058
2059
2060#define vstore_partial_1(DATA, OFFSET, PTR) \
2061    vstore1(DATA.s0, OFFSET, PTR);
2062
2063#define vstore_partial_2(DATA, OFFSET, PTR) \
2064    vstore2(DATA.s01, OFFSET, PTR);
2065
2066#define vstore_partial_3(DATA, OFFSET, PTR) \
2067    vstore3(DATA.s012, OFFSET, PTR);
2068
2069#define vstore_partial_4(DATA, OFFSET, PTR) \
2070    vstore4(DATA.s0123, OFFSET, PTR);
2071
2072#define vstore_partial_5(DATA, OFFSET, PTR)    \
2073    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2074    vstore1(DATA.s4, OFFSET, PTR + 4);
2075
2076#define vstore_partial_6(DATA, OFFSET, PTR)    \
2077    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2078    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2079
2080#define vstore_partial_7(DATA, OFFSET, PTR)    \
2081    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2082    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2083
2084#define vstore_partial_8(DATA, OFFSET, PTR) \
2085    vstore8(DATA.s01234567, OFFSET, PTR);
2086
2087#define vstore_partial_9(DATA, OFFSET, PTR)        \
2088    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2089    vstore1(DATA.s8, OFFSET, PTR + 8);
2090
2091#define vstore_partial_10(DATA, OFFSET, PTR)       \
2092    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2093    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2094
2095#define vstore_partial_11(DATA, OFFSET, PTR)       \
2096    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2097    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2098
2099#define vstore_partial_12(DATA, OFFSET, PTR)       \
2100    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2101    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2102
2103#define vstore_partial_13(DATA, OFFSET, PTR)       \
2104    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2105    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2106
2107#define vstore_partial_14(DATA, OFFSET, PTR)       \
2108    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2109    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2110
2111#define vstore_partial_15(DATA, OFFSET, PTR)       \
2112    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2113    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2114
2115#define vstore_partial_16(DATA, OFFSET, PTR) \
2116    vstore16(DATA, OFFSET, PTR);
2117
2118
2119
2120
2121
2122#define convert_float_sat convert_float
2123#define convert_float1_sat convert_float
2124#define convert_float2_sat convert_float2
2125#define convert_float3_sat convert_float3
2126#define convert_float4_sat convert_float4
2127#define convert_float8_sat convert_float8
2128#define convert_float16_sat convert_float16
2129#define convert_half_sat convert_float
2130#define convert_half1_sat convert_half
2131#define convert_half2_sat convert_half2
2132#define convert_half3_sat convert_half3
2133#define convert_half4_sat convert_half4
2134#define convert_half8_sat convert_half8
2135#define convert_half16_sat convert_half16
2136
2137#define convert_float1 convert_float
2138#define convert_half1 convert_half
2139#define convert_char1 convert_char
2140#define convert_uchar1 convert_uchar
2141#define convert_short1 convert_short
2142#define convert_ushort1 convert_ushort
2143#define convert_int1 convert_int
2144#define convert_uint1 convert_uint
2145#define convert_long1 convert_long
2146#define convert_ulong1 convert_ulong
2147#define convert_double1 convert_double
2148
2149#define convert_char1_sat convert_char_sat
2150#define convert_uchar1_sat convert_uchar_sat
2151#define convert_uchar2_sat convert_uchar2_sat
2152#define convert_uchar3_sat convert_uchar3_sat
2153#define convert_uchar4_sat convert_uchar4_sat
2154#define convert_uchar8_sat convert_uchar8_sat
2155#define convert_uchar16_sat convert_uchar16_sat
2156#define convert_short1_sat convert_short_sat
2157#define convert_ushort1_sat convert_ushort_sat
2158#define convert_int1_sat convert_int_sat
2159#define convert_uint1_sat convert_uint_sat
2160#define convert_long1_sat convert_long_sat
2161#define convert_ulong1_sat convert_ulong_sat
2162#define convert_double1_sat convert_double_sat
2163
2164#define VEC_DATA_TYPE_STR(type, size) type##size
2165#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2166
2167#define CONVERT_STR(x, type) (convert_##type((x)))
2168#define CONVERT(x, type) CONVERT_STR(x, type)
2169
2170#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2171#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2172
2173#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2174#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2175
2176#define select_vec_dt_uchar(size) uchar##size
2177#define select_vec_dt_char(size) char##size
2178#define select_vec_dt_ushort(size) ushort##size
2179#define select_vec_dt_short(size) short##size
2180#define select_vec_dt_half(size) short##size
2181#define select_vec_dt_uint(size) uint##size
2182#define select_vec_dt_int(size) int##size
2183#define select_vec_dt_float(size) int##size
2184#define select_vec_dt_ulong(size) ulong##size
2185#define select_vec_dt_long(size) long##size
2186
2187#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2188#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2189#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2190
2191#define signed_int_vec_dt_uchar(size) char##size
2192#define signed_int_vec_dt_char(size) char##size
2193#define signed_int_vec_dt_ushort(size) short##size
2194#define signed_int_vec_dt_short(size) short##size
2195#define signed_int_vec_dt_half(size) short##size
2196#define signed_int_vec_dt_uint(size) int##size
2197#define signed_int_vec_dt_int(size) int##size
2198#define signed_int_vec_dt_float(size) int##size
2199#define signed_int_vec_dt_ulong(size) long##size
2200#define signed_int_vec_dt_long(size) long##size
2201
2202#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2203#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2204#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2205
2206#define sum_reduce_1(x) (x)
2207#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2208#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2209#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2210#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2211#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2212
2213#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2214#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2215
2216#define prod_reduce_1(x) (x)
2217#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2218#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2219#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2220#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2221#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2222
2223#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2224#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2225
2226#define max_reduce_1(x) (x)
2227#define max_reduce_2(x) max(((x).s0), ((x).s1))
2228#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2229#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2230#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2231#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2232
2233#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2234#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2235
2236#define VECTOR_DECLARATION(name)     \
2237    __global uchar *name##_ptr,      \
2238    uint        name##_stride_x, \
2239    uint        name##_step_x,   \
2240    uint        name##_offset_first_element_in_bytes
2241
2242#define IMAGE_DECLARATION(name)      \
2243    __global uchar *name##_ptr,      \
2244    uint        name##_stride_x, \
2245    uint        name##_step_x,   \
2246    uint        name##_stride_y, \
2247    uint        name##_step_y,   \
2248    uint        name##_offset_first_element_in_bytes
2249
2250#define TENSOR3D_DECLARATION(name)   \
2251    __global uchar *name##_ptr,      \
2252    uint        name##_stride_x, \
2253    uint        name##_step_x,   \
2254    uint        name##_stride_y, \
2255    uint        name##_step_y,   \
2256    uint        name##_stride_z, \
2257    uint        name##_step_z,   \
2258    uint        name##_offset_first_element_in_bytes
2259
2260#define TENSOR4D_DECLARATION(name)   \
2261    __global uchar *name##_ptr,      \
2262    uint        name##_stride_x, \
2263    uint        name##_step_x,   \
2264    uint        name##_stride_y, \
2265    uint        name##_step_y,   \
2266    uint        name##_stride_z, \
2267    uint        name##_step_z,   \
2268    uint        name##_stride_w, \
2269    uint        name##_step_w,   \
2270    uint        name##_offset_first_element_in_bytes
2271
2272#define TENSOR5D_DECLARATION(name)   \
2273    __global uchar *name##_ptr,      \
2274    uint        name##_stride_x, \
2275    uint        name##_step_x,   \
2276    uint        name##_stride_y, \
2277    uint        name##_step_y,   \
2278    uint        name##_stride_z, \
2279    uint        name##_step_z,   \
2280    uint        name##_stride_w, \
2281    uint        name##_step_w,   \
2282    uint        name##_stride_v, \
2283    uint        name##_step_v,   \
2284    uint        name##_offset_first_element_in_bytes
2285
2286#define CONVERT_TO_VECTOR_STRUCT(name) \
2287    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2288
2289#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2290    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2291
2292#define CONVERT_TO_IMAGE_STRUCT(name) \
2293    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2294
2295#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2296    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2297
2298#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2299    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2300
2301#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2302    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2303
2304#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2305    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2306
2307#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2308    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2309                                 name##_stride_z, name##_step_z)
2310
2311#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2312    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2313
2314#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2315    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2316                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2317
2318#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2319    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2320
2321#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2322    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2323                           name##_stride_z, name##_step_z)
2324
2325
2326typedef struct Vector
2327{
2328    __global uchar *ptr;
2329    int             offset_first_element_in_bytes;
2330    int             stride_x;
2331} Vector;
2332
2333
2334typedef struct Image
2335{
2336    __global uchar *ptr;
2337    int             offset_first_element_in_bytes;
2338    int             stride_x;
2339    int             stride_y;
2340} Image;
2341
2342
2343typedef struct Tensor3D
2344{
2345    __global uchar *ptr;
2346    int             offset_first_element_in_bytes;
2347    int             stride_x;
2348    int             stride_y;
2349    int             stride_z;
2350} Tensor3D;
2351
2352
2353typedef struct Tensor4D
2354{
2355    __global uchar *ptr;
2356    int             offset_first_element_in_bytes;
2357    int             stride_x;
2358    int             stride_y;
2359    int             stride_z;
2360    int             stride_w;
2361} Tensor4D;
2362
2363
2364inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2365{
2366    Vector vector =
2367    {
2368        .ptr                           = ptr,
2369        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2370        .stride_x                      = stride_x,
2371    };
2372    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2373    return vector;
2374}
2375
2376
2377inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2378{
2379    Image img =
2380    {
2381        .ptr                           = ptr,
2382        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2383        .stride_x                      = stride_x,
2384        .stride_y                      = stride_y
2385    };
2386    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2387    return img;
2388}
2389
2390
2391inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2392{
2393    Image img =
2394    {
2395        .ptr                           = ptr,
2396        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2397        .stride_x                      = stride_x,
2398        .stride_y                      = stride_y
2399    };
2400    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2401    return img;
2402}
2403
2404
2405inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2406{
2407    Tensor3D tensor =
2408    {
2409        .ptr                           = ptr,
2410        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2411        .stride_x                      = stride_x,
2412        .stride_y                      = stride_y,
2413        .stride_z                      = stride_z
2414    };
2415    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2416    return tensor;
2417}
2418
2419
2420inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2421{
2422    Tensor3D tensor =
2423    {
2424        .ptr                           = ptr,
2425        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2426        .stride_x                      = stride_x,
2427        .stride_y                      = stride_y,
2428        .stride_z                      = stride_z
2429    };
2430    return tensor;
2431}
2432
2433inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2434                                             uint step_w,
2435                                             uint mod_size)
2436{
2437    Tensor4D tensor =
2438    {
2439        .ptr                           = ptr,
2440        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2441        .stride_x                      = stride_x,
2442        .stride_y                      = stride_y,
2443        .stride_z                      = stride_z,
2444        .stride_w                      = stride_w
2445    };
2446
2447    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2448    return tensor;
2449}
2450
2451
2452inline __global const uchar *vector_offset(const Vector *vec, int x)
2453{
2454    return vec->ptr + x * vec->stride_x;
2455}
2456
2457
2458inline __global uchar *offset(const Image *img, int x, int y)
2459{
2460    return img->ptr + x * img->stride_x + y * img->stride_y;
2461}
2462
2463
2464inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2465{
2466    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2467}
2468
2469
2470inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2471{
2472    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2473}
2474
2475
2476inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2477{
2478    uint num_elements = width * height;
2479
2480    const uint z = index / num_elements;
2481
2482    index %= num_elements;
2483
2484    const uint y = index / width;
2485
2486    index %= width;
2487
2488    const uint x = index;
2489
2490    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2491}
2492
2493#endif
2494
2495
2496
2497#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
2498#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
2499    P_X##_DEF(1, P_A, P_B, P_C);       \
2500    REPEAT_3_1(P_X, P_A, P_B, P_C)
2501#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
2502    P_X##_DEF(2, P_A, P_B, P_C);       \
2503    REPEAT_3_2(P_X, P_A, P_B, P_C)
2504#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
2505    P_X##_DEF(3, P_A, P_B, P_C);       \
2506    REPEAT_3_3(P_X, P_A, P_B, P_C)
2507#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
2508    P_X##_DEF(4, P_A, P_B, P_C);       \
2509    REPEAT_3_4(P_X, P_A, P_B, P_C)
2510#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
2511    P_X##_DEF(5, P_A, P_B, P_C);       \
2512    REPEAT_3_5(P_X, P_A, P_B, P_C)
2513#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
2514    P_X##_DEF(6, P_A, P_B, P_C);       \
2515    REPEAT_3_6(P_X, P_A, P_B, P_C)
2516#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
2517    P_X##_DEF(7, P_A, P_B, P_C);       \
2518    REPEAT_3_7(P_X, P_A, P_B, P_C)
2519#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
2520    P_X##_DEF(8, P_A, P_B, P_C);       \
2521    REPEAT_3_8(P_X, P_A, P_B, P_C)
2522#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
2523    P_X##_DEF(9, P_A, P_B, P_C);        \
2524    REPEAT_3_9(P_X, P_A, P_B, P_C)
2525#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
2526    P_X##_DEF(A, P_A, P_B, P_C);        \
2527    REPEAT_3_10(P_X, P_A, P_B, P_C)
2528#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
2529    P_X##_DEF(B, P_A, P_B, P_C);        \
2530    REPEAT_3_11(P_X, P_A, P_B, P_C)
2531#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
2532    P_X##_DEF(C, P_A, P_B, P_C);        \
2533    REPEAT_3_12(P_X, P_A, P_B, P_C)
2534#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
2535    P_X##_DEF(D, P_A, P_B, P_C);        \
2536    REPEAT_3_13(P_X, P_A, P_B, P_C)
2537#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
2538    P_X##_DEF(E, P_A, P_B, P_C);        \
2539    REPEAT_3_14(P_X, P_A, P_B, P_C)
2540#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
2541    P_X##_DEF(F, P_A, P_B, P_C);        \
2542    REPEAT_3_15(P_X, P_A, P_B, P_C)
2543
2544#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
2545#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
2546
2547
2548#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
2549#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
2550    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
2551    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
2552#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
2553    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
2554    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
2555#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
2556    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
2557    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
2558#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
2559    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
2560    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
2561#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
2562    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
2563    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
2564#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
2565    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
2566    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
2567#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
2568    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
2569    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
2570#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
2571    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
2572    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
2573#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
2574    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
2575    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
2576#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
2577    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
2578    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
2579#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
2580    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
2581    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
2582#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
2583    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
2584    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
2585#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
2586    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
2587    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
2588#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
2589    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
2590    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
2591#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
2592    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
2593    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
2594
2595#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
2596#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
2597
2598
2599#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
2600#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
2601
2602
2603#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
2604#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
2605
2606
2607#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
2608#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
2609
2610
2611#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
2612#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
2613
2614
2615#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
2616#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
2617
2618
2619#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
2620#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
2621
2622
2623#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
2624#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
2625
2626
2627#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
2628#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
2629
2630
2631#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
2632#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
2633
2634
2635#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
2636#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
2637
2638
2639#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
2640#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
2641
2642
2643#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
2644    ({                                                                                                        \
2645        VEC_DATA_TYPE(int, N0)                                                                                \
2646        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
2647        VEC_DATA_TYPE(int, N0)                                                                                \
2648        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
2649        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
2650    })
2651#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
2652
2653#endif
2654
2655#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
2656#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
2657
2658
2659
2660
2661#define TILE_VECTOR_SIZE1 1
2662#define TILE_VECTOR_SIZE2 2
2663#define TILE_VECTOR_SIZE3 3
2664#define TILE_VECTOR_SIZE4 4
2665#define TILE_VECTOR_SIZE5 8
2666#define TILE_VECTOR_SIZE6 8
2667#define TILE_VECTOR_SIZE7 8
2668#define TILE_VECTOR_SIZE8 8
2669#define TILE_VECTOR_SIZE9 16
2670#define TILE_VECTOR_SIZE10 16
2671#define TILE_VECTOR_SIZE11 16
2672#define TILE_VECTOR_SIZE12 16
2673#define TILE_VECTOR_SIZE13 16
2674#define TILE_VECTOR_SIZE14 16
2675#define TILE_VECTOR_SIZE15 16
2676#define TILE_VECTOR_SIZE16 16
2677
2678#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
2679#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
2680#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
2681#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
2682#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
2683#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
2684#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
2685#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
2686#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
2687#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
2688#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
2689#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
2690#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
2691#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
2692#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
2693#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
2694
2695
2696#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
2697#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
2698    union {                                 \
2699        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
2700        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
2701    } BASENAME[H]
2702
2703#define TENSOR4D_IMAGE(name)          \
2704    __read_only image2d_t name##_img, \
2705    __global uchar *name##_ptr,       \
2706    uint            name##_stride_x,  \
2707    uint            name##_step_x,    \
2708    uint            name##_stride_y,  \
2709    uint            name##_step_y,    \
2710    uint            name##_stride_z,  \
2711    uint            name##_step_z,    \
2712    uint            name##_stride_w,  \
2713    uint            name##_step_w,    \
2714    uint            name##_offset_first_element_in_bytes
2715
2716#define TENSOR4D_BUFFER(name)    \
2717    __global uchar *name##_ptr,  \
2718    uint        name##_stride_x, \
2719    uint        name##_step_x,   \
2720    uint        name##_stride_y, \
2721    uint        name##_step_y,   \
2722    uint        name##_stride_z, \
2723    uint        name##_step_z,   \
2724    uint        name##_stride_w, \
2725    uint        name##_step_w,   \
2726    uint        name##_offset_first_element_in_bytes
2727
2728#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
2729#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
2730
2731#define TENSOR4D_T_IMAGE(name)          \
2732    __read_only image2d_t name##_img, \
2733    __global uchar *name##_ptr,       \
2734    uint        name##_stride_y, \
2735    uint        name##_stride_z, \
2736    uint        name##_stride_w, \
2737    uint        name##_c,   \
2738    uint        name##_w,   \
2739    uint        name##_h,   \
2740    uint        name##_n,   \
2741    uint        name##_offset_first_element_in_bytes
2742
2743#define TENSOR4D_T_BUFFER(name)    \
2744    __global uchar *name##_ptr,  \
2745    uint        name##_stride_y, \
2746    uint        name##_stride_z, \
2747    uint        name##_stride_w, \
2748    uint        name##_c,   \
2749    uint        name##_w,   \
2750    uint        name##_h,   \
2751    uint        name##_n,   \
2752    uint        name##_offset_first_element_in_bytes
2753
2754#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
2755
2756
2757#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
2758
2759#define TENSOR4D_RO_T_IMAGE(name)          \
2760    __read_only image2d_t name##_img, \
2761    TENSOR4D_T_BUFFER(name)
2762
2763#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
2764
2765#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
2766
2767
2768#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
2769
2770#define TENSOR4D_WO_T_IMAGE(name)          \
2771    __write_only image2d_t name##_img, \
2772    TENSOR4D_T_BUFFER(name)
2773
2774#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
2775
2776#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
2777
2778
2779#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
2780
2781#define TENSOR3D_T_IMAGE(name)          \
2782    __read_only image2d_t name##_img, \
2783    __global uchar *name##_ptr,       \
2784    uint        name##_stride_y, \
2785    uint        name##_stride_z, \
2786    uint        name##_w,   \
2787    uint        name##_h,   \
2788    uint        name##_n,   \
2789    uint        name##_offset_first_element_in_bytes
2790
2791#define TENSOR3D_T_BUFFER(name)    \
2792    __global uchar *name##_ptr,  \
2793    uint        name##_stride_y, \
2794    uint        name##_stride_z, \
2795    uint        name##_w,   \
2796    uint        name##_h,   \
2797    uint        name##_n,   \
2798    uint        name##_offset_first_element_in_bytes
2799
2800#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
2801#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
2802
2803#if !defined(UNROLL_WITH_PRAGMA)
2804#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
2805
2806#define LOOP_UNROLLING_1(idx, step, macro) (macro)
2807#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
2808#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
2809#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
2810#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
2811#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
2812#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
2813#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
2814#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
2815#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
2816#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
2817#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
2818#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
2819#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
2820#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
2821#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
2822#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
2823#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
2824#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
2825#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
2826#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
2827#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
2828#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
2829#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
2830#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
2831#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
2832#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
2833#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
2834#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
2835#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
2836#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
2837#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
2838#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
2839#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
2840#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
2841#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
2842#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
2843#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
2844#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
2845#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
2846#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
2847#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
2848#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
2849#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
2850#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
2851#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
2852#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
2853#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
2854#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
2855#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
2856#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
2857#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
2858#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
2859#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
2860#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
2861#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
2862#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
2863#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
2864#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
2865#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
2866#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
2867#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
2868#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
2869#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
2870#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
2871#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
2872#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
2873#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
2874#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
2875#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
2876#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
2877#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
2878#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
2879#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
2880#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
2881#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
2882#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
2883#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
2884#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
2885#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
2886#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
2887#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
2888#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
2889#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
2890#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
2891#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
2892#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
2893#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
2894#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
2895#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
2896#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
2897#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
2898#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
2899#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
2900#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
2901#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
2902#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
2903#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
2904#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
2905#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
2906#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
2907#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
2908#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
2909#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
2910#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
2911#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
2912#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
2913#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
2914#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
2915#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
2916#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
2917#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
2918#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
2919#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
2920#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
2921#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
2922#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
2923#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
2924#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
2925#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
2926#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
2927#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
2928#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
2929#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
2930#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
2931#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
2932#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
2933#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
2934
2935#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
2936    {                                                          \
2937        type idx = start;                                      \
2938        LOOP_UNROLLING_##num(idx, step, macro);                \
2939    }
2940#else
2941#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
2942    {                                                          \
2943        _Pragma("unroll")                                      \
2944        for(type idx = start; idx < (num * step); idx += step) \
2945        {                                                      \
2946            (macro);                                           \
2947        }                                                      \
2948    }
2949#endif
2950#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
2951
2952
2953#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
2954
2955
2956#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
2957#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
2958#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2959    ({                                                \
2960        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
2961    })
2962#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
2963#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
2964#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
2965#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
2966#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2967#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
2968#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
2969#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
2970#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2971#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
2972#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
2973#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
2974#else
2975#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
2976    ({                                                  \
2977        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
2978        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
2979    })
2980#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
2981    ({                                                  \
2982        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
2983        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
2984    })
2985#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
2986    ({                                                    \
2987        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
2988        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
2989        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
2990        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
2991    })
2992#endif
2993#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2994    ({                                                \
2995        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
2996        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
2997    })
2998#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
2999    ({                                                \
3000        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
3001        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
3002    })
3003#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3004    ({                                                \
3005        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
3006        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
3007    })
3008#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3009    ({                                                \
3010        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
3011        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
3012    })
3013#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3014    ({                                                \
3015        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3016        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
3017    })
3018#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3019    ({                                                \
3020        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3021        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
3022    })
3023#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3024    ({                                                \
3025        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3026        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
3027    })
3028#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3029    ({                                                \
3030        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3031        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
3032    })
3033#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3034    ({                                                \
3035        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3036        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
3037    })
3038#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3039    ({                                                \
3040        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3041        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
3042    })
3043#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3044    ({                                                \
3045        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
3046        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
3047    })
3048#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
3049    ({                                                 \
3050        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
3051        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
3052    })
3053
3054
3055#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
3056#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
3057
3058
3059#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
3060#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
3061#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
3062    VLOAD(WIDTH)                                                \
3063    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
3064#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
3065
3066
3067#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
3068#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
3069#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
3070    VSTORE(WIDTH)                                                \
3071    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
3072#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
3073
3074
3075#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
3076    ({                                                                                                                 \
3077        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
3078        {                                                                                                              \
3079            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
3080        })                                                                                                             \
3081    })
3082
3083
3084#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
3085    ({                                                                                                  \
3086        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
3087        {                                                                                               \
3088            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
3089        })                                                                                              \
3090    })
3091
3092
3093#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
3094    ({                                                                                                                                                                                             \
3095        if(WIDTH1_CONDITION)                                                                                                                                                                       \
3096        {                                                                                                                                                                                          \
3097            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3098            {                                                                                                                                                                                      \
3099                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
3100                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
3101            })                                                                                                                                                                                     \
3102        }                                                                                                                                                                                          \
3103        else                                                                                                                                                                                       \
3104        {                                                                                                                                                                                          \
3105            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3106            {                                                                                                                                                                                      \
3107                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
3108            })                                                                                                                                                                                     \
3109        }                                                                                                                                                                                          \
3110    })
3111
3112#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
3113    ({                                                                                                                                                \
3114        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
3115        {                                                                                                                                             \
3116            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
3117            {                                                                                                                                         \
3118                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
3119                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
3120                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
3121                if(_src_valid_y != 0)                                                                                                                 \
3122                {                                                                                                                                     \
3123                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
3124                }                                                                                                                                     \
3125            })                                                                                                                                        \
3126        })                                                                                                                                            \
3127    })
3128
3129
3130#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
3131    ({ \
3132        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
3133        { \
3134            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
3135            { \
3136                int _src_y = (X) + _xk * (DILATION_X); \
3137                int _src_z = ((Y) + _yk * (DILATION_Y)); \
3138                int _src_w    = (B); \
3139                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
3140                if(!(BOUNDARY_CHECK)) \
3141                { \
3142                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
3143                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
3144                } \
3145                else \
3146                { \
3147                    if(_src_valid_y) \
3148                    { \
3149                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
3150                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
3151                    }                                                                                                                                                                                                 \
3152                } \
3153            })                                                                                                                                                                                                             \
3154        })                                                                                                                                                                                                             \
3155    })
3156
3157
3158#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
3159    ({                                                                                                                                                                \
3160        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
3161        {                                                                                                                                                             \
3162            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
3163            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
3164            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
3165            if(_src_valid_y != 0)                                                                                                                                     \
3166            {                                                                                                                                                         \
3167                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
3168            }                                                                                                                                                         \
3169        })                                                                                                                                                            \
3170    })
3171
3172
3173#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
3174#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
3175#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
3176    ({ \
3177        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
3178        { \
3179            if(yi[0].s[_i] >= 0) \
3180            { \
3181                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
3182            } \
3183        }) \
3184    })
3185
3186#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
3187    ({ \
3188        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
3189        { \
3190            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
3191        }) \
3192    })
3193
3194
3195#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
3196    ({                                                                                                                                                                \
3197        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
3198        {                                                                                                                                                             \
3199            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
3200            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
3201            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
3202                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
3203            if(_src_valid_y != 0)                                                                                                                                     \
3204            {                                                                                                                                                         \
3205                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
3206            }                                                                                                                                                         \
3207        })                                                                                                                                                            \
3208    })
3209
3210
3211#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
3212    ({                                                                                                                                                                                             \
3213        if(WIDTH1_CONDITION)                                                                                                                                                                       \
3214        {                                                                                                                                                                                          \
3215            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3216            {                                                                                                                                                                                      \
3217                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
3218                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
3219            })                                                                                                                                                                                     \
3220        }                                                                                                                                                                                          \
3221        else                                                                                                                                                                                       \
3222        {                                                                                                                                                                                          \
3223            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
3224            {                                                                                                                                                                                      \
3225                VSTORE(WIDTH0)                                                                                                                                                                     \
3226                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
3227            })                                                                                                                                                                                     \
3228        }                                                                                                                                                                                          \
3229    })
3230
3231
3232#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
3233    ({                                                                                               \
3234        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
3235        {                                                                                            \
3236            ACC_DATA_TYPE _tm = 0;                                                                   \
3237            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
3238            {                                                                                        \
3239                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
3240            })                                                                                       \
3241            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
3242            {                                                                                        \
3243                dst[_m0].s[_n0] += _tm;                                                              \
3244                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
3245                {                                                                                    \
3246                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
3247                })                                                                                   \
3248            })                                                                                       \
3249        })                                                                                          \
3250    })
3251
3252
3253#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
3254#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
3255
3256
3257#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
3258    ({ \
3259        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3260        { \
3261            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3262            { \
3263                SRC_DATA_TYPE _tmp = 0; \
3264                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3265                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
3266                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
3267                long a_64 = (long)(_src); \
3268                long b_64 = (long)(DST_MULTIPLIER); \
3269                long ab_64 = a_64 * b_64; \
3270                long mask1 = 1 << 30; \
3271                long mask2 = 1 - (1 << 30); \
3272                long is_positive_or_zero = ab_64 >= 0; \
3273                long nudge = select(mask2, mask1, is_positive_or_zero); \
3274                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3275                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3276                if(DST_SHIFT >= 0) \
3277                { \
3278                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
3279                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
3280                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
3281                } \
3282                _tmp += DST_OFFSET; \
3283                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3284            })                                                                                                                                          \
3285        })                                                                                                                                          \
3286    })
3287
3288
3289#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
3290    ({ \
3291        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3292        { \
3293            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3294            { \
3295                SRC_DATA_TYPE _tmp = 0; \
3296                SRC_DATA_TYPE _tmp2 = 0; \
3297                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3298                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
3299                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
3300                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
3301                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
3302                long a_64 = (long)(_src); \
3303                long b_64 = (long)(_dst_multiplier); \
3304                long ab_64 = a_64 * b_64; \
3305                long mask1 = 1 << 30; \
3306                long mask2 = 1 - (1 << 30); \
3307                long is_positive_or_zero = ab_64 >= 0; \
3308                long nudge = select(mask2, mask1, is_positive_or_zero); \
3309                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3310                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3311                long mask = ((((int)1) << _dst_shift) - (int)1); \
3312                long threshold = (mask >> 1) + any(_tmp); \
3313                _tmp2 = _tmp >> _dst_shift; \
3314                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
3315                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
3316                _tmp += DST_OFFSET; \
3317                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3318            })                                                                                                                                          \
3319        })                                                                                                                                         \
3320    })
3321
3322
3323#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
3324    ({ \
3325        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3326        { \
3327            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
3328            { \
3329                SRC_DATA_TYPE _tmp = 0; \
3330                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
3331                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
3332                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
3333                long a_64 = (long)(_src); \
3334                long b_64 = (long)(DST_MULTIPLIER); \
3335                long ab_64 = a_64 * b_64; \
3336                long mask1 = 1 << 30; \
3337                long mask2 = 1 - (1 << 30); \
3338                long is_positive_or_zero = ab_64 >= 0; \
3339                long nudge = select(mask2, mask1, is_positive_or_zero); \
3340                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
3341                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
3342                if(DST_SHIFT >= 0) \
3343                { \
3344                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
3345                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
3346                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
3347                } \
3348                _tmp += DST_OFFSET; \
3349                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
3350            })                                                                                                                                          \
3351        })                                                                                                                                          \
3352    })
3353
3354
3355#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
3356    ({                                                                                                                                                     \
3357        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
3358        {                                                                                                                                                  \
3359            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
3360            {                                                                                                                                              \
3361                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
3362            })                                                                                                                                             \
3363        })                                                                                                                                                 \
3364    })
3365
3366
3367#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
3368    ({                                                                                         \
3369        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
3370        {                                                                                      \
3371            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
3372        })                                                                                     \
3373    })
3374
3375
3376#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
3377
3378#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
3379
3380#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
3381
3382#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
3383
3384#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
3385
3386#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
3387#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
3388
3389#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
3390#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
3391#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
3392#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
3393
3394
3395#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
3396    ({ \
3397        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
3398        { \
3399            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
3400        })                                                                                          \
3401    })
3402
3403
3404#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
3405    ({                                                            \
3406        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3407        {                                                         \
3408            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
3409        })                                                        \
3410    })
3411
3412
3413#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
3414    ({                                                            \
3415        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3416        {                                                         \
3417            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
3418        })                                                        \
3419    })
3420
3421#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3422#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3423#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3424
3425#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3426#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3427
3428#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3429
3430#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3431#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3432
3433
3434#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
3435    ({                                                            \
3436        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
3437        {                                                         \
3438            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
3439        })                                                        \
3440    })
3441
3442
3443#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3444    ({                                                      \
3445        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3446        {                                                   \
3447            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3448        })                                                  \
3449    })
3450
3451
3452#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3453    ({                                                      \
3454        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3455        {                                                   \
3456            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3457        })                                                  \
3458    })
3459
3460#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3461#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3462#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3463#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
3464
3465
3466#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
3467    ({                                                      \
3468        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3469        {                                                   \
3470            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3471        })                                                  \
3472    })
3473
3474
3475#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
3476    ({                                                      \
3477        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
3478        {                                                   \
3479            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
3480        })                                                  \
3481    })
3482
3483
3484#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3485#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3486#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3487#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3488#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3489#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3490#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3491#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
3492#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
3493    {                                                                                     \
3494        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
3495        {                                                                                 \
3496            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
3497            {                                                                             \
3498                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
3499                {                                                                         \
3500                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
3501                })                                                                        \
3502            })                                                                            \
3503        })                                                                                \
3504    }
3505
3506#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
3507    ({ \
3508        LOOP_UNROLLING(int, _m, 0, 1, M0, \
3509        { \
3510            LOOP_UNROLLING(int, _n, 0, 1, N0, \
3511            { \
3512                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
3513            })                                                                                             \
3514        })                                                                                             \
3515    })
3516
3517#endif
3518
3519#if defined(POOL_AVG) || defined(POOL_L2)
3520#define POOL_OP(x, y) ((x) + (y))
3521#else
3522#define POOL_OP(x, y) (fmax((x), (y)))
3523#endif
3524
3525#if defined(POOL_L2)
3526#define POW2_OP(x, vec_size) ((x) * (x))
3527#else
3528#define POW2_OP(x, vec_size) (x)
3529#endif
3530
3531#define DIV_OP(x, y) (x * (1.f / y))
3532#define SQRT_OP(x) sqrt((x))
3533
3534#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
3535
3536#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
3537
3538__kernel void pooling_layer_MxN_nhwc(
3539    TENSOR4D_DECLARATION(input),
3540    TENSOR4D_DECLARATION(output))
3541{
3542
3543
3544    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
3545    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
3546#if DST_BATCH_SIZE != 1
3547
3548    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
3549    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
3550#else
3551    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
3552    int idx_out_n   = 0;
3553#endif
3554
3555    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
3556
3557    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
3558                                           output_stride_w;
3559
3560    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
3561    res0 = INITIAL_VALUE;
3562
3563    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
3564    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
3565
3566    int pool_x_s = max((int)0, -idx_in_w);
3567    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
3568    int pool_y_s = max((int)0, -idx_in_h);
3569    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
3570
3571#if defined(EXCLUDE_PADDING)
3572    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
3573#else
3574    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
3575#endif
3576
3577#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
3578
3579    for(int y = 0; y < POOL_SIZE_Y; ++y)
3580    {
3581#pragma unroll 8
3582        for(int x = 0; x < POOL_SIZE_X; ++x)
3583        {
3584#else
3585    for(int y = pool_y_s; y < pool_y_e; ++y)
3586    {
3587#pragma unroll 8
3588        for(int x = pool_x_s; x < pool_x_e; ++x)
3589        {
3590#endif
3591            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
3592            data0;
3593#if defined(FP_MIXED_PRECISION)
3594
3595            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
3596#else
3597            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
3598#endif
3599
3600#if defined(POOL_L2)
3601
3602            data0 *= data0;
3603#endif
3604            res0 = POOL_OP(res0, data0);
3605        }
3606    }
3607
3608#if defined(POOL_AVG) || defined(POOL_L2)
3609    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
3610#endif
3611
3612#if defined(POOL_L2)
3613
3614    res0 = SQRT_OP(res0);
3615#endif
3616
3617
3618#if defined(FP_MIXED_PRECISION)
3619    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
3620    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
3621    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
3622#else
3623    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
3624#endif
3625}
3626#endif
3627
3628#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
3629
3630
3631__kernel void pooling_layer_2x2_nhwc(
3632    TENSOR4D_DECLARATION(input),
3633    TENSOR4D_DECLARATION(output)
3634#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
3635    ,
3636    TENSOR4D_DECLARATION(indices)
3637#endif
3638)
3639{
3640
3641
3642    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
3643    int idx_out_w = get_global_id(1);
3644#if DST_BATCH_SIZE != 1
3645
3646    int idx_out_h = get_global_id(2) % DST_HEIGHT;
3647    int idx_out_n = get_global_id(2) / DST_HEIGHT;
3648#else
3649    int idx_out_h = get_global_id(2);
3650    int idx_out_n = 0;
3651#endif
3652
3653    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
3654    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
3655
3656    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
3657
3658    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
3659                                           output_stride_w;
3660
3661    int pool_x_s = max((int)0, -idx_in_w);
3662    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
3663    int pool_y_s = max((int)0, -idx_in_h);
3664    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
3665
3666    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
3667
3668    int x0 = pool_x_s + idx_in_w;
3669    int y0 = pool_y_s + idx_in_h;
3670    int x1 = pool_x_e - 1 + idx_in_w;
3671    int y1 = pool_y_e - 1 + idx_in_h;
3672
3673    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
3674
3675#if defined(FP_MIXED_PRECISION)
3676
3677    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
3678    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
3679    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
3680    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
3681#else
3682    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
3683    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
3684    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
3685    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
3686#endif
3687
3688#if !defined(POOL_MAX)
3689    if(filter_size != 4)
3690    {
3691        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
3692        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
3693        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
3694        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
3695
3696
3697        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
3698        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
3699        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
3700        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
3701    }
3702#endif
3703
3704#if defined(POOL_L2)
3705
3706    data0 *= data0;
3707    data1 *= data1;
3708    data2 *= data2;
3709    data3 *= data3;
3710#endif
3711
3712    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
3713    res0 = data0;
3714    res0 = POOL_OP(res0, data1);
3715    res0 = POOL_OP(res0, data2);
3716    res0 = POOL_OP(res0, data3);
3717
3718#if defined(POOL_AVG) || defined(POOL_L2)
3719#if defined(EXCLUDE_PADDING)
3720    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
3721#else
3722    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
3723#endif
3724#endif
3725
3726#if defined(POOL_L2)
3727
3728    res0 = SQRT_OP(res0);
3729#endif
3730
3731
3732#if defined(FP_MIXED_PRECISION)
3733    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
3734    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
3735    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
3736#else
3737    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
3738#endif
3739
3740#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
3741
3742
3743
3744
3745
3746    VEC_DATA_TYPE(uint, VEC_SIZE)
3747    base_index = (uint)idx_out_c;
3748
3749    base_index += VEC_OFFS(uint, VEC_SIZE);
3750
3751    VEC_DATA_TYPE(uint, VEC_SIZE)
3752    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
3753    VEC_DATA_TYPE(uint, VEC_SIZE)
3754    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
3755    VEC_DATA_TYPE(uint, VEC_SIZE)
3756    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
3757    VEC_DATA_TYPE(uint, VEC_SIZE)
3758    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
3759
3760    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
3761    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
3762    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
3763
3764    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
3765                                           indices_stride_w;
3766
3767
3768    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
3769#endif
3770}
3771#endif  )"