xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/tile_helpers.h (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
25 #define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
26 
27 // *INDENT-OFF*
28 // clang-format off
29 
30 #define TILE_VECTOR_SIZE1 1
31 #define TILE_VECTOR_SIZE2 2
32 #define TILE_VECTOR_SIZE3 3
33 #define TILE_VECTOR_SIZE4 4
34 #define TILE_VECTOR_SIZE5 8
35 #define TILE_VECTOR_SIZE6 8
36 #define TILE_VECTOR_SIZE7 8
37 #define TILE_VECTOR_SIZE8 8
38 #define TILE_VECTOR_SIZE9 16
39 #define TILE_VECTOR_SIZE10 16
40 #define TILE_VECTOR_SIZE11 16
41 #define TILE_VECTOR_SIZE12 16
42 #define TILE_VECTOR_SIZE13 16
43 #define TILE_VECTOR_SIZE14 16
44 #define TILE_VECTOR_SIZE15 16
45 #define TILE_VECTOR_SIZE16 16
46 
47 #define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
48 #define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
49 #define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
50 #define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
51 #define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
52 #define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
53 #define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
54 #define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
55 #define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
56 #define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
57 #define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
58 #define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
59 #define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
60 #define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
61 #define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
62 #define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
63 
64 /** Tile object
65  *  A tile object is a 2D memory block and can be accessed using the following syntax:
66  *  -# a[m0].v    = access the the vector at row "m0" (OpenCL vector)
67  *  -# dst[m0].s[n0] = access the scalar element at row "m0" and column "n0" (scalar access)
68  *
69  * @param[in] DATA_TYPE Data type of the tile
70  * @param[in] H         Number of tile rows
71  * @param[in] W         Number of tile colums
72  * @param[in] BASENAME  Tile's name
73  */
74 #define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
75 #define TILE_STR(DATA_TYPE, H, W, BASENAME) \
76     union {                                 \
77         DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
78         TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
79     } BASENAME[H]
80 
81 #define TENSOR4D_IMAGE(name)          \
82     __read_only image2d_t name##_img, \
83     __global uchar *name##_ptr,       \
84     uint            name##_stride_x,  \
85     uint            name##_step_x,    \
86     uint            name##_stride_y,  \
87     uint            name##_step_y,    \
88     uint            name##_stride_z,  \
89     uint            name##_step_z,    \
90     uint            name##_stride_w,  \
91     uint            name##_step_w,    \
92     uint            name##_offset_first_element_in_bytes
93 
94 #define TENSOR4D_BUFFER(name)    \
95     __global uchar *name##_ptr,  \
96     uint        name##_stride_x, \
97     uint        name##_step_x,   \
98     uint        name##_stride_y, \
99     uint        name##_step_y,   \
100     uint        name##_stride_z, \
101     uint        name##_step_z,   \
102     uint        name##_stride_w, \
103     uint        name##_step_w,   \
104     uint        name##_offset_first_element_in_bytes
105 
106 #define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
107 #define TENSOR4D(name, type) TENSOR4D_STR(name, type)
108 
109 #define TENSOR4D_T_IMAGE(name)          \
110     __read_only image2d_t name##_img, \
111     __global uchar *name##_ptr,       \
112     uint        name##_stride_y, \
113     uint        name##_stride_z, \
114     uint        name##_stride_w, \
115     uint        name##_c,   \
116     uint        name##_w,   \
117     uint        name##_h,   \
118     uint        name##_n,   \
119     uint        name##_offset_first_element_in_bytes
120 
121 #define TENSOR4D_T_BUFFER(name)    \
122     __global uchar *name##_ptr,  \
123     uint        name##_stride_y, \
124     uint        name##_stride_z, \
125     uint        name##_stride_w, \
126     uint        name##_c,   \
127     uint        name##_w,   \
128     uint        name##_h,   \
129     uint        name##_n,   \
130     uint        name##_offset_first_element_in_bytes
131 
132 #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
133 
134 /** Legacy tensor 4D arguments
135  *
136  * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
137  * @param[in] type Tensor type (BUFFER or IMAGE)
138  */
139 #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
140 
141 #define TENSOR4D_RO_T_IMAGE(name)          \
142     __read_only image2d_t name##_img, \
143     TENSOR4D_T_BUFFER(name)
144 
145 #define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
146 
147 #define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
148 
149 /** Read-Only (RO) tensor 4D.
150  *
151  * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
152  * @param[in] type Tensor type (BUFFER or IMAGE)
153  */
154 #define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
155 
156 #define TENSOR4D_WO_T_IMAGE(name)          \
157     __write_only image2d_t name##_img, \
158     TENSOR4D_T_BUFFER(name)
159 
160 #define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
161 
162 #define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
163 
164 /** Write-Only (WO) tensor 4D.
165  *
166  * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
167  * @param[in] type Tensor type (BUFFER or IMAGE)
168  */
169 #define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
170 
171 #define TENSOR3D_T_IMAGE(name)          \
172     __read_only image2d_t name##_img, \
173     __global uchar *name##_ptr,       \
174     uint        name##_stride_y, \
175     uint        name##_stride_z, \
176     uint        name##_w,   \
177     uint        name##_h,   \
178     uint        name##_n,   \
179     uint        name##_offset_first_element_in_bytes
180 
181 #define TENSOR3D_T_BUFFER(name)    \
182     __global uchar *name##_ptr,  \
183     uint        name##_stride_y, \
184     uint        name##_stride_z, \
185     uint        name##_w,   \
186     uint        name##_h,   \
187     uint        name##_n,   \
188     uint        name##_offset_first_element_in_bytes
189 
190 #define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
191 #define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
192 
193 #if !defined(UNROLL_WITH_PRAGMA)
194 #define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
195 
196 #define LOOP_UNROLLING_1(idx, step, macro) (macro)
197 #define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
198 #define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
199 #define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
200 #define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
201 #define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
202 #define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
203 #define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
204 #define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
205 #define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
206 #define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
207 #define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
208 #define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
209 #define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
210 #define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
211 #define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
212 #define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
213 #define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
214 #define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
215 #define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
216 #define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
217 #define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
218 #define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
219 #define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
220 #define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
221 #define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
222 #define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
223 #define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
224 #define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
225 #define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
226 #define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
227 #define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
228 #define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
229 #define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
230 #define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
231 #define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
232 #define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
233 #define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
234 #define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
235 #define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
236 #define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
237 #define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
238 #define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
239 #define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
240 #define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
241 #define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
242 #define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
243 #define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
244 #define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
245 #define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
246 #define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
247 #define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
248 #define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
249 #define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
250 #define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
251 #define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
252 #define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
253 #define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
254 #define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
255 #define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
256 #define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
257 #define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
258 #define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
259 #define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
260 #define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
261 #define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
262 #define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
263 #define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
264 #define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
265 #define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
266 #define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
267 #define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
268 #define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
269 #define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
270 #define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
271 #define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
272 #define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
273 #define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
274 #define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
275 #define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
276 #define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
277 #define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
278 #define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
279 #define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
280 #define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
281 #define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
282 #define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
283 #define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
284 #define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
285 #define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
286 #define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
287 #define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
288 #define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
289 #define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
290 #define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
291 #define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
292 #define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
293 #define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
294 #define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
295 #define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
296 #define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
297 #define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
298 #define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
299 #define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
300 #define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
301 #define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
302 #define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
303 #define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
304 #define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
305 #define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
306 #define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
307 #define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
308 #define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
309 #define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
310 #define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
311 #define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
312 #define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
313 #define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
314 #define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
315 #define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
316 #define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
317 #define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
318 #define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
319 #define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
320 #define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
321 #define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
322 #define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
323 #define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
324 
325 #define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
326     {                                                          \
327         type idx = start;                                      \
328         LOOP_UNROLLING_##num(idx, step, macro);                \
329     }
330 #else // !defined(UNROLL_WITH_PRAGMA)
331 #define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
332     {                                                          \
333         _Pragma("unroll")                                      \
334         for(type idx = start; idx < (num * step); idx += step) \
335         {                                                      \
336             (macro);                                           \
337         }                                                      \
338     }
339 #endif // !defined(UNROLL_WITH_PRAGMA)
340 #define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
341 
342 /** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0
343  *  to avoid out-of-bound read/write
344  *
345  * @note PARTIAL_N0 is used for get_global_id(n) = 0.
346  *
347  * @param[in] IDX        get_global_id index (0,1 and 2 only)
348  * @param[in] N0         Number of elements read/written on the IDX direction
349  * @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,
350  *                        the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0
351  */
352 #define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
353 
354 /** Dot product integet 8bit function
355  *
356  *  @note Performs: c += dot(a, b)
357  *
358  * @param[in] A_DATA_TYPE A (lhs) data type
359  * @param[in] B_DATA_TYPE B (rhs) data type
360  * @param[in] C_DATA_TYPE C (accumulator) data type
361  * @param[in] K0          Number of accumulations
362  * @param[in] a           OpenCL vector a
363  * @param[in] b           OpenCL vector b
364  * @param[in] c           Scalar variable c
365  */
366 #define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
367 #define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
368 #define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
369     ({                                                \
370         c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
371     })
372 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
373 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
374 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
375 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
376 #elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) //  defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
377 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
378 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
379 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
380 #elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
381 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
382 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
383 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
384 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
385 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
386     ({                                                  \
387         c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
388         c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
389     })
390 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
391     ({                                                  \
392         DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
393         c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
394     })
395 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
396     ({                                                    \
397         val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
398         val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
399         val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
400         val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
401     })
402 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
403 #define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
404     ({                                                \
405         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
406         DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
407     })
408 #define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
409     ({                                                \
410         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
411         DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
412     })
413 #define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
414     ({                                                \
415         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
416         DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
417     })
418 #define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
419     ({                                                \
420         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
421         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
422     })
423 #define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
424     ({                                                \
425         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
426         DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
427     })
428 #define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
429     ({                                                \
430         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
431         DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
432     })
433 #define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
434     ({                                                \
435         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
436         DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
437     })
438 #define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
439     ({                                                \
440         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
441         DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
442     })
443 #define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
444     ({                                                \
445         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
446         DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
447     })
448 #define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
449     ({                                                \
450         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
451         DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
452     })
453 #define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
454     ({                                                \
455         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
456         DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
457     })
458 #define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
459     ({                                                 \
460         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
461         DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
462     })
463 
464 /** Dot product integet 8bit function
465  *
466  *  @note Performs: c += dot(a, b)
467  *
468  * @param[in] A_DATA_TYPE A (lhs) data type
469  * @param[in] B_DATA_TYPE B (rhs) data type
470  * @param[in] C_DATA_TYPE C (accumulator) data type
471  * @param[in] K0          Number of accumulations
472  * @param[in] a           OpenCL vector a
473  * @param[in] c           Scalar variable c
474  */
475 #define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
476 #define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
477 
478 /** Load a vector from global memory (tensor)
479  *
480  * @param[in] DATA_TYPE   Data type
481  * @param[in] WIDTH       Number of dst columns
482  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
483  *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
484  * @param[in] TENSOR      Tensor basename
485  * @param[in] X           Starting X position
486  * @param[in] Y           Starting Y position
487  * @param[in] STRIDE_Y    Stride Y (in bytes)
488  */
489 #define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
490 #define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
491 #define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
492     VLOAD(WIDTH)                                                \
493     (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
494 #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
495 
496 /** Store a vector in global memory (tensor)
497  *
498  * @param[in] DATA_TYPE   Data type
499  * @param[in] WIDTH       Number of dst columns
500  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
501  *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
502  * @param[in] TENSOR      Tensor basename
503  * @param[in] X           Starting X position
504  * @param[in] Y           Starting Y position
505  * @param[in] STRIDE_Y    Stride Y (in bytes)
506  * @param[in] VALUES      Values to store in memory
507  */
508 #define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
509 #define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
510 #define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
511     VSTORE(WIDTH)                                                \
512     (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
513 #define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
514 
515 /** Load a tile from global memory (tensor)
516  *
517  * @param[in]  DATA_TYPE     Data type
518  * @param[in]  HEIGHT        Number of dst rows
519  * @param[in]  WIDTH         Number of dst columns
520  * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
521  *                           In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
522  * @param[in]  TENSOR        Tensor basename
523  * @param[in]  X             Starting X position
524  * @param[in]  Y             Starting Y position
525  * @param[in]  YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).
526  *                           In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).
527  *                           In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y
528  * @param[in]  STRIDE_Y      Stride Y (in bytes) used to load each row.
529  * @param[out] dst           Output tile
530  */
531 #define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
532     ({                                                                                                                 \
533         LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
534         {                                                                                                              \
535             dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
536         })                                                                                                             \
537     })
538 
539 /** Load a tile from global memory (tensor) using an indirect Y index tile
540  *
541  * @param[in]  DATA_TYPE   Data type
542  * @param[in]  HEIGHT      Number of dst rows
543  * @param[in]  WIDTH       Number of dst columns
544  * @param[in]  TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
545  *                         In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
546  * @param[in]  TENSOR      Tensor basename
547  * @param[in]  X           Starting X position
548  * @param[in]  STRIDE_Y    Stride Y (in bytes)
549  * @param[in]  indirect_y  Indirect Y index tile
550  * @param[out] dst         Output tile
551  */
552 #define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
553     ({                                                                                                  \
554         LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
555         {                                                                                               \
556             dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
557         })                                                                                              \
558     })
559 
560 /** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load
561  *
562  * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store
563  * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
564  *
565  * @param[in]  DATA_TYPE        Data type
566  * @param[in]  HEIGHT           Number of dst rows
567  * @param[in]  WIDTH0           Store width to use if WIDTH1_CONDITION = false
568  * @param[in]  WIDTH1           Store width to use if WIDTH1_CONDITION = true
569  * @param[in]  TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
570  *                              In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
571  * @param[in]  TENSOR           Tensor basename
572  * @param[in]  X                Starting X position
573  * @param[in]  STRIDE_Y         Stride Y (in bytes) used to load each row.
574  * @param[in]  WIDTH1_CONDITION Condition to select the WIDTH1 store
575  * @param[out] dst              Output tile
576  * @param[out] indirect_y       Indirect Y index tile
577  */
578 #define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
579     ({                                                                                                                                                                                             \
580         if(WIDTH1_CONDITION)                                                                                                                                                                       \
581         {                                                                                                                                                                                          \
582             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
583             {                                                                                                                                                                                      \
584                 VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
585                 (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
586             })                                                                                                                                                                                     \
587         }                                                                                                                                                                                          \
588         else                                                                                                                                                                                       \
589         {                                                                                                                                                                                          \
590             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
591             {                                                                                                                                                                                      \
592                 dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
593             })                                                                                                                                                                                     \
594         }                                                                                                                                                                                          \
595     })
596 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
597  *
598  * @param[in]  DATA_TYPE     Data type
599  * @param[in]  TILE_HEIGHT   Number of elements to load from Y (height) dimension
600  * @param[in]  TILE_WIDTH    Number of elements to load from X (width) dimension
601  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
602  * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
603  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
604  * @param[in]  TENSOR        Tensor basename
605  * @param[in]  B             Starting batch index
606  * @param[in]  Y             Starting Y index
607  * @param[in]  X             Starting X index
608  * @param[in]  C             Starting C index
609  * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
610  * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
611  * @param[in]  STRIDE_Y      Stride Y (in bytes)
612  * @param[out] dst           Output tile
613  */
614 #define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
615     ({                                                                                                                                                \
616         LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
617         {                                                                                                                                             \
618             LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
619             {                                                                                                                                         \
620                 int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
621                 _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
622                 int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
623                 if(_src_valid_y != 0)                                                                                                                 \
624                 {                                                                                                                                     \
625                     dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
626                 }                                                                                                                                     \
627             })                                                                                                                                        \
628         })                                                                                                                                            \
629     })
630 
631 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments
632  *
633  * @param[in]  DATA_TYPE      Data type
634  * @param[in]  TILE_HEIGHT    Number of elements to load from Y (height) dimension
635  * @param[in]  TILE_WIDTH     Number of elements to load from X (width) dimension
636  * @param[in]  TILE_CHANNELS  Number of elements to load from C (channel) dimension
637  * @param[in]  TENSOR_TYPE    Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
638  *                            In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
639  * @param[in]  TENSOR         Tensor basename
640  * @param[in]  B              Starting batch index
641  * @param[in]  Y              Starting Y index
642  * @param[in]  X              Starting X index
643  * @param[in]  C              Starting C index
644  * @param[in]  TENSOR_HEIGHT  Number of elements to load from Y (height) dimension
645  * @param[in]  TENSOR_WIDTH   Number of elements to load from X (width) dimension
646  * @param[in]  DILATION_X     Dilation for the X increment
647  * @param[in]  DILATION_Y     Dilation for the Y increment
648  * @param[in]  BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads
649  * @param[out] dst            Output tile
650  */
651 #define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
652     ({ \
653         LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
654         { \
655             LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
656             { \
657                 int _src_y = (X) + _xk * (DILATION_X); \
658                 int _src_z = ((Y) + _yk * (DILATION_Y)); \
659                 int _src_w    = (B); \
660                 bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
661                 if(!(BOUNDARY_CHECK)) \
662                 { \
663                     dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
664                     (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
665                 } \
666                 else \
667                 { \
668                     if(_src_valid_y) \
669                     { \
670                         dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
671                     (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
672                     }                                                                                                                                                                                                 \
673                 } \
674             })                                                                                                                                                                                                             \
675         })                                                                                                                                                                                                             \
676     })
677 
678 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates
679  *
680  * @param[in]  DATA_TYPE     Data type
681  * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
682  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
683  * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
684  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
685  * @param[in]  TENSOR        Tensor basename
686  * @param[in]  B             Starting batch index
687  * @param[in]  Y             Starting Y index
688  * @param[in]  X             Starting X index
689  * @param[in]  C             Starting C index
690  * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
691  * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
692  * @param[in]  STRIDE_Y      Stride Y (in bytes)
693  * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
694  * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
695  * @param[out] dst           Output tile
696  */
697 #define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
698     ({                                                                                                                                                                \
699         LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
700         {                                                                                                                                                             \
701             int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
702             _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
703             int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
704             if(_src_valid_y != 0)                                                                                                                                     \
705             {                                                                                                                                                         \
706                 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
707             }                                                                                                                                                         \
708         })                                                                                                                                                            \
709     })
710 
711 /** Load a tile from global memory (tensor) using an indirect buffer for the Y coordinates
712  *
713  * @param[in]  DATA_TYPE     Data type
714  * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
715  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
716  * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
717  *                           When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped
718  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
719  * @param[in]  TENSOR        Tensor basename
720  * @param[in]  C             Starting C index
721  * @param[in]  STRIDE_Y      Stride Y (in bytes)
722  * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
723  *                           16 is the maximum indirect buffer size.
724  * @param[out] dst           Output tile
725  */
726 #define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
727 #define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
728 #define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
729     ({ \
730         LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
731         { \
732             if(yi[0].s[_i] >= 0) \
733             { \
734                 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
735             } \
736         }) \
737     })
738 
739 #define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
740     ({ \
741         LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
742         { \
743             dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
744         }) \
745     })
746 
747 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
748  *
749  * @param[in]  DATA_TYPE     Data type
750  * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
751  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
752  * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
753  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
754  * @param[in]  TENSOR        Tensor basename
755  * @param[in]  B             Starting batch index
756  * @param[in]  Z             Starting Z index
757  * @param[in]  Y             Starting Y index
758  * @param[in]  X             Starting X index
759  * @param[in]  C             Starting C index
760  * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
761  * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
762  * @param[in]  TENSOR_DEPTH  Number of elements to load from Z (depth) dimension
763  * @param[in]  STRIDE_Y      Stride Y (in bytes)
764  * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
765  * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
766  * @param[out] zi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate
767  * @param[out] dst           Output tile
768  */
769 #define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
770     ({                                                                                                                                                                \
771         LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
772         {                                                                                                                                                             \
773             int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
774             _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
775             int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
776                              && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
777             if(_src_valid_y != 0)                                                                                                                                     \
778             {                                                                                                                                                         \
779                 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
780             }                                                                                                                                                         \
781         })                                                                                                                                                            \
782     })
783 
784 /** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
785  *
786  * @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store
787  * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
788  *
789  * @param[in] DATA_TYPE        Data type
790  * @param[in] HEIGHT           Number of src rows
791  * @param[in] WIDTH0           Store width to use if WIDTH1_CONDITION = false
792  * @param[in] WIDTH1           Store width to use if WIDTH1_CONDITION = true
793  * @param[in] TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
794  *                             cl_image is not supported.
795  * @param[in] TENSOR           Tensor basename
796  * @param[in] X                Starting X position
797  * @param[in] STRIDE_Y         Stride Y (in bytes)
798  * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store
799  * @param[in] src              Input tile
800  * @param[in] indirect_y       Indirect Y index tile
801  */
802 #define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
803     ({                                                                                                                                                                                             \
804         if(WIDTH1_CONDITION)                                                                                                                                                                       \
805         {                                                                                                                                                                                          \
806             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
807             {                                                                                                                                                                                      \
808                 VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
809                 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
810             })                                                                                                                                                                                     \
811         }                                                                                                                                                                                          \
812         else                                                                                                                                                                                       \
813         {                                                                                                                                                                                          \
814             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
815             {                                                                                                                                                                                      \
816                 VSTORE(WIDTH0)                                                                                                                                                                     \
817                 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
818             })                                                                                                                                                                                     \
819         }                                                                                                                                                                                          \
820     })
821 
822 /** Offset correction for the QASYMM8 computation
823  *
824  * @param[in]  ACC_DATA_TYPE Accumulator data type
825  * @param[in]  M0            Number of src/dst rows
826  * @param[in]  N0            Number of src/dst columns
827  * @param[in]  K0            Number of src columns
828  * @param[in]  SRC_OFFSET    Source quantization offset
829  * @param[in]  WEI_OFFSET    Weights quantization shift
830  * @param[in]  lhs           LHS tile
831  * @param[in]  rhs           RHS tile
832  * @param[out] dst           DST tile
833  */
834 #define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
835     ({                                                                                               \
836         LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
837         {                                                                                            \
838             ACC_DATA_TYPE _tm = 0;                                                                   \
839             LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
840             {                                                                                        \
841                 _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
842             })                                                                                       \
843             LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
844             {                                                                                        \
845                 dst[_m0].s[_n0] += _tm;                                                              \
846                 LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
847                 {                                                                                    \
848                     dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
849                 })                                                                                   \
850             })                                                                                       \
851         })                                                                                          \
852     })
853 
854 /** 8-bit quantization with fixed-point scale
855  *
856  * @param[in]  SRC_DATA_TYPE     SRC data type
857  * @param[in]  DST_DATA_TYPE     DST data type
858  * @param[in]  QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)
859  * @param[in]  M0                Number of src/dst rows
860  * @param[in]  N0                Number of src/dst columns
861  * @param[in]  DST_OFFSET        Quantization offset used for both the per-tensor and per-channel quantization
862  * @param[in]  DST_SHIFT         Quantization shift for the per-tensor quantization
863  * @param[in]  DST_MULTIPLIER    Quantization multiplier for the per-tensor quantization
864  * @param[in]  src               Input tile
865  * @param[in]  dst_multipliers   Output multipliers tile for the per-channel quantization
866  * @param[in]  dst_shifts        Output shift tile for the per-channel quantization
867  * @param[out] dst               Output tile
868  */
869 #define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
870 #define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
871 
872 /** 8-bit per-tensor quantization with fixed-point scale
873  *
874  * @param[in]  SRC_DATA_TYPE   SRC data type
875  * @param[in]  DST_DATA_TYPE   DST data type
876  * @param[in]  M0              Number of src/dst rows
877  * @param[in]  N0              Number of src/dst columns
878  * @param[in]  DST_OFFSET      Quantization offset
879  * @param[in]  DST_SHIFT       Quantization shift for the per-tensor quantization
880  * @param[in]  DST_MULTIPLIER  Quantization multiplier for the per-tensor quantization
881  * @param[in]  src             Input tile
882  * @param[in]  dst_multipliers (unused)
883  * @param[in]  dst_shifts      (unused)
884  * @param[out] dst             Output tile
885  */
886 #define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
887     ({ \
888         LOOP_UNROLLING(int, _m0, 0, 1, M0, \
889         { \
890             LOOP_UNROLLING(int, _n0, 0, 1, N0, \
891             { \
892                 SRC_DATA_TYPE _tmp = 0; \
893                 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
894                 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
895                 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
896                 long a_64 = (long)(_src); \
897                 long b_64 = (long)(DST_MULTIPLIER); \
898                 long ab_64 = a_64 * b_64; \
899                 long mask1 = 1 << 30; \
900                 long mask2 = 1 - (1 << 30); \
901                 long is_positive_or_zero = ab_64 >= 0; \
902                 long nudge = select(mask2, mask1, is_positive_or_zero); \
903                 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
904                 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
905                 if(DST_SHIFT >= 0) \
906                 { \
907                     long mask = ((((int)1) << DST_SHIFT) - (long)1); \
908                     long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
909                     _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
910                 } \
911                 _tmp += DST_OFFSET; \
912                 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
913             })                                                                                                                                          \
914         })                                                                                                                                          \
915     })
916 
917 /** 8-bit per-channel quantization with fixed-point scale
918  *
919  * @param[in]  SRC_DATA_TYPE   SRC data type
920  * @param[in]  DST_DATA_TYPE   DST data type
921  * @param[in]  M0              Number of src/dst rows
922  * @param[in]  N0              Number of src/dst columns
923  * @param[in]  DST_OFFSET      Quantization offset
924  * @param[in]  DST_SHIFT       (unused)
925  * @param[in]  DST_MULTIPLIER  (unused)
926  * @param[in]  src             Input tile
927  * @param[in]  dst_multipliers Output multipliers tile for the per-channel quantization
928  * @param[in]  dst_shifts      Output shift tile for the per-channel quantization
929  * @param[out] dst             Output tile
930  */
931 #define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
932     ({ \
933         LOOP_UNROLLING(int, _m0, 0, 1, M0, \
934         { \
935             LOOP_UNROLLING(int, _n0, 0, 1, N0, \
936             { \
937                 SRC_DATA_TYPE _tmp = 0; \
938                 SRC_DATA_TYPE _tmp2 = 0; \
939                 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
940                 SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
941                 SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
942                 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
943                 SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
944                 long a_64 = (long)(_src); \
945                 long b_64 = (long)(_dst_multiplier); \
946                 long ab_64 = a_64 * b_64; \
947                 long mask1 = 1 << 30; \
948                 long mask2 = 1 - (1 << 30); \
949                 long is_positive_or_zero = ab_64 >= 0; \
950                 long nudge = select(mask2, mask1, is_positive_or_zero); \
951                 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
952                 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
953                 long mask = ((((int)1) << _dst_shift) - (int)1); \
954                 long threshold = (mask >> 1) + any(_tmp); \
955                 _tmp2 = _tmp >> _dst_shift; \
956                 _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
957                 _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
958                 _tmp += DST_OFFSET; \
959                 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
960             })                                                                                                                                          \
961         })                                                                                                                                         \
962     })
963 
964 /** Quantized the 8-bit tile with fixed-point scale for asymmetric
965  *
966  * @param[in]  SRC_DATA_TYPE  SRC data type
967  * @param[in]  DST_DATA_TYPE  DST data type
968  * @param[in]  M0             Number of src/dst rows
969  * @param[in]  N0             Number of src/dst columns
970  * @param[in]  DST_OFFSET     Quantization offset used for both the per-tensor and per-channel quantization
971  * @param[in]  DST_SHIFT      Quantization shift for the per-tensor quantization
972  * @param[in]  DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
973  * @param[in]  src            Input tile
974  * @param[out] dst            Output tile
975  */
976 #define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
977     ({ \
978         LOOP_UNROLLING(int, _m0, 0, 1, M0, \
979         { \
980             LOOP_UNROLLING(int, _n0, 0, 1, N0, \
981             { \
982                 SRC_DATA_TYPE _tmp = 0; \
983                 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
984                 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
985                 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
986                 long a_64 = (long)(_src); \
987                 long b_64 = (long)(DST_MULTIPLIER); \
988                 long ab_64 = a_64 * b_64; \
989                 long mask1 = 1 << 30; \
990                 long mask2 = 1 - (1 << 30); \
991                 long is_positive_or_zero = ab_64 >= 0; \
992                 long nudge = select(mask2, mask1, is_positive_or_zero); \
993                 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
994                 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
995                 if(DST_SHIFT >= 0) \
996                 { \
997                     long mask = ((((int)1) << DST_SHIFT) - (int)1); \
998                     long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
999                     _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
1000                 } \
1001                 _tmp += DST_OFFSET; \
1002                 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
1003             })                                                                                                                                          \
1004         })                                                                                                                                          \
1005     })
1006 
1007 /** Conditional rowset (memset by row)
1008  *
1009  * @note Set the row to VALUE_TO_SET if the corresponding mask == 0
1010  *
1011  * @param[in]      DATA_TYPE    Data type
1012  * @param[in]      M0           Number of LHS rows
1013  * @param[in]      N0           Number of LHS columns
1014  * @param[in]      VALUE_TO_SET Value to set the row
1015  * @param[in, out] a            Input/output tile
1016  * @param[out]     mask         Mask to check for setting the row to VALUE_TO_SET
1017  */
1018 #define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
1019     ({                                                                                                                                                     \
1020         LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
1021         {                                                                                                                                                  \
1022             LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
1023             {                                                                                                                                              \
1024                 a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
1025             })                                                                                                                                             \
1026         })                                                                                                                                                 \
1027     })
1028 
1029 /** Element-wise activation for floating point types
1030  *
1031  * @note Performs: activation(LHS) = DST
1032  *
1033  * @param[in]  DATA_TYPE       SRC/DST data type
1034  * @param[in]  M0              Number of SRC/DST rows
1035  * @param[in]  N0              Number of SRC/DST columns
1036  * @param[in]  ACTIVATION_TYPE Activation type
1037  * @param[in]  A_VAL           A value used for the activation (e.g. tanh_op, brelu,..)
1038  * @param[in]  B_VAL           B value used for the activation (e.g. tanh_op, brelu,..)
1039  * @param[out] src             SRC tile
1040  * @param[out] dst             DST tile
1041  */
1042 #define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
1043     ({                                                                                         \
1044         LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
1045         {                                                                                      \
1046             dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
1047         })                                                                                     \
1048     })
1049 
1050 // RELU Activation
1051 #define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
1052 // Bounded RELU Activation
1053 #define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
1054 // Lower Upper Bounded RELU Activation
1055 #define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1056 // Hard Swish Activation
1057 #define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
1058 // Identity Activation
1059 #define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
1060 
1061 #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
1062 #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
1063 
1064 #define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
1065 #define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
1066 #define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
1067 #define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
1068 
1069 /** Element-wise activation for quantized types
1070  *
1071  * @note Performs: activation(LHS) = DST
1072  *
1073  * @param[in]  DATA_TYPE       SRC/DST data type
1074  * @param[in]  M0              Number of SRC/DST rows
1075  * @param[in]  N0              Number of SRC/DST columns
1076  * @param[in]  ACTIVATION_TYPE Activation type
1077  * @param[in]  ZERO_VALUE      The zero value to consider in the computation
1078  * @param[in]  A_VAL           A value used for the activation (e.g. tanh_op, brelu,..)
1079  * @param[in]  B_VAL           B value used for the activation (e.g. tanh_op, brelu,..)
1080  * @param[out] src             SRC tile
1081  * @param[out] dst             DST tile
1082  */
1083 #define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
1084     ({ \
1085         LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1086         { \
1087             dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
1088         })                                                                                          \
1089     })
1090 
1091 /** Element-wise addition between two tiles
1092  *
1093  * @note Performs: LHS + RHS = DST
1094  *
1095  * @param[in]  DATA_TYPE LHS/RHS/DST data type
1096  * @param[in]  M0        Number of LHS rows
1097  * @param[in]  N0        Number of LHS columns
1098  * @param[in]  lhs       LHS tile
1099  * @param[in]  rhs       Constant RHS tile
1100  * @param[out] dst       DST tile
1101  */
1102 #define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
1103     ({                                                            \
1104         LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
1105         {                                                         \
1106             dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
1107         })                                                        \
1108     })
1109 
1110 /** Element-wise addition with a constant value
1111  *
1112  * @note Performs: LHS + constant = DST
1113  *
1114  * @param[in]  DATA_TYPE    LHS/RHS/DST data type
1115  * @param[in]  M0           Number of LHS rows
1116  * @param[in]  N0           Number of LHS columns
1117  * @param[in]  lhs          LHS tile
1118  * @param[in]  rhs_constant Constant value
1119  * @param[out] dst          DST tile
1120  */
1121 #define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
1122     ({                                                            \
1123         LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
1124         {                                                         \
1125             dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
1126         })                                                        \
1127     })
1128 
1129 #define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1130 #define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1131 #define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1132 
1133 #define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1134 #define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1135 
1136 #define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1137 
1138 #define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1139 #define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1140 
1141 /** Element-wise scale with a constant value
1142  *
1143  * @note Performs: LHS * constant = DST
1144  *
1145  * @param[in]  DATA_TYPE    LHS/RHS/DST data type
1146  * @param[in]  M0           Number of LHS rows
1147  * @param[in]  N0           Number of LHS columns
1148  * @param[in]  lhs          LHS tile
1149  * @param[in]  rhs_constant Constant value
1150  * @param[out] dst          DST tile
1151  */
1152 #define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
1153     ({                                                            \
1154         LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
1155         {                                                         \
1156             dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
1157         })                                                        \
1158     })
1159 
1160 /** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
1161  *
1162  * @note Performs: LHS OP RHS[broadcasted] = DST
1163  * @note Both tiles must have same data type
1164  *
1165  * @param[in]  T_ELWISE_OP   Elementwise operator to perform
1166  * @param[in]  DST_DATA_TYPE DST data type
1167  * @param[in]  M0            Number of LHS rows
1168  * @param[in]  N0            Number of LHS columns
1169  * @param[in]  lhs           LHS tile
1170  * @param[in]  rhs           RHS tile
1171  * @param[out] dst           DST tile
1172  */
1173 #define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
1174     ({                                                      \
1175         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
1176         {                                                   \
1177             dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
1178         })                                                  \
1179     })
1180 
1181 /** Element-wise operation with LHS broadcasted (LHS has the X dimension only)
1182  *
1183  * @note Performs: LHS[broadcasted] OP RHS = DST
1184  * @note Both tiles must have same data type
1185  *
1186  * @param[in]  T_ELWISE_OP   Elementwise operator to perform
1187  * @param[in]  DST_DATA_TYPE DST data type
1188  * @param[in]  M0            Number of RHS rows
1189  * @param[in]  N0            Number of RHS columns
1190  * @param[in]  lhs           LHS tile
1191  * @param[in]  rhs           RHS tile
1192  * @param[out] dst           DST tile
1193  */
1194 #define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
1195     ({                                                      \
1196         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
1197         {                                                   \
1198             dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
1199         })                                                  \
1200     })
1201 
1202 #define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1203 #define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1204 #define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1205 #define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1206 
1207 /** Element-wise operation between two tiles (LHS and RHS)
1208  *
1209  * @note Performs: LHS OP RHS = DST
1210  * @note Both tiles must have same data type
1211  *
1212  * @param[in]  T_ELWISE_OP   Elementwise operator to perform
1213  * @param[in]  DST_DATA_TYPE DST data type
1214  * @param[in]  M0            Number of LHS rows
1215  * @param[in]  N0            Number of LHS columns
1216  * @param[in]  lhs           LHS tile
1217  * @param[in]  rhs           RHS tile
1218  * @param[out] dst           DST tile
1219  */
1220 #define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
1221     ({                                                      \
1222         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
1223         {                                                   \
1224             dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
1225         })                                                  \
1226     })
1227 
1228 /** Floor operation on a tile
1229  *
1230  * @note Performs: floor(SRC) = DST
1231  * @note Both tiles must have same data type
1232  *
1233  * @param[in]  DST_DATA_TYPE DST data type
1234  * @param[in]  M0            Number of SRC rows
1235  * @param[in]  N0            Number of SRC columns
1236  * @param[in]  src           LHS tile
1237  * @param[out] dst           DST tile
1238  */
1239 #define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
1240     ({                                                      \
1241         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
1242         {                                                   \
1243             dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
1244         })                                                  \
1245     })
1246 
1247 /** Matrix multiplication
1248  *
1249  * @note Performs: LHS X RHS + DST = DST
1250  *
1251  * @param[in]      LHS_DATA_TYPE LHS tile data type
1252  * @param[in]      RHS_DATA_TYPE RHS tile data type
1253  * @param[in]      DST_DATA_TYPE RHS tile data type
1254  * @param[in]      M0            Number of LHS rows
1255  * @param[in]      N0            Number of RHS columns
1256  * @param[in]      K0            Number of LHS columns
1257  * @param[in]      LHS_LAYOUT    LHS layout (T= transposed, NT= not transposed)
1258  * @param[in]      RHS_LAYOUT    RHS layout (T= transposed, NT= not transposed)
1259  * @param[in]      lhs           LHS tile
1260  * @param[in]      rhs           RHS tile
1261  * @param[in, out] dst           DST tile
1262  */
1263 #define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1264 #define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1265 #define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1266 #define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1267 #define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1268 #define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1269 #define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1270 #define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1271 #define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
1272     {                                                                                     \
1273         LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
1274         {                                                                                 \
1275             LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
1276             {                                                                             \
1277                 LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
1278                 {                                                                         \
1279                     dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
1280                 })                                                                        \
1281             })                                                                            \
1282         })                                                                                \
1283     }
1284 
1285 #define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
1286     ({ \
1287         LOOP_UNROLLING(int, _m, 0, 1, M0, \
1288         { \
1289             LOOP_UNROLLING(int, _n, 0, 1, N0, \
1290             { \
1291                 DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
1292             })                                                                                             \
1293         })                                                                                             \
1294     })
1295 
1296 #endif /* SRC_CORE_CL_CL_KERNELS_TILE_HELPERS */
1297