xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/gemm_helpers.h (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "activation_float_helpers.h"
25 #include "helpers.h"
26 
27 /** Utility macro to access a vector with the scalar positions
28  *
29  * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
30  *
31  * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
32  * @param[in] n0     The number of consecutive columns to access. n0 + offset must be <= 16
33  * @param[in] x      Vector to access
34  * @{
35  */
36 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
37 #define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
38 
39 // offset == 0
40 #define scalar_access_0_1(x) ((x).s0)
41 #define scalar_access_0_2(x) ((x).s01)
42 #define scalar_access_0_3(x) ((x).s012)
43 #define scalar_access_0_4(x) ((x).s0123)
44 #define scalar_access_0_8(x) ((x).s01234567)
45 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
46 
47 // offset == 1
48 #define scalar_access_1_1(x) ((x).s1)
49 #define scalar_access_1_2(x) ((x).s12)
50 #define scalar_access_1_3(x) ((x).s123)
51 #define scalar_access_1_4(x) ((x).s1234)
52 #define scalar_access_1_8(x) ((x).s12345678)
53 
54 // offset == 2
55 #define scalar_access_2_1(x) ((x).s2)
56 #define scalar_access_2_2(x) ((x).s23)
57 #define scalar_access_2_3(x) ((x).s234)
58 #define scalar_access_2_4(x) ((x).s2345)
59 #define scalar_access_2_8(x) ((x).s23456789)
60 
61 // offset == 3
62 #define scalar_access_3_1(x) ((x).s3)
63 #define scalar_access_3_2(x) ((x).s34)
64 #define scalar_access_3_3(x) ((x).s345)
65 #define scalar_access_3_4(x) ((x).s3456)
66 #define scalar_access_3_8(x) ((x).s3456789A)
67 
68 // offset == 4
69 #define scalar_access_4_1(x) ((x).s4)
70 #define scalar_access_4_2(x) ((x).s45)
71 #define scalar_access_4_3(x) ((x).s456)
72 #define scalar_access_4_4(x) ((x).s4567)
73 #define scalar_access_4_8(x) ((x).s456789AB)
74 
75 // offset == 8
76 #define scalar_access_8_1(x) ((x).s8)
77 #define scalar_access_8_2(x) ((x).s89)
78 #define scalar_access_8_3(x) ((x).s89A)
79 #define scalar_access_8_4(x) ((x).s89AB)
80 #define scalar_access_8_8(x) ((x).s89ABCDEF)
81 
82 // offset == 12
83 #define scalar_access_12_1(x) ((x).sC)
84 #define scalar_access_12_2(x) ((x).sCD)
85 #define scalar_access_12_3(x) ((x).sCDE)
86 #define scalar_access_12_4(x) ((x).sCDEF)
87 
88 // offset == 16
89 #define scalar_access_16_1(x) ((x).sF)
90 
91 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables.
92  * @name LOAD_TENSOR_ROW_n
93  *
94  * @param[in] N0         The number of columns to load
95  * @param[in] DATA_TYPE  The data type of variables
96  * @param[in] BASENAME   The basename of the destination variables for the loaded rows
97  * @param[in] PTR        The base pointer
98  * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
99  * @param[in] STRIDE_Y   The stride value in y-axis direction
100  * @param[in] Z          The z-axis offset vector
101  * @{
102  */
103 #define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
104     ({})
105 
106 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
107     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
108 
109 #define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
110     LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
111     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
112 
113 #define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
114     LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
115     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
116 
117 #define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
118     LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
119     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
120 
121 #define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
122     LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
123     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
124 
125 #define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
126     LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
127     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
128 
129 #define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
130     LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
131     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
132 
133 #define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
134     LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
135     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
136 
137 #define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
138     LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
139     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
140 
141 #define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
142     LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
143     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
144 
145 #define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
146     LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
147     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
148 
149 #define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
150     LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
151     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
152 
153 #define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
154     LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
155     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156 
157 #define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
158     LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
159     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
160 
161 #define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
162     LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
163     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
164 
165 #define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
166     LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
167     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168 /** @}*/ // end of group LOAD_TENSOR_ROW_n
169 
170 /** Load tensor (consecutive rows and columns) with Z offset.
171  * @name LOAD_TENSOR
172  *
173  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
174  * The data to load is expected to have consecutive names for each row.
175  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
176  * The Z offset is expected to have consecutive names.
177  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
178  *
179  * @param[in] M0         The number of consecutive rows
180  * @param[in] N0         The number of consecutive columns
181  * @param[in] DATA_TYPE  The data type of the target
182  * @param[in] BASENAME   The basename of the result variables
183  * @param[in] PTR        The base pointer for the data
184  * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
185  * @param[in] STRIDE_Y   The stride in y-axis direction
186  * @param[in] Z          The z-axis offset vector
187  * @{
188  */
189 #define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
190 #define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
191 /** @} */ // end of group LOAD_TENSOR
192 
193 /** Load 2D tensor (consecutive rows and columns) with Z offset.
194  * @name LOAD_TENSOR_M0Xn
195  *
196  * @param[in] M0        The number of rows to load [0-16]
197  * @param[in] N0        The number of columns to load [0-16]
198  * @param[in] DATA_TYPE The data type of variables
199  * @param[in] BASENAME  The basename of the destination variables for the loaded rows
200  * @param[in] PTR       The base pointer
201  * @param[in] STRIDE_Y  The stride value in y-axis direction
202  * @param[in] Z         The z-axis offset vector
203  * @{
204  */
205 #define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
206     ({})
207 
208 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
209     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
210 
211 #define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
212     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
213 
214 #define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
215     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
216 
217 #define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
218     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
219 
220 #define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
221     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
222     LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
223 
224 #define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
225     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
226     LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
227 
228 #define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
229     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
230     LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
231 
232 #define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
233     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
234 
235 #define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
236     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
237     LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
238 
239 #define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
240     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
241     LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
242 
243 #define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
244     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
245     LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
246 
247 #define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
248     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
249     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
250 
251 #define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
252     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
253     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
254     LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
255 
256 #define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
257     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
258     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
259     LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
260 
261 #define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
262     LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
263     LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
264     LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
265 
266 #define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
267     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
268 /** @}*/ // end of group LOAD_TENSOR_M0Xn
269 
270 /** Load 2D tensor (consecutive rows and columns) with Z offset.
271  * @name LOAD_TENSOR_M0XN0
272  *
273  * @param[in] M0        The number of consecutive rows [0-16]
274  * @param[in] N0        The number of consecutive columns [0-16]
275  * @param[in] DATA_TYPE The data type of the target
276  * @param[in] BASENAME  The basename of the result variables
277  * @param[in] PTR       The base pointer for the data
278  * @param[in] STRIDE_Y  The stride in y-axis direction
279  * @param[in] Z         The z-axis offset vector
280  * @{
281  */
282 #define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
283 #define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
284 
285 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
286  * @name LOAD_ROW_n
287  *
288  * @param[in] N0        The number of columns to load
289  * @param[in] DATA_TYPE The data type of variables
290  * @param[in] BASENAME  The basename of the destination variables for the loaded rows
291  * @param[in] PTR       The base pointer
292  * @param[in] OFFSET    The offset within a row
293  * @param[in] STRIDE_Y  The stride value in y-axis direction
294  * @param[in] Z         The z-axis offset vector
295  * @{
296  */
297 #define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
298     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
299     BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
300 
301 #define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
302     LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
303     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
304     BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
305 
306 #define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
307     LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
308     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
309     BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
310 
311 #define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
312     LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
313     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
314     BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
315 
316 #define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
317     LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
318     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
319     BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
320 
321 #define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
322     LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
323     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
324     BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
325 
326 #define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
327     LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
328     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
329     BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
330 
331 #define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
332     LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
333     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
334     BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
335 
336 #define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
337     LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
338     VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
339     BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
340 
341 #define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
342     LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
343     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
344     BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
345 
346 #define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
347     LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
348     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
349     BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
350 
351 #define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
352     LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
353     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
354     BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
355 
356 #define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
357     LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
358     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
359     BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
360 
361 #define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
362     LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
363     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
364     BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
365 
366 #define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
367     LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
368     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
369     BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
370 
371 #define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
372     LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
373     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
374     BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
375 
376 /** @}*/ // end of group LOAD_ROW_n
377 
378 /** Load Blocks (consecutive rows and columns) with Z offset.
379  * @name LOAD_BLOCK
380  *
381  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
382  * The data to load is expected to have consecutive names for each row.
383  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
384  * The Z offset is expected to have consecutive names.
385  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
386  *
387  * @param[in] M0        The number of consecutive rows
388  * @param[in] N0        The number of consecutive columns
389  * @param[in] DATA_TYPE The data type of the target
390  * @param[in] BASENAME  The basename of the result variables
391  * @param[in] PTR       The base pointer for the data
392  * @param[in] OFFSET    The offset within a row
393  * @param[in] STRIDE_Y  The stride in y-axis direction
394  * @param[in] Z         The z-axis offset vector
395  * @{
396  */
397 #define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
398 #define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
399 /** @} */ // end of group LOAD_BLOCK
400 
401 /** Partially load the 0 to (n-1)th rows of the given variables
402  * @name LOAD_ROW_PARTIAL_n
403  * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0
404  *
405  * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
406  *
407  * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
408  * @param[in] LOAD_N0   The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0
409  * @param[in] DATA_TYPE The data type of the vectors
410  * @param[in] BASENAME  The basename of the variables
411  * @param[in] PTR       The base pointer
412  * @param[in] OFFSET    The offset within a row
413  * @param[in] STRIDE_Y  The stride value in y-axis direction
414  * @param[in] Z         The offset in z-axis direction
415  * @{
416  */
417 #define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
418     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
419     (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
420 
421 #define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
422     LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
423     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
424     (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
425 
426 #define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
427     LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
428     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
429     (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
430 
431 #define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
432     LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
433     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
434     (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
435 
436 #define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
437     LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
438     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
439     (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
440 
441 #define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
442     LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
443     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
444     (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
445 
446 #define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
447     LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
448     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
449     (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
450 
451 #define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
452     LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
453     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
454     (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
455 
456 #define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
457     LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
458     VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
459     (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
460 
461 #define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
462     LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
463     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
464     (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
465 
466 #define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
467     LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
468     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
469     (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
470 
471 #define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
472     LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
473     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
474     (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
475 
476 #define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
477     LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
478     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
479     (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
480 
481 #define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
482     LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
483     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
484     (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
485 
486 #define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
487     LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
488     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
489     (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
490 
491 #define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
492     LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
493     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
494     (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
495 /** @} */ // end of groupd LOAD_ROW_PARTIAL_n
496 
497 /** Partially load a block of the given size LOAD_M0xLOAD_N0
498  * @name LOAD_BLOCK_PARTIAL
499  *
500  * @note The vector width @p N0 is also required for correct partial storing behaviour.
501  * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
502  *
503  * The data to load is expected to have consecutive names for each row.
504  * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2.
505  * The Z offset is expected to have consecutive names.
506  * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
507  *
508  * @param[in] LOAD_M0   The number of rows to load. Supported: 1-16
509  * @param[in] LOAD_N0   The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0
510  * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
511  * @param[in] DATA_TYPE The data type of the vectors
512  * @param[in] BASENAME  The basename of the variables
513  * @param[in] PTR       The base pointer
514  * @param[in] OFFSET    The offset within a row
515  * @param[in] STRIDE_Y  The stride value in y-axis direction
516  * @param[in] Z         The offset in z-axis direction
517  * @{
518  */
519 #define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
520 #define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
521 /** Load a block that can be partial in both x and y dimensions
522  *
523  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
524  *
525  * The data to load is expected to have consecutive names for each row.
526  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
527  * The Z offset is expected to have consecutive names.
528  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
529  *
530  * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
531  * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
532  * @param[in] DATA_TYPE        The data type of the vectors
533  * @param[in] BASENAME         The basename of the variables
534  * @param[in] PTR              The base pointer
535  * @param[in] OFFSET           The offset within a row
536  * @param[in] STRIDE_Y         The stride value in y-axis direction
537  * @param[in] Z                The offset in z-axis direction
538  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
539  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
540  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
541  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
542  */
543 #define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
544     if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
545     {                                                                                                                                                            \
546         LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
547     }                                                                                                                                                            \
548     else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
549     {                                                                                                                                                            \
550         LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
551     }                                                                                                                                                            \
552     else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
553     {                                                                                                                                                            \
554         LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
555     }                                                                                                                                                            \
556     else                                                                                                                                                         \
557     {                                                                                                                                                            \
558         LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
559     }
560 /** Load a block that can only be partial in x but not y.
561  *
562  * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
563  *
564  * The data to load is expected to have consecutive names for each row.
565  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
566  * The Z offset is expected to have consecutive names.
567  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
568  *
569  * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
570  * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
571  * @param[in] DATA_TYPE        The data type of the vectors
572  * @param[in] BASENAME         The basename of the variables
573  * @param[in] PTR              The base pointer
574  * @param[in] OFFSET           The offset within a row
575  * @param[in] STRIDE_Y         The stride value in y-axis direction
576  * @param[in] Z                The offset in z-axis direction
577  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
578  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
579  */
580 #define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
581     if(!(PARTIAL_COND_X))                                                                                                \
582     {                                                                                                                    \
583         LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
584     }                                                                                                                    \
585     else                                                                                                                 \
586     {                                                                                                                    \
587         LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
588     }
589 /** Load a block that can only be partial in y but not x.
590  *
591  * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
592  *
593  * The data to store is expected to have consecutive names for each row.
594  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
595  * The Z offset is expected to have consecutive names.
596  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
597  *
598  * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
599  * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
600  * @param[in] DATA_TYPE        The data type of the vectors
601  * @param[in] BASENAME         The basename of the variables
602  * @param[in] PTR              The base pointer
603  * @param[in] OFFSET           The offset within a row
604  * @param[in] STRIDE_Y         The stride value in y-axis direction
605  * @param[in] Z                The offset in z-axis direction
606  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
607  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
608  */
609 #define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
610     if(!(PARTIAL_COND_Y))                                                                                                \
611     {                                                                                                                    \
612         LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
613     }                                                                                                                    \
614     else                                                                                                                 \
615     {                                                                                                                    \
616         LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
617     }
618 /** @} */ // end of group LOAD_BLOCK_PARTIAL
619 /** Boundary-aware GeMM block load
620  * @name LOAD_BLOCK_BOUNDARY_AWARE
621  * This macro assumes the following schemes to achieve boundary-awareness:
622  *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
623  *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
624  *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
625  * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim.
626  *
627  * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
628  * blocks **at the end**.
629  * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
630  * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
631  *
632  *  *--x-->                         x == 0                        x == 1
633  *  |                  |<------------------------------N-------------------------->|
634  *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
635  *  |     -------------#############################################################
636  *  *     |          | |...............................|...........................|
637  * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
638  *        |          | |...............................|...........................|
639  *        M          --#############################################################
640  *        |          | |                               |...........................|
641  * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
642  *        |          | |                               |...........................|
643  *        |------------#############################################################
644  *
645  * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
646  *
647  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
648  *
649  * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
650  * and select corresponding load methods such that the boundary detection logic is only added when needed.
651  *
652  * The data to load is expected to have consecutive names for each row.
653  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
654  * The Z offset is expected to have consecutive names.
655  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
656  *
657  * The macro will result in a declaration of @p M0 vectors of size @p N0 with data
658  * type @p DATA_TYPE containing values partially loaded from the specified
659  * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be
660  * filled with zeros.
661  *
662  * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
663  * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
664  * @param[in] DATA_TYPE        The data type of the vectors
665  * @param[in] BASENAME         The basename of the variables
666  * @param[in] PTR              The base pointer
667  * @param[in] OFFSET           The offset within a row
668  * @param[in] STRIDE_Y         The stride value in y-axis direction
669  * @param[in] Z                The offset in z-axis direction
670  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
671  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
672  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
673  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
674  * @{
675  */
676 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
677 // Case1: No partial blocks in either x or y
678 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
679     LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
680 
681 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
682 // Case2: Partial blocks in y
683 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
684     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
685     LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
686 
687 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
688 // Case3: Partial blocks in x
689 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
690     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
691     LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
692 
693 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
694 // Case4: Partial blocks in both x and y
695 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
696     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
697     LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
698 
699 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
700 
701 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
702  * @name LOAD_TEXTURE2D_ROW_n
703  *
704  * @param[in] N0         The number of pixels to read
705  * @param[in] DATA_TYPE  The data type of variables
706  * @param[in] BASENAME   The basename of the destination variables for the loaded rows
707  * @param[in] IMG        The 2D OpenCL image object
708  * @param[in] X_COORD    The x coordinate for the top-left pixel
709  * @param[in] Y_COORD    The y coordinate for the top-left pixel
710  * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
711  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
712  * @{
713  */
714 #define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
715     BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
716 
717 #define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
718     LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
719     BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
720 
721 #define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
722     LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
723     BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
724 
725 #define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
726     LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
727     BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
728 
729 #define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
730     LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
731     BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
732 
733 #define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
734     LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
735     BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
736 
737 #define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
738     LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
739     BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
740 
741 #define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
742     LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
743     BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
744 
745 #define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
746     LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
747     BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
748 
749 #define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
750     LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
751     BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
752 
753 #define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
754     LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
755     BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
756 
757 #define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
758     LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
759     BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
760 
761 #define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
762     LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
763     BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
764 
765 #define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
766     LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
767     BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
768 
769 #define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
770     LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
771     BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
772 
773 #define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
774     LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
775     BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
776 /** @} */ // end of group LOAD_TEXTURE2D_ROW_n
777 
778 /** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values
779  * @name LOAD_TEXTURE2D
780  *
781  * Supported cases are M0=1,2,3,...,16 and N0=1
782  * The data to load is expected to have consecutive names for each row.
783  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
784  *
785  * @param[in] M0         The number of consecutive rows
786  * @param[in] N0         The number of consecutive pixels. Only 1, 2 and 4 are supported
787  * @param[in] DATA_TYPE  The data type of the target
788  * @param[in] BASENAME   The basename of the result variables
789  * @param[in] IMG        The 2D OpenCL image object
790  * @param[in] X_COORD    The x coordinate for the top-left pixel
791  * @param[in] Y_COORD    The y coordinate for the top-left pixel
792  * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
793  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
794  * @{
795  */
796 #define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
797 #define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
798 /** @} */ // end of group LOAD_TEXTURE2D
799 
800 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
801  * @name LOAD_ROW_INDIRECT_n
802  *
803  * @param[in] N0        The number of columns to load
804  * @param[in] DATA_TYPE The data type of variables
805  * @param[in] BASENAME  The basename of the destination variables for the loaded rows
806  * @param[in] PTR       The base pointer
807  * @param[in] OFFSET    The offset within a row
808  * @param[in] STRIDE_Y  The stride value in y-axis direction
809  * @param[in] Y         The y-axis offset vector
810  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
811  * @{
812  */
813 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
814     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
815     BASENAME##0;                                                                            \
816     if(Y_MASK##0 != 0)                                                                      \
817         BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
818     else                                                                                    \
819         BASENAME##0 = 0;
820 
821 #define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
822     LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
823     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
824     BASENAME##1;                                                                            \
825     if(Y_MASK##1 != 0)                                                                      \
826         BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
827     else                                                                                    \
828         BASENAME##1 = 0;
829 
830 #define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
831     LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
832     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
833     BASENAME##2;                                                                            \
834     if(Y_MASK##2 != 0)                                                                      \
835         BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
836     else                                                                                    \
837         BASENAME##2 = 0;
838 
839 #define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
840     LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
841     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
842     BASENAME##3;                                                                            \
843     if(Y_MASK##3 != 0)                                                                      \
844         BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
845     else                                                                                    \
846         BASENAME##3 = 0;
847 
848 #define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
849     LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
850     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
851     BASENAME##4;                                                                            \
852     if(Y_MASK##4 != 0)                                                                      \
853         BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
854     else                                                                                    \
855         BASENAME##4 = 0;
856 
857 #define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
858     LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
859     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
860     BASENAME##5;                                                                            \
861     if(Y_MASK##5 != 0)                                                                      \
862         BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
863     else                                                                                    \
864         BASENAME##5 = 0;
865 
866 #define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
867     LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
868     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
869     BASENAME##6;                                                                            \
870     if(Y_MASK##6 != 0)                                                                      \
871         BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
872     else                                                                                    \
873         BASENAME##6 = 0;
874 
875 #define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
876     LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
877     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
878     BASENAME##7;                                                                            \
879     if(Y_MASK##7 != 0)                                                                      \
880         BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
881     else                                                                                    \
882         BASENAME##7 = 0;
883 
884 #define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
885     LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
886     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
887     BASENAME##8;                                                                            \
888     if(Y_MASK##8 != 0)                                                                      \
889         BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
890     else                                                                                    \
891         BASENAME##8 = 0;
892 
893 #define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
894     LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
895     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
896     BASENAME##9;                                                                            \
897     if(Y_MASK##9 != 0)                                                                      \
898         BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
899     else                                                                                    \
900         BASENAME##9 = 0;
901 
902 #define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
903     LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
904     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
905     BASENAME##A;                                                                            \
906     if(Y_MASK##A != 0)                                                                      \
907         BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
908     else                                                                                    \
909         BASENAME##A = 0;
910 
911 #define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
912     LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
913     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
914     BASENAME##B;                                                                            \
915     if(Y_MASK##B != 0)                                                                      \
916         BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
917     else                                                                                    \
918         BASENAME##B = 0;
919 
920 #define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
921     LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
922     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
923     BASENAME##C;                                                                            \
924     if(Y_MASK##C != 0)                                                                      \
925         BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
926     else                                                                                    \
927         BASENAME##C = 0;
928 
929 #define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
930     LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
931     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
932     BASENAME##D;                                                                            \
933     if(Y_MASK##D != 0)                                                                      \
934         BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
935     else                                                                                    \
936         BASENAME##D = 0;
937 
938 #define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
939     LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
940     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
941     BASENAME##E;                                                                            \
942     if(Y_MASK##E != 0)                                                                      \
943         BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
944     else                                                                                    \
945         BASENAME##E = 0;
946 
947 #define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
948     LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
949     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
950     BASENAME##F;                                                                            \
951     if(Y_MASK##F != 0)                                                                      \
952         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
953     else                                                                                    \
954         BASENAME##F = 0;
955 
956 /** Load blocks (consecutive rows and columns) with Y offset.
957  * @name LOAD_BLOCK_INDIRECT
958  *
959  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
960  * The data to load is expected to have consecutive names for each row.
961  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
962  * The Z offset is expected to have consecutive names.
963  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
964  *
965  * @param[in] M0        The number of consecutive rows
966  * @param[in] N0        The number of consecutive columns
967  * @param[in] DATA_TYPE The data type of the target
968  * @param[in] BASENAME  The basename of the result variables
969  * @param[in] PTR       The base pointer for the data
970  * @param[in] OFFSET    The offset within a row
971  * @param[in] STRIDE_Y  The stride in y-axis direction
972  * @param[in] Y         The y-axis offset vector
973  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
974  * @{
975  */
976 #define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
977 #define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
978 
979 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
980  * @name LOAD_ELEMENT_n
981  *
982  * @param[in] N0        The number of rows to load
983  * @param[in] DATA_TYPE The data type of variables
984  * @param[in] BASENAME  The basename of the destination variables for the loaded rows
985  * @param[in] PTR       The base pointer
986  * @param[in] OFFSET    The offset within a row
987  * @param[in] STRIDE_Y  The stride value in y-axis direction
988  * @{
989  */
990 #define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
991     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
992     BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
993 
994 #define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
995     LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
996     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
997     BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
998 
999 #define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1000     LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1001     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1002     BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
1003 
1004 #define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1005     LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1006     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1007     BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
1008 
1009 #define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1010     LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1011     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1012     BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
1013 
1014 #define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1015     LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1016     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1017     BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
1018 
1019 #define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1020     LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1021     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1022     BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
1023 
1024 #define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1025     LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1026     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1027     BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
1028 
1029 #define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1030     LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1031     VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
1032     BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
1033 
1034 #define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1035     LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
1036     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1037     BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
1038 
1039 #define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1040     LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1041     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1042     BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
1043 
1044 #define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1045     LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1046     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1047     BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
1048 
1049 #define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1050     LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1051     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1052     BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
1053 
1054 #define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1055     LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1056     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1057     BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
1058 
1059 #define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1060     LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1061     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1062     BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
1063 
1064 #define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1065     LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
1066     VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
1067     BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
1068 
1069 /** @}*/ // end of group LOAD_ELEMENT_n
1070 
1071 /** Load Scalar as Vector (consecutive elements).
1072  * @name LOAD_SCALAR_AS_VECTOR
1073  *
1074  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
1075  * The data to load is expected to have consecutive names for each row.
1076  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
1077  *
1078  * @param[in] M0        The number of consecutive rows
1079  * @param[in] N0        The number of consecutive columns
1080  * @param[in] DATA_TYPE The data type of the target
1081  * @param[in] BASENAME  The basename of the result variables
1082  * @param[in] PTR       The base pointer for the data
1083  * @param[in] OFFSET    The offset within a row
1084  * @param[in] STRIDE_Y  The stride in y-axis direction
1085  * @{
1086  */
1087 #define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1088 #define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1089 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
1090 
1091 /** Basic macros to calculate Z offset values from Z0 to Zn-1
1092  * @name CALCULATE_Z_OFFSET_n
1093  *
1094  * @param[in] M0              The number of offset values to calculate
1095  * @param[in] DATA_TYPE       The data type of the results
1096  * @param[in] Z               The basename of the result variables
1097  * @param[in] Y               The work-itme ID of y-axis
1098  * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
1099  * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
1100  * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1101  * @param[in] STRIDE_Y        The stride value in y-axis direction
1102  *
1103  * @{
1104  */
1105 #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1106     Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1107     Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
1108     Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
1109 
1110 #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1111     CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1112     Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1113     Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
1114     Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
1115 
1116 #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1117     CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1118     Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1119     Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
1120     Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
1121 
1122 #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1123     CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1124     Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1125     Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
1126     Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
1127 
1128 #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1129     CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1130     Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1131     Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
1132     Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
1133 
1134 #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1135     CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1136     Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1137     Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
1138     Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
1139 
1140 #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1141     CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1142     Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1143     Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
1144     Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
1145 
1146 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1147     CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
1148     Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
1149     Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
1150     Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
1151 
1152 /** @} */ // end of group CALCULATE_Z_OFFSET_n
1153 
1154 /** Calculate Z offset values from Z0 to Zn-1
1155  * @name CALCULATE_Z_OFFSET
1156  *
1157  * The Z offsets are expected to have consecutive names.
1158  * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
1159  * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
1160  * the possible cross plane paddings in case of the plance changes across the z-dimension.
1161  *
1162  * <!--
1163  * |                  |
1164  * |      plane0      |
1165  * |                  |
1166  * |__________________|
1167  * |******************|
1168  * |  cross_plane_pad |
1169  * |******************|
1170  * |                  |
1171  * |      plane1      |
1172  * |                  |
1173  * |__________________|
1174  * -->
1175  *
1176  * @param[in] M0              The number of offset values to calculate
1177  * @param[in] DATA_TYPE       The data type of the results
1178  * @param[in] Z               The basename of the result variables
1179  * @param[in] Y               The work-itme ID of y-axis
1180  * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
1181  * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
1182  * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1183  * @param[in] STRIDE_Y        The stride value in y-axis direction
1184  * @{
1185  */
1186 #define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1187 #define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1188 /** @} */ // end of group CALCULATE_Z_OFFSET
1189 
1190 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
1191  * @name SCALE_ROW_n
1192  *
1193  * @param[in] DATA_TYPE The data type of the variables
1194  * @param[in] BASENAME  The basename of the variables
1195  * @param[in] SCALE     The scale factor
1196  * @{
1197  */
1198 #define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
1199     BASENAME##0 *= (DATA_TYPE)SCALE;
1200 
1201 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
1202     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
1203     BASENAME##1 *= (DATA_TYPE)SCALE;
1204 
1205 #define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
1206     SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
1207     BASENAME##2 *= (DATA_TYPE)SCALE;
1208 
1209 #define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
1210     SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
1211     BASENAME##3 *= (DATA_TYPE)SCALE;
1212 
1213 #define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
1214     SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
1215     BASENAME##4 *= (DATA_TYPE)SCALE;
1216 
1217 #define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
1218     SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
1219     BASENAME##5 *= (DATA_TYPE)SCALE;
1220 
1221 #define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
1222     SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
1223     BASENAME##6 *= (DATA_TYPE)SCALE;
1224 
1225 #define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
1226     SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
1227     BASENAME##7 *= (DATA_TYPE)SCALE;
1228 
1229 #define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
1230     SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
1231     BASENAME##8 *= (DATA_TYPE)SCALE;
1232 
1233 #define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
1234     SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
1235     BASENAME##9 *= (DATA_TYPE)SCALE;
1236 
1237 #define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
1238     SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
1239     BASENAME##A *= (DATA_TYPE)SCALE;
1240 
1241 #define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
1242     SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
1243     BASENAME##B *= (DATA_TYPE)SCALE;
1244 
1245 #define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
1246     SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
1247     BASENAME##C *= (DATA_TYPE)SCALE;
1248 
1249 #define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
1250     SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
1251     BASENAME##D *= (DATA_TYPE)SCALE;
1252 
1253 #define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
1254     SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
1255     BASENAME##E *= (DATA_TYPE)SCALE;
1256 
1257 #define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
1258     SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
1259     BASENAME##F *= (DATA_TYPE)SCALE;
1260 /** @} */ // end of group SCALE_ROW_n
1261 
1262 /** Scale elements stored in a block (BASENAME)
1263  * @name SCALE_BLOCK
1264  *
1265  * Supported cases are N=1,2,3,...,16
1266  *
1267  * @param[in] N         The number of rows in the block
1268  * @param[in] DATA_TYPE The data type of the block
1269  * @param[in] BASENAME  The basename of the block
1270  * @param[in] SCALE     The scale factor
1271  * @{
1272  */
1273 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
1274 #define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
1275 /** @} */ // end of group SCALE_BLOCK
1276 
1277 /** Create a new vector containing the values at the given index for a set of given vectors
1278  * @name COLUMN_VECTORn
1279  *
1280  * @param[in] IDX_COL  The index value
1281  * @param[in] BASENAME The basename of the destination vectors
1282  * @param[in] X        The basename of the source vectors
1283  * @param[in] TYPE     The data type of the destination vectors
1284  * @{
1285  */
1286 #define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
1287     TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
1288 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
1289     VEC_DATA_TYPE(TYPE, 2)                         \
1290     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
1291 #define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
1292     VEC_DATA_TYPE(TYPE, 3)                         \
1293     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
1294 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
1295     VEC_DATA_TYPE(TYPE, 4)                         \
1296     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
1297 #define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
1298     VEC_DATA_TYPE(TYPE, 8)                         \
1299     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
1300 #define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
1301     VEC_DATA_TYPE(TYPE, 16)                         \
1302     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
1303 /** @} */ // end of group COLUMN_VECTORn
1304 
1305 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
1306  * @name COLUMN_VECTOR_SCALARn
1307  *
1308  * @param[in] IDX_COL  The index value
1309  * @param[in] BASENAME The basename of the destination vectors
1310  * @param[in] X        The basename of the source vectors
1311  * @param[in] TYPE     The data type of the destination vectors
1312  * @{
1313  */
1314 #define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
1315     TYPE BASENAME##IDX_COL = (TYPE)((X##0));
1316 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
1317     VEC_DATA_TYPE(TYPE, 2)                                \
1318     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
1319 #define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
1320     VEC_DATA_TYPE(TYPE, 3)                                \
1321     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
1322 #define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
1323     VEC_DATA_TYPE(TYPE, 4)                                \
1324     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
1325 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
1326     VEC_DATA_TYPE(TYPE, 8)                                \
1327     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
1328 #define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
1329     VEC_DATA_TYPE(TYPE, 16)                                \
1330     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
1331 /** @} */ // end of group COLUMN_VECTORn
1332 
1333 /** Create transposed vectors of the given vectors
1334  * @name TRANSPOSE_K0Xn
1335  *
1336  * @param[in] K0       The size of the source vectors
1337  * @param[in] BASENAME The basename of transposed vectors
1338  * @param[in] BS       The basename of source vectors for transposition
1339  * @param[in] TYPE     The data type of the transposed vectors
1340  * @{
1341  */
1342 #define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
1343     COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
1344 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
1345     COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
1346     COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
1347 #define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
1348     TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
1349     COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
1350 #define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
1351     TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
1352     COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
1353 #define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
1354     TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
1355     COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
1356     COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
1357     COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
1358     COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
1359 #define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
1360     TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
1361     COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
1362     COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
1363     COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
1364     COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
1365     COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
1366     COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
1367     COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
1368     COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
1369 
1370 /** @} */ // end of group TRANSPOSE_K0Xn
1371 
1372 /** Create column vectors to contain the values at the given index for a set of given vectors
1373  *
1374  * @param[in] K0       The number of source vectors
1375  * @param[in] IDX_COL  The index value
1376  * @param[in] BASENAME The basename of the destination vectors
1377  * @param[in] BS       The basename of the source vectors
1378  * @param[in] TYPE     The data type of the destination vectors
1379  */
1380 #define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
1381     CONCAT(COLUMN_VECTOR, K0)                          \
1382     (IDX_COL, BASENAME, BS, TYPE);
1383 
1384 /** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector
1385  *
1386  * @param[in] K0       The number of source vectors
1387  * @param[in] IDX_COL  The index value
1388  * @param[in] BASENAME The basename of the destination vectors
1389  * @param[in] BS       The basename of the source vectors
1390  * @param[in] TYPE     The data type of the destination vectors
1391  */
1392 #define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
1393     CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
1394     (IDX_COL, BASENAME, BS, TYPE);
1395 
1396 /** Create transposed vectors form the given source vectors
1397  *
1398  * @param[in] K0       The size of source vectors
1399  * @param[in] N0       The number of source vectors
1400  * @param[in] BASENAME The basename of transposed vectors
1401  * @param[in] BS       The basename of source vectors for transposition
1402  * @param[in] TYPE     The data type of the transposed vectors
1403  *
1404  */
1405 #define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
1406     CONCAT(TRANSPOSE_K0X, N0)                       \
1407     (K0, BASENAME, BS, TYPE);
1408 
1409 /** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
1410  * @name ADD_ROW_n
1411  *
1412  * @param[in] BASENAME The basename of the destination variables
1413  * @param[in] BIAS     The basename of the added variables
1414  * @{
1415  */
1416 #define ADD_ROW_1(BASENAME, BIAS) \
1417     BASENAME##0 += BIAS##0;
1418 
1419 #define ADD_ROW_2(BASENAME, BIAS) \
1420     ADD_ROW_1(BASENAME, BIAS)     \
1421     BASENAME##1 += BIAS##1;
1422 
1423 #define ADD_ROW_3(BASENAME, BIAS) \
1424     ADD_ROW_2(BASENAME, BIAS)     \
1425     BASENAME##2 += BIAS##2;
1426 
1427 #define ADD_ROW_4(BASENAME, BIAS) \
1428     ADD_ROW_3(BASENAME, BIAS)     \
1429     BASENAME##3 += BIAS##3;
1430 
1431 #define ADD_ROW_5(BASENAME, BIAS) \
1432     ADD_ROW_4(BASENAME, BIAS)     \
1433     BASENAME##4 += BIAS##4;
1434 
1435 #define ADD_ROW_6(BASENAME, BIAS) \
1436     ADD_ROW_5(BASENAME, BIAS)     \
1437     BASENAME##5 += BIAS##5;
1438 
1439 #define ADD_ROW_7(BASENAME, BIAS) \
1440     ADD_ROW_6(BASENAME, BIAS)     \
1441     BASENAME##6 += BIAS##6;
1442 
1443 #define ADD_ROW_8(BASENAME, BIAS) \
1444     ADD_ROW_7(BASENAME, BIAS)     \
1445     BASENAME##7 += BIAS##7;
1446 
1447 #define ADD_ROW_9(BASENAME, BIAS) \
1448     ADD_ROW_8(BASENAME, BIAS)     \
1449     BASENAME##8 += BIAS##8;
1450 
1451 #define ADD_ROW_10(BASENAME, BIAS) \
1452     ADD_ROW_9(BASENAME, BIAS)      \
1453     BASENAME##9 += BIAS##9;
1454 
1455 #define ADD_ROW_11(BASENAME, BIAS) \
1456     ADD_ROW_10(BASENAME, BIAS)     \
1457     BASENAME##A += BIAS##A;
1458 
1459 #define ADD_ROW_12(BASENAME, BIAS) \
1460     ADD_ROW_11(BASENAME, BIAS)     \
1461     BASENAME##B += BIAS##B;
1462 
1463 #define ADD_ROW_13(BASENAME, BIAS) \
1464     ADD_ROW_12(BASENAME, BIAS)     \
1465     BASENAME##C += BIAS##C;
1466 
1467 #define ADD_ROW_14(BASENAME, BIAS) \
1468     ADD_ROW_13(BASENAME, BIAS)     \
1469     BASENAME##D += BIAS##D;
1470 
1471 #define ADD_ROW_15(BASENAME, BIAS) \
1472     ADD_ROW_14(BASENAME, BIAS)     \
1473     BASENAME##E += BIAS##E;
1474 
1475 #define ADD_ROW_16(BASENAME, BIAS) \
1476     ADD_ROW_15(BASENAME, BIAS)     \
1477     BASENAME##F += BIAS##F;
1478 
1479 /** @} */ // end of group ADD_ROW_n
1480 
1481 /** Add the block (BIAS) to another block (BASENAME)
1482  * @name ADD_BLOCK
1483  *
1484  * Supported cases are N=1,2,3,...,16
1485  *
1486  * @param[in] N        The number of vectors in the block
1487  * @param[in] BASENAME The basename of the destination variables
1488  * @param[in] BIAS     The basename of the added variables
1489  * @{
1490  */
1491 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
1492 #define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
1493 /** @} */ // end of group ADD_BLOCK
1494 
1495 /** Broadcast (add single value) to the each element of the destination variables
1496  * @name ADD_ROW_BROADCAST_n
1497  *
1498  * @param[in] BASENAME The basename of the destination variables
1499  * @param[in] BIAS     The variable containing the value to add
1500  * @{
1501  */
1502 #define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1503     BASENAME##0 += BIAS;
1504 
1505 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1506     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
1507     BASENAME##1 += BIAS;
1508 
1509 #define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1510     ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
1511     BASENAME##2 += BIAS;
1512 
1513 #define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1514     ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
1515     BASENAME##3 += BIAS;
1516 
1517 #define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1518     ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
1519     BASENAME##4 += BIAS;
1520 
1521 #define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1522     ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
1523     BASENAME##5 += BIAS;
1524 
1525 #define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1526     ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
1527     BASENAME##6 += BIAS;
1528 
1529 #define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1530     ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
1531     BASENAME##7 += BIAS;
1532 
1533 #define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1534     ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
1535     BASENAME##8 += BIAS;
1536 
1537 #define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1538     ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
1539     BASENAME##9 += BIAS;
1540 
1541 #define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1542     ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
1543     BASENAME##A += BIAS;
1544 
1545 #define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1546     ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
1547     BASENAME##B += BIAS;
1548 
1549 #define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1550     ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
1551     BASENAME##C += BIAS;
1552 
1553 #define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1554     ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
1555     BASENAME##D += BIAS;
1556 
1557 #define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1558     ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
1559     BASENAME##E += BIAS;
1560 
1561 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
1562     ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
1563     BASENAME##F += BIAS;
1564 
1565 /** Broadcast (add a value) to the each element of the destination block (BASENAME)
1566  * @name ADD_BLOCK_BROADCAST
1567  *
1568  * Supported cases are N=1,2,3,...,16.
1569  *
1570  * @param[in] N        The number of vectors in the block
1571  * @param[in] BASENAME The basename of the destination variables
1572  * @param[in] BIAS     The variable containing the value to add
1573  * @{
1574  */
1575 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
1576 #define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
1577 /** @} */ // end of group ADD_BLOCK_BROADCAST
1578 
1579 /** Apply activation to the given variables
1580  * @name ACTIVATION_ROW_n
1581  *
1582  * @param[in] ACTIVATION_TYPE The type of the activation
1583  * @param[in] DATA_TYPE       The data type of the vectors
1584  * @param[in] BASENAME        The basename of the variables
1585  * @param[in] A_VAL           Additional value required by the activation
1586  * @param[in] B_VAL           Additional value required by the activation
1587  * @{
1588  */
1589 #define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1590     BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
1591 
1592 #define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1593     ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1594     BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
1595 
1596 #define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1597     ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1598     BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
1599 
1600 #define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1601     ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1602     BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
1603 
1604 #define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1605     ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1606     BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
1607 
1608 #define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1609     ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1610     BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
1611 
1612 #define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1613     ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1614     BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
1615 
1616 #define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1617     ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1618     BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
1619 
1620 #define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1621     ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1622     BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
1623 
1624 #define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1625     ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
1626     BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
1627 
1628 #define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1629     ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1630     BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
1631 
1632 #define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1633     ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1634     BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
1635 
1636 #define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1637     ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1638     BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
1639 
1640 #define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1641     ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1642     BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
1643 
1644 #define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1645     ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1646     BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
1647 
1648 #define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1649     ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
1650     BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
1651 /** @} */ // end of group ACTIVATION_ROW_n
1652 
1653 /** Apply activation to a block (BASENAME)
1654  * @name ACTIVATION_BLOCK
1655  *
1656  * Supported cases are N=1,2,3,...,16.
1657  *
1658  * @param[in] N               The number of vectors in the block
1659  * @param[in] ACTIVATION_TYPE The type of the activation
1660  * @param[in] DATA_TYPE       The data type of the vectors
1661  * @param[in] BASENAME        The basename of the variables
1662  * @param[in] A_VAL           Additional value required by the activation
1663  * @param[in] B_VAL           Additional value required by the activation
1664  * @{
1665  */
1666 #define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
1667 #define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
1668 /** @} */ // end of group ACTIVATION_BLOCK
1669 
1670 /** Apply convert_<data_type> to the given variables
1671  * @name CONVERT_ROW_n
1672  *
1673  * @param[in] N            The size of the vectors
1674  * @param[in] DATA_TYPE    The data type of the vectors
1675  * @param[in] BASENAME_SRC The basename of the source variables
1676  * @param[in] BASENAME_DST The basename of the destination variables
1677  */
1678 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1679     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1680     BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
1681 
1682 #define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1683     CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1684     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1685     BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
1686 
1687 #define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1688     CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1689     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1690     BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
1691 
1692 #define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1693     CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1694     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1695     BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
1696 
1697 #define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1698     CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1699     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1700     BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
1701 
1702 #define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1703     CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1704     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1705     BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
1706 
1707 #define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1708     CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1709     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1710     BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
1711 
1712 #define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1713     CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1714     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1715     BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
1716 
1717 #define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1718     CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1719     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
1720     BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
1721 
1722 #define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1723     CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
1724     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1725     BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
1726 
1727 #define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1728     CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1729     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1730     BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
1731 
1732 #define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1733     CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1734     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1735     BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
1736 
1737 #define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1738     CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1739     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1740     BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
1741 
1742 #define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1743     CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1744     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1745     BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
1746 
1747 #define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1748     CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1749     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1750     BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
1751 
1752 #define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1753     CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
1754     VEC_DATA_TYPE(DATA_TYPE, N)                                  \
1755     BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
1756 /** @} */ // end of group CONVERT_ROW_n
1757 
1758 /** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
1759  * @name CONVERT_BLOCK
1760  *
1761  * Supported cases N=1,2,3,...,16.
1762  *
1763  * @param[in] M            The number of vectors to convert
1764  * @param[in] N            The size of the vectors
1765  * @param[in] DATA_TYPE    The data type of the vectors
1766  * @param[in] BASENAME_SRC The basename of the source variables
1767  * @param[in] BASENAME_DST The basename of the destination variables
1768  */
1769 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1770 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1771 /** @} */ // end of group CONVERT_BLOCK