xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/common/gemm_utils.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "gemm_helpers.h"
25#include "helpers.h"
26#include "repeat.h"
27#include "tile_helpers.h"
28
29#if defined(RESHAPE_LHS_NT)
30/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
31 *  the output matrix unrolling the values.
32 *
33 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
34 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
35 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
36 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
37 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
38 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
39 * @note Only the following values for M0, K0 and V0 are supported:
40 *                                      M0: 2,3,4,5,6,7,8
41 *                                      K0: 2,3,4,8,16
42 *                                      V0: greater than 0
43 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
44 *
45 * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
46 * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
47 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_w                             The size of the width dimension of the source tensor
49 * @param[in] src_h                             The size of the height dimension of the source tensor
50 * @param[in] src_n                             The size of the depth dimension of the source tensor
51 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
52 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
53 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
54 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
55 * @param[in] dst_w                             The size of the width dimension of the destination tensor
56 * @param[in] dst_h                             The size of the height dimension of the destination tensor
57 * @param[in] dst_n                             The size of the depth dimension of the destination tensor
58 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
59 * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
60 * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0.
61 */
62__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
63                                         TENSOR3D_T(dst, BUFFER),
64                                         const int M,
65                                         const int V0)
66{
67    // Block size
68#define BLOCK_SIZE ((M0) * (K0))
69
70    // Output offset X
71#if defined(INTERLEAVE)
72#define OUTPUT_OFFSET_X (K0)
73#else // defined(INTERLEAVE)
74#define OUTPUT_OFFSET_X (BLOCK_SIZE)
75#endif // defined(INTERLEAVE)
76
77    // Output step X
78#if defined(INTERLEAVE)
79#define OUTPUT_STEP_X (K0) * (V0)
80#else // Do not interleave
81#define OUTPUT_STEP_X (K0)
82#endif // defined(INTERLEAVE)
83
84    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
85    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
86    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
87
88    const int xi = x * K0;
89    const int yi = y * M0;
90
91    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
92    const int yo = (y / V0);
93
94    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
95    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
96    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
97
98    TILE(DATA_TYPE, M0, K0, in);
99
100    // Initialize the input tile to zero
101    LOOP_UNROLLING(int, _i, 0, 1, M0,
102    {
103        in[_i].v = 0;
104    });
105
106    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
107    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
108    // Load input tile
109    TILE(uint, M0, 1, in_indirect_y);
110    LOOP_UNROLLING(int, _i, 0, 1, M0,
111    {
112        in_indirect_y[_i].v = _i;
113
114    });
115#if PARTIAL_M0 != 0
116    if(y_cond)
117    {
118        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
119    }
120    else
121#endif // PARTIAL_M0 != 0
122    {
123        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
124    }
125
126    // Store output tile
127    TILE(uint, M0, 1, dst_indirect_y);
128    LOOP_UNROLLING(int, _i, 0, 1, M0,
129    {
130        dst_indirect_y[_i].v = _i;
131    });
132
133    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
134#undef BLOCK_SIZE
135#undef OUTPUT_OFFSET_X
136#undef OUTPUT_STEP_X
137}
138#endif // defined(RESHAPE_LHS_NT)
139
140#if defined(RESHAPE_LHS_T)
141/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
142 *  the output matrix unrolling the values.
143 *
144 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
145 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
146 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
147 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
148 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
149 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
150 * @note Only the following values for M0, K0 and V0 are supported:
151 *                                      M0: 2,3,4,8,16
152 *                                      K0: 2,3,4,8,16
153 *                                      V0: greater than 0
154 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
155 *
156 * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
157 * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
158 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
159 * @param[in] src_w                             The size of the width dimension of the source tensor
160 * @param[in] src_h                             The size of the height dimension of the source tensor
161 * @param[in] src_n                             The size of the depth dimension of the source tensor
162 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
163 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
164 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
165 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
166 * @param[in] dst_w                             The size of the width dimension of the destination tensor
167 * @param[in] dst_h                             The size of the height dimension of the destination tensor
168 * @param[in] dst_n                             The size of the depth dimension of the destination tensor
169 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
170 * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
171 * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0
172 */
173__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
174                                        TENSOR3D_T(dst, BUFFER),
175                                        const int M,
176                                        const int V0)
177{
178    // Block size
179#define BLOCK_SIZE ((M0) * (K0))
180
181    // Output offset X
182#if defined(INTERLEAVE)
183#define OUTPUT_OFFSET_X (M0)
184#else // defined(INTERLEAVE)
185#define OUTPUT_OFFSET_X (BLOCK_SIZE)
186#endif // defined(INTERLEAVE)
187
188    // Output step X
189#if defined(INTERLEAVE)
190#define OUTPUT_STEP_X (M0) * (V0)
191#else // Do not interleave
192#define OUTPUT_STEP_X (M0)
193#endif // defined(INTERLEAVE)
194
195    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
196    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
197    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
198
199    const int xi = x * K0;
200    const int yi = y * M0;
201
202    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
203    const int yo = (y / V0);
204
205    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
206    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
207    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
208
209    TILE(DATA_TYPE, M0, K0, in);
210    TILE(DATA_TYPE, K0, M0, in_tr);
211
212    // Initialize the tile to zero
213    LOOP_UNROLLING(int, _i, 0, 1, M0,
214    {
215        in[_i].v = 0;
216    });
217
218    // Load input tile
219    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
220    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
221
222    TILE(uint, M0, 1, in_indirect_y);
223    LOOP_UNROLLING(int, _i, 0, 1, M0,
224    {
225        in_indirect_y[_i].v = _i;
226
227    });
228#if PARTIAL_M0 != 0
229    if(y_cond)
230    {
231        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
232    }
233    else
234#endif // PARTIAL_M0 != 0
235    {
236        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
237    }
238    // Transpose input tile
239    LOOP_UNROLLING(int, m0, 0, 1, M0,
240    {
241        LOOP_UNROLLING(int, k0, 0, 1, K0,
242        {
243            in_tr[k0].s[m0] = in[m0].s[k0];
244        })
245    });
246
247    TILE(uint, K0, 1, dst_indirect_y);
248    LOOP_UNROLLING(int, _i, 0, 1, K0,
249    {
250        dst_indirect_y[_i].v = _i;
251    });
252
253    // Store output tile
254    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
255
256#undef BLOCK_SIZE
257#undef OUTPUT_OFFSET_X
258#undef OUTPUT_STEP_X
259}
260#endif // defined(RESHAPE_LHS_T)
261
262#if defined(RESHAPE_RHS_NT)
263/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
264 *  the output matrix unrolling the values.
265 *
266 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
267 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
268 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
269 * @note Only the following values for K0, N0 and H0 are supported:
270 *                                      N0: 2,3,4,8,16
271 *                                      K0: 1,2,3,4,8,16
272 *                                      H0: greater than 0
273 *
274 * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
275 * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
276 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
277 * @param[in] src_w                             The size of the width dimension of the source tensor
278 * @param[in] src_h                             The size of the height dimension of the source tensor
279 * @param[in] src_n                             The size of the depth dimension of the source tensor
280 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
281 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
282 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
283 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
284 * @param[in] dst_w                             The size of the width dimension of the destination tensor
285 * @param[in] dst_h                             The size of the height dimension of the destination tensor
286 * @param[in] dst_n                             The size of the depth dimension of the destination tensor
287 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
288 * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0
289 */
290__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
291                                         TENSOR3D_T(dst, BUFFER),
292                                         const int H0)
293{
294    // Block size
295#define BLOCK_SIZE ((K0) * (N0))
296
297    // Output offset X
298#if defined(INTERLEAVE)
299#define OUTPUT_OFFSET_X (N0)
300#else // defined(INTERLEAVE)
301#define OUTPUT_OFFSET_X (BLOCK_SIZE)
302#endif // defined(INTERLEAVE)
303
304    // Output step X
305#if defined(INTERLEAVE)
306#define OUTPUT_STEP_X (N0) * (H0)
307#else // Do not interleave
308#define OUTPUT_STEP_X (N0)
309#endif // defined(INTERLEAVE)
310
311    const int x = GET_SPATIAL_IDX(0, 1, 0);
312    const int y = GET_SPATIAL_IDX(1, 1, 0);
313    const int z = GET_SPATIAL_IDX(2, 1, 0);
314
315    const int xi = x * N0;
316    const int yi = y * K0;
317
318    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
319    const int yo = (x / H0);
320
321    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
322    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
323
324    TILE(DATA_TYPE, K0, N0, in);
325
326    // Initialize the tile to zero
327    for(int i = 0; i < K0; ++i)
328    {
329        in[i].v = 0;
330    }
331
332    // Load input tile
333    for(int i = 0; i < K0; ++i)
334    {
335        if(yi + i < src_h)
336        {
337            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
338        }
339    }
340
341    TILE(uint, K0, 1, dst_indirect_y);
342    for(int i = 0; i < K0; ++i)
343    {
344        dst_indirect_y[i].v = i;
345    }
346
347    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
348
349#undef BLOCK_SIZE
350#undef OUTPUT_OFFSET_X
351#undef OUTPUT_STEP_X
352}
353#endif // defined(RESHAPE_RHS_NT)
354
355#if defined(RESHAPE_RHS_T)
356/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
357 *  the output matrix unrolling the values.
358 *
359 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
360 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
361 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
362 * @note The option -DTRANSPOSE must passed at compile time.
363 * @note Only the following values for K0, N0 and H0 are supported:
364 *                                      N0: 2,3,4,8,16
365 *                                      K0: 2,3,4,8,16
366 *                                      H0: greater than 0
367 *
368 * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
369 * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
370 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
371 * @param[in] src_w                             The size of the width dimension of the source tensor
372 * @param[in] src_h                             The size of the height dimension of the source tensor
373 * @param[in] src_n                             The size of the depth dimension of the source tensor
374 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
375 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
376 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
377 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
378 * @param[in] dst_w                             The size of the width dimension of the destination tensor
379 * @param[in] dst_h                             The size of the height dimension of the destination tensor
380 * @param[in] dst_n                             The size of the depth dimension of the destination tensor
381 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
382 * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0.
383 */
384__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
385                                        TENSOR3D_T(dst, BUFFER),
386                                        const int H0)
387{
388    // Block size
389#define BLOCK_SIZE ((K0) * (N0))
390
391    // Output offset X
392#if defined(INTERLEAVE)
393#define OUTPUT_OFFSET_X (K0)
394#else // defined(INTERLEAVE)
395#define OUTPUT_OFFSET_X (BLOCK_SIZE)
396#endif // defined(INTERLEAVE)
397
398    // Output step X
399#if defined(INTERLEAVE)
400#define OUTPUT_STEP_X (K0) * (H0)
401#else // Do not interleave
402#define OUTPUT_STEP_X (K0)
403#endif // defined(INTERLEAVE)
404
405    const int x = GET_SPATIAL_IDX(0, 1, 0);
406    const int y = GET_SPATIAL_IDX(1, 1, 0);
407    const int z = GET_SPATIAL_IDX(2, 1, 0);
408
409    const int xi = x * N0;
410    const int yi = y * K0;
411
412    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
413    const int yo = (x / H0);
414
415    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
416    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
417
418    TILE(DATA_TYPE, K0, N0, in);
419    TILE(DATA_TYPE, N0, K0, in_tr);
420
421    // Initialize the tile to zero
422    for(int i = 0; i < K0; ++i)
423    {
424        in[i].v = 0;
425    }
426
427    // Load input tile
428    for(int i = 0; i < K0; ++i)
429    {
430        if(yi + i < src_h)
431        {
432            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
433        }
434    }
435
436    // Transpose input tile
437    for(int k0 = 0; k0 < K0; ++k0)
438    {
439        for(int n0 = 0; n0 < N0; ++n0)
440        {
441            in_tr[n0].s[k0] = in[k0].s[n0];
442        }
443    }
444
445    TILE(uint, N0, 1, dst_indirect_y);
446    for(int i = 0; i < N0; ++i)
447    {
448        dst_indirect_y[i].v = i;
449    }
450
451    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
452
453#undef BLOCK_SIZE
454#undef OUTPUT_OFFSET_X
455#undef OUTPUT_STEP_X
456}
457
458#endif // defined(RESHAPE_RHS_T)