xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2018-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
26
27#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
28    ({                                                              \
29        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
30        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
31        comm_fact.s2 = 2.5f * tmp.s3;                               \
32        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
33        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
34        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
35        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
36        \
37        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
38        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
39        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
40        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
41        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
42        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
43        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
44        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
45    })
46
47#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
48    ({                                                                                             \
49        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
50        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
51        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
52        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
53        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
54        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
55        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
56        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
57        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
58        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
59        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
60        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
61        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
62        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
63    })
64
65#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
66
67#if defined(NHWC)
68#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
69//! @cond Doxygen_Suppress
70/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
71 *
72 * @note Data layout supported: NHWC
73 * @note Data type supported: F32/F16
74 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
75 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
76 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
77 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
78 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
79 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
80 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
81 *
82 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
83 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
84 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
85 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
86 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
87 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
88 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
89 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
90 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
91 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
92 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
93 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
94 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
96 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
98 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
99 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
100 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
101 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
102 * @param[in] _ISRC_WIDTH                       The src tensor's width
103 * @param[in] _ISRC_HEIGHT                      The src tensor's height
104 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
105 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
106 */
107//! @endcond
108__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
109    TENSOR4D(src, BUFFER),
110    TENSOR4D(dst, BUFFER),
111    const int _ISRC_WIDTH,
112    const int _ISRC_HEIGHT,
113    const int _INUM_TILES_X,
114    const int _INUM_TILES_Y)
115{
116    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
117    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
118    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
119
120    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
121    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
122    x -= PAD_LEFT;
123    y -= PAD_TOP;
124
125#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
126
127    TILE(DATA_TYPE, 6, 1, in);
128    TILE(DATA_TYPE, 6, 1, out);
129
130    // Initialize the input tile
131    LOOP_UNROLLING(int, i, 0, 1, 6,
132    {
133        in[i].v = 0;
134    })
135
136#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
137    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
138#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
139    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
140#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
141
142    TILE(DATA_TYPE, 6, 1, com);
143
144    LOOP_UNROLLING(int, i, 0, 1, 6,
145    {
146        in[i].v *= 4.0f;
147    })
148
149    com[0].v = in[2].v - 4.f * in[0].v;
150    com[1].v = in[3].v - 4.f * in[1].v;
151    com[2].v = in[4].v - 4.f * in[2].v;
152    com[3].v = in[5].v - 4.f * in[3].v;
153    com[4].v = in[3].v - in[1].v;
154    com[4].v = com[4].v + com[4].v;
155    com[5].v = in[4].v - in[2].v;
156
157    out[0].v = com[2].v - com[0].v;
158    out[1].v = com[2].v + com[1].v;
159    out[2].v = com[2].v - com[1].v;
160    out[3].v = com[5].v + com[4].v;
161    out[4].v = com[5].v - com[4].v;
162    out[5].v = com[3].v - com[1].v;
163
164    TILE(uint, 6, 1, dst_indirect_y);
165
166    LOOP_UNROLLING(int, i, 0, 1, 6,
167    {
168        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
169        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
170    })
171
172    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
173
174#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
175
176    TILE(DATA_TYPE, 36, 1, in);
177
178    // Initialize the input tile
179    LOOP_UNROLLING(int, i, 0, 1, 36,
180    {
181        in[i].v = 0;
182    })
183
184    // Load the tile from a NHWC tensor
185    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
186
187    TILE(DATA_TYPE, 6, 1, com);
188    TILE(DATA_TYPE, 36, 1, tmp);
189
190    LOOP_UNROLLING(int, i, 0, 1, 6,
191    {
192        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
193        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
194        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
195        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
196        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
197        com[4].v         = com[4].v + com[4].v;
198        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
199        tmp[i + 0 * 6].v = com[2].v - com[0].v;
200        tmp[i + 1 * 6].v = com[2].v + com[1].v;
201        tmp[i + 2 * 6].v = com[2].v - com[1].v;
202        tmp[i + 3 * 6].v = com[5].v + com[4].v;
203        tmp[i + 4 * 6].v = com[5].v - com[4].v;
204        tmp[i + 5 * 6].v = com[3].v - com[1].v;
205    })
206
207    TILE(DATA_TYPE, 36, 1, out);
208
209    LOOP_UNROLLING(int, i, 0, 1, 6,
210    {
211        com[0].v         = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;
212        com[1].v         = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;
213        com[2].v         = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;
214        com[3].v         = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;
215        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
216        com[4].v         = com[4].v + com[4].v;
217        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
218        out[i * 6 + 0].v = com[2].v - com[0].v;
219        out[i * 6 + 1].v = com[2].v + com[1].v;
220        out[i * 6 + 2].v = com[2].v - com[1].v;
221        out[i * 6 + 3].v = com[5].v + com[4].v;
222        out[i * 6 + 4].v = com[5].v - com[4].v;
223        out[i * 6 + 5].v = com[3].v - com[1].v;
224    })
225
226    // Compute destination address
227    TILE(uint, 36, 1, dst_indirect_y);
228
229    LOOP_UNROLLING(int, i, 0, 1, 36,
230    {
231        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
232        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
233    })
234
235    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
236#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
237}
238#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
239
240#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
241//! @cond Doxygen_Suppress
242/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
243 *
244 * @note Data layout supported: NHWC
245 * @note Data type supported: F32/F16
246 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
247 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
248 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
249 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
250 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
251 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
252 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
253 *
254 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
255 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
256 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
257 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
258 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
259 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
260 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
261 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
262 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
263 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
264 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
265 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
266 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
267 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
268 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
269 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
270 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
271 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
272 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
273 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
274 * @param[in] _ISRC_WIDTH                       The src tensor's width
275 * @param[in] _ISRC_HEIGHT                      The src tensor's height
276 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
277 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
278 */
279//! @endcond
280__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
281    TENSOR4D(src, BUFFER),
282    TENSOR4D(dst, BUFFER),
283    const int _ISRC_WIDTH,
284    const int _ISRC_HEIGHT,
285    const int _INUM_TILES_X,
286    const int _INUM_TILES_Y)
287{
288    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
289    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
290    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
291
292    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
293    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
294    x -= PAD_LEFT;
295    y -= PAD_TOP;
296
297#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
298
299    TILE(DATA_TYPE, 8, 1, in);
300    TILE(DATA_TYPE, 8, 1, out);
301
302    // Initialize the input tile
303    LOOP_UNROLLING(int, i, 0, 1, 8,
304    {
305        in[i].v = 0;
306    })
307
308#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
309    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
310#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
311    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
312#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
313
314    TILE(DATA_TYPE, 1, 8, com);
315
316    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
317    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
318    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
319    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
320    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
321    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
322    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
323    out[1].s[0] = com[0].s[0] + com[0].s[1];
324    out[2].s[0] = com[0].s[0] - com[0].s[1];
325    out[3].s[0] = com[0].s[3] + com[0].s[2];
326    out[4].s[0] = com[0].s[3] - com[0].s[2];
327    out[5].s[0] = com[0].s[4] + com[0].s[5];
328    out[6].s[0] = com[0].s[4] - com[0].s[5];
329    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
330
331    TILE(uint, 8, 1, dst_indirect_y);
332
333    LOOP_UNROLLING(int, i, 0, 1, 8,
334    {
335        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
336        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
337    })
338
339    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
340
341#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
342
343    TILE(DATA_TYPE, 64, 1, in);
344    TILE(DATA_TYPE, 64, 1, out);
345
346    // Initialize the input tile
347    LOOP_UNROLLING(int, i, 0, 1, 64,
348    {
349        in[i].v = 0;
350    })
351
352    // Load the tile from a NHWC tensor
353    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
354
355    TILE(DATA_TYPE, 8, 8, com);
356
357    LOOP_UNROLLING(int, i, 0, 1, 8,
358    {
359        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
360        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
361        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
362        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
363        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
364        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
365        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
366        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
367    })
368
369    TILE(DATA_TYPE, 8, 8, tmp);
370    tmp[0].v = com[6].v;
371    tmp[1].v = com[0].v + com[1].v;
372    tmp[2].v = com[0].v - com[1].v;
373    tmp[3].v = com[2].v + com[3].v;
374    tmp[4].v = com[2].v - com[3].v;
375    tmp[5].v = com[4].v + com[5].v;
376    tmp[6].v = com[4].v - com[5].v;
377    tmp[7].v = com[7].v;
378
379    LOOP_UNROLLING(int, i, 0, 1, 8,
380    {
381        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
382        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
383        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
384        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
385        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
386        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
387        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
388        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
389        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
390        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
391        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
392        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
393        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
394        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
395    })
396
397    TILE(uint, 64, 1, dst_indirect_y);
398
399    LOOP_UNROLLING(int, i, 0, 1, 64,
400    {
401        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
402        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
403    })
404
405    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
406
407#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
408}
409#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
410
411#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
412//! @cond Doxygen_Suppress
413/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
414 *
415 * @note Data layout supported: NHWC
416 * @note Data type supported: F32/F16
417 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
418 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
419 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
420 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
421 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
422 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
423 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
424 *
425 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
426 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
427 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
429 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
430 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
431 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
432 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
433 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
434 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
435 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
436 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
437 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
438 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
439 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
440 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
441 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
442 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
443 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
444 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
445 * @param[in] _ISRC_WIDTH                       The src tensor's width
446 * @param[in] _ISRC_HEIGHT                      The src tensor's height
447 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
448 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
449 */
450//! @endcond
451__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
452    TENSOR4D(src, BUFFER),
453    TENSOR4D(dst, BUFFER),
454    const int _ISRC_WIDTH,
455    const int _ISRC_HEIGHT,
456    const int _INUM_TILES_X,
457    const int _INUM_TILES_Y)
458{
459    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
460    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
461    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
462
463    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
464    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
465    x -= PAD_LEFT;
466    y -= PAD_TOP;
467
468#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
469
470    TILE(DATA_TYPE, 8, 1, in);
471    TILE(DATA_TYPE, 8, 1, out);
472
473    // Initialize the input tile
474    LOOP_UNROLLING(int, i, 0, 1, 8,
475    {
476        in[i].v = 0;
477    })
478
479#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
480    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
481#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
482    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
483#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
484
485    LOOP_UNROLLING(int, i, 0, 1, 8,
486    {
487        in[i].v *= (DATA_TYPE) - 36.0f;
488    })
489
490    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
491
492    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
493    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
494    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
495    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
496    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
497    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
498    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
499    out[1].s[0] = com[0].s[0] - com[0].s[1];
500    out[2].s[0] = com[0].s[0] + com[0].s[1];
501    out[3].s[0] = com[0].s[2] - com[0].s[3];
502    out[4].s[0] = com[0].s[2] + com[0].s[3];
503    out[5].s[0] = com[0].s[4] - com[0].s[5];
504    out[6].s[0] = com[0].s[4] + com[0].s[5];
505    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
506
507    TILE(uint, 8, 1, dst_indirect_y);
508
509    LOOP_UNROLLING(int, i, 0, 1, 8,
510    {
511        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
512        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
513    })
514
515    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
516
517#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
518
519    TILE(DATA_TYPE, 64, 1, in);
520    TILE(DATA_TYPE, 64, 1, out);
521
522    // Initialize the input tile
523    LOOP_UNROLLING(int, i, 0, 1, 64,
524    {
525        in[i].v = 0;
526    })
527
528    // Load the tile from a NHWC tensor
529    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
530
531    TILE(DATA_TYPE, 8, 8, com);
532
533    LOOP_UNROLLING(int, i, 0, 1, 8,
534    {
535        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
536        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
537        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
538        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
539        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
540        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
541        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
542        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
543    })
544
545    TILE(DATA_TYPE, 8, 8, tmp);
546    tmp[0].v = com[6].v;
547    tmp[1].v = com[0].v - com[1].v;
548    tmp[2].v = com[0].v + com[1].v;
549    tmp[3].v = com[2].v - com[3].v;
550    tmp[4].v = com[2].v + com[3].v;
551    tmp[5].v = com[4].v - com[5].v;
552    tmp[6].v = com[4].v + com[5].v;
553    tmp[7].v = com[7].v;
554
555    LOOP_UNROLLING(int, i, 0, 1, 8,
556    {
557        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
558        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
559        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
560        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
561        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
562        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
563        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
564        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
565        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
566        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
567        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
568        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
569        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
570        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
571    })
572
573    TILE(uint, 64, 1, dst_indirect_y);
574
575    LOOP_UNROLLING(int, i, 0, 1, 64,
576    {
577        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
578        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
579    })
580
581    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
582
583#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
584}
585#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
586
587#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
588//! @cond Doxygen_Suppress
589/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
590 *
591 * @note Data layout supported: NHWC
592 * @note Data type supported: F32/F16
593 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
594 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
595 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
596 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
597 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
598 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
599 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
600 *
601 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
602 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
603 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
605 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
607 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
608 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
609 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
610 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
611 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
612 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
613 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
614 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
615 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
616 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
617 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
618 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
619 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
620 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
621 * @param[in] _ISRC_WIDTH                       The src tensor's width
622 * @param[in] _ISRC_HEIGHT                      The src tensor's height
623 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
624 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
625 */
626//! @endcond
627__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
628    TENSOR4D(src, BUFFER),
629    TENSOR4D(dst, BUFFER),
630    const int _ISRC_WIDTH,
631    const int _ISRC_HEIGHT,
632    const int _INUM_TILES_X,
633    const int _INUM_TILES_Y)
634{
635    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
636                                                 src_stride_x,
637                                                 src_step_x,
638                                                 src_stride_y,
639                                                 src_step_y,
640                                                 src_stride_z,
641                                                 src_step_z,
642                                                 src_stride_w,
643                                                 src_step_w,
644                                                 src_offset_first_element_in_bytes,
645                                                 dst_ptr,
646                                                 dst_stride_x,
647                                                 dst_step_x,
648                                                 dst_stride_y,
649                                                 dst_step_y,
650                                                 dst_stride_z,
651                                                 dst_step_z,
652                                                 dst_stride_w,
653                                                 dst_step_w,
654                                                 dst_offset_first_element_in_bytes,
655                                                 _ISRC_WIDTH,
656                                                 _ISRC_HEIGHT,
657                                                 _INUM_TILES_X,
658                                                 _INUM_TILES_Y);
659}
660#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
661
662#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
663//! @cond Doxygen_Suppress
664/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
665 *
666 * @note Data layout supported: NHWC
667 * @note Data type supported: F32/F16
668 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
669 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
670 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
671 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
672 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
673 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
674 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
675 *
676 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
677 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
678 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
679 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
680 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
681 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
682 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
683 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
684 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
685 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
686 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
687 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
688 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
689 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
690 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
691 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
692 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
693 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
694 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
695 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
696 * @param[in] _ISRC_WIDTH                       The src tensor's width
697 * @param[in] _ISRC_HEIGHT                      The src tensor's height
698 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
699 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
700 */
701//! @endcond
702__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
703    TENSOR4D(src, BUFFER),
704    TENSOR4D(dst, BUFFER),
705    const int _ISRC_WIDTH,
706    const int _ISRC_HEIGHT,
707    const int _INUM_TILES_X,
708    const int _INUM_TILES_Y)
709{
710    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
711                                                 src_stride_x,
712                                                 src_step_x,
713                                                 src_stride_y,
714                                                 src_step_y,
715                                                 src_stride_z,
716                                                 src_step_z,
717                                                 src_stride_w,
718                                                 src_step_w,
719                                                 src_offset_first_element_in_bytes,
720                                                 dst_ptr,
721                                                 dst_stride_x,
722                                                 dst_step_x,
723                                                 dst_stride_y,
724                                                 dst_step_y,
725                                                 dst_stride_z,
726                                                 dst_step_z,
727                                                 dst_stride_w,
728                                                 dst_step_w,
729                                                 dst_offset_first_element_in_bytes,
730                                                 _ISRC_WIDTH,
731                                                 _ISRC_HEIGHT,
732                                                 _INUM_TILES_X,
733                                                 _INUM_TILES_Y);
734}
735#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
736
737#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
738//! @cond Doxygen_Suppress
739/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
740 *
741 * @note Data layout supported: NHWC
742 * @note Data type supported: F32/F16
743 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
744 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
745 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
746 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
747 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
748 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
749 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
750 *
751 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
752 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
753 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
754 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
755 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
756 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
757 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
758 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
759 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
760 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
761 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
762 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
763 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
764 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
765 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
766 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
767 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
768 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
769 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
770 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
771 * @param[in] _ISRC_WIDTH                       The src tensor's width
772 * @param[in] _ISRC_HEIGHT                      The src tensor's height
773 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
774 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
775 */
776//! @endcond
777__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
778    TENSOR4D(src, BUFFER),
779    TENSOR4D(dst, BUFFER),
780    const int _ISRC_WIDTH,
781    const int _ISRC_HEIGHT,
782    const int _INUM_TILES_X,
783    const int _INUM_TILES_Y)
784{
785    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
786                                                 src_stride_x,
787                                                 src_step_x,
788                                                 src_stride_y,
789                                                 src_step_y,
790                                                 src_stride_z,
791                                                 src_step_z,
792                                                 src_stride_w,
793                                                 src_step_w,
794                                                 src_offset_first_element_in_bytes,
795                                                 dst_ptr,
796                                                 dst_stride_x,
797                                                 dst_step_x,
798                                                 dst_stride_y,
799                                                 dst_step_y,
800                                                 dst_stride_z,
801                                                 dst_step_z,
802                                                 dst_stride_w,
803                                                 dst_step_w,
804                                                 dst_offset_first_element_in_bytes,
805                                                 _ISRC_WIDTH,
806                                                 _ISRC_HEIGHT,
807                                                 _INUM_TILES_X,
808                                                 _INUM_TILES_Y);
809}
810#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
811
812#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
813//! @cond Doxygen_Suppress
814/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
815 *
816 * @note Data layout supported: NHWC
817 * @note Data type supported: F32/F16
818 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
819 *
820 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
821 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
822 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
823 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
824 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
825 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
826 *
827 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
828 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
829 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
830 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
831 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
832 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
833 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
834 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
835 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
836 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
837 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
838 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
839 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
840 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
841 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
842 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
843 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
844 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
845 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
846 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
847 * @param[in] _ISRC_WIDTH                       The src tensor's width
848 * @param[in] _ISRC_HEIGHT                      The src tensor's height
849 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
850 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
851 */
852//! @endcond
853__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
854    TENSOR4D(src, BUFFER),
855    TENSOR4D(dst, BUFFER),
856    const int _ISRC_WIDTH,
857    const int _ISRC_HEIGHT,
858    const int _INUM_TILES_X,
859    const int _INUM_TILES_Y)
860{
861    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
862                                                 src_stride_x,
863                                                 src_step_x,
864                                                 src_stride_y,
865                                                 src_step_y,
866                                                 src_stride_z,
867                                                 src_step_z,
868                                                 src_stride_w,
869                                                 src_step_w,
870                                                 src_offset_first_element_in_bytes,
871                                                 dst_ptr,
872                                                 dst_stride_x,
873                                                 dst_step_x,
874                                                 dst_stride_y,
875                                                 dst_step_y,
876                                                 dst_stride_z,
877                                                 dst_step_z,
878                                                 dst_stride_w,
879                                                 dst_step_w,
880                                                 dst_offset_first_element_in_bytes,
881                                                 _ISRC_WIDTH,
882                                                 _ISRC_HEIGHT,
883                                                 _INUM_TILES_X,
884                                                 _INUM_TILES_Y);
885}
886#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
887
888#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
889//! @cond Doxygen_Suppress
890/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
891 *
892 * @note Data layout supported: NHWC
893 * @note Data type supported: F32/F16
894 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
895 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
896 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
897 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
898 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
899 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
900 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
901 *
902 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
903 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
904 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
905 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
906 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
907 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
908 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
909 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
910 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
911 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
912 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
913 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
914 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
915 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
916 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
917 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
918 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
919 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
920 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
921 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
922 * @param[in] _ISRC_WIDTH                       The src tensor's width
923 * @param[in] _ISRC_HEIGHT                      The src tensor's height
924 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
925 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
926 */
927//! @endcond
928__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
929    TENSOR4D(src, BUFFER),
930    TENSOR4D(dst, BUFFER),
931    const int _ISRC_WIDTH,
932    const int _ISRC_HEIGHT,
933    const int _INUM_TILES_X,
934    const int _INUM_TILES_Y)
935{
936    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
937                                                 src_stride_x,
938                                                 src_step_x,
939                                                 src_stride_y,
940                                                 src_step_y,
941                                                 src_stride_z,
942                                                 src_step_z,
943                                                 src_stride_w,
944                                                 src_step_w,
945                                                 src_offset_first_element_in_bytes,
946                                                 dst_ptr,
947                                                 dst_stride_x,
948                                                 dst_step_x,
949                                                 dst_stride_y,
950                                                 dst_step_y,
951                                                 dst_stride_z,
952                                                 dst_step_z,
953                                                 dst_stride_w,
954                                                 dst_step_w,
955                                                 dst_offset_first_element_in_bytes,
956                                                 _ISRC_WIDTH,
957                                                 _ISRC_HEIGHT,
958                                                 _INUM_TILES_X,
959                                                 _INUM_TILES_Y);
960}
961#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
962
963#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
964//! @cond Doxygen_Suppress
965/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
966 *
967 * @note Data layout supported: NHWC
968 * @note Data type supported: F32/F16
969 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
970 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
971 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
972 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
973 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
974 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
975 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
976 *
977 * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
978 * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
979 * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
980 * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
981 * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
982 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
983 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
984 * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
985 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
986 * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
987 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
988 * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
989 * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
990 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
991 * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
992 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
993 * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
994 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
995 * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
996 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
997 * @param[in] _ISRC_WIDTH                       The src tensor's width
998 * @param[in] _ISRC_HEIGHT                      The src tensor's height
999 * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
1000 * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
1001 */
1002//! @endcond
1003__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
1004    TENSOR4D(src, BUFFER),
1005    TENSOR4D(dst, BUFFER),
1006    const int _ISRC_WIDTH,
1007    const int _ISRC_HEIGHT,
1008    const int _INUM_TILES_X,
1009    const int _INUM_TILES_Y)
1010{
1011    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
1012                                                 src_stride_x,
1013                                                 src_step_x,
1014                                                 src_stride_y,
1015                                                 src_step_y,
1016                                                 src_stride_z,
1017                                                 src_step_z,
1018                                                 src_stride_w,
1019                                                 src_step_w,
1020                                                 src_offset_first_element_in_bytes,
1021                                                 dst_ptr,
1022                                                 dst_stride_x,
1023                                                 dst_step_x,
1024                                                 dst_stride_y,
1025                                                 dst_step_y,
1026                                                 dst_stride_z,
1027                                                 dst_step_z,
1028                                                 dst_stride_w,
1029                                                 dst_step_w,
1030                                                 dst_offset_first_element_in_bytes,
1031                                                 _ISRC_WIDTH,
1032                                                 _ISRC_HEIGHT,
1033                                                 _INUM_TILES_X,
1034                                                 _INUM_TILES_Y);
1035}
1036#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
1037#endif // defined(NHWC)
1038#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
1039