1/* 2 * Copyright (c) 2018-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25#include "tile_helpers.h" 26 27#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ 28 ({ \ 29 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \ 30 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \ 31 comm_fact.s2 = 2.5f * tmp.s3; \ 32 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \ 33 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \ 34 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \ 35 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \ 36 \ 37 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \ 38 out.s1 = comm_fact.s0 + comm_fact.s1; \ 39 out.s2 = comm_fact.s0 - comm_fact.s1; \ 40 out.s3 = comm_fact.s3 + comm_fact.s4; \ 41 out.s4 = comm_fact.s4 - comm_fact.s3; \ 42 out.s5 = comm_fact.s5 + comm_fact.s6; \ 43 out.s6 = comm_fact.s5 - comm_fact.s6; \ 44 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ 45 }) 46 47#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \ 48 ({ \ 49 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \ 50 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \ 51 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \ 52 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \ 53 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \ 54 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \ 55 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \ 56 out.s1 = comm_fact.s0 - comm_fact.s1; \ 57 out.s2 = comm_fact.s0 + comm_fact.s1; \ 58 out.s3 = comm_fact.s2 - comm_fact.s3; \ 59 out.s4 = comm_fact.s2 + comm_fact.s3; \ 60 out.s5 = comm_fact.s4 - comm_fact.s5; \ 61 out.s6 = comm_fact.s4 + comm_fact.s5; \ 62 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \ 63 }) 64 65#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) 66 67#if defined(NHWC) 68#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC) 69//! @cond Doxygen_Suppress 70/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC 71 * 72 * @note Data layout supported: NHWC 73 * @note Data type supported: F32/F16 74 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 75 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 76 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 77 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 78 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 79 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 80 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 81 * 82 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 83 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 84 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 85 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 86 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 87 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 88 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 89 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 90 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 91 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 92 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 93 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 94 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 95 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 96 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 97 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 98 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 99 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 100 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 101 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 102 * @param[in] _ISRC_WIDTH The src tensor's width 103 * @param[in] _ISRC_HEIGHT The src tensor's height 104 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 105 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 106 */ 107//! @endcond 108__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( 109 TENSOR4D(src, BUFFER), 110 TENSOR4D(dst, BUFFER), 111 const int _ISRC_WIDTH, 112 const int _ISRC_HEIGHT, 113 const int _INUM_TILES_X, 114 const int _INUM_TILES_Y) 115{ 116 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM 117 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y 118 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 119 120 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; 121 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; 122 x -= PAD_LEFT; 123 y -= PAD_TOP; 124 125#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 126 127 TILE(DATA_TYPE, 6, 1, in); 128 TILE(DATA_TYPE, 6, 1, out); 129 130 // Initialize the input tile 131 LOOP_UNROLLING(int, i, 0, 1, 6, 132 { 133 in[i].v = 0; 134 }) 135 136#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 137 T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 138#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 139 T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 140#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 141 142 TILE(DATA_TYPE, 6, 1, com); 143 144 LOOP_UNROLLING(int, i, 0, 1, 6, 145 { 146 in[i].v *= 4.0f; 147 }) 148 149 com[0].v = in[2].v - 4.f * in[0].v; 150 com[1].v = in[3].v - 4.f * in[1].v; 151 com[2].v = in[4].v - 4.f * in[2].v; 152 com[3].v = in[5].v - 4.f * in[3].v; 153 com[4].v = in[3].v - in[1].v; 154 com[4].v = com[4].v + com[4].v; 155 com[5].v = in[4].v - in[2].v; 156 157 out[0].v = com[2].v - com[0].v; 158 out[1].v = com[2].v + com[1].v; 159 out[2].v = com[2].v - com[1].v; 160 out[3].v = com[5].v + com[4].v; 161 out[4].v = com[5].v - com[4].v; 162 out[5].v = com[3].v - com[1].v; 163 164 TILE(uint, 6, 1, dst_indirect_y); 165 166 LOOP_UNROLLING(int, i, 0, 1, 6, 167 { 168 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 169 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6; 170 }) 171 172 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 173 174#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 175 176 TILE(DATA_TYPE, 36, 1, in); 177 178 // Initialize the input tile 179 LOOP_UNROLLING(int, i, 0, 1, 36, 180 { 181 in[i].v = 0; 182 }) 183 184 // Load the tile from a NHWC tensor 185 T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 186 187 TILE(DATA_TYPE, 6, 1, com); 188 TILE(DATA_TYPE, 36, 1, tmp); 189 190 LOOP_UNROLLING(int, i, 0, 1, 6, 191 { 192 com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v; 193 com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v; 194 com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v; 195 com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v; 196 com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v; 197 com[4].v = com[4].v + com[4].v; 198 com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v; 199 tmp[i + 0 * 6].v = com[2].v - com[0].v; 200 tmp[i + 1 * 6].v = com[2].v + com[1].v; 201 tmp[i + 2 * 6].v = com[2].v - com[1].v; 202 tmp[i + 3 * 6].v = com[5].v + com[4].v; 203 tmp[i + 4 * 6].v = com[5].v - com[4].v; 204 tmp[i + 5 * 6].v = com[3].v - com[1].v; 205 }) 206 207 TILE(DATA_TYPE, 36, 1, out); 208 209 LOOP_UNROLLING(int, i, 0, 1, 6, 210 { 211 com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v; 212 com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v; 213 com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v; 214 com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v; 215 com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v; 216 com[4].v = com[4].v + com[4].v; 217 com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v; 218 out[i * 6 + 0].v = com[2].v - com[0].v; 219 out[i * 6 + 1].v = com[2].v + com[1].v; 220 out[i * 6 + 2].v = com[2].v - com[1].v; 221 out[i * 6 + 3].v = com[5].v + com[4].v; 222 out[i * 6 + 4].v = com[5].v - com[4].v; 223 out[i * 6 + 5].v = com[3].v - com[1].v; 224 }) 225 226 // Compute destination address 227 TILE(uint, 36, 1, dst_indirect_y); 228 229 LOOP_UNROLLING(int, i, 0, 1, 36, 230 { 231 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 232 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36; 233 }) 234 235 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 236#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 237} 238#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC) 239 240#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC) 241//! @cond Doxygen_Suppress 242/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC 243 * 244 * @note Data layout supported: NHWC 245 * @note Data type supported: F32/F16 246 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 247 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 248 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 249 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 250 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 251 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 252 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 253 * 254 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 255 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 256 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 257 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 258 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 259 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 260 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 261 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 262 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 263 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 264 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 265 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 266 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 267 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 268 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 269 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 270 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 271 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 272 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 273 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 274 * @param[in] _ISRC_WIDTH The src tensor's width 275 * @param[in] _ISRC_HEIGHT The src tensor's height 276 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 277 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 278 */ 279//! @endcond 280__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( 281 TENSOR4D(src, BUFFER), 282 TENSOR4D(dst, BUFFER), 283 const int _ISRC_WIDTH, 284 const int _ISRC_HEIGHT, 285 const int _INUM_TILES_X, 286 const int _INUM_TILES_Y) 287{ 288 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM 289 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y 290 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 291 292 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; 293 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; 294 x -= PAD_LEFT; 295 y -= PAD_TOP; 296 297#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 298 299 TILE(DATA_TYPE, 8, 1, in); 300 TILE(DATA_TYPE, 8, 1, out); 301 302 // Initialize the input tile 303 LOOP_UNROLLING(int, i, 0, 1, 8, 304 { 305 in[i].v = 0; 306 }) 307 308#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 309 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 310#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 311 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 312#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 313 314 TILE(DATA_TYPE, 1, 8, com); 315 316 com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v; 317 com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v; 318 com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v; 319 com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v; 320 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; 321 com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v; 322 out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v; 323 out[1].s[0] = com[0].s[0] + com[0].s[1]; 324 out[2].s[0] = com[0].s[0] - com[0].s[1]; 325 out[3].s[0] = com[0].s[3] + com[0].s[2]; 326 out[4].s[0] = com[0].s[3] - com[0].s[2]; 327 out[5].s[0] = com[0].s[4] + com[0].s[5]; 328 out[6].s[0] = com[0].s[4] - com[0].s[5]; 329 out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v; 330 331 TILE(uint, 8, 1, dst_indirect_y); 332 333 LOOP_UNROLLING(int, i, 0, 1, 8, 334 { 335 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 336 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; 337 }) 338 339 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 340 341#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 342 343 TILE(DATA_TYPE, 64, 1, in); 344 TILE(DATA_TYPE, 64, 1, out); 345 346 // Initialize the input tile 347 LOOP_UNROLLING(int, i, 0, 1, 64, 348 { 349 in[i].v = 0; 350 }) 351 352 // Load the tile from a NHWC tensor 353 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 354 355 TILE(DATA_TYPE, 8, 8, com); 356 357 LOOP_UNROLLING(int, i, 0, 1, 8, 358 { 359 com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x 360 com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x 361 com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x 362 com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x 363 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; 364 com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0]; 365 com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0]; 366 com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0]; 367 }) 368 369 TILE(DATA_TYPE, 8, 8, tmp); 370 tmp[0].v = com[6].v; 371 tmp[1].v = com[0].v + com[1].v; 372 tmp[2].v = com[0].v - com[1].v; 373 tmp[3].v = com[2].v + com[3].v; 374 tmp[4].v = com[2].v - com[3].v; 375 tmp[5].v = com[4].v + com[5].v; 376 tmp[6].v = com[4].v - com[5].v; 377 tmp[7].v = com[7].v; 378 379 LOOP_UNROLLING(int, i, 0, 1, 8, 380 { 381 com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6]; 382 com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5]; 383 com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; 384 com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6]; 385 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; 386 com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5]; 387 out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6]; 388 out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1]; 389 out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1]; 390 out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2]; 391 out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2]; 392 out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5]; 393 out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5]; 394 out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7]; 395 }) 396 397 TILE(uint, 64, 1, dst_indirect_y); 398 399 LOOP_UNROLLING(int, i, 0, 1, 64, 400 { 401 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 402 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; 403 }) 404 405 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 406 407#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 408} 409#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC) 410 411#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC) 412//! @cond Doxygen_Suppress 413/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC 414 * 415 * @note Data layout supported: NHWC 416 * @note Data type supported: F32/F16 417 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 418 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 419 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 420 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 421 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 422 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 423 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 424 * 425 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 426 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 427 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 428 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 429 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 430 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 431 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 432 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 433 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 434 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 435 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 436 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 437 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 438 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 439 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 440 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 441 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 442 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 443 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 444 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 445 * @param[in] _ISRC_WIDTH The src tensor's width 446 * @param[in] _ISRC_HEIGHT The src tensor's height 447 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 448 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 449 */ 450//! @endcond 451__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( 452 TENSOR4D(src, BUFFER), 453 TENSOR4D(dst, BUFFER), 454 const int _ISRC_WIDTH, 455 const int _ISRC_HEIGHT, 456 const int _INUM_TILES_X, 457 const int _INUM_TILES_Y) 458{ 459 const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM 460 const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y 461 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 462 463 int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; 464 int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; 465 x -= PAD_LEFT; 466 y -= PAD_TOP; 467 468#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 469 470 TILE(DATA_TYPE, 8, 1, in); 471 TILE(DATA_TYPE, 8, 1, out); 472 473 // Initialize the input tile 474 LOOP_UNROLLING(int, i, 0, 1, 8, 475 { 476 in[i].v = 0; 477 }) 478 479#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 480 T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 481#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 482 T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 483#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 484 485 LOOP_UNROLLING(int, i, 0, 1, 8, 486 { 487 in[i].v *= (DATA_TYPE) - 36.0f; 488 }) 489 490 TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } }; 491 492 com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v; 493 com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v; 494 com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v; 495 com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v; 496 com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; 497 com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v; 498 out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v; 499 out[1].s[0] = com[0].s[0] - com[0].s[1]; 500 out[2].s[0] = com[0].s[0] + com[0].s[1]; 501 out[3].s[0] = com[0].s[2] - com[0].s[3]; 502 out[4].s[0] = com[0].s[2] + com[0].s[3]; 503 out[5].s[0] = com[0].s[4] - com[0].s[5]; 504 out[6].s[0] = com[0].s[4] + com[0].s[5]; 505 out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v; 506 507 TILE(uint, 8, 1, dst_indirect_y); 508 509 LOOP_UNROLLING(int, i, 0, 1, 8, 510 { 511 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 512 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; 513 }) 514 515 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 516 517#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 518 519 TILE(DATA_TYPE, 64, 1, in); 520 TILE(DATA_TYPE, 64, 1, out); 521 522 // Initialize the input tile 523 LOOP_UNROLLING(int, i, 0, 1, 64, 524 { 525 in[i].v = 0; 526 }) 527 528 // Load the tile from a NHWC tensor 529 T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); 530 531 TILE(DATA_TYPE, 8, 8, com); 532 533 LOOP_UNROLLING(int, i, 0, 1, 8, 534 { 535 com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; 536 com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; 537 com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; 538 com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; 539 com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; 540 com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0]; 541 com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0]; 542 com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0]; 543 }) 544 545 TILE(DATA_TYPE, 8, 8, tmp); 546 tmp[0].v = com[6].v; 547 tmp[1].v = com[0].v - com[1].v; 548 tmp[2].v = com[0].v + com[1].v; 549 tmp[3].v = com[2].v - com[3].v; 550 tmp[4].v = com[2].v + com[3].v; 551 tmp[5].v = com[4].v - com[5].v; 552 tmp[6].v = com[4].v + com[5].v; 553 tmp[7].v = com[7].v; 554 555 LOOP_UNROLLING(int, i, 0, 1, 8, 556 { 557 com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6]; 558 com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5]; 559 com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6]; 560 com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; 561 com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; 562 com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5]; 563 out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6]; 564 out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1]; 565 out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1]; 566 out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3]; 567 out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3]; 568 out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5]; 569 out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5]; 570 out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7]; 571 }) 572 573 TILE(uint, 64, 1, dst_indirect_y); 574 575 LOOP_UNROLLING(int, i, 0, 1, 64, 576 { 577 dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; 578 dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; 579 }) 580 581 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 582 583#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 584} 585#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC) 586 587#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) 588//! @cond Doxygen_Suppress 589/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC 590 * 591 * @note Data layout supported: NHWC 592 * @note Data type supported: F32/F16 593 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 594 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 595 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 596 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 597 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 598 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 599 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 600 * 601 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 602 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 603 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 604 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 605 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 606 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 607 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 608 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 609 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 610 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 611 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 612 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 613 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 614 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 615 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 616 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 617 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 618 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 619 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 620 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 621 * @param[in] _ISRC_WIDTH The src tensor's width 622 * @param[in] _ISRC_HEIGHT The src tensor's height 623 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 624 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 625 */ 626//! @endcond 627__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( 628 TENSOR4D(src, BUFFER), 629 TENSOR4D(dst, BUFFER), 630 const int _ISRC_WIDTH, 631 const int _ISRC_HEIGHT, 632 const int _INUM_TILES_X, 633 const int _INUM_TILES_Y) 634{ 635 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, 636 src_stride_x, 637 src_step_x, 638 src_stride_y, 639 src_step_y, 640 src_stride_z, 641 src_step_z, 642 src_stride_w, 643 src_step_w, 644 src_offset_first_element_in_bytes, 645 dst_ptr, 646 dst_stride_x, 647 dst_step_x, 648 dst_stride_y, 649 dst_step_y, 650 dst_stride_z, 651 dst_step_z, 652 dst_stride_w, 653 dst_step_w, 654 dst_offset_first_element_in_bytes, 655 _ISRC_WIDTH, 656 _ISRC_HEIGHT, 657 _INUM_TILES_X, 658 _INUM_TILES_Y); 659} 660#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) 661 662#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) 663//! @cond Doxygen_Suppress 664/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC 665 * 666 * @note Data layout supported: NHWC 667 * @note Data type supported: F32/F16 668 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 669 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 670 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 671 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 672 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 673 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 674 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 675 * 676 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 677 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 678 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 679 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 680 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 681 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 682 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 683 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 684 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 685 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 686 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 687 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 688 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 689 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 690 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 691 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 692 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 693 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 694 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 695 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 696 * @param[in] _ISRC_WIDTH The src tensor's width 697 * @param[in] _ISRC_HEIGHT The src tensor's height 698 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 699 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 700 */ 701//! @endcond 702__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( 703 TENSOR4D(src, BUFFER), 704 TENSOR4D(dst, BUFFER), 705 const int _ISRC_WIDTH, 706 const int _ISRC_HEIGHT, 707 const int _INUM_TILES_X, 708 const int _INUM_TILES_Y) 709{ 710 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, 711 src_stride_x, 712 src_step_x, 713 src_stride_y, 714 src_step_y, 715 src_stride_z, 716 src_step_z, 717 src_stride_w, 718 src_step_w, 719 src_offset_first_element_in_bytes, 720 dst_ptr, 721 dst_stride_x, 722 dst_step_x, 723 dst_stride_y, 724 dst_step_y, 725 dst_stride_z, 726 dst_step_z, 727 dst_stride_w, 728 dst_step_w, 729 dst_offset_first_element_in_bytes, 730 _ISRC_WIDTH, 731 _ISRC_HEIGHT, 732 _INUM_TILES_X, 733 _INUM_TILES_Y); 734} 735#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) 736 737#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) 738//! @cond Doxygen_Suppress 739/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC 740 * 741 * @note Data layout supported: NHWC 742 * @note Data type supported: F32/F16 743 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 744 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 745 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 746 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 747 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 748 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 749 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 750 * 751 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 752 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 753 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 754 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 755 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 756 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 757 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 758 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 759 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 760 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 761 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 762 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 763 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 764 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 765 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 766 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 767 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 768 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 769 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 770 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 771 * @param[in] _ISRC_WIDTH The src tensor's width 772 * @param[in] _ISRC_HEIGHT The src tensor's height 773 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 774 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 775 */ 776//! @endcond 777__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( 778 TENSOR4D(src, BUFFER), 779 TENSOR4D(dst, BUFFER), 780 const int _ISRC_WIDTH, 781 const int _ISRC_HEIGHT, 782 const int _INUM_TILES_X, 783 const int _INUM_TILES_Y) 784{ 785 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, 786 src_stride_x, 787 src_step_x, 788 src_stride_y, 789 src_step_y, 790 src_stride_z, 791 src_step_z, 792 src_stride_w, 793 src_step_w, 794 src_offset_first_element_in_bytes, 795 dst_ptr, 796 dst_stride_x, 797 dst_step_x, 798 dst_stride_y, 799 dst_step_y, 800 dst_stride_z, 801 dst_step_z, 802 dst_stride_w, 803 dst_step_w, 804 dst_offset_first_element_in_bytes, 805 _ISRC_WIDTH, 806 _ISRC_HEIGHT, 807 _INUM_TILES_X, 808 _INUM_TILES_Y); 809} 810#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) 811 812#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC) 813//! @cond Doxygen_Suppress 814/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC 815 * 816 * @note Data layout supported: NHWC 817 * @note Data type supported: F32/F16 818 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 819 * 820 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 821 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 822 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 823 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 824 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 825 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 826 * 827 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 828 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 829 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 830 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 831 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 832 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 833 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 834 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 835 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 836 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 837 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 838 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 839 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 840 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 841 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 842 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 843 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 844 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 845 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 846 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 847 * @param[in] _ISRC_WIDTH The src tensor's width 848 * @param[in] _ISRC_HEIGHT The src tensor's height 849 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 850 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 851 */ 852//! @endcond 853__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( 854 TENSOR4D(src, BUFFER), 855 TENSOR4D(dst, BUFFER), 856 const int _ISRC_WIDTH, 857 const int _ISRC_HEIGHT, 858 const int _INUM_TILES_X, 859 const int _INUM_TILES_Y) 860{ 861 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, 862 src_stride_x, 863 src_step_x, 864 src_stride_y, 865 src_step_y, 866 src_stride_z, 867 src_step_z, 868 src_stride_w, 869 src_step_w, 870 src_offset_first_element_in_bytes, 871 dst_ptr, 872 dst_stride_x, 873 dst_step_x, 874 dst_stride_y, 875 dst_step_y, 876 dst_stride_z, 877 dst_step_z, 878 dst_stride_w, 879 dst_step_w, 880 dst_offset_first_element_in_bytes, 881 _ISRC_WIDTH, 882 _ISRC_HEIGHT, 883 _INUM_TILES_X, 884 _INUM_TILES_Y); 885} 886#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC) 887 888#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC) 889//! @cond Doxygen_Suppress 890/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC 891 * 892 * @note Data layout supported: NHWC 893 * @note Data type supported: F32/F16 894 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 895 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 896 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 897 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 898 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 899 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 900 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 901 * 902 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 903 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 904 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 905 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 906 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 907 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 908 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 909 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 910 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 911 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 912 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 913 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 914 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 915 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 916 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 917 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 918 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 919 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 920 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 921 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 922 * @param[in] _ISRC_WIDTH The src tensor's width 923 * @param[in] _ISRC_HEIGHT The src tensor's height 924 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 925 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 926 */ 927//! @endcond 928__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( 929 TENSOR4D(src, BUFFER), 930 TENSOR4D(dst, BUFFER), 931 const int _ISRC_WIDTH, 932 const int _ISRC_HEIGHT, 933 const int _INUM_TILES_X, 934 const int _INUM_TILES_Y) 935{ 936 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, 937 src_stride_x, 938 src_step_x, 939 src_stride_y, 940 src_step_y, 941 src_stride_z, 942 src_step_z, 943 src_stride_w, 944 src_step_w, 945 src_offset_first_element_in_bytes, 946 dst_ptr, 947 dst_stride_x, 948 dst_step_x, 949 dst_stride_y, 950 dst_step_y, 951 dst_stride_z, 952 dst_step_z, 953 dst_stride_w, 954 dst_step_w, 955 dst_offset_first_element_in_bytes, 956 _ISRC_WIDTH, 957 _ISRC_HEIGHT, 958 _INUM_TILES_X, 959 _INUM_TILES_Y); 960} 961#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC) 962 963#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC) 964//! @cond Doxygen_Suppress 965/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC 966 * 967 * @note Data layout supported: NHWC 968 * @note Data type supported: F32/F16 969 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) 970 * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) 971 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) 972 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 973 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 974 * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 975 * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time 976 * 977 * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 978 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 979 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 980 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 981 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 982 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 983 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 984 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 985 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 986 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 987 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr 988 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 989 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 990 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 991 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 992 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 993 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 994 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 995 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 996 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 997 * @param[in] _ISRC_WIDTH The src tensor's width 998 * @param[in] _ISRC_HEIGHT The src tensor's height 999 * @param[in] _INUM_TILES_X The number of tiles in the X dimension 1000 * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension 1001 */ 1002//! @endcond 1003__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc( 1004 TENSOR4D(src, BUFFER), 1005 TENSOR4D(dst, BUFFER), 1006 const int _ISRC_WIDTH, 1007 const int _ISRC_HEIGHT, 1008 const int _INUM_TILES_X, 1009 const int _INUM_TILES_Y) 1010{ 1011 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, 1012 src_stride_x, 1013 src_step_x, 1014 src_stride_y, 1015 src_step_y, 1016 src_stride_z, 1017 src_step_z, 1018 src_stride_w, 1019 src_step_w, 1020 src_offset_first_element_in_bytes, 1021 dst_ptr, 1022 dst_stride_x, 1023 dst_step_x, 1024 dst_stride_y, 1025 dst_step_y, 1026 dst_stride_z, 1027 dst_step_z, 1028 dst_stride_w, 1029 dst_step_w, 1030 dst_offset_first_element_in_bytes, 1031 _ISRC_WIDTH, 1032 _ISRC_HEIGHT, 1033 _INUM_TILES_X, 1034 _INUM_TILES_Y); 1035} 1036#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC) 1037#endif // defined(NHWC) 1038#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) 1039