1/* 2 * Copyright (c) 2018-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25#if defined(DATA_TYPE) && defined(ELEMENT_SIZE) 26 27#if ELEMENT_SIZE == 1 28#define COND_DATA_TYPE char 29#elif ELEMENT_SIZE == 2 30#define COND_DATA_TYPE short 31#elif ELEMENT_SIZE == 4 32#define COND_DATA_TYPE int 33#else // ELEMENT_SIZE 34#error "Element size not support" 35#endif // ELEMENT_SIZE 36 37#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) 38/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW 39 * 40 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 41 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 42 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 43 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 44 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 45 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 46 * 47 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 48 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 49 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 50 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 51 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 52 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 53 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 54 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 55 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 56 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 57 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 58 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 59 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 60 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 61 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 62 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 63 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 64 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 65 */ 66__kernel void im2col1x1_stridex1_nchw( 67 TENSOR3D_DECLARATION(src), 68#if defined(NUM_GROUPS) 69 TENSOR3D_DECLARATION(dst), 70#else // defined(NUM_GROUPS) 71 IMAGE_DECLARATION(dst), 72#endif // defined(NUM_GROUPS) 73 uint src_stride_w, 74 uint dst_stride_w) 75{ 76 const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor 77 const uint yc = get_global_id(1); // y coordinate in the convolved tensor 78 const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map 79 const uint batch = get_global_id(2) / SRC_DEPTH; // batch size 80 81 // Clamp xc 82 // The strategy clamps at "xc" as it will be a valid value for sure 83 uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3); 84 85 // Check which values are valid 86 const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 87 88 xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0)); 89 90 // Calculate input indices 91 const uint xi = xc; 92 const uint yi = yc * STRIDE_Y; 93 94 // Calculate output indices 95 96#if defined(NUM_GROUPS) 97 const uint xo = ch % (SRC_DEPTH / NUM_GROUPS); 98 const uint zo = ch / (SRC_DEPTH / NUM_GROUPS); 99#else // defined(NUM_GROUPS) 100 const uint xo = ch; 101#endif // defined(NUM_GROUPS) 102 const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution 103 104 // Get input and output address 105 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; 106#if defined(NUM_GROUPS) 107 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w; 108#else // defined(NUM_GROUPS) 109 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w; 110#endif // defined(NUM_GROUPS) 111 112 VEC_DATA_TYPE(DATA_TYPE, 4) 113 data = vload4(0, (__global DATA_TYPE *)input_ptr); 114 115 // If out-of-bound, overwrite with the first element 116 data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0); 117 118 *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0; 119 *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1; 120 *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2; 121 *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; 122 123#ifdef HAS_BIAS 124#if defined(NUM_GROUPS) 125 if(xo == (SRC_DEPTH / NUM_GROUPS - 1)) 126#else // defined(NUM_GROUPS) 127 if(ch == (SRC_DEPTH - 1)) 128#endif // defined(NUM_GROUPS) 129 { 130 *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; 131 *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; 132 *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f; 133 *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f; 134 } 135#endif // HAS_BIAS 136} 137#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) 138 139#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 140#if defined(DILATION_X) && defined(DILATION_Y) 141/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW 142 * 143 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 144 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 145 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 146 * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 147 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 148 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 149 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 150 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 151 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 152 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 153 * 154 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 155 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 156 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 157 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 158 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 159 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 160 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 161 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 162 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 163 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 164 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 165 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 166 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 167 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 168 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 169 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 170 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 171 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 172 */ 173__kernel void im2col_generic_nchw( 174 TENSOR3D_DECLARATION(src), 175#if defined(NUM_GROUPS) 176 TENSOR3D_DECLARATION(dst), 177#else // defined(NUM_GROUPS) 178 IMAGE_DECLARATION(dst), 179#endif // defined(NUM_GROUPS) 180 uint src_stride_w, 181 uint dst_stride_w) 182{ 183 const int xc = get_global_id(0); // x coordinate in the convolved tensor 184 const int yc = get_global_id(1); // y coordinate in the convolved tensor 185 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 186 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 187 188 // Calculate input indices 189 const int xi = xc * STRIDE_X - PAD_LEFT; 190 const int yi = yc * STRIDE_Y - PAD_TOP; 191 192 // Calculate output indices 193#if defined(NUM_GROUPS) 194 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; 195 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 196#else // defined(NUM_GROUPS) 197 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; 198#endif // defined(NUM_GROUPS) 199 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 200 201 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; 202#if defined(NUM_GROUPS) 203 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; 204#else // defined(NUM_GROUPS) 205 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; 206#endif // defined(NUM_GROUPS) 207 208 // Linearize convolution elements 209 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) 210 { 211 int y = yi + yk * DILATION_Y; 212 for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) 213 { 214 int x = xi + xk * DILATION_X; 215#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 216 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 217#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 218 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) 219 { 220 *output_ptr = PAD_VALUE; 221 } 222 else 223 { 224 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 225 } 226#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 227 } 228 } 229 230#ifdef HAS_BIAS 231#if defined(NUM_GROUPS) 232 if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) 233#else // defined(NUM_GROUPS) 234 if(ch == (SRC_DEPTH - 1)) 235#endif // defined(NUM_GROUPS) 236 { 237 *output_ptr = 1.0f; 238 } 239#endif // HAS_BIAS 240} 241#endif // defined(DILATION_X) && defined(DILATION_Y) 242 243/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW 244 * 245 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 246 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 247 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 248 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 249 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 250 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 251 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 252 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 253 * 254 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 255 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 256 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 257 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 258 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 259 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 260 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 261 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 262 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 263 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 264 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 265 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 266 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 267 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 268 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 269 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 270 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 271 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 272 */ 273__kernel void im2col3x3_nchw( 274 TENSOR3D_DECLARATION(src), 275#if defined(NUM_GROUPS) 276 TENSOR3D_DECLARATION(dst), 277#else // defined(NUM_GROUPS) 278 IMAGE_DECLARATION(dst), 279#endif // defined(NUM_GROUPS) 280 uint src_stride_w, 281 uint dst_stride_w) 282{ 283 const int xc = get_global_id(0); // x coordinate in the convolved tensor 284 const int yc = get_global_id(1); // y coordinate in the convolved tensor 285 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 286 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 287 288 // Calculate input indices 289 const int xi = xc * STRIDE_X - PAD_LEFT; 290 const int yi = yc * STRIDE_Y - PAD_TOP; 291 292 // Calculate output indices 293#if defined(NUM_GROUPS) 294 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3 295 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 296#else // defined(NUM_GROUPS) 297 const int xo = ch * 9; // 3x3 298#endif // defined(NUM_GROUPS) 299 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 300 301 // Get input and output address 302 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; 303#if defined(NUM_GROUPS) 304 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 305#else // defined(NUM_GROUPS) 306 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 307#endif // defined(NUM_GROUPS) 308 309 VEC_DATA_TYPE(DATA_TYPE, 3) 310 row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); 311 VEC_DATA_TYPE(DATA_TYPE, 3) 312 row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); 313 VEC_DATA_TYPE(DATA_TYPE, 3) 314 row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); 315 316#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 317 // Put 0 if the value is out-of-bound 318 int3 x = (int3)xi + (int3)(0, 1, 2); 319 int3 y = (int3)yi + (int3)(0, 1, 2); 320 321 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 322 cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 323 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 324 cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 325 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 326 cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 327 328 row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); 329 row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1); 330 row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2); 331#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 332 333 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); 334 *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; 335 336#ifdef HAS_BIAS 337#if defined(NUM_GROUPS) 338 if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1)) 339#else // defined(NUM_GROUPS) 340 if(ch == (SRC_DEPTH - 1)) 341#endif // defined(NUM_GROUPS) 342 { 343 *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; 344 } 345#endif // HAS_BIAS 346} 347 348/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW 349 * 350 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 351 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 352 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 353 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 354 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 355 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 356 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 357 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 358 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 359 * 360 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 361 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 362 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 363 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 364 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 365 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 366 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 367 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 368 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 369 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 370 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 371 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 372 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 373 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 374 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 375 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 376 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 377 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 378 */ 379__kernel void im2col5x5_nchw( 380 TENSOR3D_DECLARATION(src), 381#if defined(NUM_GROUPS) 382 TENSOR3D_DECLARATION(dst), 383#else // defined(NUM_GROUPS) 384 IMAGE_DECLARATION(dst), 385#endif // defined(NUM_GROUPS) 386 uint src_stride_w, 387 uint dst_stride_w) 388{ 389 const int xc = get_global_id(0); // x coordinate in the convolved tensor 390 const int yc = get_global_id(1); // y coordinate in the convolved tensor 391 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 392 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 393 394 // Calculate input indices 395 const int xi = xc * STRIDE_X - PAD_LEFT; 396 const int yi = yc * STRIDE_Y - PAD_TOP; 397 398 // Calculate output indices 399#if defined(NUM_GROUPS) 400 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5 401 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 402#else // defined(NUM_GROUPS) 403 const int xo = ch * 25; // 5x5 404#endif // defined(NUM_GROUPS) 405 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 406 407#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 408 // Put 0 if the value is out-of-bound 409 int4 x0 = (int4)xi + (int4)(0, 1, 2, 3); 410 int4 y0 = (int4)yi + (int4)(0, 1, 2, 3); 411 int x1 = xi + 4; 412 int y1 = yi + 4; 413 414 // Check if we could have out-of-bounds elements in the x direction 415 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 416 x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 417 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 418 y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 419 COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH); 420 COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT); 421#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 422 423 // Get input and output address 424 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; 425#if defined(NUM_GROUPS) 426 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 427#else // defined(NUM_GROUPS) 428 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 429#endif // defined(NUM_GROUPS) 430 431 { 432 VEC_DATA_TYPE(DATA_TYPE, 4) 433 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 434 DATA_TYPE 435 row01 = *((__global DATA_TYPE *)input_ptr + 4); 436 437 input_ptr += src_stride_y; 438 439 VEC_DATA_TYPE(DATA_TYPE, 4) 440 row10 = vload4(0, (__global DATA_TYPE *)input_ptr); 441 DATA_TYPE 442 row11 = *((__global DATA_TYPE *)input_ptr + 4); 443 444#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 445 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 446 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0; 447 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 448 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1; 449 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0); 450 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1); 451 452 // Replace with 0 if the value is not valid 453 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 454 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); 455 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 456 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); 457#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 458 459 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, 460 row10.s012), 461 0, (__global DATA_TYPE *)output_ptr); 462 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); 463 464 input_ptr += src_stride_y; 465 output_ptr += 10 * dst_stride_x; 466 } 467 468 { 469 VEC_DATA_TYPE(DATA_TYPE, 4) 470 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 471 DATA_TYPE 472 row01 = *((__global DATA_TYPE *)input_ptr + 4); 473 474 input_ptr += src_stride_y; 475 476 VEC_DATA_TYPE(DATA_TYPE, 4) 477 row10 = vload4(0, (__global DATA_TYPE *)input_ptr); 478 DATA_TYPE 479 row11 = *((__global DATA_TYPE *)input_ptr + 4); 480 481#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 482 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 483 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2; 484 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 485 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3; 486 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2); 487 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3); 488 489 // Replace with 0 if the value is not valid 490 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 491 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); 492 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 493 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); 494#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 495 496 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, 497 row10.s012), 498 0, (__global DATA_TYPE *)output_ptr); 499 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); 500 501 input_ptr += src_stride_y; 502 output_ptr += 10 * dst_stride_x; 503 } 504 505 { 506 VEC_DATA_TYPE(DATA_TYPE, 4) 507 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 508 DATA_TYPE 509 row01 = *((__global DATA_TYPE *)input_ptr + 4); 510 511 input_ptr += src_stride_y; 512 513#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 514 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 515 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition; 516 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition); 517 518 // Replace with 0 if the value is not valid 519 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 520 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 521#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 522 523 vstore4(row00, 0, (__global DATA_TYPE *)output_ptr); 524 *((__global DATA_TYPE *)output_ptr + 4) = row01; 525 526 output_ptr += 5 * dst_stride_x; 527 } 528 529#ifdef HAS_BIAS 530#if defined(NUM_GROUPS) 531 if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1)) 532#else // defined(NUM_GROUPS) 533 if(ch == (SRC_DEPTH - 1)) 534#endif // defined(NUM_GROUPS) 535 { 536 *((__global DATA_TYPE *)output_ptr) = 1.0f; 537 } 538#endif // HAS_BIAS 539} 540#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 541 542#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) 543/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW 544 * 545 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 546 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 547 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 548 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 549 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 550 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 551 * 552 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 553 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 554 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 555 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 556 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 557 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 558 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 559 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 560 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 561 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 562 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 563 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 564 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 565 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 566 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 567 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 568 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 569 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 570 */ 571__kernel void im2col11x11_padx0_pady0_nchw( 572 TENSOR3D_DECLARATION(src), 573#if defined(NUM_GROUPS) 574 TENSOR3D_DECLARATION(dst), 575#else // defined(NUM_GROUPS) 576 IMAGE_DECLARATION(dst), 577#endif // defined(NUM_GROUPS) 578 uint src_stride_w, 579 uint dst_stride_w) 580{ 581 const int xc = get_global_id(0); // x coordinate in the convolved tensor 582 const int yc = get_global_id(1); // y coordinate in the convolved tensor 583 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 584 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 585 586 // Calculate input indices 587 const int xi = xc * STRIDE_X; 588 const int yi = yc * STRIDE_Y; 589 590 // Calculate output indices 591#if defined(NUM_GROUPS) 592 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11 593 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 594#else // defined(NUM_GROUPS) 595 const int xo = ch * 121; // 11x11 596#endif // defined(NUM_GROUPS) 597 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 598 599 // Get input and output address 600 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; 601#if defined(NUM_GROUPS) 602 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 603#else // defined(NUM_GROUPS) 604 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 605#endif // defined(NUM_GROUPS) 606 607 { 608 VEC_DATA_TYPE(DATA_TYPE, 8) 609 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 610 VEC_DATA_TYPE(DATA_TYPE, 3) 611 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 612 613 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 614 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 615 616 input_ptr += src_stride_y; 617 output_ptr += 11 * src_stride_x; 618 } 619 620 { 621 VEC_DATA_TYPE(DATA_TYPE, 8) 622 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 623 VEC_DATA_TYPE(DATA_TYPE, 3) 624 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 625 626 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 627 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 628 629 input_ptr += src_stride_y; 630 output_ptr += 11 * src_stride_x; 631 } 632 633 { 634 VEC_DATA_TYPE(DATA_TYPE, 8) 635 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 636 VEC_DATA_TYPE(DATA_TYPE, 3) 637 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 638 639 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 640 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 641 642 input_ptr += src_stride_y; 643 output_ptr += 11 * src_stride_x; 644 } 645 646 { 647 VEC_DATA_TYPE(DATA_TYPE, 8) 648 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 649 VEC_DATA_TYPE(DATA_TYPE, 3) 650 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 651 652 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 653 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 654 655 input_ptr += src_stride_y; 656 output_ptr += 11 * src_stride_x; 657 } 658 659 { 660 VEC_DATA_TYPE(DATA_TYPE, 8) 661 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 662 VEC_DATA_TYPE(DATA_TYPE, 3) 663 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 664 665 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 666 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 667 668 input_ptr += src_stride_y; 669 output_ptr += 11 * src_stride_x; 670 } 671 672 { 673 VEC_DATA_TYPE(DATA_TYPE, 8) 674 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 675 VEC_DATA_TYPE(DATA_TYPE, 3) 676 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 677 678 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 679 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 680 681 input_ptr += src_stride_y; 682 output_ptr += 11 * src_stride_x; 683 } 684 685 { 686 VEC_DATA_TYPE(DATA_TYPE, 8) 687 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 688 VEC_DATA_TYPE(DATA_TYPE, 3) 689 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 690 691 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 692 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 693 694 input_ptr += src_stride_y; 695 output_ptr += 11 * src_stride_x; 696 } 697 698 { 699 VEC_DATA_TYPE(DATA_TYPE, 8) 700 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 701 VEC_DATA_TYPE(DATA_TYPE, 3) 702 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 703 704 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 705 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 706 707 input_ptr += src_stride_y; 708 output_ptr += 11 * src_stride_x; 709 } 710 711 { 712 VEC_DATA_TYPE(DATA_TYPE, 8) 713 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 714 VEC_DATA_TYPE(DATA_TYPE, 3) 715 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 716 717 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 718 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 719 720 input_ptr += src_stride_y; 721 output_ptr += 11 * src_stride_x; 722 } 723 724 { 725 VEC_DATA_TYPE(DATA_TYPE, 8) 726 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 727 VEC_DATA_TYPE(DATA_TYPE, 3) 728 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 729 730 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 731 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 732 733 input_ptr += src_stride_y; 734 output_ptr += 11 * src_stride_x; 735 } 736 737 { 738 VEC_DATA_TYPE(DATA_TYPE, 8) 739 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 740 VEC_DATA_TYPE(DATA_TYPE, 3) 741 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 742 743 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 744 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 745 746 output_ptr += 11 * src_stride_x; 747 } 748 749#ifdef HAS_BIAS 750#if defined(NUM_GROUPS) 751 if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1)) 752#else // defined(NUM_GROUPS) 753 if(ch == (SRC_DEPTH - 1)) 754#endif // defined(NUM_GROUPS) 755 { 756 *((__global DATA_TYPE *)output_ptr) = 1.0f; 757 } 758#endif // HAS_BIAS 759} 760#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) 761 762#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 763/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW 764 * 765 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. 766 * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. 767 * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. 768 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 769 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 770 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 771 * 772 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 773 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 774 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 775 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 776 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 777 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 778 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 779 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 780 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 781 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 782 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 783 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 784 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 785 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 786 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 787 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 788 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 789 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 790 */ 791__kernel void im2col_generic_padx0_pady0_nchw( 792 TENSOR3D_DECLARATION(src), 793#if defined(NUM_GROUPS) 794 TENSOR3D_DECLARATION(dst), 795#else // defined(NUM_GROUPS) 796 IMAGE_DECLARATION(dst), 797#endif // defined(NUM_GROUPS) 798 uint src_stride_w, 799 uint dst_stride_w) 800{ 801 const int xc = get_global_id(0); // x coordinate in the convolved tensor 802 const int yc = get_global_id(1); // y coordinate in the convolved tensor 803 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 804 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 805 806 // Calculate input indices 807 const int xi = xc * STRIDE_X; 808 const int yi = yc * STRIDE_Y; 809 810 // Calculate output indices 811#if defined(NUM_GROUPS) 812 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; 813 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 814#else // defined(NUM_GROUPS) 815 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; 816#endif // defined(NUM_GROUPS) 817 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 818 819 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; 820#if defined(NUM_GROUPS) 821 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; 822#else // defined(NUM_GROUPS) 823 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; 824#endif // defined(NUM_GROUPS) 825 826 // Linearize convolution elements 827 for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y) 828 { 829 int last_x = 0; 830 for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE) 831 { 832 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 833 row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 834 VSTORE(VECTOR_SIZE) 835 (row, 0, output_ptr); 836 last_x = x; 837 } 838 // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE). 839 // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit. 840#if WIDTH_MOD_VECTOR_SIZE == 1 841 *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); 842#elif WIDTH_MOD_VECTOR_SIZE > 1 843 VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE) 844 row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); 845 VSTORE(WIDTH_MOD_VECTOR_SIZE) 846 (row, 0, output_ptr); 847#endif /* WIDTH_MOD_VECTOR_SIZE */ 848 output_ptr += WIDTH_MOD_VECTOR_SIZE; 849 } /* End of loop over KERNEL_HEIGHT */ 850 851#ifdef HAS_BIAS 852#if defined(NUM_GROUPS) 853 if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) 854#else // defined(NUM_GROUPS) 855 if(ch == (SRC_DEPTH - 1)) 856#endif // defined(NUM_GROUPS) 857 { 858 *output_ptr = 1.0f; 859 } 860#endif // HAS_BIAS 861} 862#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 863#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)