xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/CpuIm2ColKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/cpu/kernels/CpuIm2ColKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/Size2D.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Types.h"
32 #include "arm_compute/core/Validate.h"
33 #include "src/core/CPP/Validate.h"
34 #include "src/core/helpers/AutoConfiguration.h"
35 #include "src/core/helpers/WindowHelpers.h"
36 
37 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
38 
39 #include <arm_neon.h>
40 #include <cstddef>
41 #include <cstdint>
42 #include <cstring>
43 #include <tuple>
44 
45 namespace arm_compute
46 {
47 using namespace misc::shape_calculator;
48 namespace cpu
49 {
50 namespace kernels
51 {
52 namespace
53 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)54 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
55                           bool has_bias, const Size2D &dilation, unsigned int num_groups)
56 {
57     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
58     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
59     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
60     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
61     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
62     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
63 
64     // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
65     const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
66     const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
67     const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
68     const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
69     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
70 
71     if(output->total_size() > 0)
72     {
73         TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
74         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
75         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
76         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
77     }
78 
79     return Status{};
80 }
81 
82 template <typename T, bool has_pads>
linearize_volume_nchw(const uint8_t * const in_ptr,T * out_ptr,bool has_bias,int top_left_x,int top_left_y,int kernel_width,int kernel_height,int kernel_depth,int input_w,int input_h,int input_stride_x,int input_stride_y,int input_stride_z,int pad_value,int dilation_x,int dilation_y)83 inline void linearize_volume_nchw(const uint8_t *const in_ptr,
84                                   T                   *out_ptr,
85                                   bool                 has_bias,
86                                   int                  top_left_x,
87                                   int                  top_left_y,
88                                   int                  kernel_width,
89                                   int                  kernel_height,
90                                   int                  kernel_depth,
91                                   int                  input_w,
92                                   int                  input_h,
93                                   int                  input_stride_x,
94                                   int                  input_stride_y,
95                                   int                  input_stride_z,
96                                   int                  pad_value,
97                                   int                  dilation_x,
98                                   int                  dilation_y)
99 {
100     const int kernel_size2 = kernel_width * kernel_height;
101     const int x_e          = top_left_x + kernel_width * dilation_x;
102     const int y_e          = top_left_y + kernel_height * dilation_y;
103 
104     // Linearize volume
105     int d = 0;
106     // This for loop linearize a volume with 3 slices. This allows:
107     // 1) to reduce the iterations of the outer for loop "d"
108     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
109     for(; d <= (kernel_depth - 3); d += 3)
110     {
111         for(int y = top_left_y; y < y_e; y += dilation_y)
112         {
113             if((y < 0 || y >= input_h) && has_pads)
114             {
115                 // All the values will be the offset (will be zeros when not quantized)
116                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
117                 {
118                     *(out_ptr + 0 * kernel_size2) = pad_value;
119                     *(out_ptr + 1 * kernel_size2) = pad_value;
120                     *(out_ptr + 2 * kernel_size2) = pad_value;
121                 }
122             }
123             else
124             {
125                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
126                 {
127                     if((x < 0 || x >= input_w) && has_pads)
128                     {
129                         *(out_ptr + 0 * kernel_size2) = pad_value;
130                         *(out_ptr + 1 * kernel_size2) = pad_value;
131                         *(out_ptr + 2 * kernel_size2) = pad_value;
132                     }
133                     else
134                     {
135                         *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
136                         *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
137                         *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
138                     }
139                 }
140             }
141         }
142         out_ptr += 2 * kernel_size2;
143     }
144 
145     // Left over
146     for(; d < kernel_depth; d++)
147     {
148         for(int y = top_left_y; y < y_e; y += dilation_y)
149         {
150             if((y < 0 || y >= input_h) && has_pads)
151             {
152                 // All the values will be the offset (will be zeros when not quantized)
153                 memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
154                 out_ptr += kernel_width;
155             }
156             else
157             {
158                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
159                 {
160                     if((x < 0 || x >= input_w) && has_pads)
161                     {
162                         *out_ptr = pad_value;
163                     }
164                     else
165                     {
166                         *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
167                     }
168                 }
169             }
170         }
171     }
172 
173     // Append 1 if the convolution layer has biases
174     if(has_bias)
175     {
176         *out_ptr = static_cast<T>(1);
177     }
178 }
179 
180 template <typename T, bool has_pads>
linearize_volume_nhwc(const uint8_t * const in_ptr,T * out_ptr,bool has_bias,int start_x,int start_y,int kernel_width,int kernel_height,int input_w,int input_h,int input_c,int input_stride_y,int input_stride_z,int pad_value,int dilation_x,int dilation_y)181 inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
182                                   T                   *out_ptr,
183                                   bool                 has_bias,
184                                   int                  start_x,
185                                   int                  start_y,
186                                   int                  kernel_width,
187                                   int                  kernel_height,
188                                   int                  input_w,
189                                   int                  input_h,
190                                   int                  input_c,
191                                   int                  input_stride_y,
192                                   int                  input_stride_z,
193                                   int                  pad_value,
194                                   int                  dilation_x,
195                                   int                  dilation_y)
196 {
197     const int end_x        = start_x + kernel_width * dilation_x;
198     const int end_y        = start_y + kernel_height * dilation_y;
199     const int pad_quant    = kernel_width * input_c;
200     const int element_size = static_cast<int>(sizeof(T));
201     if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
202     {
203         for(int y = start_y; y < end_y; y += dilation_y)
204         {
205             //optimized for no dilation and no boundary pixels
206             memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
207             out_ptr += input_c * kernel_width;
208         }
209     }
210     else
211     {
212         for(int y = start_y; y < end_y; y += dilation_y)
213         {
214             if(y < 0 || y >= input_h)
215             {
216                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
217                 out_ptr += pad_quant;
218             }
219             else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
220             {
221                 for(int x = start_x; x < end_x; x += dilation_x)
222                 {
223                     if(x < 0 || x >= input_w)
224                     {
225                         memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
226                         out_ptr += input_c;
227                     }
228                     else
229                     {
230                         memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
231                         out_ptr += input_c;
232                     }
233                 }
234             }
235             else
236             {
237                 //optimized for no dilation and no boundary pixels
238                 memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
239                 out_ptr += input_c * kernel_width;
240             }
241         }
242     }
243     // Append 1 if the convolution layer has biases
244     if(has_bias)
245     {
246         *out_ptr = static_cast<T>(1);
247     }
248 }
249 } // namespace
250 
251 template <typename T, bool has_pads, bool is_nchw>
run_im2col(const ITensor * src,ITensor * dst,const Window & window)252 void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window)
253 {
254     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
255     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
256 
257     const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
258     const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
259     const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
260 
261     const int input_w        = src->info()->dimension(width_idx);
262     const int input_h        = src->info()->dimension(height_idx);
263     const int input_c        = src->info()->dimension(channel_idx);
264     const int input_stride_x = src->info()->strides_in_bytes().x();
265     const int input_stride_y = src->info()->strides_in_bytes().y();
266     const int input_stride_z = src->info()->strides_in_bytes().z();
267     const int pad_left       = _conv_info.pad_left();
268     const int pad_top        = _conv_info.pad_top();
269     const int stride_x       = _conv_info.stride().first;
270     const int stride_y       = _conv_info.stride().second;
271     const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
272 
273     Window window_in_out(window);
274     // The first three dimensions of the input and output are increased by the inner loops
275     window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
276     window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
277     window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
278 
279     // Create iterators
280     Iterator in(src, window_in_out);
281     Iterator out(dst, window_in_out);
282 
283     execute_window_loop(window, [&](const Coordinates & id)
284     {
285         const int start_w = id[width_idx] * stride_x - pad_left;
286         const int start_h = id[height_idx] * stride_y - pad_top;
287 
288         // Get pointers
289         const uint8_t *const input_ptr  = in.ptr();
290         auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
291 
292         // Linearize volume
293         if(is_nchw)
294         {
295             linearize_volume_nchw<T, has_pads>(input_ptr,
296                                                output_ptr,
297                                                _has_bias,
298                                                start_w,
299                                                start_h,
300                                                _kernel_width,
301                                                _kernel_height,
302                                                input_c,
303                                                input_w,
304                                                input_h,
305                                                input_stride_x,
306                                                input_stride_y,
307                                                input_stride_z,
308                                                pad_value,
309                                                _dilation.x(),
310                                                _dilation.y());
311         }
312         else
313         {
314             linearize_volume_nhwc<T, has_pads>(input_ptr,
315                                                output_ptr,
316                                                _has_bias,
317                                                start_w,
318                                                start_h,
319                                                _kernel_width,
320                                                _kernel_height,
321                                                input_w,
322                                                input_h,
323                                                input_c,
324                                                input_stride_y,
325                                                input_stride_z,
326                                                pad_value,
327                                                _dilation.x(),
328                                                _dilation.y());
329         }
330     },
331     in, out);
332 }
333 
configure(const ITensorInfo * src,ITensorInfo * dst,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)334 void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
335                                 bool has_bias, const Size2D &dilation, unsigned int num_groups)
336 {
337     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
338     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
339     ARM_COMPUTE_UNUSED(num_groups);
340 
341     _data_layout                   = src->data_layout();
342     const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
343     const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
344     const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
345 
346     _conv_info      = conv_info;
347     _kernel_width   = kernel_dims.width;
348     _kernel_height  = kernel_dims.height;
349     _dilation       = dilation;
350     _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
351                                         _kernel_width, _kernel_height,
352                                         _conv_info, _dilation);
353     _has_bias = has_bias;
354 
355     if(_data_layout == DataLayout::NCHW)
356     {
357         switch(src->data_type())
358         {
359             case DataType::F32:
360                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
361                 break;
362 #if defined(ARM_COMPUTE_ENABLE_BF16)
363             case DataType::BFLOAT16:
364                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
365                 break;
366 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
367 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
368             case DataType::F16:
369                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
370                 break;
371 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
372             case DataType::QASYMM8_SIGNED:
373             case DataType::QASYMM8:
374                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
375                 break;
376             default:
377                 ARM_COMPUTE_ERROR("Data type not supported");
378                 break;
379         }
380     }
381     else
382     {
383         switch(src->data_type())
384         {
385             case DataType::F32:
386                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
387                 break;
388 #if defined(ARM_COMPUTE_ENABLE_BF16)
389             case DataType::BFLOAT16:
390                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
391                 break;
392 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
393 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
394             case DataType::F16:
395                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
396                 break;
397 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
398             case DataType::QASYMM8:
399                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
400                 break;
401             case DataType::QASYMM8_SIGNED:
402                 _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
403                 break;
404             default:
405                 ARM_COMPUTE_ERROR("Data type not supported");
406                 break;
407         }
408     }
409 
410     // Output tensor auto initialization if not yet initialized
411     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false)));
412 
413     std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
414                                                                              kernel_dims.width, kernel_dims.height,
415                                                                              conv_info, dilation);
416 
417     Window win = calculate_max_window(*src, Steps());
418     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
419     win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
420     win.set(channel_idx, Window::Dimension(0, 1, 1));
421     // Configure kernel window
422     ICpuKernel::configure(win);
423 }
424 
validate(const ITensorInfo * src,const ITensorInfo * dst,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)425 Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
426                                  bool has_bias, const Size2D &dilation, unsigned int num_groups)
427 {
428     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
429     return Status{};
430 }
431 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)432 void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
433 {
434     ARM_COMPUTE_UNUSED(info);
435     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
436     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
437 
438     auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
439     auto dst = tensors.get_tensor(TensorType::ACL_DST);
440     (this->*_func)(src, dst, window);
441 }
name() const442 const char *CpuIm2ColKernel::name() const
443 {
444     return "CpuIm2ColKernel";
445 }
446 
get_mws(const CPUInfo & platform,size_t thread_count) const447 size_t CpuIm2ColKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
448 {
449     ARM_COMPUTE_UNUSED(thread_count);
450     ARM_COMPUTE_UNUSED(platform);
451 
452     return ICPPKernel::default_mws;
453 }
454 } // namespace kernels
455 } // namespace cpu
456 } // namespace arm_compute
457