xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/pool2d/neon/nchw/all.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/core/Helpers.h"
25 #include "arm_compute/core/ITensor.h"
26 #include "arm_compute/core/Types.h"
27 #include "arm_compute/core/utils/misc/Traits.h"
28 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
29 #include "src/core/helpers/WindowHelpers.h"
30 #include "src/cpu/kernels/pool2d/neon/list.h"
31 #include <limits>
32 
33 #ifdef ENABLE_NCHW_KERNELS
34 namespace arm_compute
35 {
36 namespace cpu
37 {
38 #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
39     (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
40 #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
41     (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
42 #define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
43     ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
44 
45 #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)           \
46     vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
47                  READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
48 
read_8_boundary_aware(int height,int width,int pad_left,int pad_top,int x,int y,const float * ptr,float fval)49 float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
50 {
51     float32x4x2_t vec;
52     vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
53     vec.val[1] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 4), y, (ptr + 4), fval);
54     return vec;
55 }
56 
57 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
58 
read_4_boundary_aware_fp16(int srcw,int srch,int pad_l,int pad_t,int x,int y,const float16_t * ptr,float16_t fval)59 float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
60 {
61     float16_t  vec[4];
62     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
63     for(int i = 0; i < 4; i++)
64     {
65         if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
66         {
67             vec[i] = *(ptr + i);
68         }
69         else
70         {
71             vec[i] = fval;
72         }
73     }
74     return wrapper::vload(vec);
75 }
76 
pooling3_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)77 void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
78 {
79     ARM_COMPUTE_UNUSED(dst1);
80 
81     Iterator in(src, window_src);
82     Iterator out(dst0, window);
83 
84     constexpr const int pool_size       = 3;
85     const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
86     const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
87     const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
88     const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
89     int                 pool_stride_x   = 0;
90     int                 pool_stride_y   = 0;
91     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
92     const int                  src_w          = src->info()->dimension(0);
93     const int                  src_h          = src->info()->dimension(1);
94     const int                  upper_bound_w  = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
95     const int                  upper_bound_h  = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
96     const float16_t            fp16_min       = -std::numeric_limits<half_float::half>::infinity();
97     const float16_t            fill_value     = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
98     const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
99     const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
100     const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
101 
102     execute_window_loop(window, [&](const Coordinates & id)
103     {
104         const auto  x_val    = id.x() * pool_stride_x;
105         const auto  y_val_0  = id.y() * pool_stride_y;
106         const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
107         const auto  y_val_2  = (id.y() * pool_stride_y) + 2;
108         float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
109                                                           x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
110         float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
111                                                              x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
112         float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
113                                                              x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
114         float16x4_t res = {};
115 
116         // Get power of 2 in case of l2 pooling
117         if(pool_info.pool_type == PoolingType::L2)
118         {
119             top_data    = vmul_f16(top_data, top_data);
120             middle_data = vmul_f16(middle_data, middle_data);
121             bottom_data = vmul_f16(bottom_data, bottom_data);
122         }
123 
124         if(pool_info.pool_type != PoolingType::MAX)
125         {
126             // Calculate scale
127             const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
128                                                     pool_stride_y);
129             const float16x4_t scale_v = vdup_n_f16(scale);
130             // Perform pooling
131             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
132             res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
133             res                        = vmul_f16(vpadd_f16(res, res), scale_v);
134         }
135         else
136         {
137             const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
138             res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
139             res                        = vpmax_f16(res, res);
140         }
141 
142         // Calculate square-root in case of l2 pooling
143         if(pool_info.pool_type == PoolingType::L2)
144         {
145             res = vsqrt_f16(res);
146         }
147 
148         *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
149     },
150     in, out);
151 }
152 
153 template <typename T>
154 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
f16_to_f32(float16x4_t in)155 f16_to_f32(float16x4_t in)
156 {
157     float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
158     return out;
159 }
160 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
161 
162 template <typename T>
163 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
f16_to_f32(float32x2_t in)164 f16_to_f32(float32x2_t in)
165 {
166     return in;
167 }
168 
169 template <typename T>
read_2_boundary_aware(int srcw,int srch,int pad_l,int pad_t,int x,int y,const T * ptr,T fval)170 auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval)
171 {
172     T          vec[2];
173     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
174     for(int i = 0; i < 2; i++)
175     {
176         if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
177         {
178             vec[i] = *(ptr + i);
179         }
180         else
181         {
182             vec[i] = fval;
183         }
184     }
185     return wrapper::vload(vec);
186 }
187 
188 template <typename T>
pooling2_nchw_maxpool_indices(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)189 void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
190 {
191     Iterator  in(src, window_src);
192     Iterator  out(dst0, window);
193     Iterator  indices(dst1, window);
194     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
195     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
196     int       pool_stride_x = 0;
197     int       pool_stride_y = 0;
198     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
199     const int            src_w          = src->info()->dimension(0);
200     const int            src_h          = src->info()->dimension(1);
201     const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
202     const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
203     const int            pad_left       = src->info()->padding().left;
204     const int            pad_right      = src->info()->padding().right;
205     const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
206     constexpr T          float_min      = -std::numeric_limits<float>::infinity();
207     const T              fill_value     = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
208 
209     execute_window_loop(window, [&](const Coordinates & id)
210     {
211         const auto x_val    = id.x() * pool_stride_x;
212         const auto y_val_0  = id.y() * pool_stride_y;
213         const auto y_val_1  = (id.y() * pool_stride_y) + 1;
214         auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
215                                                     x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
216         auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
217                                                  x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
218         float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
219         float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
220 
221         // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
222         const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
223         const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
224         const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
225         *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
226 
227         // Calculate max data indice, which will be used in max unpool.
228         const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
229         const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
230         const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
231         const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
232         const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
233         const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
234         const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
235         *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
236     },
237     in, out, indices);
238 }
239 
240 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
pooling2_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)241 void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
242 {
243     if(pool_info.pool_type == PoolingType::MAX && dst1)
244     {
245         pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
246     }
247     else
248     {
249         Iterator      in(src, window_src);
250         Iterator      out(dst0, window);
251         constexpr int pool_size       = 2;
252         const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
253         const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
254         const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
255         const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
256         int           pool_stride_x, pool_stride_y = 0;
257         std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
258         const int       src_w         = src->info()->dimension(0);
259         const int       src_h         = src->info()->dimension(1);
260         const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
261         const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
262         const float16_t fp16_min      = -std::numeric_limits<half_float::half>::infinity();
263         const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
264 
265         const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
266         const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
267 
268         execute_window_loop(window, [&](const Coordinates & id)
269         {
270             const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
271             const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
272 
273             const auto  x_val    = id.x() * pool_stride_x;
274             const auto  y_val_0  = id.y() * pool_stride_y;
275             const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
276             float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
277                                                               x_val, y_val_0, in_top_ptr, fill_value);
278             float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
279                                                                  x_val, y_val_1, in_bottom_ptr, fill_value);
280             float16x4_t res = {};
281 
282             // Get power of 2 in case of l2 pooling
283             if(pool_info.pool_type == PoolingType::L2)
284             {
285                 top_data    = vmul_f16(top_data, top_data);
286                 bottom_data = vmul_f16(bottom_data, bottom_data);
287             }
288 
289             if(pool_info.pool_type != PoolingType::MAX)
290             {
291                 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
292                                                         pool_stride_y);
293                 const float16x4_t scale_v = vdup_n_f16(scale);
294 
295                 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
296                 res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
297             }
298             else
299             {
300                 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
301                 res                        = vpmax_f16(max_data, max_data);
302             }
303 
304             // Calculate square-root in case of l2 pooling
305             if(pool_info.pool_type == PoolingType::L2)
306             {
307                 res = vsqrt_f16(res);
308             }
309 
310             // Store result
311             *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
312         },
313         in, out);
314     }
315 }
316 
poolingMxN_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)317 void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
318 {
319     ARM_COMPUTE_UNUSED(dst1);
320     Iterator in(src, window_src);
321     Iterator out(dst0, window);
322 
323     const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
324     const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
325     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
326     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
327     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
328     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
329     int       pool_stride_x   = 0;
330     int       pool_stride_y   = 0;
331     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
332     const int       src_w         = src->info()->dimension(0);
333     const int       src_h         = src->info()->dimension(1);
334     const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
335     const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
336     const float16_t fp16_min      = -std::numeric_limits<half_float::half>::infinity();
337     const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
338 
339     execute_window_loop(window, [&](const Coordinates & id)
340     {
341         float16_t res = 0.0f;
342 
343         if(pool_info.pool_type != PoolingType::MAX)
344         {
345             // Calculate scale
346             const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
347                                                         pool_stride_y);
348 
349             // Perform pooling
350             for(int y = 0; y < pool_size_y; ++y)
351             {
352                 for(int x = 0; x < pool_size_x; ++x)
353                 {
354                     const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
355                                                                          + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
356 
357                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
358                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
359                     float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
360 
361                     if(pool_info.pool_type == PoolingType::L2)
362                     {
363                         data *= data;
364                     }
365 
366                     res += data;
367                 }
368             }
369 
370             // Divide by scale
371             res *= scale;
372         }
373         else // if max pooling
374         {
375             res = fp16_min;
376 
377             for(int y = 0; y < pool_size_y; ++y)
378             {
379                 for(int x = 0; x < pool_size_x; ++x)
380                 {
381                     const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
382                                                                          + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
383 
384                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
385                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
386                     float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
387                     res            = std::max(res, data);
388                 }
389             }
390         }
391 
392         // Calculate square-root in case of l2 pooling
393         if(pool_info.pool_type == PoolingType::L2)
394         {
395             res = std::sqrt(res);
396         }
397 
398         // Store result
399         *(reinterpret_cast<float16_t *>(out.ptr())) = res;
400     },
401     in, out);
402 }
403 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
404 
poolingMxN_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)405 void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
406 {
407     ARM_COMPUTE_UNUSED(dst1);
408     Iterator in(src, window_src);
409     Iterator out(dst0, window);
410 
411     const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
412     const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
413     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
414     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
415     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
416     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
417     int       pool_stride_x   = 0;
418     int       pool_stride_y   = 0;
419     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
420     const int   src_w         = src->info()->dimension(0);
421     const int   src_h         = src->info()->dimension(1);
422     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
423     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
424     const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
425 
426     execute_window_loop(window, [&](const Coordinates & id)
427     {
428         float res = 0.0f;
429 
430         if(pool_info.pool_type != PoolingType::MAX)
431         {
432             // Calculate scale
433             const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
434                                                     pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
435 
436             // Perform pooling
437             for(int y = 0; y < pool_size_y; ++y)
438             {
439                 for(int x = 0; x < pool_size_x; ++x)
440                 {
441                     const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
442                                                                      + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
443 
444                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
445                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
446                     float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
447 
448                     if(pool_info.pool_type == PoolingType::L2)
449                     {
450                         data *= data;
451                     }
452 
453                     res += data;
454                 }
455             }
456 
457             // Divide by scale
458             res *= scale;
459         }
460         else // if max pooling
461         {
462             res = -std::numeric_limits<float>::infinity();
463 
464             for(int y = 0; y < pool_size_y; ++y)
465             {
466                 for(int x = 0; x < pool_size_x; ++x)
467                 {
468                     const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
469                                                                      + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
470 
471                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
472                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
473                     float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
474                     res            = std::max(res, data);
475                 }
476             }
477         }
478 
479         // Calculate square-root in case of l2 pooling
480         if(pool_info.pool_type == PoolingType::L2)
481         {
482             res = std::sqrt(res);
483         }
484 
485         // Store result
486         *(reinterpret_cast<float *>(out.ptr())) = res;
487     },
488     in, out);
489 }
490 
pooling2_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)491 void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
492 {
493     if(pool_info.pool_type == PoolingType::MAX && dst1)
494     {
495         pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
496     }
497     else
498     {
499         Iterator      in(src, window_src);
500         Iterator      out(dst0, window);
501         constexpr int pool_size       = 2;
502         const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
503         const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
504         const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
505         const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
506         int           pool_stride_x   = 0;
507         int           pool_stride_y   = 0;
508         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
509         const int   src_w         = src->info()->dimension(0);
510         const int   src_h         = src->info()->dimension(1);
511         const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
512         const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
513         const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
514 
515         const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
516         const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
517 
518         execute_window_loop(window, [&](const Coordinates & id)
519         {
520             const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
521             const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
522 
523             const auto  x_val       = id.x() * pool_stride_x;
524             const auto  y_val_0     = id.y() * pool_stride_y;
525             const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
526             auto        top_data    = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
527             auto        bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
528             float32x2_t res         = {};
529             float       final_res   = 0;
530 
531             // Get power of 2 in case of l2 pooling
532             if(pool_info.pool_type == PoolingType::L2)
533             {
534                 top_data    = vmul_f32(top_data, top_data);
535                 bottom_data = vmul_f32(bottom_data, bottom_data);
536             }
537 
538             if(pool_info.pool_type != PoolingType::MAX)
539             {
540                 // Calculate scale
541                 float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
542                                                   pool_stride_y);
543                 const float32x2_t scale_v = vdup_n_f32(scale);
544 
545                 // Perform pooling
546                 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
547                 res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
548             }
549             else
550             {
551                 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
552                 res                        = vpmax_f32(max_data, max_data);
553             }
554             final_res = vget_lane_f32(res, 0);
555 
556             // Calculate square-root in case of l2 pooling
557             if(pool_info.pool_type == PoolingType::L2)
558             {
559                 final_res = sqrt(final_res);
560             }
561 
562             // Store result
563             *(reinterpret_cast<float *>(out.ptr())) = final_res;
564         },
565         in, out);
566     }
567 }
568 
pooling3_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)569 void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
570 {
571     ARM_COMPUTE_UNUSED(dst1);
572     Iterator in(src, window_src);
573     Iterator out(dst0, window);
574 
575     constexpr const int pool_size       = 3;
576     const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
577     const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
578     const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
579     const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
580     int                 pool_stride_x   = 0;
581     int                 pool_stride_y   = 0;
582     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
583     const int   src_w         = src->info()->dimension(0);
584     const int   src_h         = src->info()->dimension(1);
585     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
586     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
587     const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
588 
589     const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
590     const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
591     const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
592 
593     execute_window_loop(window, [&](const Coordinates & id)
594     {
595         const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
596         const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
597         const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
598 
599         const auto x_val       = id.x() * pool_stride_x;
600         const auto y_val_0     = id.y() * pool_stride_y;
601         const auto y_val_1     = (id.y() * pool_stride_y) + 1;
602         const auto y_val_2     = (id.y() * pool_stride_y) + 2;
603         auto       top_data    = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
604         auto       middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
605         auto       bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
606 
607         float32x2_t res       = {};
608         float       final_res = 0;
609 
610         // Get power of 2 in case of l2 pooling
611         if(pool_info.pool_type == PoolingType::L2)
612         {
613             top_data    = vmulq_f32(top_data, top_data);
614             middle_data = vmulq_f32(middle_data, middle_data);
615             bottom_data = vmulq_f32(bottom_data, bottom_data);
616         }
617 
618         if(pool_info.pool_type != PoolingType::MAX)
619         {
620             // Calculate scale
621             float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
622                                               pool_stride_y);
623             const float32x2_t scale_v = vdup_n_f32(scale);
624 
625             // Perform pooling
626             const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
627             res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
628             res                        = vmul_f32(vpadd_f32(res, res), scale_v);
629         }
630         else
631         {
632             const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
633             res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), max_data, 3)), vget_low_f32(max_data));
634             res                        = vpmax_f32(res, res);
635         }
636         final_res = vget_lane_f32(res, 0);
637 
638         // Calculate square-root in case of l2 pooling
639         if(pool_info.pool_type == PoolingType::L2)
640         {
641             final_res = sqrt(final_res);
642         }
643 
644         // Store result
645         *(reinterpret_cast<float *>(out.ptr())) = final_res;
646     },
647     in, out);
648 }
649 
pooling7_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)650 void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
651 {
652     ARM_COMPUTE_UNUSED(dst1);
653     Iterator in(src, window_src);
654     Iterator out(dst0, window);
655 
656     constexpr const int pool_size       = 7;
657     const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
658     const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
659     const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
660     const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
661     int                 pool_stride_x   = 0;
662     int                 pool_stride_y   = 0;
663     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
664     const int   src_w         = src->info()->dimension(0);
665     const int   src_h         = src->info()->dimension(1);
666     const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
667     const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
668     const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
669 
670     std::array<const uint8_t *, pool_size> src_ptrs{ {} };
671     for(int i = 0; i < pool_size; ++i)
672     {
673         src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
674     }
675 
676     execute_window_loop(window, [&](const Coordinates & id)
677     {
678         auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
679 
680         auto          x_val = id.x() * pool_stride_x;
681         auto          y_val = id.y() * pool_stride_y;
682         float32x4x2_t data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
683 
684         float32x2_t res       = {};
685         float       final_res = 0.f;
686 
687         if(pool_info.pool_type != PoolingType::MAX)
688         {
689             // Calculate scale
690             float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
691                                               pool_stride_y);
692             const float32x2_t scale_v = vdup_n_f32(scale);
693 
694             // Get power of 2 in case of l2 pooling
695             if(pool_info.pool_type == PoolingType::L2)
696             {
697                 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
698                 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
699             }
700             float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
701             for(int i = 1; i < pool_size; ++i)
702             {
703                 in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
704 
705                 x_val = id.x() * pool_stride_x;
706                 y_val = (id.y() * pool_stride_y) + i;
707                 data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
708                 // Get power of 2 in case of l2 pooling
709                 if(pool_info.pool_type == PoolingType::L2)
710                 {
711                     data.val[0] = vmulq_f32(data.val[0], data.val[0]);
712                     data.val[1] = vmulq_f32(data.val[1], data.val[1]);
713                 }
714                 sum_data = vaddq_f32(sum_data, data.val[0]);
715                 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
716             }
717             res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
718             res = vmul_f32(vpadd_f32(res, res), scale_v);
719         }
720         else
721         {
722             for(int i = 1; i < pool_size; ++i)
723             {
724                 in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
725 
726                 x_val              = id.x() * pool_stride_x;
727                 y_val              = (id.y() * pool_stride_y) + i;
728                 float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
729                 data               = vmax2q_f32(data, temp);
730             }
731             res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), data.val[1], 3)), vget_low_f32(data.val[1]));
732             res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
733             res = vpmax_f32(res, res);
734         }
735         final_res = vget_lane_f32(res, 0);
736 
737         // Calculate square-root in case of l2 pooling
738         if(pool_info.pool_type == PoolingType::L2)
739         {
740             final_res = sqrt(final_res);
741         }
742 
743         // Store result
744         *(reinterpret_cast<float *>(out.ptr())) = final_res;
745     },
746     in, out);
747 }
748 } // namespace cpu
749 } // namespace arm_compute
750 
751 #endif // ENABLE_NCHW_KERNELS