1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "arm_compute/core/Helpers.h"
25 #include "arm_compute/core/ITensor.h"
26 #include "arm_compute/core/Types.h"
27 #include "arm_compute/core/utils/misc/Traits.h"
28 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
29 #include "src/core/helpers/WindowHelpers.h"
30 #include "src/cpu/kernels/pool2d/neon/list.h"
31 #include <limits>
32
33 #ifdef ENABLE_NCHW_KERNELS
34 namespace arm_compute
35 {
36 namespace cpu
37 {
38 #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
39 (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
40 #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
41 (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
42 #define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
43 ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
44
45 #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
46 vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
47 READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
48
read_8_boundary_aware(int height,int width,int pad_left,int pad_top,int x,int y,const float * ptr,float fval)49 float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
50 {
51 float32x4x2_t vec;
52 vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
53 vec.val[1] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 4), y, (ptr + 4), fval);
54 return vec;
55 }
56
57 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
58
read_4_boundary_aware_fp16(int srcw,int srch,int pad_l,int pad_t,int x,int y,const float16_t * ptr,float16_t fval)59 float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
60 {
61 float16_t vec[4];
62 const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
63 for(int i = 0; i < 4; i++)
64 {
65 if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
66 {
67 vec[i] = *(ptr + i);
68 }
69 else
70 {
71 vec[i] = fval;
72 }
73 }
74 return wrapper::vload(vec);
75 }
76
pooling3_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)77 void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
78 {
79 ARM_COMPUTE_UNUSED(dst1);
80
81 Iterator in(src, window_src);
82 Iterator out(dst0, window);
83
84 constexpr const int pool_size = 3;
85 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
86 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
87 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
88 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
89 int pool_stride_x = 0;
90 int pool_stride_y = 0;
91 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
92 const int src_w = src->info()->dimension(0);
93 const int src_h = src->info()->dimension(1);
94 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
95 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
96 const float16_t fp16_min = -std::numeric_limits<half_float::half>::infinity();
97 const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
98 const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
99 const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
100 const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
101
102 execute_window_loop(window, [&](const Coordinates & id)
103 {
104 const auto x_val = id.x() * pool_stride_x;
105 const auto y_val_0 = id.y() * pool_stride_y;
106 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
107 const auto y_val_2 = (id.y() * pool_stride_y) + 2;
108 float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
109 x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
110 float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
111 x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
112 float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
113 x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
114 float16x4_t res = {};
115
116 // Get power of 2 in case of l2 pooling
117 if(pool_info.pool_type == PoolingType::L2)
118 {
119 top_data = vmul_f16(top_data, top_data);
120 middle_data = vmul_f16(middle_data, middle_data);
121 bottom_data = vmul_f16(bottom_data, bottom_data);
122 }
123
124 if(pool_info.pool_type != PoolingType::MAX)
125 {
126 // Calculate scale
127 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
128 pool_stride_y);
129 const float16x4_t scale_v = vdup_n_f16(scale);
130 // Perform pooling
131 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
132 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
133 res = vmul_f16(vpadd_f16(res, res), scale_v);
134 }
135 else
136 {
137 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
138 res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
139 res = vpmax_f16(res, res);
140 }
141
142 // Calculate square-root in case of l2 pooling
143 if(pool_info.pool_type == PoolingType::L2)
144 {
145 res = vsqrt_f16(res);
146 }
147
148 *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
149 },
150 in, out);
151 }
152
153 template <typename T>
154 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
f16_to_f32(float16x4_t in)155 f16_to_f32(float16x4_t in)
156 {
157 float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
158 return out;
159 }
160 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
161
162 template <typename T>
163 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
f16_to_f32(float32x2_t in)164 f16_to_f32(float32x2_t in)
165 {
166 return in;
167 }
168
169 template <typename T>
read_2_boundary_aware(int srcw,int srch,int pad_l,int pad_t,int x,int y,const T * ptr,T fval)170 auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval)
171 {
172 T vec[2];
173 const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
174 for(int i = 0; i < 2; i++)
175 {
176 if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
177 {
178 vec[i] = *(ptr + i);
179 }
180 else
181 {
182 vec[i] = fval;
183 }
184 }
185 return wrapper::vload(vec);
186 }
187
188 template <typename T>
pooling2_nchw_maxpool_indices(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)189 void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
190 {
191 Iterator in(src, window_src);
192 Iterator out(dst0, window);
193 Iterator indices(dst1, window);
194 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
195 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
196 int pool_stride_x = 0;
197 int pool_stride_y = 0;
198 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
199 const int src_w = src->info()->dimension(0);
200 const int src_h = src->info()->dimension(1);
201 const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
202 const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
203 const int pad_left = src->info()->padding().left;
204 const int pad_right = src->info()->padding().right;
205 const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
206 constexpr T float_min = -std::numeric_limits<float>::infinity();
207 const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
208
209 execute_window_loop(window, [&](const Coordinates & id)
210 {
211 const auto x_val = id.x() * pool_stride_x;
212 const auto y_val_0 = id.y() * pool_stride_y;
213 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
214 auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
215 x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
216 auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
217 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
218 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
219 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
220
221 // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
222 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
223 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
224 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
225 *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
226
227 // Calculate max data indice, which will be used in max unpool.
228 const uint32_t offset_base = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
229 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
230 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
231 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
232 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
233 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
234 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
235 *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
236 },
237 in, out, indices);
238 }
239
240 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
pooling2_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)241 void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
242 {
243 if(pool_info.pool_type == PoolingType::MAX && dst1)
244 {
245 pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
246 }
247 else
248 {
249 Iterator in(src, window_src);
250 Iterator out(dst0, window);
251 constexpr int pool_size = 2;
252 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
253 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
254 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
255 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
256 int pool_stride_x, pool_stride_y = 0;
257 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
258 const int src_w = src->info()->dimension(0);
259 const int src_h = src->info()->dimension(1);
260 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
261 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
262 const float16_t fp16_min = -std::numeric_limits<half_float::half>::infinity();
263 const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
264
265 const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
266 const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
267
268 execute_window_loop(window, [&](const Coordinates & id)
269 {
270 const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
271 const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
272
273 const auto x_val = id.x() * pool_stride_x;
274 const auto y_val_0 = id.y() * pool_stride_y;
275 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
276 float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
277 x_val, y_val_0, in_top_ptr, fill_value);
278 float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
279 x_val, y_val_1, in_bottom_ptr, fill_value);
280 float16x4_t res = {};
281
282 // Get power of 2 in case of l2 pooling
283 if(pool_info.pool_type == PoolingType::L2)
284 {
285 top_data = vmul_f16(top_data, top_data);
286 bottom_data = vmul_f16(bottom_data, bottom_data);
287 }
288
289 if(pool_info.pool_type != PoolingType::MAX)
290 {
291 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
292 pool_stride_y);
293 const float16x4_t scale_v = vdup_n_f16(scale);
294
295 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
296 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
297 }
298 else
299 {
300 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
301 res = vpmax_f16(max_data, max_data);
302 }
303
304 // Calculate square-root in case of l2 pooling
305 if(pool_info.pool_type == PoolingType::L2)
306 {
307 res = vsqrt_f16(res);
308 }
309
310 // Store result
311 *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
312 },
313 in, out);
314 }
315 }
316
poolingMxN_fp16_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)317 void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
318 {
319 ARM_COMPUTE_UNUSED(dst1);
320 Iterator in(src, window_src);
321 Iterator out(dst0, window);
322
323 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
324 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
325 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
326 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
327 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
328 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
329 int pool_stride_x = 0;
330 int pool_stride_y = 0;
331 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
332 const int src_w = src->info()->dimension(0);
333 const int src_h = src->info()->dimension(1);
334 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
335 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
336 const float16_t fp16_min = -std::numeric_limits<half_float::half>::infinity();
337 const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
338
339 execute_window_loop(window, [&](const Coordinates & id)
340 {
341 float16_t res = 0.0f;
342
343 if(pool_info.pool_type != PoolingType::MAX)
344 {
345 // Calculate scale
346 const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
347 pool_stride_y);
348
349 // Perform pooling
350 for(int y = 0; y < pool_size_y; ++y)
351 {
352 for(int x = 0; x < pool_size_x; ++x)
353 {
354 const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
355 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
356
357 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
358 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
359 float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
360
361 if(pool_info.pool_type == PoolingType::L2)
362 {
363 data *= data;
364 }
365
366 res += data;
367 }
368 }
369
370 // Divide by scale
371 res *= scale;
372 }
373 else // if max pooling
374 {
375 res = fp16_min;
376
377 for(int y = 0; y < pool_size_y; ++y)
378 {
379 for(int x = 0; x < pool_size_x; ++x)
380 {
381 const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
382 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
383
384 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
385 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
386 float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
387 res = std::max(res, data);
388 }
389 }
390 }
391
392 // Calculate square-root in case of l2 pooling
393 if(pool_info.pool_type == PoolingType::L2)
394 {
395 res = std::sqrt(res);
396 }
397
398 // Store result
399 *(reinterpret_cast<float16_t *>(out.ptr())) = res;
400 },
401 in, out);
402 }
403 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
404
poolingMxN_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)405 void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
406 {
407 ARM_COMPUTE_UNUSED(dst1);
408 Iterator in(src, window_src);
409 Iterator out(dst0, window);
410
411 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
412 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
413 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
414 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
415 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
416 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
417 int pool_stride_x = 0;
418 int pool_stride_y = 0;
419 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
420 const int src_w = src->info()->dimension(0);
421 const int src_h = src->info()->dimension(1);
422 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
423 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
424 const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
425
426 execute_window_loop(window, [&](const Coordinates & id)
427 {
428 float res = 0.0f;
429
430 if(pool_info.pool_type != PoolingType::MAX)
431 {
432 // Calculate scale
433 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
434 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
435
436 // Perform pooling
437 for(int y = 0; y < pool_size_y; ++y)
438 {
439 for(int x = 0; x < pool_size_x; ++x)
440 {
441 const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
442 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
443
444 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
445 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
446 float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
447
448 if(pool_info.pool_type == PoolingType::L2)
449 {
450 data *= data;
451 }
452
453 res += data;
454 }
455 }
456
457 // Divide by scale
458 res *= scale;
459 }
460 else // if max pooling
461 {
462 res = -std::numeric_limits<float>::infinity();
463
464 for(int y = 0; y < pool_size_y; ++y)
465 {
466 for(int x = 0; x < pool_size_x; ++x)
467 {
468 const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
469 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
470
471 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
472 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
473 float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
474 res = std::max(res, data);
475 }
476 }
477 }
478
479 // Calculate square-root in case of l2 pooling
480 if(pool_info.pool_type == PoolingType::L2)
481 {
482 res = std::sqrt(res);
483 }
484
485 // Store result
486 *(reinterpret_cast<float *>(out.ptr())) = res;
487 },
488 in, out);
489 }
490
pooling2_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)491 void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
492 {
493 if(pool_info.pool_type == PoolingType::MAX && dst1)
494 {
495 pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
496 }
497 else
498 {
499 Iterator in(src, window_src);
500 Iterator out(dst0, window);
501 constexpr int pool_size = 2;
502 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
503 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
504 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
505 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
506 int pool_stride_x = 0;
507 int pool_stride_y = 0;
508 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
509 const int src_w = src->info()->dimension(0);
510 const int src_h = src->info()->dimension(1);
511 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
512 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
513 const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
514
515 const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
516 const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
517
518 execute_window_loop(window, [&](const Coordinates & id)
519 {
520 const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
521 const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
522
523 const auto x_val = id.x() * pool_stride_x;
524 const auto y_val_0 = id.y() * pool_stride_y;
525 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
526 auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
527 auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
528 float32x2_t res = {};
529 float final_res = 0;
530
531 // Get power of 2 in case of l2 pooling
532 if(pool_info.pool_type == PoolingType::L2)
533 {
534 top_data = vmul_f32(top_data, top_data);
535 bottom_data = vmul_f32(bottom_data, bottom_data);
536 }
537
538 if(pool_info.pool_type != PoolingType::MAX)
539 {
540 // Calculate scale
541 float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
542 pool_stride_y);
543 const float32x2_t scale_v = vdup_n_f32(scale);
544
545 // Perform pooling
546 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
547 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
548 }
549 else
550 {
551 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
552 res = vpmax_f32(max_data, max_data);
553 }
554 final_res = vget_lane_f32(res, 0);
555
556 // Calculate square-root in case of l2 pooling
557 if(pool_info.pool_type == PoolingType::L2)
558 {
559 final_res = sqrt(final_res);
560 }
561
562 // Store result
563 *(reinterpret_cast<float *>(out.ptr())) = final_res;
564 },
565 in, out);
566 }
567 }
568
pooling3_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)569 void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
570 {
571 ARM_COMPUTE_UNUSED(dst1);
572 Iterator in(src, window_src);
573 Iterator out(dst0, window);
574
575 constexpr const int pool_size = 3;
576 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
577 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
578 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
579 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
580 int pool_stride_x = 0;
581 int pool_stride_y = 0;
582 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
583 const int src_w = src->info()->dimension(0);
584 const int src_h = src->info()->dimension(1);
585 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
586 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
587 const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
588
589 const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
590 const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
591 const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
592
593 execute_window_loop(window, [&](const Coordinates & id)
594 {
595 const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
596 const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
597 const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
598
599 const auto x_val = id.x() * pool_stride_x;
600 const auto y_val_0 = id.y() * pool_stride_y;
601 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
602 const auto y_val_2 = (id.y() * pool_stride_y) + 2;
603 auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
604 auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
605 auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
606
607 float32x2_t res = {};
608 float final_res = 0;
609
610 // Get power of 2 in case of l2 pooling
611 if(pool_info.pool_type == PoolingType::L2)
612 {
613 top_data = vmulq_f32(top_data, top_data);
614 middle_data = vmulq_f32(middle_data, middle_data);
615 bottom_data = vmulq_f32(bottom_data, bottom_data);
616 }
617
618 if(pool_info.pool_type != PoolingType::MAX)
619 {
620 // Calculate scale
621 float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
622 pool_stride_y);
623 const float32x2_t scale_v = vdup_n_f32(scale);
624
625 // Perform pooling
626 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
627 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
628 res = vmul_f32(vpadd_f32(res, res), scale_v);
629 }
630 else
631 {
632 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
633 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), max_data, 3)), vget_low_f32(max_data));
634 res = vpmax_f32(res, res);
635 }
636 final_res = vget_lane_f32(res, 0);
637
638 // Calculate square-root in case of l2 pooling
639 if(pool_info.pool_type == PoolingType::L2)
640 {
641 final_res = sqrt(final_res);
642 }
643
644 // Store result
645 *(reinterpret_cast<float *>(out.ptr())) = final_res;
646 },
647 in, out);
648 }
649
pooling7_fp32_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)650 void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
651 {
652 ARM_COMPUTE_UNUSED(dst1);
653 Iterator in(src, window_src);
654 Iterator out(dst0, window);
655
656 constexpr const int pool_size = 7;
657 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
658 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
659 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
660 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
661 int pool_stride_x = 0;
662 int pool_stride_y = 0;
663 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
664 const int src_w = src->info()->dimension(0);
665 const int src_h = src->info()->dimension(1);
666 const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
667 const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
668 const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? -std::numeric_limits<float>::infinity() : 0.0f;
669
670 std::array<const uint8_t *, pool_size> src_ptrs{ {} };
671 for(int i = 0; i < pool_size; ++i)
672 {
673 src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
674 }
675
676 execute_window_loop(window, [&](const Coordinates & id)
677 {
678 auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
679
680 auto x_val = id.x() * pool_stride_x;
681 auto y_val = id.y() * pool_stride_y;
682 float32x4x2_t data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
683
684 float32x2_t res = {};
685 float final_res = 0.f;
686
687 if(pool_info.pool_type != PoolingType::MAX)
688 {
689 // Calculate scale
690 float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
691 pool_stride_y);
692 const float32x2_t scale_v = vdup_n_f32(scale);
693
694 // Get power of 2 in case of l2 pooling
695 if(pool_info.pool_type == PoolingType::L2)
696 {
697 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
698 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
699 }
700 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
701 for(int i = 1; i < pool_size; ++i)
702 {
703 in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
704
705 x_val = id.x() * pool_stride_x;
706 y_val = (id.y() * pool_stride_y) + i;
707 data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
708 // Get power of 2 in case of l2 pooling
709 if(pool_info.pool_type == PoolingType::L2)
710 {
711 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
712 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
713 }
714 sum_data = vaddq_f32(sum_data, data.val[0]);
715 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
716 }
717 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
718 res = vmul_f32(vpadd_f32(res, res), scale_v);
719 }
720 else
721 {
722 for(int i = 1; i < pool_size; ++i)
723 {
724 in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
725
726 x_val = id.x() * pool_stride_x;
727 y_val = (id.y() * pool_stride_y) + i;
728 float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
729 data = vmax2q_f32(data, temp);
730 }
731 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::infinity(), data.val[1], 3)), vget_low_f32(data.val[1]));
732 res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
733 res = vpmax_f32(res, res);
734 }
735 final_res = vget_lane_f32(res, 0);
736
737 // Calculate square-root in case of l2 pooling
738 if(pool_info.pool_type == PoolingType::L2)
739 {
740 final_res = sqrt(final_res);
741 }
742
743 // Store result
744 *(reinterpret_cast<float *>(out.ptr())) = final_res;
745 },
746 in, out);
747 }
748 } // namespace cpu
749 } // namespace arm_compute
750
751 #endif // ENABLE_NCHW_KERNELS