1 /*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
25 #define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
26
27 #include "arm_compute/core/ITensor.h"
28 #include "arm_compute/core/Types.h"
29 #include "src/core/NEON/wrapper/wrapper.h"
30 #include "src/core/helpers/PoolingHelpers.h"
31 #include "src/core/helpers/WindowHelpers.h"
32
33 namespace arm_compute
34 {
35 namespace cpu
36 {
37 template <typename T>
avg_poolingMxNxD_q8_neon_ndhwc(const ITensor * src,ITensor * dst0,Pooling3dLayerInfo & pool_info,const Window & window_out,const int window_step_x)38 void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
39 const int window_step_x)
40
41 {
42 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
43 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
44 using q16_t = typename wrapper::traits::promote_t<T>;
45 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
46 using q32_t = typename wrapper::traits::promote_t<q16_t>;
47 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
48
49 int pool_stride_x = static_cast<int>(pool_info.stride.width);
50 int pool_stride_y = static_cast<int>(pool_info.stride.height);
51 int pool_stride_z = static_cast<int>(pool_info.stride.depth);
52
53 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
54 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
55 const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
56
57 const int pool_pad_top = static_cast<int>(pool_info.padding.top);
58 const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
59 const int pool_pad_left = static_cast<int>(pool_info.padding.left);
60 const int pool_pad_right = static_cast<int>(pool_info.padding.right);
61 const int pool_pad_front = static_cast<int>(pool_info.padding.front);
62 const int pool_pad_back = static_cast<int>(pool_info.padding.back);
63
64 const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
65 const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
66 const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
67
68 const int input_dim_c = src->info()->dimension(0);
69 const int input_dim_w = src->info()->dimension(1);
70 const int input_dim_h = src->info()->dimension(2);
71 const int input_dim_d = src->info()->dimension(3);
72
73 const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
74 const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
75 const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
76 const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
77
78 const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
79
80 const int window_end_x = input_dim_c;
81 const int window_start_x = 0;
82
83 Iterator out(dst0, window_out);
84
85 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
86 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
87 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
88
89 const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
90 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
91 // With a requantization performed in a single step there won't be uncertainties introduced
92 const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
93
94 execute_window_loop(window_out, [&](const Coordinates & id)
95 {
96 // Computing the theoretical input starting/ending points
97 const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
98 const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
99 const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
100
101 const int pool_start_x = std::max(0, -in_idx_width);
102 const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
103 const int pool_start_y = std::max(0, -in_idx_height);
104 const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
105
106 const int pool_start_z = std::max(0, -in_idx_depth);
107 const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
108
109 // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
110 const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
111 const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
112 const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
113
114 // Calculate scale
115 const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
116 pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
117
118 const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
119
120 int x_off = window_start_x;
121
122 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
123 {
124 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
125 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
126 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
127 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
128
129 // Perform pooling
130 for(int z = pool_start_z; z < pool_end_z; ++z)
131 {
132 const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
133 for(int y = pool_start_y; y < pool_end_y; ++y)
134 {
135 const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
136 for(int x = pool_start_x; x < pool_end_x; ++x)
137 {
138 const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
139 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
140
141 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
142 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
143 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
144 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
145 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
146 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
147 }
148 }
149 }
150
151 if(src_qinfo != dst_qinfo)
152 {
153 const float32x4x4_t vres =
154 {
155 {
156 vcvtq_f32_q32(vres1),
157 vcvtq_f32_q32(vres2),
158 vcvtq_f32_q32(vres3),
159 vcvtq_f32_q32(vres4),
160 }
161 };
162 const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
163 // Store result
164 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
165 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
166 }
167 else
168 {
169 const float32x4_t scale_v = vdupq_n_f32(scale);
170 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
171 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
172 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
173 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
174 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
175
176 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
177 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
178 // Store result
179 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
180 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
181 }
182 }
183
184 // Left-overs loop
185 for(; x_off < window_end_x; ++x_off)
186 {
187 q32_t res = static_cast<q32_t>(0.f);
188
189 // Perform pooling
190 for(int z = pool_start_z; z < pool_end_z; ++z)
191 {
192 const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
193 for(int y = pool_start_y; y < pool_end_y; ++y)
194 {
195 const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
196 for(int x = pool_start_x; x < pool_end_x; ++x)
197 {
198 const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
199 const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
200 res += data;
201 }
202 }
203 }
204
205 if(src_qinfo != dst_qinfo)
206 {
207 const float res_f = static_cast<float>(res);
208 const float new_scale = quant_rescale / scale;
209 const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
210
211 // Store result
212 *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
213 }
214 else
215 {
216 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
217 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
218
219 // Store result
220 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
221 }
222 }
223 },
224 out);
225 }
226
227 template <typename T>
max_poolingMxNxD_q8_neon_ndhwc(const ITensor * src,ITensor * dst0,Pooling3dLayerInfo & pool_info,const Window & window_out,const int window_step_x)228 void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
229 const int window_step_x)
230
231 {
232 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
233 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
234
235 const int window_half_step_x = window_step_x / 2;
236
237 int pool_stride_x = static_cast<int>(pool_info.stride.width);
238 int pool_stride_y = static_cast<int>(pool_info.stride.height);
239 int pool_stride_z = static_cast<int>(pool_info.stride.depth);
240
241 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
242 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
243 const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
244
245 const int pool_pad_top = static_cast<int>(pool_info.padding.top);
246 const int pool_pad_left = static_cast<int>(pool_info.padding.left);
247 const int pool_pad_front = static_cast<int>(pool_info.padding.front);
248
249 const int input_dim_c = src->info()->dimension(0);
250 const int input_dim_w = src->info()->dimension(1);
251 const int input_dim_h = src->info()->dimension(2);
252 const int input_dim_d = src->info()->dimension(3);
253
254 const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
255 const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
256 const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
257 const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
258
259 const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
260
261 const int window_end_x = input_dim_c;
262 const int window_start_x = 0;
263
264 Iterator out(dst0, window_out);
265
266 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
267 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
268
269 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
270 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
271 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
272
273 execute_window_loop(window_out, [&](const Coordinates & id)
274 {
275 // Computing the theoretical input starting/ending points
276 const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
277 const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
278 const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
279
280 const int pool_start_x = std::max(0, -in_idx_width);
281 const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
282 const int pool_start_y = std::max(0, -in_idx_height);
283 const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
284
285 const int pool_start_z = std::max(0, -in_idx_depth);
286 const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
287
288 // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
289 const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
290 const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
291 const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
292
293 const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
294
295 int x_off = window_start_x;
296
297 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
298 {
299 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
300
301 // Perform pooling
302 for(int z = pool_start_z; z < pool_end_z; ++z)
303 {
304 const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
305 for(int y = pool_start_y; y < pool_end_y; ++y)
306 {
307 const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
308 for(int x = pool_start_x; x < pool_end_x; ++x)
309 {
310 const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
311 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
312
313 vres = wrapper::vmax(vres, data);
314 }
315 }
316 }
317
318 // Store result
319 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
320 requant_qinfo) :
321 vres);
322 }
323
324 // Leftovers using half the window step
325 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
326 {
327 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
328
329 // Perform pooling
330 for(int z = pool_start_z; z < pool_end_z; ++z)
331 {
332 const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
333 for(int y = pool_start_y; y < pool_end_y; ++y)
334 {
335 const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
336 for(int x = pool_start_x; x < pool_end_x; ++x)
337 {
338 const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
339 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
340
341 vres = wrapper::vmax(vres, data);
342 }
343 }
344 }
345
346 // Store result
347 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
348 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
349 }
350
351 // Left-overs loop
352 for(; x_off < window_end_x; ++x_off)
353 {
354 T res = std::numeric_limits<T>::min();
355
356 for(int z = pool_start_z; z < pool_end_z; ++z)
357 {
358 const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
359 for(int y = pool_start_y; y < pool_end_y; ++y)
360 {
361 const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
362 for(int x = pool_start_x; x < pool_end_x; ++x)
363 {
364 const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
365 const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
366
367 res = std::max(res, data);
368 }
369 }
370 }
371
372 // Store result
373 if(src_qinfo != dst_qinfo)
374 {
375 const float res_f = static_cast<float>(res);
376 *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
377 }
378 else
379 {
380 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
381 }
382 }
383 },
384 out);
385 }
386
387 } // namespace cpu
388 } // namespace arm_compute
389
390 #endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H