1 /*
2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_pool_q7_HWC.c
22 * Description: Pooling function implementations
23 *
24 * $Date: 17. January 2018
25 * $Revision: V.1.0.0
26 *
27 * Target Processor: Cortex-M cores
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33
34 #if defined (ARM_MATH_DSP)
35
36 /**
37 * @brief A few utility functions used by pooling functions
38 *
39 *
40 */
41
buffer_scale_back_q15_to_q7(q15_t * buffer,q7_t * target,uint16_t length,uint16_t scale)42 static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
43 {
44 int i;
45
46 for (i = 0; i < length; i++)
47 {
48 target[i] = (q7_t) (buffer[i] / scale);
49 }
50 }
51
compare_and_replace_if_larger_q7(q7_t * base,q7_t * target,const uint16_t length)52 static void compare_and_replace_if_larger_q7(q7_t * base, // base data
53 q7_t * target, // compare target
54 const uint16_t length // data length
55 )
56 {
57 q7_t *pIn = base;
58 q7_t *pCom = target;
59 union arm_nnword in;
60 union arm_nnword com;
61 uint16_t cnt = length >> 2;
62
63 while (cnt > 0u)
64 {
65 in.word = *__SIMD32(pIn);
66 com.word = *__SIMD32(pCom)++;
67
68 // if version
69 if (com.bytes[0] > in.bytes[0])
70 in.bytes[0] = com.bytes[0];
71 if (com.bytes[1] > in.bytes[1])
72 in.bytes[1] = com.bytes[1];
73 if (com.bytes[2] > in.bytes[2])
74 in.bytes[2] = com.bytes[2];
75 if (com.bytes[3] > in.bytes[3])
76 in.bytes[3] = com.bytes[3];
77
78 *__SIMD32(pIn)++ = in.word;
79
80 cnt--;
81 }
82 }
83
accumulate_q7_to_q15(q15_t * base,q7_t * target,const uint16_t length)84 static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
85 {
86 q15_t *pCnt = base;
87 q7_t *pV = target;
88 q31_t v1, v2, vo1, vo2;
89 uint16_t cnt = length >> 2;
90 q31_t in;
91
92 while (cnt > 0u)
93 {
94 q31_t value = *__SIMD32(pV)++;
95 v1 = __SXTB16(__ROR(value, 8));
96 v2 = __SXTB16(value);
97 #ifndef ARM_MATH_BIG_ENDIAN
98
99 vo2 = __PKHTB(v1, v2, 16);
100 vo1 = __PKHBT(v2, v1, 16);
101
102 #else
103
104 vo1 = __PKHTB(v1, v2, 16);
105 vo2 = __PKHBT(v2, v1, 16);
106
107 #endif
108
109 in = *__SIMD32(pCnt);
110 *__SIMD32(pCnt)++ = __QADD16(vo1, in);
111
112 in = *__SIMD32(pCnt);
113 *__SIMD32(pCnt)++ = __QADD16(vo2, in);
114
115 cnt--;
116 }
117 cnt = length & 0x3;
118 while (cnt > 0u)
119 {
120 *pCnt++ += *pV++;
121 cnt--;
122 }
123 }
124
125 #endif // ARM_MATH_DSP
126
127 /**
128 * @ingroup groupNN
129 */
130
131 /**
132 * @addtogroup Pooling
133 * @{
134 */
135
136 /**
137 * @brief Q7 max pooling function
138 * @param[in, out] Im_in pointer to input tensor
139 * @param[in] dim_im_in input tensor dimention
140 * @param[in] ch_im_in number of input tensor channels
141 * @param[in] dim_kernel filter kernel size
142 * @param[in] padding padding sizes
143 * @param[in] stride convolution stride
144 * @param[in] dim_im_out output tensor dimension
145 * @param[in,out] bufferA pointer to buffer space for input
146 * @param[in,out] Im_out pointer to output tensor
147 * @return none.
148 *
149 * @details
150 *
151 * <b>Buffer size:</b>
152 *
153 * bufferA size: 0
154 *
155 * The pooling function is implemented as split x-pooling then
156 * y-pooling.
157 *
158 * This pooling function is input-destructive. Input data is undefined
159 * after calling this function.
160 *
161 */
162
163 void
arm_maxpool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)164 arm_maxpool_q7_HWC(q7_t * Im_in,
165 const uint16_t dim_im_in,
166 const uint16_t ch_im_in,
167 const uint16_t dim_kernel,
168 const uint16_t padding,
169 const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
170 {
171
172 #if defined (ARM_MATH_DSP)
173 /* Run the following code for Cortex-M4 and Cortex-M7 */
174
175 int16_t i_x, i_y;
176
177 /* first does the pooling along x axis */
178 for (i_y = 0; i_y < dim_im_in; i_y++)
179 {
180
181 for (i_x = 0; i_x < dim_im_out; i_x++)
182 {
183 /* for each output pixel */
184 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
185 q7_t *win_start;
186 q7_t *win_stop;
187 if (i_x * stride - padding < 0)
188 {
189 win_start = target;
190 } else
191 {
192 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
193 }
194
195 if (i_x * stride - padding + dim_kernel >= dim_im_in)
196 {
197 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
198 } else
199 {
200 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
201 }
202
203 /* first step is to copy over initial data */
204 /* arm_copy_q7(win_start, target, ch_im_in); */
205 memmove(target, win_start, ch_im_in);
206
207 /* start the max operation from the second part */
208 win_start += ch_im_in;
209 for (; win_start < win_stop; win_start += ch_im_in)
210 {
211 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
212 }
213 }
214 }
215
216 /* then does the pooling along y axis */
217 for (i_y = 0; i_y < dim_im_out; i_y++)
218 {
219
220 /* for each output row */
221 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
222 q7_t *row_start;
223 q7_t *row_end;
224 /* setting the starting row */
225 if (i_y * stride - padding < 0)
226 {
227 row_start = Im_in;
228 } else
229 {
230 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
231 }
232 /* setting the stopping row */
233 if (i_y * stride - padding + dim_kernel >= dim_im_in)
234 {
235 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
236 } else
237 {
238 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
239 }
240
241 /* copy over the first row */
242 /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
243 memmove(target, row_start, dim_im_out * ch_im_in);
244
245 /* move over to next row */
246 row_start += ch_im_in * dim_im_in;
247
248 for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
249 {
250 compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
251 }
252 }
253
254 #else
255 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
256
257 int16_t i_ch_in, i_x, i_y;
258 int16_t k_x, k_y;
259
260 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
261 {
262 for (i_y = 0; i_y < dim_im_out; i_y++)
263 {
264 for (i_x = 0; i_x < dim_im_out; i_x++)
265 {
266 int max = -129;
267 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
268 {
269 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
270 {
271 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
272 {
273 if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
274 {
275 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
276 }
277 }
278 }
279 }
280 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
281 }
282 }
283 }
284
285 #endif /* ARM_MATH_DSP */
286
287 }
288
289 /**
290 * @brief Q7 average pooling function
291 * @param[in,out] Im_in pointer to input tensor
292 * @param[in] dim_im_in input tensor dimention
293 * @param[in] ch_im_in number of input tensor channels
294 * @param[in] dim_kernel filter kernel size
295 * @param[in] padding padding sizes
296 * @param[in] stride convolution stride
297 * @param[in] dim_im_out output tensor dimension
298 * @param[in,out] bufferA pointer to buffer space for input
299 * @param[in,out] Im_out pointer to output tensor
300 * @return none.
301 *
302 * @details
303 *
304 * <b>Buffer size:</b>
305 *
306 * bufferA size: 2*dim_im_out*ch_im_in
307 *
308 * The pooling function is implemented as split x-pooling then
309 * y-pooling.
310 *
311 * This pooling function is input-destructive. Input data is undefined
312 * after calling this function.
313 *
314 */
315
316 void
arm_avepool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)317 arm_avepool_q7_HWC(q7_t * Im_in,
318 const uint16_t dim_im_in,
319 const uint16_t ch_im_in,
320 const uint16_t dim_kernel,
321 const uint16_t padding,
322 const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
323 {
324
325 #if defined (ARM_MATH_DSP)
326 /* Run the following code for Cortex-M4 and Cortex-M7 */
327
328 q15_t *buffer = (q15_t *) bufferA;
329 int16_t i_x, i_y;
330 int16_t count = 0;
331
332 /* first does the pooling along x axis */
333 for (i_y = 0; i_y < dim_im_in; i_y++)
334 {
335
336 for (i_x = 0; i_x < dim_im_out; i_x++)
337 {
338 /* for each output pixel */
339 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
340 q7_t *win_start;
341 q7_t *win_stop;
342 if (i_x * stride - padding < 0)
343 {
344 win_start = target;
345 } else
346 {
347 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
348 }
349
350 if (i_x * stride - padding + dim_kernel >= dim_im_in)
351 {
352 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
353 } else
354 {
355 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
356 }
357
358 /* first step is to copy over initial data */
359 arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
360 count = 1;
361
362 /* start the max operation from the second part */
363 win_start += ch_im_in;
364 for (; win_start < win_stop; win_start += ch_im_in)
365 {
366 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
367 count++;
368 }
369 buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
370 }
371 }
372
373 /* then does the pooling along y axis */
374 for (i_y = 0; i_y < dim_im_out; i_y++)
375 {
376 /* for each output row */
377 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
378 q7_t *row_start;
379 q7_t *row_end;
380 /* setting the starting row */
381 if (i_y * stride - padding < 0)
382 {
383 row_start = Im_in;
384 } else
385 {
386 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
387 }
388 /* setting the stopping row */
389 if (i_y * stride - padding + dim_kernel >= dim_im_in)
390 {
391 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
392 } else
393 {
394 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
395 }
396
397 /* copy over the first row */
398 arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
399 count = 1;
400
401 /* move over to next row */
402 row_start += ch_im_in * dim_im_in;
403
404 for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
405 {
406 accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
407 count++;
408 }
409 buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
410 }
411
412 #else
413 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
414
415 int16_t i_ch_in, i_x, i_y;
416 int16_t k_x, k_y;
417
418 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
419 {
420 for (i_y = 0; i_y < dim_im_out; i_y++)
421 {
422 for (i_x = 0; i_x < dim_im_out; i_x++)
423 {
424 int sum = 0;
425 int count = 0;
426 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
427 {
428 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
429 {
430 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
431 {
432 sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
433 count++;
434 }
435 }
436 }
437 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
438 }
439 }
440 }
441
442 #endif /* ARM_MATH_DSP */
443
444 }
445
446 /**
447 * @} end of Pooling group
448 */
449