xref: /btstack/port/stm32-f4discovery-usb/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c (revision a8f7f3fcbcd51f8d2e92aca076b6a9f812db358c)
1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_pool_q7_HWC.c
22  * Description:  Pooling function implementations
23  *
24  * $Date:        17. January 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33 
34 #if defined (ARM_MATH_DSP)
35 
36 /**
37  * @brief A few utility functions used by pooling functions
38  *
39  *
40  */
41 
buffer_scale_back_q15_to_q7(q15_t * buffer,q7_t * target,uint16_t length,uint16_t scale)42 static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
43 {
44     int       i;
45 
46     for (i = 0; i < length; i++)
47     {
48         target[i] = (q7_t) (buffer[i] / scale);
49     }
50 }
51 
compare_and_replace_if_larger_q7(q7_t * base,q7_t * target,const uint16_t length)52 static void compare_and_replace_if_larger_q7(q7_t * base,   // base data
53                                              q7_t * target, // compare target
54                                              const uint16_t length  // data length
55     )
56 {
57     q7_t     *pIn = base;
58     q7_t     *pCom = target;
59     union arm_nnword in;
60     union arm_nnword com;
61     uint16_t  cnt = length >> 2;
62 
63     while (cnt > 0u)
64     {
65         in.word = *__SIMD32(pIn);
66         com.word = *__SIMD32(pCom)++;
67 
68         // if version
69         if (com.bytes[0] > in.bytes[0])
70             in.bytes[0] = com.bytes[0];
71         if (com.bytes[1] > in.bytes[1])
72             in.bytes[1] = com.bytes[1];
73         if (com.bytes[2] > in.bytes[2])
74             in.bytes[2] = com.bytes[2];
75         if (com.bytes[3] > in.bytes[3])
76             in.bytes[3] = com.bytes[3];
77 
78         *__SIMD32(pIn)++ = in.word;
79 
80         cnt--;
81     }
82 }
83 
accumulate_q7_to_q15(q15_t * base,q7_t * target,const uint16_t length)84 static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
85 {
86     q15_t    *pCnt = base;
87     q7_t     *pV = target;
88     q31_t     v1, v2, vo1, vo2;
89     uint16_t  cnt = length >> 2;
90     q31_t     in;
91 
92     while (cnt > 0u)
93     {
94         q31_t     value = *__SIMD32(pV)++;
95         v1 = __SXTB16(__ROR(value, 8));
96         v2 = __SXTB16(value);
97 #ifndef ARM_MATH_BIG_ENDIAN
98 
99         vo2 = __PKHTB(v1, v2, 16);
100         vo1 = __PKHBT(v2, v1, 16);
101 
102 #else
103 
104         vo1 = __PKHTB(v1, v2, 16);
105         vo2 = __PKHBT(v2, v1, 16);
106 
107 #endif
108 
109         in = *__SIMD32(pCnt);
110         *__SIMD32(pCnt)++ = __QADD16(vo1, in);
111 
112         in = *__SIMD32(pCnt);
113         *__SIMD32(pCnt)++ = __QADD16(vo2, in);
114 
115         cnt--;
116     }
117     cnt = length & 0x3;
118     while (cnt > 0u)
119     {
120         *pCnt++ += *pV++;
121         cnt--;
122     }
123 }
124 
125 #endif                          // ARM_MATH_DSP
126 
127 /**
128  *  @ingroup groupNN
129  */
130 
131 /**
132  * @addtogroup Pooling
133  * @{
134  */
135 
136   /**
137    * @brief Q7 max pooling function
138    * @param[in, out]  Im_in       pointer to input tensor
139    * @param[in]       dim_im_in   input tensor dimention
140    * @param[in]       ch_im_in    number of input tensor channels
141    * @param[in]       dim_kernel  filter kernel size
142    * @param[in]       padding     padding sizes
143    * @param[in]       stride      convolution stride
144    * @param[in]       dim_im_out  output tensor dimension
145    * @param[in,out]   bufferA     pointer to buffer space for input
146    * @param[in,out]   Im_out      pointer to output tensor
147    * @return none.
148    *
149    * @details
150    *
151    * <b>Buffer size:</b>
152    *
153    * bufferA size:  0
154    *
155    * The pooling function is implemented as split x-pooling then
156    * y-pooling.
157    *
158    * This pooling function is input-destructive. Input data is undefined
159    * after calling this function.
160    *
161    */
162 
163 void
arm_maxpool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)164 arm_maxpool_q7_HWC(q7_t * Im_in,
165                    const uint16_t dim_im_in,
166                    const uint16_t ch_im_in,
167                    const uint16_t dim_kernel,
168                    const uint16_t padding,
169                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
170 {
171 
172 #if defined (ARM_MATH_DSP)
173     /* Run the following code for Cortex-M4 and Cortex-M7 */
174 
175     int16_t   i_x, i_y;
176 
177     /* first does the pooling along x axis */
178     for (i_y = 0; i_y < dim_im_in; i_y++)
179     {
180 
181         for (i_x = 0; i_x < dim_im_out; i_x++)
182         {
183             /* for each output pixel */
184             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
185             q7_t     *win_start;
186             q7_t     *win_stop;
187             if (i_x * stride - padding < 0)
188             {
189                 win_start = target;
190             } else
191             {
192                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
193             }
194 
195             if (i_x * stride - padding + dim_kernel >= dim_im_in)
196             {
197                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
198             } else
199             {
200                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
201             }
202 
203             /* first step is to copy over initial data */
204             /* arm_copy_q7(win_start, target, ch_im_in); */
205             memmove(target, win_start, ch_im_in);
206 
207             /* start the max operation from the second part */
208             win_start += ch_im_in;
209             for (; win_start < win_stop; win_start += ch_im_in)
210             {
211                 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
212             }
213         }
214     }
215 
216     /* then does the pooling along y axis */
217     for (i_y = 0; i_y < dim_im_out; i_y++)
218     {
219 
220         /* for each output row */
221         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
222         q7_t     *row_start;
223         q7_t     *row_end;
224         /* setting the starting row */
225         if (i_y * stride - padding < 0)
226         {
227             row_start = Im_in;
228         } else
229         {
230             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
231         }
232         /* setting the stopping row */
233         if (i_y * stride - padding + dim_kernel >= dim_im_in)
234         {
235             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
236         } else
237         {
238             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
239         }
240 
241         /* copy over the first row */
242         /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
243         memmove(target, row_start, dim_im_out * ch_im_in);
244 
245         /* move over to next row */
246         row_start += ch_im_in * dim_im_in;
247 
248         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
249         {
250             compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
251         }
252     }
253 
254 #else
255     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
256 
257     int16_t   i_ch_in, i_x, i_y;
258     int16_t   k_x, k_y;
259 
260     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
261     {
262         for (i_y = 0; i_y < dim_im_out; i_y++)
263         {
264             for (i_x = 0; i_x < dim_im_out; i_x++)
265             {
266                 int       max = -129;
267                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
268                 {
269                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
270                     {
271                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
272                         {
273                             if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
274                             {
275                                 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
276                             }
277                         }
278                     }
279                 }
280                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
281             }
282         }
283     }
284 
285 #endif                          /* ARM_MATH_DSP */
286 
287 }
288 
289   /**
290    * @brief Q7 average pooling function
291    * @param[in,out]   Im_in       pointer to input tensor
292    * @param[in]       dim_im_in   input tensor dimention
293    * @param[in]       ch_im_in    number of input tensor channels
294    * @param[in]       dim_kernel  filter kernel size
295    * @param[in]       padding     padding sizes
296    * @param[in]       stride      convolution stride
297    * @param[in]       dim_im_out  output tensor dimension
298    * @param[in,out]   bufferA     pointer to buffer space for input
299    * @param[in,out]   Im_out      pointer to output tensor
300    * @return none.
301    *
302    * @details
303    *
304    * <b>Buffer size:</b>
305    *
306    * bufferA size:  2*dim_im_out*ch_im_in
307    *
308    * The pooling function is implemented as split x-pooling then
309    * y-pooling.
310    *
311    * This pooling function is input-destructive. Input data is undefined
312    * after calling this function.
313    *
314    */
315 
316 void
arm_avepool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)317 arm_avepool_q7_HWC(q7_t * Im_in,
318                    const uint16_t dim_im_in,
319                    const uint16_t ch_im_in,
320                    const uint16_t dim_kernel,
321                    const uint16_t padding,
322                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
323 {
324 
325 #if defined (ARM_MATH_DSP)
326     /* Run the following code for Cortex-M4 and Cortex-M7 */
327 
328     q15_t    *buffer = (q15_t *) bufferA;
329     int16_t   i_x, i_y;
330     int16_t   count = 0;
331 
332     /* first does the pooling along x axis */
333     for (i_y = 0; i_y < dim_im_in; i_y++)
334     {
335 
336         for (i_x = 0; i_x < dim_im_out; i_x++)
337         {
338             /* for each output pixel */
339             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
340             q7_t     *win_start;
341             q7_t     *win_stop;
342             if (i_x * stride - padding < 0)
343             {
344                 win_start = target;
345             } else
346             {
347                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
348             }
349 
350             if (i_x * stride - padding + dim_kernel >= dim_im_in)
351             {
352                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
353             } else
354             {
355                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
356             }
357 
358             /* first step is to copy over initial data */
359             arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
360             count = 1;
361 
362             /* start the max operation from the second part */
363             win_start += ch_im_in;
364             for (; win_start < win_stop; win_start += ch_im_in)
365             {
366                 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
367                 count++;
368             }
369             buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
370         }
371     }
372 
373     /* then does the pooling along y axis */
374     for (i_y = 0; i_y < dim_im_out; i_y++)
375     {
376         /* for each output row */
377         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
378         q7_t     *row_start;
379         q7_t     *row_end;
380         /* setting the starting row */
381         if (i_y * stride - padding < 0)
382         {
383             row_start = Im_in;
384         } else
385         {
386             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
387         }
388         /* setting the stopping row */
389         if (i_y * stride - padding + dim_kernel >= dim_im_in)
390         {
391             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
392         } else
393         {
394             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
395         }
396 
397         /* copy over the first row */
398         arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
399         count = 1;
400 
401         /* move over to next row */
402         row_start += ch_im_in * dim_im_in;
403 
404         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
405         {
406             accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
407             count++;
408         }
409         buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
410     }
411 
412 #else
413     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
414 
415     int16_t   i_ch_in, i_x, i_y;
416     int16_t   k_x, k_y;
417 
418     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
419     {
420         for (i_y = 0; i_y < dim_im_out; i_y++)
421         {
422             for (i_x = 0; i_x < dim_im_out; i_x++)
423             {
424                 int       sum = 0;
425                 int       count = 0;
426                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
427                 {
428                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
429                     {
430                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
431                         {
432                             sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
433                             count++;
434                         }
435                     }
436                 }
437                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
438             }
439         }
440     }
441 
442 #endif                          /* ARM_MATH_DSP */
443 
444 }
445 
446 /**
447  * @} end of Pooling group
448  */
449