xref: /btstack/port/stm32-f4discovery-usb/Drivers/CMSIS/NN/Include/arm_nnfunctions.h (revision a8f7f3fcbcd51f8d2e92aca076b6a9f812db358c)
1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nnfunctions.h
22  * Description:  Public header file for CMSIS NN Library
23  *
24  * $Date:        13. July 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  * -------------------------------------------------------------------- */
29 
30 /**
31    \mainpage CMSIS NN Software Library
32    *
33    * Introduction
34    * ------------
35    *
36    * This user manual describes the CMSIS NN software library,
37    * a collection of efficient neural network kernels developed to maximize the
38    * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39    *
40    * The library is divided into a number of functions each covering a specific category:
41    * - Neural Network Convolution Functions
42    * - Neural Network Activation Functions
43    * - Fully-connected Layer Functions
44    * - Neural Network Pooling Functions
45    * - Softmax Functions
46    * - Neural Network Support Functions
47    *
48    * The library has separate functions for operating on different weight and activation data
49    * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
50    * kernels are included in the function description. The implementation details are also
51    * described in this paper [1].
52    *
53    * Block Diagram
54    * --------
55    * \image html CMSIS-NN-OVERVIEW.PNG
56    *
57    * Examples
58    * --------
59    *
60    * The library ships with a number of examples which demonstrate how to use the library functions.
61    *
62    * Pre-processor Macros
63    * ------------
64    *
65    * Each library project have differant pre-processor macros.
66    *
67    * - ARM_MATH_DSP:
68    *
69    * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
70    *
71    * - ARM_MATH_BIG_ENDIAN:
72    *
73    * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
74    *
75    * - ARM_NN_TRUNCATE:
76    *
77    * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
78    *
79    * Copyright Notice
80    * ------------
81    *
82    * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
83    *
84    * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
85    */
86 
87 /**
88  * @defgroup groupNN Neural Network Functions
89  * These functions perform basic operations for neural network layers.
90  */
91 
92 #ifndef _ARM_NNFUNCTIONS_H
93 #define _ARM_NNFUNCTIONS_H
94 
95 #include "arm_nnsupportfunctions.h"
96 #include "arm_nn_tables.h"
97 
98 #define USE_INTRINSIC
99 
100 //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
101 
102 #ifdef __cplusplus
103 extern    "C"
104 {
105 #endif
106 
107 /**
108  * @defgroup NNConv Neural Network Convolution Functions
109  *
110  * Perform convolution layer
111  *
112  * The convolution is implemented in 2 steps: im2col and GEMM
113  *
114  * im2col is a process of converting each patch of image data into
115  * a column. After im2col, the convolution is computed as matrix-matrix
116  * multiplication.
117  *
118  * To reduce the memory footprint, the im2col is performed partially.
119  * Each iteration, only a few column (i.e., patches) are generated and
120  * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
121  *
122  */
123 
124   /**
125    * @brief Basic Q7 convolution function
126    * @param[in]       Im_in       pointer to input tensor
127    * @param[in]       dim_im_in   input tensor dimention
128    * @param[in]       ch_im_in    number of input tensor channels
129    * @param[in]       wt          pointer to kernel weights
130    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
131    * @param[in]       dim_kernel  filter kernel size
132    * @param[in]       padding     padding sizes
133    * @param[in]       stride      convolution stride
134    * @param[in]       bias        pointer to bias
135    * @param[in]       bias_shift  amount of left-shift for bias
136    * @param[in]       out_shift   amount of right-shift for output
137    * @param[in,out]   Im_out      pointer to output tensor
138    * @param[in]       dim_im_out  output tensor dimension
139    * @param[in,out]   bufferA     pointer to buffer space for input
140    * @param[in,out]   bufferB     pointer to buffer space for output
141    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
142    *
143    */
144 
145     arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
146                                          const uint16_t dim_im_in,
147                                          const uint16_t ch_im_in,
148                                          const q7_t * wt,
149                                          const uint16_t ch_im_out,
150                                          const uint16_t dim_kernel,
151                                          const uint16_t padding,
152                                          const uint16_t stride,
153                                          const q7_t * bias,
154                                          const uint16_t bias_shift,
155                                          const uint16_t out_shift,
156                                          q7_t * Im_out,
157                                          const uint16_t dim_im_out,
158                                          q15_t * bufferA,
159                                          q7_t * bufferB);
160 
161   /**
162    * @brief Basic Q7 convolution function (non-sqaure shape)
163    * @param[in]       Im_in        pointer to input tensor
164    * @param[in]       dim_im_in_x  input tensor dimention x
165    * @param[in]       dim_im_in_y  input tensor dimention y
166    * @param[in]       ch_im_in     number of input tensor channels
167    * @param[in]       wt           pointer to kernel weights
168    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
169    * @param[in]       dim_kernel_x filter kernel size x
170    * @param[in]       dim_kernel_y filter kernel size y
171    * @param[in]       padding_x    padding size x
172    * @param[in]       padding_y    padding size y
173    * @param[in]       stride_x     convolution stride x
174    * @param[in]       stride_y     convolution stride y
175    * @param[in]       bias         pointer to bias
176    * @param[in]       bias_shift   amount of left-shift for bias
177    * @param[in]       out_shift    amount of right-shift for output
178    * @param[in,out]   Im_out       pointer to output tensor
179    * @param[in]       dim_im_out_x output tensor dimension x
180    * @param[in]       dim_im_out_y output tensor dimension y
181    * @param[in,out]   bufferA      pointer to buffer space for input
182    * @param[in,out]   bufferB      pointer to buffer space for output
183    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
184    */
185 
186     arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
187                                                   const uint16_t dim_im_in_x,
188                                                   const uint16_t dim_im_in_y,
189                                                   const uint16_t ch_im_in,
190                                                   const q7_t * wt,
191                                                   const uint16_t ch_im_out,
192                                                   const uint16_t dim_kernel_x,
193                                                   const uint16_t dim_kernel_y,
194                                                   const uint16_t padding_x,
195                                                   const uint16_t padding_y,
196                                                   const uint16_t stride_x,
197                                                   const uint16_t stride_y,
198                                                   const q7_t * bias,
199                                                   const uint16_t bias_shift,
200                                                   const uint16_t out_shift,
201                                                   q7_t * Im_out,
202                                                   const uint16_t dim_im_out_x,
203                                                   const uint16_t dim_im_out_y,
204                                                   q15_t * bufferA,
205                                                   q7_t * bufferB);
206 
207   /**
208    * @brief Basic Q15 convolution function
209    * @param[in]       Im_in       pointer to input tensor
210    * @param[in]       dim_im_in   input tensor dimention
211    * @param[in]       ch_im_in    number of input tensor channels
212    * @param[in]       wt          pointer to kernel weights
213    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
214    * @param[in]       dim_kernel  filter kernel size
215    * @param[in]       padding     padding sizes
216    * @param[in]       stride      convolution stride
217    * @param[in]       bias        pointer to bias
218    * @param[in]       bias_shift  amount of left-shift for bias
219    * @param[in]       out_shift   amount of right-shift for output
220    * @param[in,out]   Im_out      pointer to output tensor
221    * @param[in]       dim_im_out  output tensor dimension
222    * @param[in,out]   bufferA     pointer to buffer space for input
223    * @param[in,out]   bufferB     pointer to buffer space for output
224    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
225    *
226    */
227 
228     arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
229                                           const uint16_t dim_im_in,
230                                           const uint16_t ch_im_in,
231                                           const q15_t * wt,
232                                           const uint16_t ch_im_out,
233                                           const uint16_t dim_kernel,
234                                           const uint16_t padding,
235                                           const uint16_t stride,
236                                           const q15_t * bias,
237                                           const uint16_t bias_shift,
238                                           const uint16_t out_shift,
239                                           q15_t * Im_out,
240                                           const uint16_t dim_im_out,
241                                           q15_t * bufferA,
242                                           q7_t * bufferB);
243 
244   /**
245    * @brief Fast Q7 convolution function
246    * @param[in]       Im_in       pointer to input tensor
247    * @param[in]       dim_im_in   input tensor dimention
248    * @param[in]       ch_im_in    number of input tensor channels
249    * @param[in]       wt          pointer to kernel weights
250    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
251    * @param[in]       dim_kernel  filter kernel size
252    * @param[in]       padding     padding sizes
253    * @param[in]       stride      convolution stride
254    * @param[in]       bias        pointer to bias
255    * @param[in]       bias_shift  amount of left-shift for bias
256    * @param[in]       out_shift   amount of right-shift for output
257    * @param[in,out]   Im_out      pointer to output tensor
258    * @param[in]       dim_im_out  output tensor dimension
259    * @param[in,out]   bufferA     pointer to buffer space for input
260    * @param[in,out]   bufferB     pointer to buffer space for output
261    * @return     The function returns either
262    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
263    *
264    * This function is the version with full list of optimization tricks, but with
265    * some contraints:
266    *   ch_im_in is multiple of 4
267    *   ch_im_out is multiple of 2
268    */
269 
270     arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
271                                         const uint16_t dim_im_in,
272                                         const uint16_t ch_im_in,
273                                         const q7_t * wt,
274                                         const uint16_t ch_im_out,
275                                         const uint16_t dim_kernel,
276                                         const uint16_t padding,
277                                         const uint16_t stride,
278                                         const q7_t * bias,
279                                         const uint16_t bias_shift,
280                                         const uint16_t out_shift,
281                                         q7_t * Im_out,
282                                         const uint16_t dim_im_out,
283                                         q15_t * bufferA,
284                                         q7_t * bufferB);
285 
286   /**
287    * @brief Fast Q7 convolution function (non-sqaure shape)
288    * @param[in]       Im_in        pointer to input tensor
289    * @param[in]       dim_im_in_x  input tensor dimention x
290    * @param[in]       dim_im_in_y  input tensor dimention y
291    * @param[in]       ch_im_in     number of input tensor channels
292    * @param[in]       wt           pointer to kernel weights
293    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
294    * @param[in]       dim_kernel_x filter kernel size x
295    * @param[in]       dim_kernel_y filter kernel size y
296    * @param[in]       padding_x    padding size x
297    * @param[in]       padding_y    padding size y
298    * @param[in]       stride_x     convolution stride x
299    * @param[in]       stride_y     convolution stride y
300    * @param[in]       bias         pointer to bias
301    * @param[in]       bias_shift   amount of left-shift for bias
302    * @param[in]       out_shift    amount of right-shift for output
303    * @param[in,out]   Im_out       pointer to output tensor
304    * @param[in]       dim_im_out_x output tensor dimension x
305    * @param[in]       dim_im_out_y output tensor dimension y
306    * @param[in,out]   bufferA      pointer to buffer space for input
307    * @param[in,out]   bufferB      pointer to buffer space for output
308    * @return     The function returns either
309    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
310    *
311    * This function is the version with full list of optimization tricks, but with
312    * some contraints:
313    *   ch_im_in is multiple of 4
314    *   ch_im_out is multiple of 2
315    */
316 
317     arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
318                                                   const uint16_t dim_im_in_x,
319                                                   const uint16_t dim_im_in_y,
320                                                   const uint16_t ch_im_in,
321                                                   const q7_t * wt,
322                                                   const uint16_t ch_im_out,
323                                                   const uint16_t dim_kernel_x,
324                                                   const uint16_t dim_kernel_y,
325                                                   const uint16_t padding_x,
326                                                   const uint16_t padding_y,
327                                                   const uint16_t stride_x,
328                                                   const uint16_t stride_y,
329                                                   const q7_t * bias,
330                                                   const uint16_t bias_shift,
331                                                   const uint16_t out_shift,
332                                                   q7_t * Im_out,
333                                                   const uint16_t dim_im_out_x,
334                                                   const uint16_t dim_im_out_y,
335                                                   q15_t * bufferA,
336                                                   q7_t * bufferB);
337 
338   /**
339    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
340    * @param[in]       Im_in        pointer to input tensor
341    * @param[in]       dim_im_in_x  input tensor dimention x
342    * @param[in]       dim_im_in_y  input tensor dimention y
343    * @param[in]       ch_im_in     number of input tensor channels
344    * @param[in]       wt           pointer to kernel weights
345    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
346    * @param[in]       dim_kernel_x filter kernel size x
347    * @param[in]       dim_kernel_y filter kernel size y
348    * @param[in]       padding_x    padding size x
349    * @param[in]       padding_y    padding size y
350    * @param[in]       stride_x     convolution stride x
351    * @param[in]       stride_y     convolution stride y
352    * @param[in]       bias         pointer to bias
353    * @param[in]       bias_shift   amount of left-shift for bias
354    * @param[in]       out_shift    amount of right-shift for output
355    * @param[in,out]   Im_out       pointer to output tensor
356    * @param[in]       dim_im_out_x output tensor dimension x
357    * @param[in]       dim_im_out_y output tensor dimension y
358    * @param[in,out]   bufferA      pointer to buffer space for input
359    * @param[in,out]   bufferB      pointer to buffer space for output
360    * @return     The function returns either
361    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
362    *
363    * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
364    * and dim_kernel_y=1). It can be used for
365    * second half of MobileNets after depthwise separable convolution.
366    *
367    * This function is the version with full list of optimization tricks, but with
368    * some contraints:
369    *   ch_im_in is multiple of 4
370    *   ch_im_out is multiple of 2
371    */
372     arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
373                                                       const uint16_t dim_im_in_x,
374                                                       const uint16_t dim_im_in_y,
375                                                       const uint16_t ch_im_in,
376                                                       const q7_t * wt,
377                                                       const uint16_t ch_im_out,
378                                                       const uint16_t dim_kernel_x,
379                                                       const uint16_t dim_kernel_y,
380                                                       const uint16_t padding_x,
381                                                       const uint16_t padding_y,
382                                                       const uint16_t stride_x,
383                                                       const uint16_t stride_y,
384                                                       const q7_t * bias,
385                                                       const uint16_t bias_shift,
386                                                       const uint16_t out_shift,
387                                                       q7_t * Im_out,
388                                                       const uint16_t dim_im_out_x,
389                                                       const uint16_t dim_im_out_y,
390                                                       q15_t * bufferA,
391                                                       q7_t * bufferB);
392 
393   /**
394    * @brief Q7 version of convolution for RGB image
395    * @param[in]       Im_in       pointer to input tensor
396    * @param[in]       dim_im_in   input tensor dimention
397    * @param[in]       ch_im_in    number of input tensor channels
398    * @param[in]       wt          pointer to kernel weights
399    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
400    * @param[in]       dim_kernel  filter kernel size
401    * @param[in]       padding     padding sizes
402    * @param[in]       stride      convolution stride
403    * @param[in]       bias        pointer to bias
404    * @param[in]       bias_shift  amount of left-shift for bias
405    * @param[in]       out_shift   amount of right-shift for output
406    * @param[in,out]   Im_out      pointer to output tensor
407    * @param[in]       dim_im_out  output tensor dimension
408    * @param[in,out]   bufferA     pointer to buffer space for input
409    * @param[in,out]   bufferB     pointer to buffer space for output
410    * @return     The function returns either
411    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
412    *
413    * This kernel is written exclusively for convolution with ch_im_in
414    * equals 3. This applies on the first layer of CNNs which has input
415    * image with RGB format.
416    */
417 
418     arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
419                                        const uint16_t dim_im_in,
420                                        const uint16_t ch_im_in,
421                                        const q7_t * wt,
422                                        const uint16_t ch_im_out,
423                                        const uint16_t dim_kernel,
424                                        const uint16_t padding,
425                                        const uint16_t stride,
426                                        const q7_t * bias,
427                                        const uint16_t bias_shift,
428                                        const uint16_t out_shift,
429                                        q7_t * Im_out,
430                                        const uint16_t dim_im_out,
431                                        q15_t * bufferA,
432                                        q7_t * bufferB);
433 
434   /**
435    * @brief Fast Q15 convolution function
436    * @param[in]       Im_in       pointer to input tensor
437    * @param[in]       dim_im_in   input tensor dimention
438    * @param[in]       ch_im_in    number of input tensor channels
439    * @param[in]       wt          pointer to kernel weights
440    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
441    * @param[in]       dim_kernel  filter kernel size
442    * @param[in]       padding     padding sizes
443    * @param[in]       stride      convolution stride
444    * @param[in]       bias        pointer to bias
445    * @param[in]       bias_shift  amount of left-shift for bias
446    * @param[in]       out_shift   amount of right-shift for output
447    * @param[in,out]   Im_out      pointer to output tensor
448    * @param[in]       dim_im_out  output tensor dimension
449    * @param[in,out]   bufferA     pointer to buffer space for input
450    * @param[in,out]   bufferB     pointer to buffer space for output
451    * @return     The function returns either
452    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
453    *
454    * This function is the version with full list of optimization tricks, but with
455    * some contraints:
456    *   ch_im_in is multiple of 2
457    *   ch_im_out is multiple of 2
458    */
459 
460     arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
461                                          const uint16_t dim_im_in,
462                                          const uint16_t ch_im_in,
463                                          const q15_t * wt,
464                                          const uint16_t ch_im_out,
465                                          const uint16_t dim_kernel,
466                                          const uint16_t padding,
467                                          const uint16_t stride,
468                                          const q15_t * bias,
469                                          const uint16_t bias_shift,
470                                          const uint16_t out_shift,
471                                          q15_t * Im_out,
472                                          const uint16_t dim_im_out,
473                                          q15_t * bufferA,
474                                          q7_t * bufferB);
475 
476   /**
477    * @brief Fast Q15 convolution function (non-sqaure shape)
478    * @param[in]       Im_in        pointer to input tensor
479    * @param[in]       dim_im_in_x  input tensor dimention x
480    * @param[in]       dim_im_in_y  input tensor dimention y
481    * @param[in]       ch_im_in     number of input tensor channels
482    * @param[in]       wt           pointer to kernel weights
483    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
484    * @param[in]       dim_kernel_x filter kernel size x
485    * @param[in]       dim_kernel_y filter kernel size y
486    * @param[in]       padding_x    padding size x
487    * @param[in]       padding_y    padding size y
488    * @param[in]       stride_x     convolution stride x
489    * @param[in]       stride_y     convolution stride y
490    * @param[in]       bias         pointer to bias
491    * @param[in]       bias_shift   amount of left-shift for bias
492    * @param[in]       out_shift    amount of right-shift for output
493    * @param[in,out]   Im_out       pointer to output tensor
494    * @param[in]       dim_im_out_x output tensor dimension x
495    * @param[in]       dim_im_out_y output tensor dimension y
496    * @param[in,out]   bufferA      pointer to buffer space for input
497    * @param[in,out]   bufferB      pointer to buffer space for output
498    * @return     The function returns either
499    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
500    *
501    * @details
502    *
503    * <b>Buffer size:</b>
504    *
505    * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
506    *
507    * bufferB size: 0
508    *
509    * <b>Input dimension constraints:</b>
510    *
511    * ch_im_in is multiple of 2
512    *
513    * ch_im_out is multipe of 2
514    *
515    */
516 
517     arm_status
518     arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
519                               const uint16_t dim_im_in_x,
520                               const uint16_t dim_im_in_y,
521                               const uint16_t ch_im_in,
522                               const q15_t * wt,
523                               const uint16_t ch_im_out,
524                               const uint16_t dim_kernel_x,
525                               const uint16_t dim_kernel_y,
526                               const uint16_t padding_x,
527                               const uint16_t padding_y,
528                               const uint16_t stride_x,
529                               const uint16_t stride_y,
530                               const q15_t * bias,
531                               const uint16_t bias_shift,
532                               const uint16_t out_shift,
533                               q15_t * Im_out,
534                               const uint16_t dim_im_out_x,
535                               const uint16_t dim_im_out_y,
536                               q15_t * bufferA,
537                               q7_t * bufferB);
538 
539   /**
540    * @brief Q7 depthwise separable convolution function
541    * @param[in]       Im_in       pointer to input tensor
542    * @param[in]       dim_im_in   input tensor dimention
543    * @param[in]       ch_im_in    number of input tensor channels
544    * @param[in]       wt          pointer to kernel weights
545    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
546    * @param[in]       dim_kernel  filter kernel size
547    * @param[in]       padding     padding sizes
548    * @param[in]       stride      convolution stride
549    * @param[in]       bias        pointer to bias
550    * @param[in]       bias_shift  amount of left-shift for bias
551    * @param[in]       out_shift   amount of right-shift for output
552    * @param[in,out]   Im_out      pointer to output tensor
553    * @param[in]       dim_im_out  output tensor dimension
554    * @param[in,out]   bufferA     pointer to buffer space for input
555    * @param[in,out]   bufferB     pointer to buffer space for output
556    * @return     The function returns either
557    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
558    *
559    * This function is the version with full list of optimization tricks, but with
560    * some contraints:
561    *   ch_im_in is multiple of 2
562    *   ch_im_out is multiple of 2
563    */
564 
565     arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
566                                                    const uint16_t dim_im_in,
567                                                    const uint16_t ch_im_in,
568                                                    const q7_t * wt,
569                                                    const uint16_t ch_im_out,
570                                                    const uint16_t dim_kernel,
571                                                    const uint16_t padding,
572                                                    const uint16_t stride,
573                                                    const q7_t * bias,
574                                                    const uint16_t bias_shift,
575                                                    const uint16_t out_shift,
576                                                    q7_t * Im_out,
577                                                    const uint16_t dim_im_out,
578                                                    q15_t * bufferA,
579                                                    q7_t * bufferB);
580 
581   /**
582    * @brief Q7 depthwise separable convolution function (non-square shape)
583    * @param[in]       Im_in         pointer to input tensor
584    * @param[in]       dim_im_in_x   input tensor dimention x
585    * @param[in]       dim_im_in_y   input tensor dimention y
586    * @param[in]       ch_im_in      number of input tensor channels
587    * @param[in]       wt            pointer to kernel weights
588    * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
589    * @param[in]       dim_kernel_x  filter kernel size x
590    * @param[in]       dim_kernel_y  filter kernel size y
591    * @param[in]       padding_x     padding sizes x
592    * @param[in]       padding_y     padding sizes y
593    * @param[in]       stride_x      convolution stride x
594    * @param[in]       stride_y      convolution stride y
595    * @param[in]       bias          pointer to bias
596    * @param[in]       bias_shift    amount of left-shift for bias
597    * @param[in]       out_shift     amount of right-shift for output
598    * @param[in,out]   Im_out        pointer to output tensor
599    * @param[in]       dim_im_out_x  output tensor dimension x
600    * @param[in]       dim_im_out_y  output tensor dimension y
601    * @param[in,out]   bufferA       pointer to buffer space for input
602    * @param[in,out]   bufferB       pointer to buffer space for output
603    * @return     The function returns either
604    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
605    *
606    * This function is the version with full list of optimization tricks, but with
607    * some contraints:
608    *   ch_im_in is multiple of 2
609    *   ch_im_out is multiple of 2
610    */
611     arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
612                                                              const uint16_t dim_im_in_x,
613                                                              const uint16_t dim_im_in_y,
614                                                              const uint16_t ch_im_in,
615                                                              const q7_t * wt,
616                                                              const uint16_t ch_im_out,
617                                                              const uint16_t dim_kernel_x,
618                                                              const uint16_t dim_kernel_y,
619                                                              const uint16_t padding_x,
620                                                              const uint16_t padding_y,
621                                                              const uint16_t stride_x,
622                                                              const uint16_t stride_y,
623                                                              const q7_t * bias,
624                                                              const uint16_t bias_shift,
625                                                              const uint16_t out_shift,
626                                                              q7_t * Im_out,
627                                                              const uint16_t dim_im_out_x,
628                                                              const uint16_t dim_im_out_y,
629                                                              q15_t * bufferA,
630                                                              q7_t * bufferB);
631 
632 
633 /**
634  * @defgroup FC Fully-connected Layer Functions
635  *
636  * Perform fully-connected layer
637  *
638  * Fully-connected layer is basically a matrix-vector multiplication
639  * with bias. The matrix is the weights and the input/output vectors
640  * are the activation values. Supported {weight, activation} precisions
641  * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
642  *
643  * Here we have two types of kernel functions. The basic function
644  * implements the function using regular GEMV approach. The opt functions
645  * operates with weights in interleaved formats.
646  *
647  */
648 
649   /**
650    * @brief Q7 basic fully-connected layer function
651    * @param[in]       pV          pointer to input vector
652    * @param[in]       pM          pointer to matrix weights
653    * @param[in]       dim_vec     length of the vector
654    * @param[in]       num_of_rows number of rows in weight matrix
655    * @param[in]       bias_shift  amount of left-shift for bias
656    * @param[in]       out_shift   amount of right-shift for output
657    * @param[in]       bias        pointer to bias
658    * @param[in,out]   pOut        pointer to output vector
659    * @param[in,out]   vec_buffer  pointer to buffer space for input
660    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
661    *
662    */
663 
664     arm_status arm_fully_connected_q7(const q7_t * pV,
665                                       const q7_t * pM,
666                                       const uint16_t dim_vec,
667                                       const uint16_t num_of_rows,
668                                       const uint16_t bias_shift,
669                                       const uint16_t out_shift,
670                                       const q7_t * bias,
671                                       q7_t * pOut,
672                                       q15_t * vec_buffer);
673 
674   /**
675    * @brief Q7 opt fully-connected layer function
676    * @param[in]       pV          pointer to input vector
677    * @param[in]       pM          pointer to matrix weights
678    * @param[in]       dim_vec     length of the vector
679    * @param[in]       num_of_rows number of rows in weight matrix
680    * @param[in]       bias_shift  amount of left-shift for bias
681    * @param[in]       out_shift   amount of right-shift for output
682    * @param[in]       bias        pointer to bias
683    * @param[in,out]   pOut        pointer to output vector
684    * @param[in,out]   vec_buffer  pointer to buffer space for input
685    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
686    *
687    */
688 
689     arm_status arm_fully_connected_q7_opt(const q7_t * pV,
690                                           const q7_t * pM,
691                                           const uint16_t dim_vec,
692                                           const uint16_t num_of_rows,
693                                           const uint16_t bias_shift,
694                                           const uint16_t out_shift,
695                                           const q7_t * bias,
696                                           q7_t * pOut,
697                                           q15_t * vec_buffer);
698 
699   /**
700    * @brief Q15 basic fully-connected layer function
701    * @param[in]       pV          pointer to input vector
702    * @param[in]       pM          pointer to matrix weights
703    * @param[in]       dim_vec     length of the vector
704    * @param[in]       num_of_rows number of rows in weight matrix
705    * @param[in]       bias_shift  amount of left-shift for bias
706    * @param[in]       out_shift   amount of right-shift for output
707    * @param[in]       bias        pointer to bias
708    * @param[in,out]   pOut        pointer to output vector
709    * @param[in,out]   vec_buffer  pointer to buffer space for input
710    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
711    *
712    */
713 
714     arm_status arm_fully_connected_q15(const q15_t * pV,
715                                        const q15_t * pM,
716                                        const uint16_t dim_vec,
717                                        const uint16_t num_of_rows,
718                                        const uint16_t bias_shift,
719                                        const uint16_t out_shift,
720                                        const q15_t * bias,
721                                        q15_t * pOut,
722                                        q15_t * vec_buffer);
723 
724   /**
725    * @brief Q15 opt fully-connected layer function
726    * @param[in]       pV          pointer to input vector
727    * @param[in]       pM          pointer to matrix weights
728    * @param[in]       dim_vec     length of the vector
729    * @param[in]       num_of_rows number of rows in weight matrix
730    * @param[in]       bias_shift  amount of left-shift for bias
731    * @param[in]       out_shift   amount of right-shift for output
732    * @param[in]       bias        pointer to bias
733    * @param[in,out]   pOut        pointer to output vector
734    * @param[in,out]   vec_buffer  pointer to buffer space for input
735    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
736    *
737    */
738 
739     arm_status arm_fully_connected_q15_opt(const q15_t * pV,
740                                            const q15_t * pM,
741                                            const uint16_t dim_vec,
742                                            const uint16_t num_of_rows,
743                                            const uint16_t bias_shift,
744                                            const uint16_t out_shift,
745                                            const q15_t * bias,
746                                            q15_t * pOut,
747                                            q15_t * vec_buffer);
748 
749   /**
750    * @brief Mixed Q15-Q7 fully-connected layer function
751    * @param[in]       pV          pointer to input vector
752    * @param[in]       pM          pointer to matrix weights
753    * @param[in]       dim_vec     length of the vector
754    * @param[in]       num_of_rows number of rows in weight matrix
755    * @param[in]       bias_shift  amount of left-shift for bias
756    * @param[in]       out_shift   amount of right-shift for output
757    * @param[in]       bias        pointer to bias
758    * @param[in,out]   pOut        pointer to output vector
759    * @param[in,out]   vec_buffer  pointer to buffer space for input
760    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
761    *
762    */
763 
764     arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
765                                                   const q7_t * pM,
766                                                   const uint16_t dim_vec,
767                                                   const uint16_t num_of_rows,
768                                                   const uint16_t bias_shift,
769                                                   const uint16_t out_shift,
770                                                   const q7_t * bias,
771                                                   q15_t * pOut,
772                                                   q15_t * vec_buffer);
773 
774   /**
775    * @brief Mixed Q15-Q7 opt fully-connected layer function
776    * @param[in]       pV          pointer to input vector
777    * @param[in]       pM          pointer to matrix weights
778    * @param[in]       dim_vec     length of the vector
779    * @param[in]       num_of_rows number of rows in weight matrix
780    * @param[in]       bias_shift  amount of left-shift for bias
781    * @param[in]       out_shift   amount of right-shift for output
782    * @param[in]       bias        pointer to bias
783    * @param[in,out]   pOut        pointer to output vector
784    * @param[in,out]   vec_buffer  pointer to buffer space for input
785    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
786    *
787    */
788 
789     arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
790                                                       const q7_t * pM,
791                                                       const uint16_t dim_vec,
792                                                       const uint16_t num_of_rows,
793                                                       const uint16_t bias_shift,
794                                                       const uint16_t out_shift,
795                                                       const q7_t * bias,
796                                                       q15_t * pOut,
797                                                       q15_t * vec_buffer);
798 
799 /**
800  * @brief Matrix-Multiplication Kernels for Convolution
801  *
802  * These functions are used within convolution layer functions for
803  * matrix multiplication.
804  *
805  * The implementation is similar to CMSIS-DSP arm_mat_mult functions
806  * with one Q7 and one Q15 operands. The Q15 operand is the im2col
807  * output which is always with 2 columns.
808  *
809  */
810 
811   /**
812    * @brief Matrix-multiplication function for convolution
813    * @param[in]       pA          pointer to operand A
814    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
815    * @param[in]       ch_im_out   numRow of A
816    * @param[in]       numCol_A    numCol of A
817    * @param[in]       bias_shift  amount of left-shift for bias
818    * @param[in]       out_shift   amount of right-shift for output
819    * @param[in]       bias        the bias
820    * @param[in,out]   pOut        pointer to output
821    * @return     The function returns the incremented output pointer
822    */
823 
824     q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
825                                             const q15_t * pInBuffer,
826                                             const uint16_t ch_im_out,
827                                             const uint16_t numCol_A,
828                                             const uint16_t bias_shift,
829                                             const uint16_t out_shift,
830                                             const q7_t * bias,
831                                             q7_t * pOut);
832 
833   /**
834    * @brief Matrix-multiplication function for convolution with reordered columns
835    * @param[in]       pA          pointer to operand A
836    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
837    * @param[in]       ch_im_out   numRow of A
838    * @param[in]       numCol_A    numCol of A
839    * @param[in]       bias_shift  amount of left-shift for bias
840    * @param[in]       out_shift   amount of right-shift for output
841    * @param[in]       bias        the bias
842    * @param[in,out]   pOut        pointer to output
843    * @return     The function returns the incremented output pointer
844    */
845 
846     q7_t     *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
847                                                       const q15_t * pInBuffer,
848                                                       const uint16_t ch_im_out,
849                                                       const uint16_t numCol_A,
850                                                       const uint16_t bias_shift,
851                                                       const uint16_t out_shift,
852                                                       const q7_t * bias,
853                                                       q7_t * pOut);
854 
855 #ifdef __cplusplus
856 }
857 #endif
858 
859 /*
860  *  Other functions
861  *  These layers are typically not timing critical
862  *  Basic implementation is supported here
863  */
864 
865 #ifdef __cplusplus
866 extern    "C"
867 {
868 #endif
869 
870 /**
871  * @defgroup Acti Neural Network Activation Functions
872  *
873  * Perform activation layers, including ReLU (Rectified Linear Unit),
874  * sigmoid and tanh
875  *
876  */
877 
878   /**
879    * @brief Q7 RELU function
880    * @param[in,out]   data        pointer to input
881    * @param[in]       size        number of elements
882    * @return none.
883    */
884 
885     void      arm_relu_q7(q7_t * data, uint16_t size);
886 
887   /**
888    * @brief Q15 RELU function
889    * @param[in,out]   data        pointer to input
890    * @param[in]       size        number of elements
891    * @return none.
892    */
893 
894     void      arm_relu_q15(q15_t * data, uint16_t size);
895 
896   /**
897    * @brief Q7 neural network activation function using direct table look-up
898    * @param[in,out]   data        pointer to input
899    * @param[in]       size        number of elements
900    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
901    * @param[in]       type        type of activation functions
902    * @return none.
903    */
904 
905     void      arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
906                                            arm_nn_activation_type type);
907 
908   /**
909    * @brief Q15 neural network activation function using direct table look-up
910    * @param[in,out]   data        pointer to input
911    * @param[in]       size        number of elements
912    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
913    * @param[in]       type        type of activation functions
914    * @return none.
915    */
916 
917     void      arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
918                                             arm_nn_activation_type type);
919 
920 /**
921  * @defgroup Pooling Neural Network Pooling Functions
922  *
923  * Perform pooling functions, including max pooling and average pooling
924  *
925  */
926 
927   /**
928    * @brief Q7 max pooling function
929    * @param[in]       Im_in       pointer to input tensor
930    * @param[in]       dim_im_in   input tensor dimention
931    * @param[in]       ch_im_in    number of input tensor channels
932    * @param[in]       dim_kernel  filter kernel size
933    * @param[in]       padding     padding sizes
934    * @param[in]       stride      convolution stride
935    * @param[in]       dim_im_out  output tensor dimension
936    * @param[in,out]   bufferA     pointer to buffer space for input
937    * @param[in,out]   Im_out      pointer to output tensor
938    * @return none.
939    *
940    */
941 
942     void      arm_maxpool_q7_HWC(q7_t * Im_in,
943                                  const uint16_t dim_im_in,
944                                  const uint16_t ch_im_in,
945                                  const uint16_t dim_kernel,
946                                  const uint16_t padding,
947                                  const uint16_t stride,
948                                  const uint16_t dim_im_out,
949                                  q7_t * bufferA,
950                                  q7_t * Im_out);
951 
952   /**
953    * @brief Q7 average pooling function
954    * @param[in]       Im_in       pointer to input tensor
955    * @param[in]       dim_im_in   input tensor dimention
956    * @param[in]       ch_im_in    number of input tensor channels
957    * @param[in]       dim_kernel  filter kernel size
958    * @param[in]       padding     padding sizes
959    * @param[in]       stride      convolution stride
960    * @param[in]       dim_im_out  output tensor dimension
961    * @param[in,out]   bufferA     pointer to buffer space for input
962    * @param[in,out]   Im_out      pointer to output tensor
963    * @return none.
964    *
965    */
966 
967     void      arm_avepool_q7_HWC(q7_t * Im_in,
968                                  const uint16_t dim_im_in,
969                                  const uint16_t ch_im_in,
970                                  const uint16_t dim_kernel,
971                                  const uint16_t padding,
972                                  const uint16_t stride,
973                                  const uint16_t dim_im_out,
974                                  q7_t * bufferA,
975                                  q7_t * Im_out);
976 
977 /**
978  * @defgroup Softmax Softmax Functions
979  *
980  * EXP(2) based softmax function
981  *
982  */
983 
984   /**
985    * @brief Q7 softmax function
986    * @param[in]       vec_in      pointer to input vector
987    * @param[in]       dim_vec     input vector dimention
988    * @param[out]      p_out       pointer to output vector
989    * @return none.
990    *
991    */
992 
993     void      arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
994 
995   /**
996    * @brief Q15 softmax function
997    * @param[in]       vec_in      pointer to input vector
998    * @param[in]       dim_vec     input vector dimention
999    * @param[out]      p_out       pointer to output vector
1000    * @return none.
1001    *
1002    */
1003 
1004     void      arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
1005 
1006 #ifdef __cplusplus
1007 }
1008 #endif
1009 
1010 #endif
1011