xref: /btstack/port/stm32-f4discovery-usb/Drivers/CMSIS/NN/Include/arm_nnfunctions.h (revision a8f7f3fcbcd51f8d2e92aca076b6a9f812db358c)
1*a8f7f3fcSMatthias Ringwald /*
2*a8f7f3fcSMatthias Ringwald  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3*a8f7f3fcSMatthias Ringwald  *
4*a8f7f3fcSMatthias Ringwald  * SPDX-License-Identifier: Apache-2.0
5*a8f7f3fcSMatthias Ringwald  *
6*a8f7f3fcSMatthias Ringwald  * Licensed under the Apache License, Version 2.0 (the License); you may
7*a8f7f3fcSMatthias Ringwald  * not use this file except in compliance with the License.
8*a8f7f3fcSMatthias Ringwald  * You may obtain a copy of the License at
9*a8f7f3fcSMatthias Ringwald  *
10*a8f7f3fcSMatthias Ringwald  * www.apache.org/licenses/LICENSE-2.0
11*a8f7f3fcSMatthias Ringwald  *
12*a8f7f3fcSMatthias Ringwald  * Unless required by applicable law or agreed to in writing, software
13*a8f7f3fcSMatthias Ringwald  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14*a8f7f3fcSMatthias Ringwald  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15*a8f7f3fcSMatthias Ringwald  * See the License for the specific language governing permissions and
16*a8f7f3fcSMatthias Ringwald  * limitations under the License.
17*a8f7f3fcSMatthias Ringwald  */
18*a8f7f3fcSMatthias Ringwald 
19*a8f7f3fcSMatthias Ringwald /* ----------------------------------------------------------------------
20*a8f7f3fcSMatthias Ringwald  * Project:      CMSIS NN Library
21*a8f7f3fcSMatthias Ringwald  * Title:        arm_nnfunctions.h
22*a8f7f3fcSMatthias Ringwald  * Description:  Public header file for CMSIS NN Library
23*a8f7f3fcSMatthias Ringwald  *
24*a8f7f3fcSMatthias Ringwald  * $Date:        13. July 2018
25*a8f7f3fcSMatthias Ringwald  * $Revision:    V.1.0.0
26*a8f7f3fcSMatthias Ringwald  *
27*a8f7f3fcSMatthias Ringwald  * Target Processor:  Cortex-M cores
28*a8f7f3fcSMatthias Ringwald  * -------------------------------------------------------------------- */
29*a8f7f3fcSMatthias Ringwald 
30*a8f7f3fcSMatthias Ringwald /**
31*a8f7f3fcSMatthias Ringwald    \mainpage CMSIS NN Software Library
32*a8f7f3fcSMatthias Ringwald    *
33*a8f7f3fcSMatthias Ringwald    * Introduction
34*a8f7f3fcSMatthias Ringwald    * ------------
35*a8f7f3fcSMatthias Ringwald    *
36*a8f7f3fcSMatthias Ringwald    * This user manual describes the CMSIS NN software library,
37*a8f7f3fcSMatthias Ringwald    * a collection of efficient neural network kernels developed to maximize the
38*a8f7f3fcSMatthias Ringwald    * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39*a8f7f3fcSMatthias Ringwald    *
40*a8f7f3fcSMatthias Ringwald    * The library is divided into a number of functions each covering a specific category:
41*a8f7f3fcSMatthias Ringwald    * - Neural Network Convolution Functions
42*a8f7f3fcSMatthias Ringwald    * - Neural Network Activation Functions
43*a8f7f3fcSMatthias Ringwald    * - Fully-connected Layer Functions
44*a8f7f3fcSMatthias Ringwald    * - Neural Network Pooling Functions
45*a8f7f3fcSMatthias Ringwald    * - Softmax Functions
46*a8f7f3fcSMatthias Ringwald    * - Neural Network Support Functions
47*a8f7f3fcSMatthias Ringwald    *
48*a8f7f3fcSMatthias Ringwald    * The library has separate functions for operating on different weight and activation data
49*a8f7f3fcSMatthias Ringwald    * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
50*a8f7f3fcSMatthias Ringwald    * kernels are included in the function description. The implementation details are also
51*a8f7f3fcSMatthias Ringwald    * described in this paper [1].
52*a8f7f3fcSMatthias Ringwald    *
53*a8f7f3fcSMatthias Ringwald    * Block Diagram
54*a8f7f3fcSMatthias Ringwald    * --------
55*a8f7f3fcSMatthias Ringwald    * \image html CMSIS-NN-OVERVIEW.PNG
56*a8f7f3fcSMatthias Ringwald    *
57*a8f7f3fcSMatthias Ringwald    * Examples
58*a8f7f3fcSMatthias Ringwald    * --------
59*a8f7f3fcSMatthias Ringwald    *
60*a8f7f3fcSMatthias Ringwald    * The library ships with a number of examples which demonstrate how to use the library functions.
61*a8f7f3fcSMatthias Ringwald    *
62*a8f7f3fcSMatthias Ringwald    * Pre-processor Macros
63*a8f7f3fcSMatthias Ringwald    * ------------
64*a8f7f3fcSMatthias Ringwald    *
65*a8f7f3fcSMatthias Ringwald    * Each library project have differant pre-processor macros.
66*a8f7f3fcSMatthias Ringwald    *
67*a8f7f3fcSMatthias Ringwald    * - ARM_MATH_DSP:
68*a8f7f3fcSMatthias Ringwald    *
69*a8f7f3fcSMatthias Ringwald    * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
70*a8f7f3fcSMatthias Ringwald    *
71*a8f7f3fcSMatthias Ringwald    * - ARM_MATH_BIG_ENDIAN:
72*a8f7f3fcSMatthias Ringwald    *
73*a8f7f3fcSMatthias Ringwald    * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
74*a8f7f3fcSMatthias Ringwald    *
75*a8f7f3fcSMatthias Ringwald    * - ARM_NN_TRUNCATE:
76*a8f7f3fcSMatthias Ringwald    *
77*a8f7f3fcSMatthias Ringwald    * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
78*a8f7f3fcSMatthias Ringwald    *
79*a8f7f3fcSMatthias Ringwald    * Copyright Notice
80*a8f7f3fcSMatthias Ringwald    * ------------
81*a8f7f3fcSMatthias Ringwald    *
82*a8f7f3fcSMatthias Ringwald    * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
83*a8f7f3fcSMatthias Ringwald    *
84*a8f7f3fcSMatthias Ringwald    * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
85*a8f7f3fcSMatthias Ringwald    */
86*a8f7f3fcSMatthias Ringwald 
87*a8f7f3fcSMatthias Ringwald /**
88*a8f7f3fcSMatthias Ringwald  * @defgroup groupNN Neural Network Functions
89*a8f7f3fcSMatthias Ringwald  * These functions perform basic operations for neural network layers.
90*a8f7f3fcSMatthias Ringwald  */
91*a8f7f3fcSMatthias Ringwald 
92*a8f7f3fcSMatthias Ringwald #ifndef _ARM_NNFUNCTIONS_H
93*a8f7f3fcSMatthias Ringwald #define _ARM_NNFUNCTIONS_H
94*a8f7f3fcSMatthias Ringwald 
95*a8f7f3fcSMatthias Ringwald #include "arm_nnsupportfunctions.h"
96*a8f7f3fcSMatthias Ringwald #include "arm_nn_tables.h"
97*a8f7f3fcSMatthias Ringwald 
98*a8f7f3fcSMatthias Ringwald #define USE_INTRINSIC
99*a8f7f3fcSMatthias Ringwald 
100*a8f7f3fcSMatthias Ringwald //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
101*a8f7f3fcSMatthias Ringwald 
102*a8f7f3fcSMatthias Ringwald #ifdef __cplusplus
103*a8f7f3fcSMatthias Ringwald extern    "C"
104*a8f7f3fcSMatthias Ringwald {
105*a8f7f3fcSMatthias Ringwald #endif
106*a8f7f3fcSMatthias Ringwald 
107*a8f7f3fcSMatthias Ringwald /**
108*a8f7f3fcSMatthias Ringwald  * @defgroup NNConv Neural Network Convolution Functions
109*a8f7f3fcSMatthias Ringwald  *
110*a8f7f3fcSMatthias Ringwald  * Perform convolution layer
111*a8f7f3fcSMatthias Ringwald  *
112*a8f7f3fcSMatthias Ringwald  * The convolution is implemented in 2 steps: im2col and GEMM
113*a8f7f3fcSMatthias Ringwald  *
114*a8f7f3fcSMatthias Ringwald  * im2col is a process of converting each patch of image data into
115*a8f7f3fcSMatthias Ringwald  * a column. After im2col, the convolution is computed as matrix-matrix
116*a8f7f3fcSMatthias Ringwald  * multiplication.
117*a8f7f3fcSMatthias Ringwald  *
118*a8f7f3fcSMatthias Ringwald  * To reduce the memory footprint, the im2col is performed partially.
119*a8f7f3fcSMatthias Ringwald  * Each iteration, only a few column (i.e., patches) are generated and
120*a8f7f3fcSMatthias Ringwald  * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
121*a8f7f3fcSMatthias Ringwald  *
122*a8f7f3fcSMatthias Ringwald  */
123*a8f7f3fcSMatthias Ringwald 
124*a8f7f3fcSMatthias Ringwald   /**
125*a8f7f3fcSMatthias Ringwald    * @brief Basic Q7 convolution function
126*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
127*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
128*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
129*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
130*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
131*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
132*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
133*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
134*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
135*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
136*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
137*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
138*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
139*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
140*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
141*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
142*a8f7f3fcSMatthias Ringwald    *
143*a8f7f3fcSMatthias Ringwald    */
144*a8f7f3fcSMatthias Ringwald 
145*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
146*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_im_in,
147*a8f7f3fcSMatthias Ringwald                                          const uint16_t ch_im_in,
148*a8f7f3fcSMatthias Ringwald                                          const q7_t * wt,
149*a8f7f3fcSMatthias Ringwald                                          const uint16_t ch_im_out,
150*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_kernel,
151*a8f7f3fcSMatthias Ringwald                                          const uint16_t padding,
152*a8f7f3fcSMatthias Ringwald                                          const uint16_t stride,
153*a8f7f3fcSMatthias Ringwald                                          const q7_t * bias,
154*a8f7f3fcSMatthias Ringwald                                          const uint16_t bias_shift,
155*a8f7f3fcSMatthias Ringwald                                          const uint16_t out_shift,
156*a8f7f3fcSMatthias Ringwald                                          q7_t * Im_out,
157*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_im_out,
158*a8f7f3fcSMatthias Ringwald                                          q15_t * bufferA,
159*a8f7f3fcSMatthias Ringwald                                          q7_t * bufferB);
160*a8f7f3fcSMatthias Ringwald 
161*a8f7f3fcSMatthias Ringwald   /**
162*a8f7f3fcSMatthias Ringwald    * @brief Basic Q7 convolution function (non-sqaure shape)
163*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in        pointer to input tensor
164*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_x  input tensor dimention x
165*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_y  input tensor dimention y
166*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in     number of input tensor channels
167*a8f7f3fcSMatthias Ringwald    * @param[in]       wt           pointer to kernel weights
168*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
169*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_x filter kernel size x
170*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_y filter kernel size y
171*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_x    padding size x
172*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_y    padding size y
173*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_x     convolution stride x
174*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_y     convolution stride y
175*a8f7f3fcSMatthias Ringwald    * @param[in]       bias         pointer to bias
176*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift   amount of left-shift for bias
177*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift    amount of right-shift for output
178*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out       pointer to output tensor
179*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_x output tensor dimension x
180*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_y output tensor dimension y
181*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA      pointer to buffer space for input
182*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB      pointer to buffer space for output
183*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
184*a8f7f3fcSMatthias Ringwald    */
185*a8f7f3fcSMatthias Ringwald 
186*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
187*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_in_x,
188*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_in_y,
189*a8f7f3fcSMatthias Ringwald                                                   const uint16_t ch_im_in,
190*a8f7f3fcSMatthias Ringwald                                                   const q7_t * wt,
191*a8f7f3fcSMatthias Ringwald                                                   const uint16_t ch_im_out,
192*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_kernel_x,
193*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_kernel_y,
194*a8f7f3fcSMatthias Ringwald                                                   const uint16_t padding_x,
195*a8f7f3fcSMatthias Ringwald                                                   const uint16_t padding_y,
196*a8f7f3fcSMatthias Ringwald                                                   const uint16_t stride_x,
197*a8f7f3fcSMatthias Ringwald                                                   const uint16_t stride_y,
198*a8f7f3fcSMatthias Ringwald                                                   const q7_t * bias,
199*a8f7f3fcSMatthias Ringwald                                                   const uint16_t bias_shift,
200*a8f7f3fcSMatthias Ringwald                                                   const uint16_t out_shift,
201*a8f7f3fcSMatthias Ringwald                                                   q7_t * Im_out,
202*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_out_x,
203*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_out_y,
204*a8f7f3fcSMatthias Ringwald                                                   q15_t * bufferA,
205*a8f7f3fcSMatthias Ringwald                                                   q7_t * bufferB);
206*a8f7f3fcSMatthias Ringwald 
207*a8f7f3fcSMatthias Ringwald   /**
208*a8f7f3fcSMatthias Ringwald    * @brief Basic Q15 convolution function
209*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
210*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
211*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
212*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
213*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
214*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
215*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
216*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
217*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
218*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
219*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
220*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
221*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
222*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
223*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
224*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
225*a8f7f3fcSMatthias Ringwald    *
226*a8f7f3fcSMatthias Ringwald    */
227*a8f7f3fcSMatthias Ringwald 
228*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
229*a8f7f3fcSMatthias Ringwald                                           const uint16_t dim_im_in,
230*a8f7f3fcSMatthias Ringwald                                           const uint16_t ch_im_in,
231*a8f7f3fcSMatthias Ringwald                                           const q15_t * wt,
232*a8f7f3fcSMatthias Ringwald                                           const uint16_t ch_im_out,
233*a8f7f3fcSMatthias Ringwald                                           const uint16_t dim_kernel,
234*a8f7f3fcSMatthias Ringwald                                           const uint16_t padding,
235*a8f7f3fcSMatthias Ringwald                                           const uint16_t stride,
236*a8f7f3fcSMatthias Ringwald                                           const q15_t * bias,
237*a8f7f3fcSMatthias Ringwald                                           const uint16_t bias_shift,
238*a8f7f3fcSMatthias Ringwald                                           const uint16_t out_shift,
239*a8f7f3fcSMatthias Ringwald                                           q15_t * Im_out,
240*a8f7f3fcSMatthias Ringwald                                           const uint16_t dim_im_out,
241*a8f7f3fcSMatthias Ringwald                                           q15_t * bufferA,
242*a8f7f3fcSMatthias Ringwald                                           q7_t * bufferB);
243*a8f7f3fcSMatthias Ringwald 
244*a8f7f3fcSMatthias Ringwald   /**
245*a8f7f3fcSMatthias Ringwald    * @brief Fast Q7 convolution function
246*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
247*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
248*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
249*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
250*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
251*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
252*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
253*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
254*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
255*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
256*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
257*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
258*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
259*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
260*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
261*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
262*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
263*a8f7f3fcSMatthias Ringwald    *
264*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
265*a8f7f3fcSMatthias Ringwald    * some contraints:
266*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 4
267*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
268*a8f7f3fcSMatthias Ringwald    */
269*a8f7f3fcSMatthias Ringwald 
270*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
271*a8f7f3fcSMatthias Ringwald                                         const uint16_t dim_im_in,
272*a8f7f3fcSMatthias Ringwald                                         const uint16_t ch_im_in,
273*a8f7f3fcSMatthias Ringwald                                         const q7_t * wt,
274*a8f7f3fcSMatthias Ringwald                                         const uint16_t ch_im_out,
275*a8f7f3fcSMatthias Ringwald                                         const uint16_t dim_kernel,
276*a8f7f3fcSMatthias Ringwald                                         const uint16_t padding,
277*a8f7f3fcSMatthias Ringwald                                         const uint16_t stride,
278*a8f7f3fcSMatthias Ringwald                                         const q7_t * bias,
279*a8f7f3fcSMatthias Ringwald                                         const uint16_t bias_shift,
280*a8f7f3fcSMatthias Ringwald                                         const uint16_t out_shift,
281*a8f7f3fcSMatthias Ringwald                                         q7_t * Im_out,
282*a8f7f3fcSMatthias Ringwald                                         const uint16_t dim_im_out,
283*a8f7f3fcSMatthias Ringwald                                         q15_t * bufferA,
284*a8f7f3fcSMatthias Ringwald                                         q7_t * bufferB);
285*a8f7f3fcSMatthias Ringwald 
286*a8f7f3fcSMatthias Ringwald   /**
287*a8f7f3fcSMatthias Ringwald    * @brief Fast Q7 convolution function (non-sqaure shape)
288*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in        pointer to input tensor
289*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_x  input tensor dimention x
290*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_y  input tensor dimention y
291*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in     number of input tensor channels
292*a8f7f3fcSMatthias Ringwald    * @param[in]       wt           pointer to kernel weights
293*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
294*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_x filter kernel size x
295*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_y filter kernel size y
296*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_x    padding size x
297*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_y    padding size y
298*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_x     convolution stride x
299*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_y     convolution stride y
300*a8f7f3fcSMatthias Ringwald    * @param[in]       bias         pointer to bias
301*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift   amount of left-shift for bias
302*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift    amount of right-shift for output
303*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out       pointer to output tensor
304*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_x output tensor dimension x
305*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_y output tensor dimension y
306*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA      pointer to buffer space for input
307*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB      pointer to buffer space for output
308*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
309*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
310*a8f7f3fcSMatthias Ringwald    *
311*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
312*a8f7f3fcSMatthias Ringwald    * some contraints:
313*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 4
314*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
315*a8f7f3fcSMatthias Ringwald    */
316*a8f7f3fcSMatthias Ringwald 
317*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
318*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_in_x,
319*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_in_y,
320*a8f7f3fcSMatthias Ringwald                                                   const uint16_t ch_im_in,
321*a8f7f3fcSMatthias Ringwald                                                   const q7_t * wt,
322*a8f7f3fcSMatthias Ringwald                                                   const uint16_t ch_im_out,
323*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_kernel_x,
324*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_kernel_y,
325*a8f7f3fcSMatthias Ringwald                                                   const uint16_t padding_x,
326*a8f7f3fcSMatthias Ringwald                                                   const uint16_t padding_y,
327*a8f7f3fcSMatthias Ringwald                                                   const uint16_t stride_x,
328*a8f7f3fcSMatthias Ringwald                                                   const uint16_t stride_y,
329*a8f7f3fcSMatthias Ringwald                                                   const q7_t * bias,
330*a8f7f3fcSMatthias Ringwald                                                   const uint16_t bias_shift,
331*a8f7f3fcSMatthias Ringwald                                                   const uint16_t out_shift,
332*a8f7f3fcSMatthias Ringwald                                                   q7_t * Im_out,
333*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_out_x,
334*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_im_out_y,
335*a8f7f3fcSMatthias Ringwald                                                   q15_t * bufferA,
336*a8f7f3fcSMatthias Ringwald                                                   q7_t * bufferB);
337*a8f7f3fcSMatthias Ringwald 
338*a8f7f3fcSMatthias Ringwald   /**
339*a8f7f3fcSMatthias Ringwald    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
340*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in        pointer to input tensor
341*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_x  input tensor dimention x
342*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_y  input tensor dimention y
343*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in     number of input tensor channels
344*a8f7f3fcSMatthias Ringwald    * @param[in]       wt           pointer to kernel weights
345*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
346*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_x filter kernel size x
347*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_y filter kernel size y
348*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_x    padding size x
349*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_y    padding size y
350*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_x     convolution stride x
351*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_y     convolution stride y
352*a8f7f3fcSMatthias Ringwald    * @param[in]       bias         pointer to bias
353*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift   amount of left-shift for bias
354*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift    amount of right-shift for output
355*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out       pointer to output tensor
356*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_x output tensor dimension x
357*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_y output tensor dimension y
358*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA      pointer to buffer space for input
359*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB      pointer to buffer space for output
360*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
361*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
362*a8f7f3fcSMatthias Ringwald    *
363*a8f7f3fcSMatthias Ringwald    * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
364*a8f7f3fcSMatthias Ringwald    * and dim_kernel_y=1). It can be used for
365*a8f7f3fcSMatthias Ringwald    * second half of MobileNets after depthwise separable convolution.
366*a8f7f3fcSMatthias Ringwald    *
367*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
368*a8f7f3fcSMatthias Ringwald    * some contraints:
369*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 4
370*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
371*a8f7f3fcSMatthias Ringwald    */
372*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
373*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_im_in_x,
374*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_im_in_y,
375*a8f7f3fcSMatthias Ringwald                                                       const uint16_t ch_im_in,
376*a8f7f3fcSMatthias Ringwald                                                       const q7_t * wt,
377*a8f7f3fcSMatthias Ringwald                                                       const uint16_t ch_im_out,
378*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_kernel_x,
379*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_kernel_y,
380*a8f7f3fcSMatthias Ringwald                                                       const uint16_t padding_x,
381*a8f7f3fcSMatthias Ringwald                                                       const uint16_t padding_y,
382*a8f7f3fcSMatthias Ringwald                                                       const uint16_t stride_x,
383*a8f7f3fcSMatthias Ringwald                                                       const uint16_t stride_y,
384*a8f7f3fcSMatthias Ringwald                                                       const q7_t * bias,
385*a8f7f3fcSMatthias Ringwald                                                       const uint16_t bias_shift,
386*a8f7f3fcSMatthias Ringwald                                                       const uint16_t out_shift,
387*a8f7f3fcSMatthias Ringwald                                                       q7_t * Im_out,
388*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_im_out_x,
389*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_im_out_y,
390*a8f7f3fcSMatthias Ringwald                                                       q15_t * bufferA,
391*a8f7f3fcSMatthias Ringwald                                                       q7_t * bufferB);
392*a8f7f3fcSMatthias Ringwald 
393*a8f7f3fcSMatthias Ringwald   /**
394*a8f7f3fcSMatthias Ringwald    * @brief Q7 version of convolution for RGB image
395*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
396*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
397*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
398*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
399*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
400*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
401*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
402*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
403*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
404*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
405*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
406*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
407*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
408*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
409*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
410*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
411*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
412*a8f7f3fcSMatthias Ringwald    *
413*a8f7f3fcSMatthias Ringwald    * This kernel is written exclusively for convolution with ch_im_in
414*a8f7f3fcSMatthias Ringwald    * equals 3. This applies on the first layer of CNNs which has input
415*a8f7f3fcSMatthias Ringwald    * image with RGB format.
416*a8f7f3fcSMatthias Ringwald    */
417*a8f7f3fcSMatthias Ringwald 
418*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
419*a8f7f3fcSMatthias Ringwald                                        const uint16_t dim_im_in,
420*a8f7f3fcSMatthias Ringwald                                        const uint16_t ch_im_in,
421*a8f7f3fcSMatthias Ringwald                                        const q7_t * wt,
422*a8f7f3fcSMatthias Ringwald                                        const uint16_t ch_im_out,
423*a8f7f3fcSMatthias Ringwald                                        const uint16_t dim_kernel,
424*a8f7f3fcSMatthias Ringwald                                        const uint16_t padding,
425*a8f7f3fcSMatthias Ringwald                                        const uint16_t stride,
426*a8f7f3fcSMatthias Ringwald                                        const q7_t * bias,
427*a8f7f3fcSMatthias Ringwald                                        const uint16_t bias_shift,
428*a8f7f3fcSMatthias Ringwald                                        const uint16_t out_shift,
429*a8f7f3fcSMatthias Ringwald                                        q7_t * Im_out,
430*a8f7f3fcSMatthias Ringwald                                        const uint16_t dim_im_out,
431*a8f7f3fcSMatthias Ringwald                                        q15_t * bufferA,
432*a8f7f3fcSMatthias Ringwald                                        q7_t * bufferB);
433*a8f7f3fcSMatthias Ringwald 
434*a8f7f3fcSMatthias Ringwald   /**
435*a8f7f3fcSMatthias Ringwald    * @brief Fast Q15 convolution function
436*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
437*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
438*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
439*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
440*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
441*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
442*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
443*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
444*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
445*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
446*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
447*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
448*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
449*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
450*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
451*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
452*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
453*a8f7f3fcSMatthias Ringwald    *
454*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
455*a8f7f3fcSMatthias Ringwald    * some contraints:
456*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 2
457*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
458*a8f7f3fcSMatthias Ringwald    */
459*a8f7f3fcSMatthias Ringwald 
460*a8f7f3fcSMatthias Ringwald     arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
461*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_im_in,
462*a8f7f3fcSMatthias Ringwald                                          const uint16_t ch_im_in,
463*a8f7f3fcSMatthias Ringwald                                          const q15_t * wt,
464*a8f7f3fcSMatthias Ringwald                                          const uint16_t ch_im_out,
465*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_kernel,
466*a8f7f3fcSMatthias Ringwald                                          const uint16_t padding,
467*a8f7f3fcSMatthias Ringwald                                          const uint16_t stride,
468*a8f7f3fcSMatthias Ringwald                                          const q15_t * bias,
469*a8f7f3fcSMatthias Ringwald                                          const uint16_t bias_shift,
470*a8f7f3fcSMatthias Ringwald                                          const uint16_t out_shift,
471*a8f7f3fcSMatthias Ringwald                                          q15_t * Im_out,
472*a8f7f3fcSMatthias Ringwald                                          const uint16_t dim_im_out,
473*a8f7f3fcSMatthias Ringwald                                          q15_t * bufferA,
474*a8f7f3fcSMatthias Ringwald                                          q7_t * bufferB);
475*a8f7f3fcSMatthias Ringwald 
476*a8f7f3fcSMatthias Ringwald   /**
477*a8f7f3fcSMatthias Ringwald    * @brief Fast Q15 convolution function (non-sqaure shape)
478*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in        pointer to input tensor
479*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_x  input tensor dimention x
480*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_y  input tensor dimention y
481*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in     number of input tensor channels
482*a8f7f3fcSMatthias Ringwald    * @param[in]       wt           pointer to kernel weights
483*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
484*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_x filter kernel size x
485*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_y filter kernel size y
486*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_x    padding size x
487*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_y    padding size y
488*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_x     convolution stride x
489*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_y     convolution stride y
490*a8f7f3fcSMatthias Ringwald    * @param[in]       bias         pointer to bias
491*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift   amount of left-shift for bias
492*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift    amount of right-shift for output
493*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out       pointer to output tensor
494*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_x output tensor dimension x
495*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_y output tensor dimension y
496*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA      pointer to buffer space for input
497*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB      pointer to buffer space for output
498*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
499*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
500*a8f7f3fcSMatthias Ringwald    *
501*a8f7f3fcSMatthias Ringwald    * @details
502*a8f7f3fcSMatthias Ringwald    *
503*a8f7f3fcSMatthias Ringwald    * <b>Buffer size:</b>
504*a8f7f3fcSMatthias Ringwald    *
505*a8f7f3fcSMatthias Ringwald    * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
506*a8f7f3fcSMatthias Ringwald    *
507*a8f7f3fcSMatthias Ringwald    * bufferB size: 0
508*a8f7f3fcSMatthias Ringwald    *
509*a8f7f3fcSMatthias Ringwald    * <b>Input dimension constraints:</b>
510*a8f7f3fcSMatthias Ringwald    *
511*a8f7f3fcSMatthias Ringwald    * ch_im_in is multiple of 2
512*a8f7f3fcSMatthias Ringwald    *
513*a8f7f3fcSMatthias Ringwald    * ch_im_out is multipe of 2
514*a8f7f3fcSMatthias Ringwald    *
515*a8f7f3fcSMatthias Ringwald    */
516*a8f7f3fcSMatthias Ringwald 
517*a8f7f3fcSMatthias Ringwald     arm_status
518*a8f7f3fcSMatthias Ringwald     arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
519*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_im_in_x,
520*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_im_in_y,
521*a8f7f3fcSMatthias Ringwald                               const uint16_t ch_im_in,
522*a8f7f3fcSMatthias Ringwald                               const q15_t * wt,
523*a8f7f3fcSMatthias Ringwald                               const uint16_t ch_im_out,
524*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_kernel_x,
525*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_kernel_y,
526*a8f7f3fcSMatthias Ringwald                               const uint16_t padding_x,
527*a8f7f3fcSMatthias Ringwald                               const uint16_t padding_y,
528*a8f7f3fcSMatthias Ringwald                               const uint16_t stride_x,
529*a8f7f3fcSMatthias Ringwald                               const uint16_t stride_y,
530*a8f7f3fcSMatthias Ringwald                               const q15_t * bias,
531*a8f7f3fcSMatthias Ringwald                               const uint16_t bias_shift,
532*a8f7f3fcSMatthias Ringwald                               const uint16_t out_shift,
533*a8f7f3fcSMatthias Ringwald                               q15_t * Im_out,
534*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_im_out_x,
535*a8f7f3fcSMatthias Ringwald                               const uint16_t dim_im_out_y,
536*a8f7f3fcSMatthias Ringwald                               q15_t * bufferA,
537*a8f7f3fcSMatthias Ringwald                               q7_t * bufferB);
538*a8f7f3fcSMatthias Ringwald 
539*a8f7f3fcSMatthias Ringwald   /**
540*a8f7f3fcSMatthias Ringwald    * @brief Q7 depthwise separable convolution function
541*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
542*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
543*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
544*a8f7f3fcSMatthias Ringwald    * @param[in]       wt          pointer to kernel weights
545*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
546*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
547*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
548*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
549*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
550*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
551*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
552*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
553*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
554*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
555*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB     pointer to buffer space for output
556*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
557*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
558*a8f7f3fcSMatthias Ringwald    *
559*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
560*a8f7f3fcSMatthias Ringwald    * some contraints:
561*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 2
562*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
563*a8f7f3fcSMatthias Ringwald    */
564*a8f7f3fcSMatthias Ringwald 
565*a8f7f3fcSMatthias Ringwald     arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
566*a8f7f3fcSMatthias Ringwald                                                    const uint16_t dim_im_in,
567*a8f7f3fcSMatthias Ringwald                                                    const uint16_t ch_im_in,
568*a8f7f3fcSMatthias Ringwald                                                    const q7_t * wt,
569*a8f7f3fcSMatthias Ringwald                                                    const uint16_t ch_im_out,
570*a8f7f3fcSMatthias Ringwald                                                    const uint16_t dim_kernel,
571*a8f7f3fcSMatthias Ringwald                                                    const uint16_t padding,
572*a8f7f3fcSMatthias Ringwald                                                    const uint16_t stride,
573*a8f7f3fcSMatthias Ringwald                                                    const q7_t * bias,
574*a8f7f3fcSMatthias Ringwald                                                    const uint16_t bias_shift,
575*a8f7f3fcSMatthias Ringwald                                                    const uint16_t out_shift,
576*a8f7f3fcSMatthias Ringwald                                                    q7_t * Im_out,
577*a8f7f3fcSMatthias Ringwald                                                    const uint16_t dim_im_out,
578*a8f7f3fcSMatthias Ringwald                                                    q15_t * bufferA,
579*a8f7f3fcSMatthias Ringwald                                                    q7_t * bufferB);
580*a8f7f3fcSMatthias Ringwald 
581*a8f7f3fcSMatthias Ringwald   /**
582*a8f7f3fcSMatthias Ringwald    * @brief Q7 depthwise separable convolution function (non-square shape)
583*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in         pointer to input tensor
584*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_x   input tensor dimention x
585*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in_y   input tensor dimention y
586*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in      number of input tensor channels
587*a8f7f3fcSMatthias Ringwald    * @param[in]       wt            pointer to kernel weights
588*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
589*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_x  filter kernel size x
590*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel_y  filter kernel size y
591*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_x     padding sizes x
592*a8f7f3fcSMatthias Ringwald    * @param[in]       padding_y     padding sizes y
593*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_x      convolution stride x
594*a8f7f3fcSMatthias Ringwald    * @param[in]       stride_y      convolution stride y
595*a8f7f3fcSMatthias Ringwald    * @param[in]       bias          pointer to bias
596*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift    amount of left-shift for bias
597*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift     amount of right-shift for output
598*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out        pointer to output tensor
599*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_x  output tensor dimension x
600*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out_y  output tensor dimension y
601*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA       pointer to buffer space for input
602*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferB       pointer to buffer space for output
603*a8f7f3fcSMatthias Ringwald    * @return     The function returns either
604*a8f7f3fcSMatthias Ringwald    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
605*a8f7f3fcSMatthias Ringwald    *
606*a8f7f3fcSMatthias Ringwald    * This function is the version with full list of optimization tricks, but with
607*a8f7f3fcSMatthias Ringwald    * some contraints:
608*a8f7f3fcSMatthias Ringwald    *   ch_im_in is multiple of 2
609*a8f7f3fcSMatthias Ringwald    *   ch_im_out is multiple of 2
610*a8f7f3fcSMatthias Ringwald    */
611*a8f7f3fcSMatthias Ringwald     arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
612*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_im_in_x,
613*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_im_in_y,
614*a8f7f3fcSMatthias Ringwald                                                              const uint16_t ch_im_in,
615*a8f7f3fcSMatthias Ringwald                                                              const q7_t * wt,
616*a8f7f3fcSMatthias Ringwald                                                              const uint16_t ch_im_out,
617*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_kernel_x,
618*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_kernel_y,
619*a8f7f3fcSMatthias Ringwald                                                              const uint16_t padding_x,
620*a8f7f3fcSMatthias Ringwald                                                              const uint16_t padding_y,
621*a8f7f3fcSMatthias Ringwald                                                              const uint16_t stride_x,
622*a8f7f3fcSMatthias Ringwald                                                              const uint16_t stride_y,
623*a8f7f3fcSMatthias Ringwald                                                              const q7_t * bias,
624*a8f7f3fcSMatthias Ringwald                                                              const uint16_t bias_shift,
625*a8f7f3fcSMatthias Ringwald                                                              const uint16_t out_shift,
626*a8f7f3fcSMatthias Ringwald                                                              q7_t * Im_out,
627*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_im_out_x,
628*a8f7f3fcSMatthias Ringwald                                                              const uint16_t dim_im_out_y,
629*a8f7f3fcSMatthias Ringwald                                                              q15_t * bufferA,
630*a8f7f3fcSMatthias Ringwald                                                              q7_t * bufferB);
631*a8f7f3fcSMatthias Ringwald 
632*a8f7f3fcSMatthias Ringwald 
633*a8f7f3fcSMatthias Ringwald /**
634*a8f7f3fcSMatthias Ringwald  * @defgroup FC Fully-connected Layer Functions
635*a8f7f3fcSMatthias Ringwald  *
636*a8f7f3fcSMatthias Ringwald  * Perform fully-connected layer
637*a8f7f3fcSMatthias Ringwald  *
638*a8f7f3fcSMatthias Ringwald  * Fully-connected layer is basically a matrix-vector multiplication
639*a8f7f3fcSMatthias Ringwald  * with bias. The matrix is the weights and the input/output vectors
640*a8f7f3fcSMatthias Ringwald  * are the activation values. Supported {weight, activation} precisions
641*a8f7f3fcSMatthias Ringwald  * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
642*a8f7f3fcSMatthias Ringwald  *
643*a8f7f3fcSMatthias Ringwald  * Here we have two types of kernel functions. The basic function
644*a8f7f3fcSMatthias Ringwald  * implements the function using regular GEMV approach. The opt functions
645*a8f7f3fcSMatthias Ringwald  * operates with weights in interleaved formats.
646*a8f7f3fcSMatthias Ringwald  *
647*a8f7f3fcSMatthias Ringwald  */
648*a8f7f3fcSMatthias Ringwald 
649*a8f7f3fcSMatthias Ringwald   /**
650*a8f7f3fcSMatthias Ringwald    * @brief Q7 basic fully-connected layer function
651*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
652*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
653*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
654*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
655*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
656*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
657*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
658*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
659*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
660*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
661*a8f7f3fcSMatthias Ringwald    *
662*a8f7f3fcSMatthias Ringwald    */
663*a8f7f3fcSMatthias Ringwald 
664*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_q7(const q7_t * pV,
665*a8f7f3fcSMatthias Ringwald                                       const q7_t * pM,
666*a8f7f3fcSMatthias Ringwald                                       const uint16_t dim_vec,
667*a8f7f3fcSMatthias Ringwald                                       const uint16_t num_of_rows,
668*a8f7f3fcSMatthias Ringwald                                       const uint16_t bias_shift,
669*a8f7f3fcSMatthias Ringwald                                       const uint16_t out_shift,
670*a8f7f3fcSMatthias Ringwald                                       const q7_t * bias,
671*a8f7f3fcSMatthias Ringwald                                       q7_t * pOut,
672*a8f7f3fcSMatthias Ringwald                                       q15_t * vec_buffer);
673*a8f7f3fcSMatthias Ringwald 
674*a8f7f3fcSMatthias Ringwald   /**
675*a8f7f3fcSMatthias Ringwald    * @brief Q7 opt fully-connected layer function
676*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
677*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
678*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
679*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
680*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
681*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
682*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
683*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
684*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
685*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
686*a8f7f3fcSMatthias Ringwald    *
687*a8f7f3fcSMatthias Ringwald    */
688*a8f7f3fcSMatthias Ringwald 
689*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_q7_opt(const q7_t * pV,
690*a8f7f3fcSMatthias Ringwald                                           const q7_t * pM,
691*a8f7f3fcSMatthias Ringwald                                           const uint16_t dim_vec,
692*a8f7f3fcSMatthias Ringwald                                           const uint16_t num_of_rows,
693*a8f7f3fcSMatthias Ringwald                                           const uint16_t bias_shift,
694*a8f7f3fcSMatthias Ringwald                                           const uint16_t out_shift,
695*a8f7f3fcSMatthias Ringwald                                           const q7_t * bias,
696*a8f7f3fcSMatthias Ringwald                                           q7_t * pOut,
697*a8f7f3fcSMatthias Ringwald                                           q15_t * vec_buffer);
698*a8f7f3fcSMatthias Ringwald 
699*a8f7f3fcSMatthias Ringwald   /**
700*a8f7f3fcSMatthias Ringwald    * @brief Q15 basic fully-connected layer function
701*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
702*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
703*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
704*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
705*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
706*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
707*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
708*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
709*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
710*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
711*a8f7f3fcSMatthias Ringwald    *
712*a8f7f3fcSMatthias Ringwald    */
713*a8f7f3fcSMatthias Ringwald 
714*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_q15(const q15_t * pV,
715*a8f7f3fcSMatthias Ringwald                                        const q15_t * pM,
716*a8f7f3fcSMatthias Ringwald                                        const uint16_t dim_vec,
717*a8f7f3fcSMatthias Ringwald                                        const uint16_t num_of_rows,
718*a8f7f3fcSMatthias Ringwald                                        const uint16_t bias_shift,
719*a8f7f3fcSMatthias Ringwald                                        const uint16_t out_shift,
720*a8f7f3fcSMatthias Ringwald                                        const q15_t * bias,
721*a8f7f3fcSMatthias Ringwald                                        q15_t * pOut,
722*a8f7f3fcSMatthias Ringwald                                        q15_t * vec_buffer);
723*a8f7f3fcSMatthias Ringwald 
724*a8f7f3fcSMatthias Ringwald   /**
725*a8f7f3fcSMatthias Ringwald    * @brief Q15 opt fully-connected layer function
726*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
727*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
728*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
729*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
730*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
731*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
732*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
733*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
734*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
735*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
736*a8f7f3fcSMatthias Ringwald    *
737*a8f7f3fcSMatthias Ringwald    */
738*a8f7f3fcSMatthias Ringwald 
739*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_q15_opt(const q15_t * pV,
740*a8f7f3fcSMatthias Ringwald                                            const q15_t * pM,
741*a8f7f3fcSMatthias Ringwald                                            const uint16_t dim_vec,
742*a8f7f3fcSMatthias Ringwald                                            const uint16_t num_of_rows,
743*a8f7f3fcSMatthias Ringwald                                            const uint16_t bias_shift,
744*a8f7f3fcSMatthias Ringwald                                            const uint16_t out_shift,
745*a8f7f3fcSMatthias Ringwald                                            const q15_t * bias,
746*a8f7f3fcSMatthias Ringwald                                            q15_t * pOut,
747*a8f7f3fcSMatthias Ringwald                                            q15_t * vec_buffer);
748*a8f7f3fcSMatthias Ringwald 
749*a8f7f3fcSMatthias Ringwald   /**
750*a8f7f3fcSMatthias Ringwald    * @brief Mixed Q15-Q7 fully-connected layer function
751*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
752*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
753*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
754*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
755*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
756*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
757*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
758*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
759*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
760*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
761*a8f7f3fcSMatthias Ringwald    *
762*a8f7f3fcSMatthias Ringwald    */
763*a8f7f3fcSMatthias Ringwald 
764*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
765*a8f7f3fcSMatthias Ringwald                                                   const q7_t * pM,
766*a8f7f3fcSMatthias Ringwald                                                   const uint16_t dim_vec,
767*a8f7f3fcSMatthias Ringwald                                                   const uint16_t num_of_rows,
768*a8f7f3fcSMatthias Ringwald                                                   const uint16_t bias_shift,
769*a8f7f3fcSMatthias Ringwald                                                   const uint16_t out_shift,
770*a8f7f3fcSMatthias Ringwald                                                   const q7_t * bias,
771*a8f7f3fcSMatthias Ringwald                                                   q15_t * pOut,
772*a8f7f3fcSMatthias Ringwald                                                   q15_t * vec_buffer);
773*a8f7f3fcSMatthias Ringwald 
774*a8f7f3fcSMatthias Ringwald   /**
775*a8f7f3fcSMatthias Ringwald    * @brief Mixed Q15-Q7 opt fully-connected layer function
776*a8f7f3fcSMatthias Ringwald    * @param[in]       pV          pointer to input vector
777*a8f7f3fcSMatthias Ringwald    * @param[in]       pM          pointer to matrix weights
778*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     length of the vector
779*a8f7f3fcSMatthias Ringwald    * @param[in]       num_of_rows number of rows in weight matrix
780*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
781*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
782*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        pointer to bias
783*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output vector
784*a8f7f3fcSMatthias Ringwald    * @param[in,out]   vec_buffer  pointer to buffer space for input
785*a8f7f3fcSMatthias Ringwald    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
786*a8f7f3fcSMatthias Ringwald    *
787*a8f7f3fcSMatthias Ringwald    */
788*a8f7f3fcSMatthias Ringwald 
789*a8f7f3fcSMatthias Ringwald     arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
790*a8f7f3fcSMatthias Ringwald                                                       const q7_t * pM,
791*a8f7f3fcSMatthias Ringwald                                                       const uint16_t dim_vec,
792*a8f7f3fcSMatthias Ringwald                                                       const uint16_t num_of_rows,
793*a8f7f3fcSMatthias Ringwald                                                       const uint16_t bias_shift,
794*a8f7f3fcSMatthias Ringwald                                                       const uint16_t out_shift,
795*a8f7f3fcSMatthias Ringwald                                                       const q7_t * bias,
796*a8f7f3fcSMatthias Ringwald                                                       q15_t * pOut,
797*a8f7f3fcSMatthias Ringwald                                                       q15_t * vec_buffer);
798*a8f7f3fcSMatthias Ringwald 
799*a8f7f3fcSMatthias Ringwald /**
800*a8f7f3fcSMatthias Ringwald  * @brief Matrix-Multiplication Kernels for Convolution
801*a8f7f3fcSMatthias Ringwald  *
802*a8f7f3fcSMatthias Ringwald  * These functions are used within convolution layer functions for
803*a8f7f3fcSMatthias Ringwald  * matrix multiplication.
804*a8f7f3fcSMatthias Ringwald  *
805*a8f7f3fcSMatthias Ringwald  * The implementation is similar to CMSIS-DSP arm_mat_mult functions
806*a8f7f3fcSMatthias Ringwald  * with one Q7 and one Q15 operands. The Q15 operand is the im2col
807*a8f7f3fcSMatthias Ringwald  * output which is always with 2 columns.
808*a8f7f3fcSMatthias Ringwald  *
809*a8f7f3fcSMatthias Ringwald  */
810*a8f7f3fcSMatthias Ringwald 
811*a8f7f3fcSMatthias Ringwald   /**
812*a8f7f3fcSMatthias Ringwald    * @brief Matrix-multiplication function for convolution
813*a8f7f3fcSMatthias Ringwald    * @param[in]       pA          pointer to operand A
814*a8f7f3fcSMatthias Ringwald    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
815*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   numRow of A
816*a8f7f3fcSMatthias Ringwald    * @param[in]       numCol_A    numCol of A
817*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
818*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
819*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        the bias
820*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output
821*a8f7f3fcSMatthias Ringwald    * @return     The function returns the incremented output pointer
822*a8f7f3fcSMatthias Ringwald    */
823*a8f7f3fcSMatthias Ringwald 
824*a8f7f3fcSMatthias Ringwald     q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
825*a8f7f3fcSMatthias Ringwald                                             const q15_t * pInBuffer,
826*a8f7f3fcSMatthias Ringwald                                             const uint16_t ch_im_out,
827*a8f7f3fcSMatthias Ringwald                                             const uint16_t numCol_A,
828*a8f7f3fcSMatthias Ringwald                                             const uint16_t bias_shift,
829*a8f7f3fcSMatthias Ringwald                                             const uint16_t out_shift,
830*a8f7f3fcSMatthias Ringwald                                             const q7_t * bias,
831*a8f7f3fcSMatthias Ringwald                                             q7_t * pOut);
832*a8f7f3fcSMatthias Ringwald 
833*a8f7f3fcSMatthias Ringwald   /**
834*a8f7f3fcSMatthias Ringwald    * @brief Matrix-multiplication function for convolution with reordered columns
835*a8f7f3fcSMatthias Ringwald    * @param[in]       pA          pointer to operand A
836*a8f7f3fcSMatthias Ringwald    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
837*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_out   numRow of A
838*a8f7f3fcSMatthias Ringwald    * @param[in]       numCol_A    numCol of A
839*a8f7f3fcSMatthias Ringwald    * @param[in]       bias_shift  amount of left-shift for bias
840*a8f7f3fcSMatthias Ringwald    * @param[in]       out_shift   amount of right-shift for output
841*a8f7f3fcSMatthias Ringwald    * @param[in]       bias        the bias
842*a8f7f3fcSMatthias Ringwald    * @param[in,out]   pOut        pointer to output
843*a8f7f3fcSMatthias Ringwald    * @return     The function returns the incremented output pointer
844*a8f7f3fcSMatthias Ringwald    */
845*a8f7f3fcSMatthias Ringwald 
846*a8f7f3fcSMatthias Ringwald     q7_t     *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
847*a8f7f3fcSMatthias Ringwald                                                       const q15_t * pInBuffer,
848*a8f7f3fcSMatthias Ringwald                                                       const uint16_t ch_im_out,
849*a8f7f3fcSMatthias Ringwald                                                       const uint16_t numCol_A,
850*a8f7f3fcSMatthias Ringwald                                                       const uint16_t bias_shift,
851*a8f7f3fcSMatthias Ringwald                                                       const uint16_t out_shift,
852*a8f7f3fcSMatthias Ringwald                                                       const q7_t * bias,
853*a8f7f3fcSMatthias Ringwald                                                       q7_t * pOut);
854*a8f7f3fcSMatthias Ringwald 
855*a8f7f3fcSMatthias Ringwald #ifdef __cplusplus
856*a8f7f3fcSMatthias Ringwald }
857*a8f7f3fcSMatthias Ringwald #endif
858*a8f7f3fcSMatthias Ringwald 
859*a8f7f3fcSMatthias Ringwald /*
860*a8f7f3fcSMatthias Ringwald  *  Other functions
861*a8f7f3fcSMatthias Ringwald  *  These layers are typically not timing critical
862*a8f7f3fcSMatthias Ringwald  *  Basic implementation is supported here
863*a8f7f3fcSMatthias Ringwald  */
864*a8f7f3fcSMatthias Ringwald 
865*a8f7f3fcSMatthias Ringwald #ifdef __cplusplus
866*a8f7f3fcSMatthias Ringwald extern    "C"
867*a8f7f3fcSMatthias Ringwald {
868*a8f7f3fcSMatthias Ringwald #endif
869*a8f7f3fcSMatthias Ringwald 
870*a8f7f3fcSMatthias Ringwald /**
871*a8f7f3fcSMatthias Ringwald  * @defgroup Acti Neural Network Activation Functions
872*a8f7f3fcSMatthias Ringwald  *
873*a8f7f3fcSMatthias Ringwald  * Perform activation layers, including ReLU (Rectified Linear Unit),
874*a8f7f3fcSMatthias Ringwald  * sigmoid and tanh
875*a8f7f3fcSMatthias Ringwald  *
876*a8f7f3fcSMatthias Ringwald  */
877*a8f7f3fcSMatthias Ringwald 
878*a8f7f3fcSMatthias Ringwald   /**
879*a8f7f3fcSMatthias Ringwald    * @brief Q7 RELU function
880*a8f7f3fcSMatthias Ringwald    * @param[in,out]   data        pointer to input
881*a8f7f3fcSMatthias Ringwald    * @param[in]       size        number of elements
882*a8f7f3fcSMatthias Ringwald    * @return none.
883*a8f7f3fcSMatthias Ringwald    */
884*a8f7f3fcSMatthias Ringwald 
885*a8f7f3fcSMatthias Ringwald     void      arm_relu_q7(q7_t * data, uint16_t size);
886*a8f7f3fcSMatthias Ringwald 
887*a8f7f3fcSMatthias Ringwald   /**
888*a8f7f3fcSMatthias Ringwald    * @brief Q15 RELU function
889*a8f7f3fcSMatthias Ringwald    * @param[in,out]   data        pointer to input
890*a8f7f3fcSMatthias Ringwald    * @param[in]       size        number of elements
891*a8f7f3fcSMatthias Ringwald    * @return none.
892*a8f7f3fcSMatthias Ringwald    */
893*a8f7f3fcSMatthias Ringwald 
894*a8f7f3fcSMatthias Ringwald     void      arm_relu_q15(q15_t * data, uint16_t size);
895*a8f7f3fcSMatthias Ringwald 
896*a8f7f3fcSMatthias Ringwald   /**
897*a8f7f3fcSMatthias Ringwald    * @brief Q7 neural network activation function using direct table look-up
898*a8f7f3fcSMatthias Ringwald    * @param[in,out]   data        pointer to input
899*a8f7f3fcSMatthias Ringwald    * @param[in]       size        number of elements
900*a8f7f3fcSMatthias Ringwald    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
901*a8f7f3fcSMatthias Ringwald    * @param[in]       type        type of activation functions
902*a8f7f3fcSMatthias Ringwald    * @return none.
903*a8f7f3fcSMatthias Ringwald    */
904*a8f7f3fcSMatthias Ringwald 
905*a8f7f3fcSMatthias Ringwald     void      arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
906*a8f7f3fcSMatthias Ringwald                                            arm_nn_activation_type type);
907*a8f7f3fcSMatthias Ringwald 
908*a8f7f3fcSMatthias Ringwald   /**
909*a8f7f3fcSMatthias Ringwald    * @brief Q15 neural network activation function using direct table look-up
910*a8f7f3fcSMatthias Ringwald    * @param[in,out]   data        pointer to input
911*a8f7f3fcSMatthias Ringwald    * @param[in]       size        number of elements
912*a8f7f3fcSMatthias Ringwald    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
913*a8f7f3fcSMatthias Ringwald    * @param[in]       type        type of activation functions
914*a8f7f3fcSMatthias Ringwald    * @return none.
915*a8f7f3fcSMatthias Ringwald    */
916*a8f7f3fcSMatthias Ringwald 
917*a8f7f3fcSMatthias Ringwald     void      arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
918*a8f7f3fcSMatthias Ringwald                                             arm_nn_activation_type type);
919*a8f7f3fcSMatthias Ringwald 
920*a8f7f3fcSMatthias Ringwald /**
921*a8f7f3fcSMatthias Ringwald  * @defgroup Pooling Neural Network Pooling Functions
922*a8f7f3fcSMatthias Ringwald  *
923*a8f7f3fcSMatthias Ringwald  * Perform pooling functions, including max pooling and average pooling
924*a8f7f3fcSMatthias Ringwald  *
925*a8f7f3fcSMatthias Ringwald  */
926*a8f7f3fcSMatthias Ringwald 
927*a8f7f3fcSMatthias Ringwald   /**
928*a8f7f3fcSMatthias Ringwald    * @brief Q7 max pooling function
929*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
930*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
931*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
932*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
933*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
934*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
935*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
936*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
937*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
938*a8f7f3fcSMatthias Ringwald    * @return none.
939*a8f7f3fcSMatthias Ringwald    *
940*a8f7f3fcSMatthias Ringwald    */
941*a8f7f3fcSMatthias Ringwald 
942*a8f7f3fcSMatthias Ringwald     void      arm_maxpool_q7_HWC(q7_t * Im_in,
943*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_im_in,
944*a8f7f3fcSMatthias Ringwald                                  const uint16_t ch_im_in,
945*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_kernel,
946*a8f7f3fcSMatthias Ringwald                                  const uint16_t padding,
947*a8f7f3fcSMatthias Ringwald                                  const uint16_t stride,
948*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_im_out,
949*a8f7f3fcSMatthias Ringwald                                  q7_t * bufferA,
950*a8f7f3fcSMatthias Ringwald                                  q7_t * Im_out);
951*a8f7f3fcSMatthias Ringwald 
952*a8f7f3fcSMatthias Ringwald   /**
953*a8f7f3fcSMatthias Ringwald    * @brief Q7 average pooling function
954*a8f7f3fcSMatthias Ringwald    * @param[in]       Im_in       pointer to input tensor
955*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_in   input tensor dimention
956*a8f7f3fcSMatthias Ringwald    * @param[in]       ch_im_in    number of input tensor channels
957*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_kernel  filter kernel size
958*a8f7f3fcSMatthias Ringwald    * @param[in]       padding     padding sizes
959*a8f7f3fcSMatthias Ringwald    * @param[in]       stride      convolution stride
960*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_im_out  output tensor dimension
961*a8f7f3fcSMatthias Ringwald    * @param[in,out]   bufferA     pointer to buffer space for input
962*a8f7f3fcSMatthias Ringwald    * @param[in,out]   Im_out      pointer to output tensor
963*a8f7f3fcSMatthias Ringwald    * @return none.
964*a8f7f3fcSMatthias Ringwald    *
965*a8f7f3fcSMatthias Ringwald    */
966*a8f7f3fcSMatthias Ringwald 
967*a8f7f3fcSMatthias Ringwald     void      arm_avepool_q7_HWC(q7_t * Im_in,
968*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_im_in,
969*a8f7f3fcSMatthias Ringwald                                  const uint16_t ch_im_in,
970*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_kernel,
971*a8f7f3fcSMatthias Ringwald                                  const uint16_t padding,
972*a8f7f3fcSMatthias Ringwald                                  const uint16_t stride,
973*a8f7f3fcSMatthias Ringwald                                  const uint16_t dim_im_out,
974*a8f7f3fcSMatthias Ringwald                                  q7_t * bufferA,
975*a8f7f3fcSMatthias Ringwald                                  q7_t * Im_out);
976*a8f7f3fcSMatthias Ringwald 
977*a8f7f3fcSMatthias Ringwald /**
978*a8f7f3fcSMatthias Ringwald  * @defgroup Softmax Softmax Functions
979*a8f7f3fcSMatthias Ringwald  *
980*a8f7f3fcSMatthias Ringwald  * EXP(2) based softmax function
981*a8f7f3fcSMatthias Ringwald  *
982*a8f7f3fcSMatthias Ringwald  */
983*a8f7f3fcSMatthias Ringwald 
984*a8f7f3fcSMatthias Ringwald   /**
985*a8f7f3fcSMatthias Ringwald    * @brief Q7 softmax function
986*a8f7f3fcSMatthias Ringwald    * @param[in]       vec_in      pointer to input vector
987*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     input vector dimention
988*a8f7f3fcSMatthias Ringwald    * @param[out]      p_out       pointer to output vector
989*a8f7f3fcSMatthias Ringwald    * @return none.
990*a8f7f3fcSMatthias Ringwald    *
991*a8f7f3fcSMatthias Ringwald    */
992*a8f7f3fcSMatthias Ringwald 
993*a8f7f3fcSMatthias Ringwald     void      arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
994*a8f7f3fcSMatthias Ringwald 
995*a8f7f3fcSMatthias Ringwald   /**
996*a8f7f3fcSMatthias Ringwald    * @brief Q15 softmax function
997*a8f7f3fcSMatthias Ringwald    * @param[in]       vec_in      pointer to input vector
998*a8f7f3fcSMatthias Ringwald    * @param[in]       dim_vec     input vector dimention
999*a8f7f3fcSMatthias Ringwald    * @param[out]      p_out       pointer to output vector
1000*a8f7f3fcSMatthias Ringwald    * @return none.
1001*a8f7f3fcSMatthias Ringwald    *
1002*a8f7f3fcSMatthias Ringwald    */
1003*a8f7f3fcSMatthias Ringwald 
1004*a8f7f3fcSMatthias Ringwald     void      arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
1005*a8f7f3fcSMatthias Ringwald 
1006*a8f7f3fcSMatthias Ringwald #ifdef __cplusplus
1007*a8f7f3fcSMatthias Ringwald }
1008*a8f7f3fcSMatthias Ringwald #endif
1009*a8f7f3fcSMatthias Ringwald 
1010*a8f7f3fcSMatthias Ringwald #endif
1011