1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_convolve_HWC_q7_basic.c
22  * Description:	 Q7 version of convolution
23  *
24  * $Date:        17. January 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 #include "arm_math.h"
31 #include "arm_nnfunctions.h"
32 
33 /**
34  *  @ingroup groupNN
35  */
36 
37 /**
38  * @addtogroup NNConv
39  * @{
40  */
41 
42   /**
43    * @brief Basic Q7 convolution function
44    * @param[in]       Im_in       pointer to input tensor
45    * @param[in]       dim_im_in   input tensor dimention
46    * @param[in]       ch_im_in    number of input tensor channels
47    * @param[in]       wt          pointer to kernel weights
48    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
49    * @param[in]       dim_kernel  filter kernel size
50    * @param[in]       padding     padding sizes
51    * @param[in]       stride      convolution stride
52    * @param[in]       bias        pointer to bias
53    * @param[in]       bias_shift  amount of left-shift for bias
54    * @param[in]       out_shift   amount of right-shift for output
55    * @param[in,out]   Im_out      pointer to output tensor
56    * @param[in]       dim_im_out  output tensor dimension
57    * @param[in,out]   bufferA     pointer to buffer space for input
58    * @param[in,out]   bufferB     pointer to buffer space for output
59    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
60    *
61    * @details
62    *
63    * <b>Buffer size:</b>
64    *
65    * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
66    *
67    * bufferB size: 0
68    *
69    * This basic version is designed to work for any input tensor and weight
70    * dimension.
71    */
72 
73 arm_status
arm_convolve_HWC_q7_basic(const q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const q7_t * wt,const uint16_t ch_im_out,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const q7_t * bias,const uint16_t bias_shift,const uint16_t out_shift,q7_t * Im_out,const uint16_t dim_im_out,q15_t * bufferA,q7_t * bufferB)74 arm_convolve_HWC_q7_basic(const q7_t * Im_in,
75                           const uint16_t dim_im_in,
76                           const uint16_t ch_im_in,
77                           const q7_t * wt,
78                           const uint16_t ch_im_out,
79                           const uint16_t dim_kernel,
80                           const uint16_t padding,
81                           const uint16_t stride,
82                           const q7_t * bias,
83                           const uint16_t bias_shift,
84                           const uint16_t out_shift,
85                           q7_t * Im_out,
86                           const uint16_t dim_im_out,
87                           q15_t * bufferA,
88                           q7_t * bufferB)
89 {
90 
91 #if defined (ARM_MATH_DSP)
92     /* Run the following code for Cortex-M4 and Cortex-M7 */
93 
94     int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
95 
96     /*
97      *  Here we use bufferA as q15_t internally as computation are done with q15_t level
98      *  im2col are done to output in q15_t format from q7_t input
99      */
100     q15_t    *pBuffer = bufferA;
101     q7_t     *pOut = Im_out;
102 
103     /* This part implements the im2col function */
104     for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
105     {
106         for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
107         {
108             for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
109             {
110                 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
111                 {
112                     if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
113                     {
114                         /* Filling 0 for out-of-bound paddings */
115                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
116                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
117                     } else
118                     {
119                         /* Copying the pixel data to column */
120                         arm_q7_to_q15_no_shift((q7_t *)
121                                                Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
122                     }
123                     pBuffer += ch_im_in;
124                 }
125             }
126 
127             /* Computation is filed for every 2 columns */
128             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
129             {
130                 pOut =
131                     arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
132                                                   ch_im_out,
133                                                   ch_im_in *
134                                                   dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
135 
136                 /* counter reset */
137                 pBuffer = bufferA;
138             }
139         }
140     }
141 
142     /* left-over because odd number of output pixels */
143     if (pBuffer != bufferA)
144     {
145         const q7_t *pA = wt;
146         int       i;
147 
148         for (i = 0; i < ch_im_out; i++)
149         {
150             /* Load the accumulator with bias first */
151             q31_t     sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
152 
153             /* Point to the beging of the im2col buffer */
154             q15_t    *pB = bufferA;
155 
156             /* Each time it process 4 entries */
157             uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
158 
159             while (colCnt)
160             {
161                 q31_t     inA1, inA2;
162                 q31_t     inB1, inB2;
163 
164                 pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
165 
166                 inB1 = *__SIMD32(pB)++;
167                 sum = __SMLAD(inA1, inB1, sum);
168                 inB2 = *__SIMD32(pB)++;
169                 sum = __SMLAD(inA2, inB2, sum);
170 
171                 colCnt--;
172             }
173             colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
174             while (colCnt)
175             {
176                 q7_t      inA1 = *pA++;
177                 q15_t     inB1 = *pB++;
178                 sum += inA1 * inB1;
179                 colCnt--;
180             }
181             *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
182         }
183     }
184 #else
185     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
186 
187     uint16_t  i, j, k, l, m, n;
188     int       conv_out;
189     signed char in_row, in_col;
190 
191     for (i = 0; i < ch_im_out; i++)
192     {
193         for (j = 0; j < dim_im_out; j++)
194         {
195             for (k = 0; k < dim_im_out; k++)
196             {
197                 conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
198                 for (m = 0; m < dim_kernel; m++)
199                 {
200                     for (n = 0; n < dim_kernel; n++)
201                     {
202                         // if-for implementation
203                         in_row = stride * j + m - padding;
204                         in_col = stride * k + n - padding;
205                         if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
206                         {
207                             for (l = 0; l < ch_im_in; l++)
208                             {
209                                 conv_out +=
210                                     Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
211                                           l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
212                                                                                             n) * ch_im_in + l];
213                             }
214                         }
215                     }
216                 }
217                 Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
218             }
219         }
220     }
221 
222 #endif                          /* ARM_MATH_DSP */
223 
224     /* Return to application */
225     return ARM_MATH_SUCCESS;
226 }
227 
228 /**
229  * @} end of NNConv group
230  */
231