1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_convolve_HWC_q15_fast.c
22  * Description:  Fast Q15 version of convolution
23  *
24  * $Date:        24. May 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33 
34 /**
35  *  @ingroup groupNN
36  */
37 
38 /**
39  * @addtogroup NNConv
40  * @{
41  */
42 
43   /**
44    * @brief Fast Q15 convolution function (non-sqaure shape)
45    * @param[in]       Im_in        pointer to input tensor
46    * @param[in]       dim_im_in_x  input tensor dimention x
47    * @param[in]       dim_im_in_y  input tensor dimention y
48    * @param[in]       ch_im_in     number of input tensor channels
49    * @param[in]       wt           pointer to kernel weights
50    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
51    * @param[in]       dim_kernel_x filter kernel size x
52    * @param[in]       dim_kernel_y filter kernel size y
53    * @param[in]       padding_x    padding size x
54    * @param[in]       padding_y    padding size y
55    * @param[in]       stride_x     convolution stride x
56    * @param[in]       stride_y     convolution stride y
57    * @param[in]       bias         pointer to bias
58    * @param[in]       bias_shift   amount of left-shift for bias
59    * @param[in]       out_shift    amount of right-shift for output
60    * @param[in,out]   Im_out       pointer to output tensor
61    * @param[in]       dim_im_out_x output tensor dimension x
62    * @param[in]       dim_im_out_y output tensor dimension y
63    * @param[in,out]   bufferA      pointer to buffer space for input
64    * @param[in,out]   bufferB      pointer to buffer space for output
65    * @return     The function returns either
66    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
67    *
68    * @details
69    *
70    * <b>Buffer size:</b>
71    *
72    * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
73    *
74    * bufferB size: 0
75    *
76    * <b>Input dimension constraints:</b>
77    *
78    * ch_im_in is multiple of 2
79    *
80    * ch_im_out is multipe of 2
81    *
82    */
83 
84 arm_status
arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,const uint16_t dim_im_in_x,const uint16_t dim_im_in_y,const uint16_t ch_im_in,const q15_t * wt,const uint16_t ch_im_out,const uint16_t dim_kernel_x,const uint16_t dim_kernel_y,const uint16_t padding_x,const uint16_t padding_y,const uint16_t stride_x,const uint16_t stride_y,const q15_t * bias,const uint16_t bias_shift,const uint16_t out_shift,q15_t * Im_out,const uint16_t dim_im_out_x,const uint16_t dim_im_out_y,q15_t * bufferA,q7_t * bufferB)85 arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
86                                     const uint16_t dim_im_in_x,
87                                     const uint16_t dim_im_in_y,
88                                     const uint16_t ch_im_in,
89                                     const q15_t * wt,
90                                     const uint16_t ch_im_out,
91                                     const uint16_t dim_kernel_x,
92                                     const uint16_t dim_kernel_y,
93                                     const uint16_t padding_x,
94                                     const uint16_t padding_y,
95                                     const uint16_t stride_x,
96                                     const uint16_t stride_y,
97                                     const q15_t * bias,
98                                     const uint16_t bias_shift,
99                                     const uint16_t out_shift,
100                                     q15_t * Im_out,
101                                     const uint16_t dim_im_out_x,
102                                     const uint16_t dim_im_out_y,
103                                     q15_t * bufferA,
104                                     q7_t * bufferB)
105 {
106 
107 #if defined (ARM_MATH_DSP)
108     int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
109 
110     q15_t    *pBuffer = bufferA;
111     q15_t    *im_buffer = bufferA;
112     q15_t    *pOut = Im_out;
113 
114     if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
115     {
116         /* check if the input dimension meets the constraints */
117         return ARM_MATH_SIZE_MISMATCH;
118     }
119 
120     /* Run the following code for Cortex-M4 and Cortex-M7 */
121 
122     /* This part implements the im2col function */
123     for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
124     {
125         for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
126         {
127             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; i_ker_y++)
128             {
129                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; i_ker_x++)
130                 {
131                     if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
132                     {
133                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
134                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
135                     } else
136                     {
137                         /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
138                         memcpy(pBuffer, (q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, sizeof(q15_t)*ch_im_in);
139                     }
140                     pBuffer += ch_im_in;
141                 }
142             }
143 
144             if (i_out_x & 0x1)
145             {
146                 int       i;
147                 /* initialize the matrix pointers for A */
148                 const q15_t *pA = wt;
149 
150                 /* set up the second output pointers */
151                 q15_t    *pOut2 = pOut + ch_im_out;
152 
153                 /* this loop over rows in A */
154                 for (i = 0; i < ch_im_out; i += 2)
155                 {
156                     /* setup pointers for B */
157                     q15_t    *pB = im_buffer;
158                     const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
159 
160                     /* aling the second pointer for A */
161                     const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
162 
163                     /* init the sum with bias */
164                     q31_t     sum =  ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
165                     q31_t     sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
166                     q31_t     sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
167                     q31_t     sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
168 
169                     uint16_t  colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
170                     /* accumulate over the vector */
171                     while (colCnt)
172                     {
173                         q31_t     inA1 = *__SIMD32(pA)++;
174                         q31_t     inB1 = *__SIMD32(pB)++;
175                         q31_t     inA2 = *__SIMD32(pA2)++;
176                         q31_t     inB2 = *__SIMD32(pB2)++;
177 
178                         sum = __SMLAD(inA1, inB1, sum);
179                         sum2 = __SMLAD(inA1, inB2, sum2);
180                         sum3 = __SMLAD(inA2, inB1, sum3);
181                         sum4 = __SMLAD(inA2, inB2, sum4);
182 
183                         colCnt--;
184                     }           /* while over colCnt */
185                     colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
186                     while (colCnt)
187                     {
188                         q15_t     inA1 = *pA++;
189                         q15_t     inB1 = *pB++;
190                         q15_t     inA2 = *pA2++;
191                         q15_t     inB2 = *pB2++;
192 
193                         sum += inA1 * inB1;
194                         sum2 += inA1 * inB2;
195                         sum3 += inA2 * inB1;
196                         sum4 += inA2 * inB2;
197                         colCnt--;
198                     }           /* while over colCnt */
199                     *pOut++ = (q15_t) __SSAT(sum >> out_shift, 16);
200                     *pOut++ = (q15_t) __SSAT(sum3 >> out_shift, 16);
201                     *pOut2++ = (q15_t) __SSAT(sum2 >> out_shift, 16);
202                     *pOut2++ = (q15_t) __SSAT(sum4 >> out_shift, 16);
203 
204                     /* skip the row computed with A2 */
205                     pA += ch_im_in * dim_kernel_y * dim_kernel_x;
206                 }               /* for over ch_im_out */
207 
208                 pOut += ch_im_out;
209                 /* counter reset */
210                 pBuffer = im_buffer;
211             }
212         }
213     }
214 
215 #else
216     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
217     uint16_t  i, j, k, l, m, n;
218     int       conv_out;
219     signed char in_row, in_col;
220 
221     if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
222     {
223         /* check if the input dimension meets the constraints */
224         return ARM_MATH_SIZE_MISMATCH;
225     }
226 
227     for (i = 0; i < ch_im_out; i++)
228     {
229         for (j = 0; j < dim_im_out_y; j++)
230         {
231             for (k = 0; k < dim_im_out_x; k++)
232             {
233                 conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
234                 for (m = 0; m < dim_kernel_y; m++)
235                 {
236                     for (n = 0; n < dim_kernel_x; n++)
237                     {
238                         in_row = stride_y * j + m - padding_y;
239                         in_col = stride_x * k + n - padding_x;
240                         if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
241                         {
242                             for (l = 0; l < ch_im_in; l++)
243                             {
244                                 conv_out +=
245                                     Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in +
246                                           l] * wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x +
247                                                                                             n) * ch_im_in + l];
248                             }
249                         }
250                     }
251                 }
252                 Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) __SSAT((conv_out >> out_shift), 16);
253             }
254         }
255     }
256 
257 #endif                          /* ARM_MATH_DSP */
258 
259     /* Return to application */
260     return ARM_MATH_SUCCESS;
261 }
262 
263 /**
264  * @} end of NNConv group
265  */
266