1 /*
2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_fully_connected_mat_q7_vec_q15_opt.c
22 * Description: Mixed Q15-Q7 opt fully-connected layer function
23 *
24 * $Date: 17. January 2018
25 * $Revision: V.1.0.0
26 *
27 * Target Processor: Cortex-M cores
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33
34 /**
35 * @ingroup groupNN
36 */
37
38 /**
39 * @addtogroup FC
40 * @{
41 */
42
43 /**
44 * @brief Mixed Q15-Q7 opt fully-connected layer function
45 * @param[in] pV pointer to input vector
46 * @param[in] pM pointer to matrix weights
47 * @param[in] dim_vec length of the vector
48 * @param[in] num_of_rows number of rows in weight matrix
49 * @param[in] bias_shift amount of left-shift for bias
50 * @param[in] out_shift amount of right-shift for output
51 * @param[in] bias pointer to bias
52 * @param[in,out] pOut pointer to output vector
53 * @param[in,out] vec_buffer pointer to buffer space for input
54 * @return The function returns <code>ARM_MATH_SUCCESS</code>
55 *
56 * @details
57 *
58 * <b>Buffer size:</b>
59 *
60 * vec_buffer size: 0
61 *
62 * Q7_Q15 version of the fully connected layer
63 *
64 * Weights are in q7_t and Activations are in q15_t
65 *
66 * Limitation: x4 version requires weight reordering to work
67 *
68 * Here we use only one pointer to read 4 rows in the weight
69 * matrix. So if the original q7_t matrix looks like this:
70 *
71 * | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
72 *
73 * | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
74 *
75 * | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
76 *
77 * | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
78 *
79 * | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
80 *
81 * | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
82 *
83 * We operates on multiple-of-4 rows, so the first four rows becomes
84 *
85 * | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 |
86 *
87 * | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 |
88 *
89 * | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 |
90 *
91 * The column left over will be in-order.
92 * which is:
93 * | a17 | a27 | a37 | a47 |
94 *
95 * For the left-over rows, we do 1x1 computation, so the data remains
96 * as its original order.
97 *
98 * So the stored weight matrix looks like this:
99 *
100 * | a11 | a21 | a12 | a22 | a31 | a41 |
101 *
102 * | a32 | a42 | a13 | a23 | a14 | a24 |
103 *
104 * | a33 | a43 | a34 | a44 | a15 | a25 |
105 *
106 * | a16 | a26 | a35 | a45 | a36 | a46 |
107 *
108 * | a17 | a27 | a37 | a47 | a51 | a52 |
109 *
110 * | a53 | a54 | a55 | a56 | a57 | a61 |
111 *
112 * | a62 | a63 | a64 | a65 | a66 | a67 |
113 *
114 */
115
116 arm_status
arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,const q7_t * pM,const uint16_t dim_vec,const uint16_t num_of_rows,const uint16_t bias_shift,const uint16_t out_shift,const q7_t * bias,q15_t * pOut,q15_t * vec_buffer)117 arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
118 const q7_t * pM,
119 const uint16_t dim_vec,
120 const uint16_t num_of_rows,
121 const uint16_t bias_shift,
122 const uint16_t out_shift, const q7_t * bias, q15_t * pOut, q15_t * vec_buffer)
123 {
124
125 #if defined (ARM_MATH_DSP)
126 /* Run the following code for Cortex-M4 and Cortex-M7 */
127
128 const q7_t *pB = pM;
129 q15_t *pO = pOut;
130 const q7_t *pBias = bias;
131 const q15_t *pA = pV;
132
133 uint16_t rowCnt = num_of_rows >> 2;
134
135 while (rowCnt)
136 {
137 q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
138 q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
139 q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
140 q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
141
142 uint16_t colCnt = dim_vec >> 1;
143
144 pA = pV;
145
146 #ifdef USE_INTRINSIC
147
148 #ifndef ARM_MATH_BIG_ENDIAN
149
150 while (colCnt)
151 {
152 q31_t inM11, inM12, inM13, inM14;
153 q31_t inV;
154
155 inV = *__SIMD32(pA)++;
156 inM11 = *__SIMD32(pB)++;
157 inM12 = __SXTB16(__ROR(inM11, 8));
158 inM11 = __SXTB16(inM11);
159 sum = __SMLAD(inM11, inV, sum);
160 sum2 = __SMLAD(inM12, inV, sum2);
161 inM13 = *__SIMD32(pB)++;
162 inM14 = __SXTB16(__ROR(inM13, 8));
163 inM13 = __SXTB16(inM13);
164 sum3 = __SMLAD(inM13, inV, sum3);
165 sum4 = __SMLAD(inM14, inV, sum4);
166 colCnt--;
167 }
168
169 #else
170
171 while (colCnt)
172 {
173 q31_t inM11, inM12, inM13, inM14;
174 q31_t inV;
175
176 inV = *__SIMD32(pA)++;
177 inM11 = *__SIMD32(pB)++;
178 inM12 = __SXTB16(__ROR(inM11, 8));
179 inM11 = __SXTB16(inM11);
180 sum = __SMLAD(inM12, inV, sum);
181 sum2 = __SMLAD(inM11, inV, sum2);
182 inM13 = *__SIMD32(pB)++;
183 inM14 = __SXTB16(__ROR(inM13, 8));
184 inM13 = __SXTB16(inM13);
185 sum3 = __SMLAD(inM14, inV, sum3);
186 sum4 = __SMLAD(inM13, inV, sum4);
187 colCnt--;
188 }
189
190 #endif /* ARM_MATH_BIG_ENDIAN */
191
192 #else
193
194 /*
195 * register needed:
196 * loop counter: colCnt
197 * accumulators: sum, sum2, sum3, sum4
198 * pointers: pB, pA
199 * weight data: inM11, inM12, inM13, inM14
200 * activation data: inV
201 */
202
203 #ifndef ARM_MATH_BIG_ENDIAN
204 asm volatile ("COL_LOOP_%=:\n"
205 "ldr.w r4, [%[pA]], #4\n"
206 "ldr.w r1, [%[pB]], #8\n"
207 "mov.w r0, r1, ror #8\n"
208 "sxtb16 r0, r0\n"
209 "sxtb16 r1, r1\n"
210 "smlad %[sum], r4, r1, %[sum]\n"
211 "smlad %[sum2], r4, r0, %[sum2]\n"
212 "ldr.w r3, [%[pB], #-4]\n"
213 "mov.w r2, r3, ror #8\n"
214 "sxtb16 r2, r2\n"
215 "sxtb16 r3, r3\n"
216 "smlad %[sum3], r4, r3, %[sum3]\n"
217 "smlad %[sum4], r4, r2, %[sum4]\n"
218 "subs %[colCnt], #1\n"
219 "bne COL_LOOP_%=\n":[sum] "+r"(sum),
220 [sum2] "+r"(sum2),[sum3] "+r"(sum3),
221 [sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
222 #else
223 asm volatile ("COL_LOOP_%=:\n"
224 "ldr.w r4, [%[pA]], #4\n"
225 "ldr.w r1, [%[pB]], #8\n"
226 "mov.w r0, r1, ror #8\n"
227 "sxtb16 r0, r0\n"
228 "sxtb16 r1, r1\n"
229 "smlad %[sum], r4, r0, %[sum]\n"
230 "smlad %[sum2], r4, r1, %[sum2]\n"
231 "ldr.w r3, [%[pB], #-4]\n"
232 "mov.w r2, r3, ror #8\n"
233 "sxtb16 r2, r2\n"
234 "sxtb16 r3, r3\n"
235 "smlad %[sum3], r4, r2, %[sum3]\n"
236 "smlad %[sum4], r4, r3, %[sum4]\n"
237 "subs %[colCnt], #1\n"
238 "bne COL_LOOP_%=\n":[sum] "+r"(sum),
239 [sum2] "+r"(sum2),[sum3] "+r"(sum3),
240 [sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
241 #endif /* ARM_MATH_BIG_ENDIAN */
242
243 #endif /* USE_INTRINSIC */
244
245 colCnt = dim_vec & 0x1;
246 while (colCnt)
247 {
248 q15_t inV = *pA++;
249 q7_t inM = *pB++;
250 q7_t inM2 = *pB++;
251 q7_t inM3 = *pB++;
252 q7_t inM4 = *pB++;
253
254 sum += inV * inM;
255 sum2 += inV * inM2;
256 sum3 += inV * inM3;
257 sum4 += inV * inM4;
258 colCnt--;
259 } /* while over colCnt */
260 *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
261 *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
262 *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
263 *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));
264
265 /* adjust the pointers and counters */
266 rowCnt--;
267 }
268
269 /* left-over part of the rows */
270 rowCnt = num_of_rows & 0x3;
271
272 while (rowCnt)
273 {
274 q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
275
276 uint16_t colCnt = dim_vec >> 2;
277
278 pA = pV;
279
280 while (colCnt)
281 {
282 q31_t inV1, inV2, inM11, inM12;
283
284 pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
285
286 inV1 = *__SIMD32(pA)++;
287 sum = __SMLAD(inV1, inM11, sum);
288
289 inV2 = *__SIMD32(pA)++;
290 sum = __SMLAD(inV2, inM12, sum);
291
292 colCnt--;
293 }
294
295 /* left-over of the vector */
296 colCnt = dim_vec & 0x3;
297 while (colCnt)
298 {
299 q15_t inV = *pA++;
300 q7_t inM = *pB++;
301 sum += inV * inM;
302 colCnt--;
303 }
304
305 *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
306
307 rowCnt--;
308 }
309
310 #else
311 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
312 uint16_t rowCnt = num_of_rows >> 2;
313 const q7_t *pB = pM;
314 const q15_t *pA;
315 q15_t *pO = pOut;
316 const q7_t *pBias = bias;
317
318 while (rowCnt)
319 {
320 q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
321 q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
322 q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
323 q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
324 uint16_t colCnt = dim_vec >> 1;
325
326 pA = pV;
327
328 while (colCnt)
329 {
330 q15_t inA1 = *pA++;
331 q15_t inA2 = *pA++;
332
333 q7_t inB1 = *pB++;
334 q7_t inB3 = *pB++;
335 q7_t inB2 = *pB++;
336 q7_t inB4 = *pB++;
337
338 sum += inA1 * inB1 + inA2 * inB2;
339 sum2 += inA1 * inB3 + inA2 * inB4;
340
341 inB1 = *pB++;
342 inB3 = *pB++;
343 inB2 = *pB++;
344 inB4 = *pB++;
345
346 sum3 += inA1 * inB1 + inA2 * inB2;
347 sum4 += inA1 * inB3 + inA2 * inB4;
348
349 colCnt--;
350 }
351
352 colCnt = dim_vec & 0x1;
353 while (colCnt)
354 {
355 q15_t inA = *pA++;
356 q7_t inB = *pB++;
357 sum += inA * inB;
358 inB = *pB++;
359 sum2 += inA * inB;
360 inB = *pB++;
361 sum3 += inA * inB;
362 inB = *pB++;
363 sum4 += inA * inB;
364
365 colCnt--;
366 }
367 *pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
368 *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
369 *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
370 *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);
371
372 rowCnt--;
373 }
374
375 rowCnt = num_of_rows & 0x3;
376
377 while (rowCnt)
378 {
379 int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
380 int j;
381
382 pA = pV;
383 for (j = 0; j < dim_vec; j++)
384 {
385 q15_t inA = *pA++;
386 q7_t inB = *pB++;
387 ip_out += inA * inB;
388 }
389 *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);
390
391 rowCnt--;
392 }
393
394 #endif /* ARM_MATH_DSP */
395
396 /* Return to ARM_MATH_SUCCESS */
397 return (ARM_MATH_SUCCESS);
398
399 }
400
401 /**
402 * @} end of FC group
403 */
404