1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const int8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     int8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_conv_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41// params structure is 16 bytes
42//  struct {
43//    int32_t right_pre_shift;    d12[0]
44//    int32_t multiplier;         d12[1]
45//    int32_t right_post_shift;   d13[0]
46//    int16_t output_zero_point;  d13[2]
47//    int8_t output_min;          d13[6]
48//    int8_t output_max;          d13[7]
49//  } rndnu_neon;
50
51BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
52        # Push 96 bytes
53        PUSH    {r5, r7, r9, r11}                   // 16
54        SUB     sp, sp, 32                          // +32
55        VPUSH   {d8-d13}                            // +48 = 96
56
57        LDR     r11, [sp, 104]          // c
58        LDR     r9, [sp, 100]           // w
59        LDR     r5, [sp, 116]           // params
60
61        # Load params values
62        VLDM    r5, {d12-d13}           // RNDNU params
63        LDR     r7, [sp, 112]            // cn_stride
64
65
66        .p2align 3
670:
68        # Load initial bias from w into accumulators
69        VLDM    r9!, {d16-d19}          // Bias
70        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
71        SUBS    r5, r2, 8               // k = kc - 8
72        VMOV.I32 q3, 0
73        BLO     4f                      // less than 8 channels?
74
75        // Prologue - load A0 and B0
76        VLD1.8  {d0},  [r3]!            // A0
77        SUBS    r5, r5, 8               // k = k - 8
78        VLD1.8  {d8},  [r9]!            // B0
79        BLO     2f                      // less than 8 channels?
80
81        // Main loop - 8 bytes
82        // 64 bytes for weights.
83
84        .p2align 3
851:
86        // Extend
87        VMOVL.S8 q0, d0
88        VMOVL.S8 q4, d8
89
90        // BLOCK 0
91        VLD1.8  {d10},  [r9]!           // B1
92        VMLAL.S16 q8, d8, d0[0]
93        VMLAL.S16 q9, d9, d0[0]
94        VMOVL.S8 q5, d10
95
96        // BLOCK 1
97        VLD1.8  {d8},  [r9]!            // B2
98        VMLAL.S16 q2, d10, d0[1]
99        VMLAL.S16 q3, d11, d0[1]
100        VMOVL.S8 q4, d8
101
102        // BLOCK 2
103        VLD1.8  {d10},  [r9]!           // B3
104        VMLAL.S16 q8, d8, d0[2]
105        VMLAL.S16 q9, d9, d0[2]
106        VMOVL.S8 q5, d10
107
108        // BLOCK 3
109        VLD1.8  {d8},  [r9]!            // B4
110        VMLAL.S16 q2, d10, d0[3]
111        VMLAL.S16 q3, d11, d0[3]
112        VLD1.8  {d0},  [r3]!            // A0
113        VMOVL.S8 q4, d8
114
115        // BLOCK 4
116        VLD1.8  {d10},  [r9]!           // B5
117        VMLAL.S16 q8, d8, d1[0]
118        VMLAL.S16 q9, d9, d1[0]
119        VMOVL.S8 q5, d10
120
121        // BLOCK 5
122        VLD1.8  {d8},  [r9]!            // B6
123        VMLAL.S16 q2, d10, d1[1]
124        VMLAL.S16 q3, d11, d1[1]
125        VMOVL.S8 q4, d8
126
127        // BLOCK 6
128        VLD1.8  {d10},  [r9]!           // B7
129        VMLAL.S16 q8, d8, d1[2]
130        VMLAL.S16 q9, d9, d1[2]
131        VMOVL.S8 q5, d10
132
133        // BLOCK 7
134        VLD1.8  {d8},  [r9]!            // B0
135        VMLAL.S16 q2, d10, d1[3]
136        VMLAL.S16 q3, d11, d1[3]
137        SUBS    r5, r5, 8
138        BHS     1b
139
140        // Epilogue
141
142        .p2align 3
1432:
144        VMOVL.S8 q0, d0
145        VMOVL.S8 q4, d8
146
147        VLD1.8  {d10},  [r9]!           // B1
148        VMLAL.S16 q8, d8, d0[0]
149        VMLAL.S16 q9, d9, d0[0]
150        VMOVL.S8 q5, d10
151
152        VLD1.8  {d8},  [r9]!            // B2
153        VMLAL.S16 q2, d10, d0[1]
154        VMLAL.S16 q3, d11, d0[1]
155        VMOVL.S8 q4, d8
156
157        VLD1.8  {d10},  [r9]!           // B3
158        VMLAL.S16 q8, d8, d0[2]
159        VMLAL.S16 q9, d9, d0[2]
160        VMOVL.S8 q5, d10
161
162        VLD1.8  {d8},  [r9]!            // B4
163        VMLAL.S16 q2, d10, d0[3]
164        VMLAL.S16 q3, d11, d0[3]
165        VMOVL.S8 q4, d8
166
167        VLD1.8  {d10},  [r9]!           // B5
168        VMLAL.S16 q8, d8, d1[0]
169        VMLAL.S16 q9, d9, d1[0]
170        VMOVL.S8 q5, d10
171
172        VLD1.8  {d8},  [r9]!            // B6
173        VMLAL.S16 q2, d10, d1[1]
174        VMLAL.S16 q3, d11, d1[1]
175        VMOVL.S8 q4, d8
176
177        VLD1.8  {d10},  [r9]!           // B7
178        VMLAL.S16 q8, d8, d1[2]
179        VMLAL.S16 q9, d9, d1[2]
180        VMOVL.S8 q5, d10
181        ADDS    r5, r5, 8
182
183        VMLAL.S16 q2, d10, d1[3]
184        VMLAL.S16 q3, d11, d1[3]
185
186        # Is there a remainder?- 1-7 bytes of A
187        BNE     4f
188
1893:
190        VADD.S32 q8, q8, q2
191        VADD.S32 q9, q9, q3
192
193        # RNDNU quantization
194        VDUP.32 q0, d12[0]              // right_pre_shift
195
196        VQSHL.S32 q8,  q8, q0
197        VQSHL.S32 q9,  q9, q0
198
199        VDUP.32 q2, d13[0]              // right_post_shift
200
201        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
202        VQDMULH.S32 q9,  q9, d12[1]
203
204        VRSHL.S32 q8,  q8, q2
205        VRSHL.S32 q9,  q9, q2
206
207        VDUP.16 q0, d13[2]              // output_zero_point
208
209        VQMOVN.S32 d16, q8
210        VQMOVN.S32 d17, q9
211
212        VQADD.S16 q8,  q8, q0
213
214        VDUP.8  d24, d13[6]             // output_min
215
216        VQMOVN.S16 d0,  q8
217
218        VDUP.8  d25, d13[7]             // output_max
219
220        VMAX.S8 d0, d0, d24
221
222        SUBS    r1, r1, 8
223
224        VMIN.S8 d0, d0, d25
225
226        # Store full 1 x 8
227        BLO     5f
228        VST1.8  {d0}, [r11], r7
229        SUB     r3, r3, r2
230        BHI     0b
231
232        VPOP    {d8-d13}
233        ADD     sp, sp, 16              // skip pad of 8 + d14
234        ADD     sp, sp, 16
235        POP     {r5, r7, r9, r11}
236        BX      lr
237
238        # Remainder- 1 to 7 bytes of A
239        .p2align 3
2404:
241        AND     r5, r5, 7               // kc remainder 1 to 7
242
243        VLD1.8  {d0},  [r3], r5
244        VLD1.8  {d8},  [r9]!
245
246        VMOVL.S8 q0, d0
247        VMOVL.S8 q4, d8
248        VMLAL.S16 q8, d8, d0[0]
249        VMLAL.S16 q9, d9, d0[0]
250        CMP     r5, 2
251        BLO     3b
252
253        VLD1.8  {d8},  [r9]!
254        VMOVL.S8 q4, d8
255        VMLAL.S16 q8, d8, d0[1]
256        VMLAL.S16 q9, d9, d0[1]
257        BEQ     3b
258
259        VLD1.8  {d8},  [r9]!
260        VMOVL.S8 q4, d8
261        VMLAL.S16 q8, d8, d0[2]
262        VMLAL.S16 q9, d9, d0[2]
263        CMP     r5, 4
264        BLO     3b
265
266        VLD1.8  {d8},  [r9]!
267        VMOVL.S8 q4, d8
268        VMLAL.S16 q8, d8, d0[3]
269        VMLAL.S16 q9, d9, d0[3]
270        BEQ     3b
271
272        VLD1.8  {d8},  [r9]!
273        VMOVL.S8 q4, d8
274        VMLAL.S16 q8, d8, d1[0]
275        VMLAL.S16 q9, d9, d1[0]
276        CMP     r5, 6
277        BLO     3b
278
279        VLD1.8  {d8},  [r9]!
280        VMOVL.S8 q4, d8
281        VMLAL.S16 q8, d8, d1[1]
282        VMLAL.S16 q9, d9, d1[1]
283        BEQ     3b
284
285        VLD1.8  {d8},  [r9]!
286        VMOVL.S8 q4, d8
287        VMLAL.S16 q8, d8, d1[2]
288        VMLAL.S16 q9, d9, d1[2]
289        B       3b
290
291        # Store odd width
292        .p2align 3
2935:
294        TST     r1, 4
295        BEQ     6f
296        VST1.32 {d0[0]}, [r11]!
297        VEXT.8  q0, q0, q0, 4
2986:
299        TST     r1, 2
300        BEQ     7f
301        VST1.16 {d0[0]}, [r11]!
302        VEXT.8  q0, q0, q0, 2
3037:
304        TST     r1, 1
305        BEQ     8f
306        VST1.8  {d0[0]}, [r11]
3078:
308        VPOP    {d8-d13}
309        ADD     sp, sp, 16              // skip pad of 8 + d14
310        ADD     sp, sp, 16
311        POP     {r5, r7, r9, r11}
312        BX      lr
313
314END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
315
316#ifdef __ELF__
317.section ".note.GNU-stack","",%progbits
318#endif
319
320