1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const int8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     int8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41// params structure is 4 bytes
42//  struct {
43//    int16_t output_zero_point;  d13[2]
44//    int8_t output_min;          d13[6]
45//    int8_t output_max;          d13[7]
46//  } xnn_qs8_minmax_params.neonv8;
47
48BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35
49        # Push 96 bytes
50        PUSH    {r5, r7, r9, r11}                   // 16
51        SUB     sp, sp, 32                          // +32
52        VPUSH   {d8-d13}                            // +48 = 96
53
54        LDR     r11, [sp, 104]          // c
55        LDR     r9, [sp, 100]           // w
56        LDR     r5, [sp, 116]           // params
57
58        # Load params values
59        VLD1.32 {d13[]}, [r5]           // QC8 neonv8 params
60        LDR     r7, [sp, 112]            // cn_stride
61
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        VLDM    r9!, {d16-d19}          // Bias
67        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
68        SUBS    r5, r2, 8               // k = kc - 8
69        VMOV.I32 q3, 0
70        BLO     4f                      // less than 8 channels?
71
72        // Prologue - load A0 and B0
73        VLD1.8  {d0},  [r3]!            // A0
74        SUBS    r5, r5, 8               // k = k - 8
75        VLD1.8  {d8},  [r9]!            // B0
76        BLO     2f                      // less than 8 channels?
77
78        // Main loop - 8 bytes
79        // 64 bytes for weights.
80
81        .p2align 3
821:
83        // Extend
84        VMOVL.S8 q0, d0
85        VMOVL.S8 q4, d8
86
87        // BLOCK 0
88        VLD1.8  {d10},  [r9]!           // B1
89        VMLAL.S16 q8, d8, d0[0]
90        VMLAL.S16 q9, d9, d0[0]
91        VMOVL.S8 q5, d10
92
93        // BLOCK 1
94        VLD1.8  {d8},  [r9]!            // B2
95        VMLAL.S16 q2, d10, d0[1]
96        VMLAL.S16 q3, d11, d0[1]
97        VMOVL.S8 q4, d8
98
99        // BLOCK 2
100        VLD1.8  {d10},  [r9]!           // B3
101        VMLAL.S16 q8, d8, d0[2]
102        VMLAL.S16 q9, d9, d0[2]
103        VMOVL.S8 q5, d10
104
105        // BLOCK 3
106        VLD1.8  {d8},  [r9]!            // B4
107        VMLAL.S16 q2, d10, d0[3]
108        VMLAL.S16 q3, d11, d0[3]
109        VLD1.8  {d0},  [r3]!            // A0
110        VMOVL.S8 q4, d8
111
112        // BLOCK 4
113        VLD1.8  {d10},  [r9]!           // B5
114        VMLAL.S16 q8, d8, d1[0]
115        VMLAL.S16 q9, d9, d1[0]
116        VMOVL.S8 q5, d10
117
118        // BLOCK 5
119        VLD1.8  {d8},  [r9]!            // B6
120        VMLAL.S16 q2, d10, d1[1]
121        VMLAL.S16 q3, d11, d1[1]
122        VMOVL.S8 q4, d8
123
124        // BLOCK 6
125        VLD1.8  {d10},  [r9]!           // B7
126        VMLAL.S16 q8, d8, d1[2]
127        VMLAL.S16 q9, d9, d1[2]
128        VMOVL.S8 q5, d10
129
130        // BLOCK 7
131        VLD1.8  {d8},  [r9]!            // B0
132        VMLAL.S16 q2, d10, d1[3]
133        VMLAL.S16 q3, d11, d1[3]
134        SUBS    r5, r5, 8
135        BHS     1b
136
137        // Epilogue
138
139        .p2align 3
1402:
141        VMOVL.S8 q0, d0
142        VMOVL.S8 q4, d8
143
144        VLD1.8  {d10},  [r9]!           // B1
145        VMLAL.S16 q8, d8, d0[0]
146        VMLAL.S16 q9, d9, d0[0]
147        VMOVL.S8 q5, d10
148
149        VLD1.8  {d8},  [r9]!            // B2
150        VMLAL.S16 q2, d10, d0[1]
151        VMLAL.S16 q3, d11, d0[1]
152        VMOVL.S8 q4, d8
153
154        VLD1.8  {d10},  [r9]!           // B3
155        VMLAL.S16 q8, d8, d0[2]
156        VMLAL.S16 q9, d9, d0[2]
157        VMOVL.S8 q5, d10
158
159        VLD1.8  {d8},  [r9]!            // B4
160        VMLAL.S16 q2, d10, d0[3]
161        VMLAL.S16 q3, d11, d0[3]
162        VMOVL.S8 q4, d8
163
164        VLD1.8  {d10},  [r9]!           // B5
165        VMLAL.S16 q8, d8, d1[0]
166        VMLAL.S16 q9, d9, d1[0]
167        VMOVL.S8 q5, d10
168
169        VLD1.8  {d8},  [r9]!            // B6
170        VMLAL.S16 q2, d10, d1[1]
171        VMLAL.S16 q3, d11, d1[1]
172        VMOVL.S8 q4, d8
173
174        VLD1.8  {d10},  [r9]!           // B7
175        VMLAL.S16 q8, d8, d1[2]
176        VMLAL.S16 q9, d9, d1[2]
177        VMOVL.S8 q5, d10
178        ADDS    r5, r5, 8
179
180        VMLAL.S16 q2, d10, d1[3]
181        VMLAL.S16 q3, d11, d1[3]
182
183        # Is there a remainder?- 1-7 bytes of A
184        BNE     4f
185
1863:
187        VADD.S32 q8, q8, q2
188        VADD.S32 q9, q9, q3
189
190        # QC8 FP32 quantization
191        VLD1.8  {q0-q1},  [r9]!
192
193        VCVT.F32.S32 q8,  q8
194        VCVT.F32.S32 q9,  q9
195
196        VMUL.F32 q8,  q8, q0            // multiplier
197        VMUL.F32 q9,  q9, q1
198
199        VCVTN.S32.F32 q8,  q8
200        VCVTN.S32.F32 q9,  q9
201
202        VDUP.16 q0, d13[2]              // output_zero_point
203
204        VQMOVN.S32 d16, q8
205        VQMOVN.S32 d17, q9
206
207        VQADD.S16 q8,  q8, q0
208
209        VDUP.8  d24, d13[6]             // output_min
210
211        VQMOVN.S16 d0,  q8
212
213        VDUP.8  d25, d13[7]             // output_max
214
215        VMAX.S8 d0, d0, d24
216
217        SUBS    r1, r1, 8
218
219        VMIN.S8 d0, d0, d25
220
221        # Store full 1 x 8
222        BLO     5f
223        VST1.8  {d0}, [r11], r7
224        SUB     r3, r3, r2
225        BHI     0b
226
227        VPOP    {d8-d13}
228        ADD     sp, sp, 16              // skip pad of 8 + d14
229        ADD     sp, sp, 16
230        POP     {r5, r7, r9, r11}
231        BX      lr
232
233        # Remainder- 1 to 7 bytes of A
234        .p2align 3
2354:
236        AND     r5, r5, 7               // kc remainder 1 to 7
237
238        VLD1.8  {d0},  [r3], r5
239        VLD1.8  {d8},  [r9]!
240
241        VMOVL.S8 q0, d0
242        VMOVL.S8 q4, d8
243        VMLAL.S16 q8, d8, d0[0]
244        VMLAL.S16 q9, d9, d0[0]
245        CMP     r5, 2
246        BLO     3b
247
248        VLD1.8  {d8},  [r9]!
249        VMOVL.S8 q4, d8
250        VMLAL.S16 q8, d8, d0[1]
251        VMLAL.S16 q9, d9, d0[1]
252        BEQ     3b
253
254        VLD1.8  {d8},  [r9]!
255        VMOVL.S8 q4, d8
256        VMLAL.S16 q8, d8, d0[2]
257        VMLAL.S16 q9, d9, d0[2]
258        CMP     r5, 4
259        BLO     3b
260
261        VLD1.8  {d8},  [r9]!
262        VMOVL.S8 q4, d8
263        VMLAL.S16 q8, d8, d0[3]
264        VMLAL.S16 q9, d9, d0[3]
265        BEQ     3b
266
267        VLD1.8  {d8},  [r9]!
268        VMOVL.S8 q4, d8
269        VMLAL.S16 q8, d8, d1[0]
270        VMLAL.S16 q9, d9, d1[0]
271        CMP     r5, 6
272        BLO     3b
273
274        VLD1.8  {d8},  [r9]!
275        VMOVL.S8 q4, d8
276        VMLAL.S16 q8, d8, d1[1]
277        VMLAL.S16 q9, d9, d1[1]
278        BEQ     3b
279
280        VLD1.8  {d8},  [r9]!
281        VMOVL.S8 q4, d8
282        VMLAL.S16 q8, d8, d1[2]
283        VMLAL.S16 q9, d9, d1[2]
284        B       3b
285
286        # Store odd width
287        .p2align 3
2885:
289        TST     r1, 4
290        BEQ     6f
291        VST1.32 {d0[0]}, [r11]!
292        VEXT.8  q0, q0, q0, 4
2936:
294        TST     r1, 2
295        BEQ     7f
296        VST1.16 {d0[0]}, [r11]!
297        VEXT.8  q0, q0, q0, 2
2987:
299        TST     r1, 1
300        BEQ     8f
301        VST1.8  {d0[0]}, [r11]
3028:
303        VPOP    {d8-d13}
304        ADD     sp, sp, 16              // skip pad of 8 + d14
305        ADD     sp, sp, 16
306        POP     {r5, r7, r9, r11}
307        BX      lr
308
309END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35
310
311#ifdef __ELF__
312.section ".note.GNU-stack","",%progbits
313#endif
314
315