1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const int8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     int8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41// params structure is 4 bytes
42//  struct {
43//    int16_t output_zero_point;  d13[2]
44//    int8_t output_min;          d13[6]
45//    int8_t output_max;          d13[7]
46//  } xnn_qs8_minmax_params.neonv8;
47
48BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35
49        # Push 96 bytes
50        PUSH    {r5, r7, r9, r11}                   // 16
51        SUB     sp, sp, 32                          // +32
52        VPUSH   {d8-d13}                            // +48 = 96
53
54        LDR     r11, [sp, 104]          // c
55        LDR     r9, [sp, 100]           // w
56        LDR     r5, [sp, 116]           // params
57
58        # Load params values
59        VLD1.32 {d13[]}, [r5]           // QC8 neonv8 params
60        LDR     r7, [sp, 112]            // cn_stride
61
62        PLD     [r9,  64]               // Prefetch B
63        PLD     [r9, 128]
64        PLD     [r9, 192]
65        PLD     [r9, 256]
66        PLD     [r9, 320]
67        PLD     [r9, 384]
68
69        .p2align 3
700:
71        # Load initial bias from w into accumulators
72        VLDM    r9!, {d16-d19}          // Bias
73        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
74        SUBS    r5, r2, 8               // k = kc - 8
75        VMOV.I32 q3, 0
76        PLD     [r3,  64]               // Prefetch A
77        BLO     4f                      // less than 8 channels?
78
79        // Prologue - load A0 and B0
80        VLD1.8  {d0},  [r3]!            // A0
81        SUBS    r5, r5, 8               // k = k - 8
82        VLD1.8  {d8},  [r9]!            // B0
83        BLO     2f                      // less than 8 channels?
84
85        // Main loop - 8 bytes
86        // 64 bytes for weights.
87
88        .p2align 3
891:
90        // Extend
91        VMOVL.S8 q0, d0
92        VMOVL.S8 q4, d8
93        PLD     [r9, 448]
94
95        // BLOCK 0
96        VLD1.8  {d10},  [r9]!           // B1
97        VMLAL.S16 q8, d8, d0[0]
98        VMLAL.S16 q9, d9, d0[0]
99        VMOVL.S8 q5, d10
100
101        // BLOCK 1
102        VLD1.8  {d8},  [r9]!            // B2
103        VMLAL.S16 q2, d10, d0[1]
104        VMLAL.S16 q3, d11, d0[1]
105        VMOVL.S8 q4, d8
106
107        // BLOCK 2
108        VLD1.8  {d10},  [r9]!           // B3
109        VMLAL.S16 q8, d8, d0[2]
110        VMLAL.S16 q9, d9, d0[2]
111        VMOVL.S8 q5, d10
112
113        // BLOCK 3
114        VLD1.8  {d8},  [r9]!            // B4
115        VMLAL.S16 q2, d10, d0[3]
116        VMLAL.S16 q3, d11, d0[3]
117        VLD1.8  {d0},  [r3]!            // A0
118        VMOVL.S8 q4, d8
119
120        // BLOCK 4
121        VLD1.8  {d10},  [r9]!           // B5
122        VMLAL.S16 q8, d8, d1[0]
123        VMLAL.S16 q9, d9, d1[0]
124        VMOVL.S8 q5, d10
125
126        // BLOCK 5
127        VLD1.8  {d8},  [r9]!            // B6
128        VMLAL.S16 q2, d10, d1[1]
129        VMLAL.S16 q3, d11, d1[1]
130        VMOVL.S8 q4, d8
131
132        // BLOCK 6
133        VLD1.8  {d10},  [r9]!           // B7
134        VMLAL.S16 q8, d8, d1[2]
135        VMLAL.S16 q9, d9, d1[2]
136        VMOVL.S8 q5, d10
137
138        // BLOCK 7
139        VLD1.8  {d8},  [r9]!            // B0
140        VMLAL.S16 q2, d10, d1[3]
141        VMLAL.S16 q3, d11, d1[3]
142        SUBS    r5, r5, 8
143        BHS     1b
144
145        // Epilogue
146
147        .p2align 3
1482:
149        VMOVL.S8 q0, d0
150        VMOVL.S8 q4, d8
151
152        VLD1.8  {d10},  [r9]!           // B1
153        VMLAL.S16 q8, d8, d0[0]
154        VMLAL.S16 q9, d9, d0[0]
155        VMOVL.S8 q5, d10
156
157        VLD1.8  {d8},  [r9]!            // B2
158        VMLAL.S16 q2, d10, d0[1]
159        VMLAL.S16 q3, d11, d0[1]
160        VMOVL.S8 q4, d8
161
162        VLD1.8  {d10},  [r9]!           // B3
163        VMLAL.S16 q8, d8, d0[2]
164        VMLAL.S16 q9, d9, d0[2]
165        VMOVL.S8 q5, d10
166
167        VLD1.8  {d8},  [r9]!            // B4
168        VMLAL.S16 q2, d10, d0[3]
169        VMLAL.S16 q3, d11, d0[3]
170        VMOVL.S8 q4, d8
171
172        VLD1.8  {d10},  [r9]!           // B5
173        VMLAL.S16 q8, d8, d1[0]
174        VMLAL.S16 q9, d9, d1[0]
175        VMOVL.S8 q5, d10
176
177        VLD1.8  {d8},  [r9]!            // B6
178        VMLAL.S16 q2, d10, d1[1]
179        VMLAL.S16 q3, d11, d1[1]
180        VMOVL.S8 q4, d8
181
182        VLD1.8  {d10},  [r9]!           // B7
183        VMLAL.S16 q8, d8, d1[2]
184        VMLAL.S16 q9, d9, d1[2]
185        VMOVL.S8 q5, d10
186        ADDS    r5, r5, 8
187
188        VMLAL.S16 q2, d10, d1[3]
189        VMLAL.S16 q3, d11, d1[3]
190
191        # Is there a remainder?- 1-7 bytes of A
192        BNE     4f
193
1943:
195        VADD.S32 q8, q8, q2
196        VADD.S32 q9, q9, q3
197
198        # QC8 FP32 quantization
199        VLD1.8  {q0-q1},  [r9]!
200
201        VCVT.F32.S32 q8,  q8
202        VCVT.F32.S32 q9,  q9
203
204        VMUL.F32 q8,  q8, q0            // multiplier
205        VMUL.F32 q9,  q9, q1
206
207        VCVTN.S32.F32 q8,  q8
208        VCVTN.S32.F32 q9,  q9
209
210        VDUP.16 q0, d13[2]              // output_zero_point
211
212        VQMOVN.S32 d16, q8
213        VQMOVN.S32 d17, q9
214
215        VQADD.S16 q8,  q8, q0
216
217        VDUP.8  d24, d13[6]             // output_min
218
219        VQMOVN.S16 d0,  q8
220
221        VDUP.8  d25, d13[7]             // output_max
222
223        VMAX.S8 d0, d0, d24
224
225        SUBS    r1, r1, 8
226
227        VMIN.S8 d0, d0, d25
228
229        # Store full 1 x 8
230        BLO     5f
231        VST1.8  {d0}, [r11], r7
232        SUB     r3, r3, r2
233        BHI     0b
234
235        VPOP    {d8-d13}
236        ADD     sp, sp, 16              // skip pad of 8 + d14
237        ADD     sp, sp, 16
238        POP     {r5, r7, r9, r11}
239        BX      lr
240
241        # Remainder- 1 to 7 bytes of A
242        .p2align 3
2434:
244        AND     r5, r5, 7               // kc remainder 1 to 7
245
246        VLD1.8  {d0},  [r3], r5
247        VLD1.8  {d8},  [r9]!
248
249        VMOVL.S8 q0, d0
250        VMOVL.S8 q4, d8
251        VMLAL.S16 q8, d8, d0[0]
252        VMLAL.S16 q9, d9, d0[0]
253        CMP     r5, 2
254        BLO     3b
255
256        VLD1.8  {d8},  [r9]!
257        VMOVL.S8 q4, d8
258        VMLAL.S16 q8, d8, d0[1]
259        VMLAL.S16 q9, d9, d0[1]
260        BEQ     3b
261
262        VLD1.8  {d8},  [r9]!
263        VMOVL.S8 q4, d8
264        VMLAL.S16 q8, d8, d0[2]
265        VMLAL.S16 q9, d9, d0[2]
266        CMP     r5, 4
267        BLO     3b
268
269        VLD1.8  {d8},  [r9]!
270        VMOVL.S8 q4, d8
271        VMLAL.S16 q8, d8, d0[3]
272        VMLAL.S16 q9, d9, d0[3]
273        BEQ     3b
274
275        VLD1.8  {d8},  [r9]!
276        VMOVL.S8 q4, d8
277        VMLAL.S16 q8, d8, d1[0]
278        VMLAL.S16 q9, d9, d1[0]
279        CMP     r5, 6
280        BLO     3b
281
282        VLD1.8  {d8},  [r9]!
283        VMOVL.S8 q4, d8
284        VMLAL.S16 q8, d8, d1[1]
285        VMLAL.S16 q9, d9, d1[1]
286        BEQ     3b
287
288        VLD1.8  {d8},  [r9]!
289        VMOVL.S8 q4, d8
290        VMLAL.S16 q8, d8, d1[2]
291        VMLAL.S16 q9, d9, d1[2]
292        B       3b
293
294        # Store odd width
295        .p2align 3
2965:
297        TST     r1, 4
298        BEQ     6f
299        VST1.32 {d0[0]}, [r11]!
300        VEXT.8  q0, q0, q0, 4
3016:
302        TST     r1, 2
303        BEQ     7f
304        VST1.16 {d0[0]}, [r11]!
305        VEXT.8  q0, q0, q0, 2
3067:
307        TST     r1, 1
308        BEQ     8f
309        VST1.8  {d0[0]}, [r11]
3108:
311        VPOP    {d8-d13}
312        ADD     sp, sp, 16              // skip pad of 8 + d14
313        ADD     sp, sp, 16
314        POP     {r5, r7, r9, r11}
315        BX      lr
316
317END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35
318
319#ifdef __ELF__
320.section ".note.GNU-stack","",%progbits
321#endif
322
323