1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const int8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     int8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41// params structure is 10 bytes
42// struct {
43//   float magic_bias;                           d12[0]
44//   int32_t magic_bias_less_output_zero_point;  d12[1]
45//   int8_t output_min;                          d13[6]
46//   int8_t output_max;                          d13[7]
47// } xnn_qs8_minmax_params.neon;
48
49BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
50        # Push 96 bytes
51        PUSH    {r5, r7, r9, r11}                   // 16
52        SUB     sp, sp, 32                          // +32
53        VPUSH   {d8-d13}                            // +48 = 96
54
55        LDR     r11, [sp, 104]          // c
56        LDR     r9, [sp, 100]           // w
57        LDR     r5, [sp, 116]           // params
58
59        # Load params values
60        VLDM    r5!, {d12}              // QC8 neon params
61        VLD1.16 {d13[]}, [r5]           // output_min/max
62        LDR     r7, [sp, 112]            // cn_stride
63
64        PLD     [r9,  64]               // Prefetch B
65        PLD     [r9, 128]
66        PLD     [r9, 192]
67        PLD     [r9, 256]
68        PLD     [r9, 320]
69        PLD     [r9, 384]
70
71        .p2align 3
720:
73        # Load initial bias from w into accumulators
74        VLDM    r9!, {d16-d19}          // Bias
75        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
76        SUBS    r5, r2, 8               // k = kc - 8
77        VMOV.I32 q3, 0
78        PLD     [r3,  64]               // Prefetch A
79        BLO     4f                      // less than 8 channels?
80
81        // Prologue - load A0 and B0
82        VLD1.8  {d0},  [r3]!            // A0
83        SUBS    r5, r5, 8               // k = k - 8
84        VLD1.8  {d8},  [r9]!            // B0
85        BLO     2f                      // less than 8 channels?
86
87        // Main loop - 8 bytes
88        // 64 bytes for weights.
89
90        .p2align 3
911:
92        // Extend
93        VMOVL.S8 q0, d0
94        VMOVL.S8 q4, d8
95        PLD     [r9, 448]
96
97        // BLOCK 0
98        VLD1.8  {d10},  [r9]!           // B1
99        VMLAL.S16 q8, d8, d0[0]
100        VMLAL.S16 q9, d9, d0[0]
101        VMOVL.S8 q5, d10
102
103        // BLOCK 1
104        VLD1.8  {d8},  [r9]!            // B2
105        VMLAL.S16 q2, d10, d0[1]
106        VMLAL.S16 q3, d11, d0[1]
107        VMOVL.S8 q4, d8
108
109        // BLOCK 2
110        VLD1.8  {d10},  [r9]!           // B3
111        VMLAL.S16 q8, d8, d0[2]
112        VMLAL.S16 q9, d9, d0[2]
113        VMOVL.S8 q5, d10
114
115        // BLOCK 3
116        VLD1.8  {d8},  [r9]!            // B4
117        VMLAL.S16 q2, d10, d0[3]
118        VMLAL.S16 q3, d11, d0[3]
119        VLD1.8  {d0},  [r3]!            // A0
120        VMOVL.S8 q4, d8
121
122        // BLOCK 4
123        VLD1.8  {d10},  [r9]!           // B5
124        VMLAL.S16 q8, d8, d1[0]
125        VMLAL.S16 q9, d9, d1[0]
126        VMOVL.S8 q5, d10
127
128        // BLOCK 5
129        VLD1.8  {d8},  [r9]!            // B6
130        VMLAL.S16 q2, d10, d1[1]
131        VMLAL.S16 q3, d11, d1[1]
132        VMOVL.S8 q4, d8
133
134        // BLOCK 6
135        VLD1.8  {d10},  [r9]!           // B7
136        VMLAL.S16 q8, d8, d1[2]
137        VMLAL.S16 q9, d9, d1[2]
138        VMOVL.S8 q5, d10
139
140        // BLOCK 7
141        VLD1.8  {d8},  [r9]!            // B0
142        VMLAL.S16 q2, d10, d1[3]
143        VMLAL.S16 q3, d11, d1[3]
144        SUBS    r5, r5, 8
145        BHS     1b
146
147        // Epilogue
148
149        .p2align 3
1502:
151        VMOVL.S8 q0, d0
152        VMOVL.S8 q4, d8
153
154        VLD1.8  {d10},  [r9]!           // B1
155        VMLAL.S16 q8, d8, d0[0]
156        VMLAL.S16 q9, d9, d0[0]
157        VMOVL.S8 q5, d10
158
159        VLD1.8  {d8},  [r9]!            // B2
160        VMLAL.S16 q2, d10, d0[1]
161        VMLAL.S16 q3, d11, d0[1]
162        VMOVL.S8 q4, d8
163
164        VLD1.8  {d10},  [r9]!           // B3
165        VMLAL.S16 q8, d8, d0[2]
166        VMLAL.S16 q9, d9, d0[2]
167        VMOVL.S8 q5, d10
168
169        VLD1.8  {d8},  [r9]!            // B4
170        VMLAL.S16 q2, d10, d0[3]
171        VMLAL.S16 q3, d11, d0[3]
172        VMOVL.S8 q4, d8
173
174        VLD1.8  {d10},  [r9]!           // B5
175        VMLAL.S16 q8, d8, d1[0]
176        VMLAL.S16 q9, d9, d1[0]
177        VMOVL.S8 q5, d10
178
179        VLD1.8  {d8},  [r9]!            // B6
180        VMLAL.S16 q2, d10, d1[1]
181        VMLAL.S16 q3, d11, d1[1]
182        VMOVL.S8 q4, d8
183
184        VLD1.8  {d10},  [r9]!           // B7
185        VMLAL.S16 q8, d8, d1[2]
186        VMLAL.S16 q9, d9, d1[2]
187        VMOVL.S8 q5, d10
188        ADDS    r5, r5, 8
189
190        VMLAL.S16 q2, d10, d1[3]
191        VMLAL.S16 q3, d11, d1[3]
192
193        # Is there a remainder?- 1-7 bytes of A
194        BNE     4f
195
1963:
197        VADD.S32 q8, q8, q2
198        VADD.S32 q9, q9, q3
199
200        # QC8 FP32 quantization
201        VLD1.8  {q0-q1},  [r9]!
202
203        VDUP.32 q2, d12[0]              // magic_bias
204        VDUP.32 q3, d12[1]              // magic_bias_less_output_zero_point
205
206        VCVT.F32.S32 q8,  q8
207        VCVT.F32.S32 q9,  q9
208
209        VMUL.F32 q8,  q8, q0            // multiplier
210        VMUL.F32 q9,  q9, q1
211
212        VADD.F32 q8,  q8, q2            // magic_bias
213        VADD.F32 q9,  q9, q2
214
215        VQSUB.S32 q8,  q8, q3           // magic_bias_less_output_zero_point
216        VQSUB.S32 q9,  q9, q3
217
218
219        VQMOVN.S32 d16, q8
220        VQMOVN.S32 d17, q9
221
222
223        VDUP.8  d24, d13[6]             // output_min
224
225        VQMOVN.S16 d0,  q8
226
227        VDUP.8  d25, d13[7]             // output_max
228
229        VMAX.S8 d0, d0, d24
230
231        SUBS    r1, r1, 8
232
233        VMIN.S8 d0, d0, d25
234
235        # Store full 1 x 8
236        BLO     5f
237        VST1.8  {d0}, [r11], r7
238        SUB     r3, r3, r2
239        BHI     0b
240
241        VPOP    {d8-d13}
242        ADD     sp, sp, 16              // skip pad of 8 + d14
243        ADD     sp, sp, 16
244        POP     {r5, r7, r9, r11}
245        BX      lr
246
247        # Remainder- 1 to 7 bytes of A
248        .p2align 3
2494:
250        AND     r5, r5, 7               // kc remainder 1 to 7
251
252        VLD1.8  {d0},  [r3], r5
253        VLD1.8  {d8},  [r9]!
254
255        VMOVL.S8 q0, d0
256        VMOVL.S8 q4, d8
257        VMLAL.S16 q8, d8, d0[0]
258        VMLAL.S16 q9, d9, d0[0]
259        CMP     r5, 2
260        BLO     3b
261
262        VLD1.8  {d8},  [r9]!
263        VMOVL.S8 q4, d8
264        VMLAL.S16 q8, d8, d0[1]
265        VMLAL.S16 q9, d9, d0[1]
266        BEQ     3b
267
268        VLD1.8  {d8},  [r9]!
269        VMOVL.S8 q4, d8
270        VMLAL.S16 q8, d8, d0[2]
271        VMLAL.S16 q9, d9, d0[2]
272        CMP     r5, 4
273        BLO     3b
274
275        VLD1.8  {d8},  [r9]!
276        VMOVL.S8 q4, d8
277        VMLAL.S16 q8, d8, d0[3]
278        VMLAL.S16 q9, d9, d0[3]
279        BEQ     3b
280
281        VLD1.8  {d8},  [r9]!
282        VMOVL.S8 q4, d8
283        VMLAL.S16 q8, d8, d1[0]
284        VMLAL.S16 q9, d9, d1[0]
285        CMP     r5, 6
286        BLO     3b
287
288        VLD1.8  {d8},  [r9]!
289        VMOVL.S8 q4, d8
290        VMLAL.S16 q8, d8, d1[1]
291        VMLAL.S16 q9, d9, d1[1]
292        BEQ     3b
293
294        VLD1.8  {d8},  [r9]!
295        VMOVL.S8 q4, d8
296        VMLAL.S16 q8, d8, d1[2]
297        VMLAL.S16 q9, d9, d1[2]
298        B       3b
299
300        # Store odd width
301        .p2align 3
3025:
303        TST     r1, 4
304        BEQ     6f
305        VST1.32 {d0[0]}, [r11]!
306        VEXT.8  q0, q0, q0, 4
3076:
308        TST     r1, 2
309        BEQ     7f
310        VST1.16 {d0[0]}, [r11]!
311        VEXT.8  q0, q0, q0, 2
3127:
313        TST     r1, 1
314        BEQ     8f
315        VST1.8  {d0[0]}, [r11]
3168:
317        VPOP    {d8-d13}
318        ADD     sp, sp, 16              // skip pad of 8 + d14
319        ADD     sp, sp, 16
320        POP     {r5, r7, r9, r11}
321        BX      lr
322
323END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
324
325#ifdef __ELF__
326.section ".note.GNU-stack","",%progbits
327#endif
328
329