xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const int8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     int8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41// params structure is 10 bytes
42// struct {
43//   float magic_bias;                           d12[0]
44//   int32_t magic_bias_less_output_zero_point;  d12[1]
45//   int8_t output_min;                          d13[6]
46//   int8_t output_max;                          d13[7]
47// } xnn_qs8_minmax_params.neon;
48
49BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
50        # Push 96 bytes
51        PUSH    {r5, r7, r9, r11}                   // 16
52        SUB     sp, sp, 32                          // +32
53        VPUSH   {d8-d13}                            // +48 = 96
54
55        LDR     r11, [sp, 104]          // c
56        LDR     r9, [sp, 100]           // w
57        LDR     r5, [sp, 116]           // params
58
59        # Load params values
60        VLDM    r5!, {d12}              // QC8 neon params
61        VLD1.16 {d13[]}, [r5]           // output_min/max
62        LDR     r7, [sp, 112]            // cn_stride
63
64
65        .p2align 3
660:
67        # Load initial bias from w into accumulators
68        VLDM    r9!, {d16-d19}          // Bias
69        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
70        SUBS    r5, r2, 8               // k = kc - 8
71        VMOV.I32 q3, 0
72        BLO     4f                      // less than 8 channels?
73
74        // Prologue - load A0 and B0
75        VLD1.8  {d0},  [r3]!            // A0
76        SUBS    r5, r5, 8               // k = k - 8
77        VLD1.8  {d8},  [r9]!            // B0
78        BLO     2f                      // less than 8 channels?
79
80        // Main loop - 8 bytes
81        // 64 bytes for weights.
82
83        .p2align 3
841:
85        // Extend
86        VMOVL.S8 q0, d0
87        VMOVL.S8 q4, d8
88
89        // BLOCK 0
90        VLD1.8  {d10},  [r9]!           // B1
91        VMLAL.S16 q8, d8, d0[0]
92        VMLAL.S16 q9, d9, d0[0]
93        VMOVL.S8 q5, d10
94
95        // BLOCK 1
96        VLD1.8  {d8},  [r9]!            // B2
97        VMLAL.S16 q2, d10, d0[1]
98        VMLAL.S16 q3, d11, d0[1]
99        VMOVL.S8 q4, d8
100
101        // BLOCK 2
102        VLD1.8  {d10},  [r9]!           // B3
103        VMLAL.S16 q8, d8, d0[2]
104        VMLAL.S16 q9, d9, d0[2]
105        VMOVL.S8 q5, d10
106
107        // BLOCK 3
108        VLD1.8  {d8},  [r9]!            // B4
109        VMLAL.S16 q2, d10, d0[3]
110        VMLAL.S16 q3, d11, d0[3]
111        VLD1.8  {d0},  [r3]!            // A0
112        VMOVL.S8 q4, d8
113
114        // BLOCK 4
115        VLD1.8  {d10},  [r9]!           // B5
116        VMLAL.S16 q8, d8, d1[0]
117        VMLAL.S16 q9, d9, d1[0]
118        VMOVL.S8 q5, d10
119
120        // BLOCK 5
121        VLD1.8  {d8},  [r9]!            // B6
122        VMLAL.S16 q2, d10, d1[1]
123        VMLAL.S16 q3, d11, d1[1]
124        VMOVL.S8 q4, d8
125
126        // BLOCK 6
127        VLD1.8  {d10},  [r9]!           // B7
128        VMLAL.S16 q8, d8, d1[2]
129        VMLAL.S16 q9, d9, d1[2]
130        VMOVL.S8 q5, d10
131
132        // BLOCK 7
133        VLD1.8  {d8},  [r9]!            // B0
134        VMLAL.S16 q2, d10, d1[3]
135        VMLAL.S16 q3, d11, d1[3]
136        SUBS    r5, r5, 8
137        BHS     1b
138
139        // Epilogue
140
141        .p2align 3
1422:
143        VMOVL.S8 q0, d0
144        VMOVL.S8 q4, d8
145
146        VLD1.8  {d10},  [r9]!           // B1
147        VMLAL.S16 q8, d8, d0[0]
148        VMLAL.S16 q9, d9, d0[0]
149        VMOVL.S8 q5, d10
150
151        VLD1.8  {d8},  [r9]!            // B2
152        VMLAL.S16 q2, d10, d0[1]
153        VMLAL.S16 q3, d11, d0[1]
154        VMOVL.S8 q4, d8
155
156        VLD1.8  {d10},  [r9]!           // B3
157        VMLAL.S16 q8, d8, d0[2]
158        VMLAL.S16 q9, d9, d0[2]
159        VMOVL.S8 q5, d10
160
161        VLD1.8  {d8},  [r9]!            // B4
162        VMLAL.S16 q2, d10, d0[3]
163        VMLAL.S16 q3, d11, d0[3]
164        VMOVL.S8 q4, d8
165
166        VLD1.8  {d10},  [r9]!           // B5
167        VMLAL.S16 q8, d8, d1[0]
168        VMLAL.S16 q9, d9, d1[0]
169        VMOVL.S8 q5, d10
170
171        VLD1.8  {d8},  [r9]!            // B6
172        VMLAL.S16 q2, d10, d1[1]
173        VMLAL.S16 q3, d11, d1[1]
174        VMOVL.S8 q4, d8
175
176        VLD1.8  {d10},  [r9]!           // B7
177        VMLAL.S16 q8, d8, d1[2]
178        VMLAL.S16 q9, d9, d1[2]
179        VMOVL.S8 q5, d10
180        ADDS    r5, r5, 8
181
182        VMLAL.S16 q2, d10, d1[3]
183        VMLAL.S16 q3, d11, d1[3]
184
185        # Is there a remainder?- 1-7 bytes of A
186        BNE     4f
187
1883:
189        VADD.S32 q8, q8, q2
190        VADD.S32 q9, q9, q3
191
192        # QC8 FP32 quantization
193        VLD1.8  {q0-q1},  [r9]!
194
195        VDUP.32 q2, d12[0]              // magic_bias
196        VDUP.32 q3, d12[1]              // magic_bias_less_output_zero_point
197
198        VCVT.F32.S32 q8,  q8
199        VCVT.F32.S32 q9,  q9
200
201        VMUL.F32 q8,  q8, q0            // multiplier
202        VMUL.F32 q9,  q9, q1
203
204        VADD.F32 q8,  q8, q2            // magic_bias
205        VADD.F32 q9,  q9, q2
206
207        VQSUB.S32 q8,  q8, q3           // magic_bias_less_output_zero_point
208        VQSUB.S32 q9,  q9, q3
209
210
211        VQMOVN.S32 d16, q8
212        VQMOVN.S32 d17, q9
213
214
215        VDUP.8  d24, d13[6]             // output_min
216
217        VQMOVN.S16 d0,  q8
218
219        VDUP.8  d25, d13[7]             // output_max
220
221        VMAX.S8 d0, d0, d24
222
223        SUBS    r1, r1, 8
224
225        VMIN.S8 d0, d0, d25
226
227        # Store full 1 x 8
228        BLO     5f
229        VST1.8  {d0}, [r11], r7
230        SUB     r3, r3, r2
231        BHI     0b
232
233        VPOP    {d8-d13}
234        ADD     sp, sp, 16              // skip pad of 8 + d14
235        ADD     sp, sp, 16
236        POP     {r5, r7, r9, r11}
237        BX      lr
238
239        # Remainder- 1 to 7 bytes of A
240        .p2align 3
2414:
242        AND     r5, r5, 7               // kc remainder 1 to 7
243
244        VLD1.8  {d0},  [r3], r5
245        VLD1.8  {d8},  [r9]!
246
247        VMOVL.S8 q0, d0
248        VMOVL.S8 q4, d8
249        VMLAL.S16 q8, d8, d0[0]
250        VMLAL.S16 q9, d9, d0[0]
251        CMP     r5, 2
252        BLO     3b
253
254        VLD1.8  {d8},  [r9]!
255        VMOVL.S8 q4, d8
256        VMLAL.S16 q8, d8, d0[1]
257        VMLAL.S16 q9, d9, d0[1]
258        BEQ     3b
259
260        VLD1.8  {d8},  [r9]!
261        VMOVL.S8 q4, d8
262        VMLAL.S16 q8, d8, d0[2]
263        VMLAL.S16 q9, d9, d0[2]
264        CMP     r5, 4
265        BLO     3b
266
267        VLD1.8  {d8},  [r9]!
268        VMOVL.S8 q4, d8
269        VMLAL.S16 q8, d8, d0[3]
270        VMLAL.S16 q9, d9, d0[3]
271        BEQ     3b
272
273        VLD1.8  {d8},  [r9]!
274        VMOVL.S8 q4, d8
275        VMLAL.S16 q8, d8, d1[0]
276        VMLAL.S16 q9, d9, d1[0]
277        CMP     r5, 6
278        BLO     3b
279
280        VLD1.8  {d8},  [r9]!
281        VMOVL.S8 q4, d8
282        VMLAL.S16 q8, d8, d1[1]
283        VMLAL.S16 q9, d9, d1[1]
284        BEQ     3b
285
286        VLD1.8  {d8},  [r9]!
287        VMOVL.S8 q4, d8
288        VMLAL.S16 q8, d8, d1[2]
289        VMLAL.S16 q9, d9, d1[2]
290        B       3b
291
292        # Store odd width
293        .p2align 3
2945:
295        TST     r1, 4
296        BEQ     6f
297        VST1.32 {d0[0]}, [r11]!
298        VEXT.8  q0, q0, q0, 4
2996:
300        TST     r1, 2
301        BEQ     7f
302        VST1.16 {d0[0]}, [r11]!
303        VEXT.8  q0, q0, q0, 2
3047:
305        TST     r1, 1
306        BEQ     8f
307        VST1.8  {d0[0]}, [r11]
3088:
309        VPOP    {d8-d13}
310        ADD     sp, sp, 16              // skip pad of 8 + d14
311        ADD     sp, sp, 16
312        POP     {r5, r7, r9, r11}
313        BX      lr
314
315END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
316
317#ifdef __ELF__
318.section ".note.GNU-stack","",%progbits
319#endif
320
321