1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
16//     size_t mr,                                     (r0)
17//     size_t nc,                                      r1
18//     size_t kc,                                     (r2) -> sp + 56 -> r5
19//     size_t ks,                                     (r3) -> sp + 60 -> r14
20//     const int8_t**restrict a,            sp + 88  -> r2
21//     const void*restrict w,              sp + 92  -> r9
22//     int8_t*restrict c,                   sp + 96  -> r11
23//     size_t cm_stride,                   sp + 100  -> r6
24//     size_t cn_stride,                   sp + 104  -> r12
25//     size_t a_offset,                    sp + 108 -> (r5)
26//     const int8_t* zero,                  sp + 112 -> r7
27//     xnn_qs8_minmax_params*params); sp + 116 -> (r5)
28
29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
30
31// Based on cortex_a53 microkernel but with Neon loads
32
33// Register usage
34// A0   r3  d0-d1 q0
35
36// B    r9  d8-d9 q4 q5
37
38// C0  r11 d16-d17  q8  d18-d19  q9
39//         q2, q3 acc2
40
41// Unused r4, r8, r10, d15, q10-q15, q1-q3
42
43// params structure is 10 bytes
44// struct {
45//   float magic_bias;                           d12[0]
46//   int32_t magic_bias_less_output_zero_point;  d12[1]
47//   int8_t output_min;                          d13[6]
48//   int8_t output_max;                          d13[7]
49// } xnn_qs8_minmax_params.neon;
50
51BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
52        # Push 88 bytes
53        # r2, r3 will be reloaded in outer loop.
54        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
55        SUB     sp, sp, 8                           // +8
56        VPUSH   {d8-d13}                            // +48 = 88
57
58        LDR     r2,  [sp, 88]           // a
59        LDR     r9,  [sp, 92]           // w
60        LDR     r11, [sp, 96]           // c
61        LDR     r6,  [sp, 100]          // cm_stride
62        LDR     r12, [sp, 104]          // cn_stride
63        LDR     r7,  [sp, 112]          // zero
64        LDR     r5,  [sp, 116]          // params
65        MOV     r14, r3                 // p = ks
66
67        # Load params values
68        VLDM    r5!, {d12}              // QC8 neon params
69        VLD1.16 {d13[]}, [r5]
70
71
72        .p2align 3
730:
74        # Load initial bias from w into accumulators
75        VLDM    r9!, {d16-d19}          // Bias
76        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
77        VMOV.I32 q3, 0
78
79        .p2align 3
801:
81        # Load next A pointer
82        LDR     r3, [r2,  0]
83
84        # Add a_offset
85        LDR     r5, [sp, 108]           // a_offset
86        ADD     r2, r2, 4
87        CMP     r3,  r7                 // if a0 == zero
88        ADD     r3,  r3, r5             // a0 += a_offset
89        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
90
91        LDR     r5, [sp, 56]            // kc
92        SUBS    r5, r5, 8               // kc - 8
93        BLO     5f                      // less than 8 channels?
94
95        // Prologue - load A0 and B0
96        VLD1.8  {d0},  [r3]!            // A0
97        SUBS    r5, r5, 8               // k = k - 8
98        VLD1.8  {d8},  [r9]!            // B0
99        BLO     3f                      // less than 8 channels?
100
101        // Main loop - 8 bytes
102        // 64 bytes for weights.
103
104        .p2align 3
1052:
106        // Extend
107        VMOVL.S8 q0, d0
108        VMOVL.S8 q4, d8
109
110        // BLOCK 0
111        VLD1.8  {d10},  [r9]!           // B1
112        VMLAL.S16 q8, d8, d0[0]
113        VMLAL.S16 q9, d9, d0[0]
114        VMOVL.S8 q5, d10
115
116        // BLOCK 1
117        VLD1.8  {d8},  [r9]!            // B2
118        VMLAL.S16 q2, d10, d0[1]
119        VMLAL.S16 q3, d11, d0[1]
120        VMOVL.S8 q4, d8
121
122        // BLOCK 2
123        VLD1.8  {d10},  [r9]!           // B3
124        VMLAL.S16 q8, d8, d0[2]
125        VMLAL.S16 q9, d9, d0[2]
126        VMOVL.S8 q5, d10
127
128        // BLOCK 3
129        VLD1.8  {d8},  [r9]!            // B4
130        VMLAL.S16 q2, d10, d0[3]
131        VMLAL.S16 q3, d11, d0[3]
132        VLD1.8  {d0},  [r3]!            // A0
133        VMOVL.S8 q4, d8
134
135        // BLOCK 4
136        VLD1.8  {d10},  [r9]!           // B5
137        VMLAL.S16 q8, d8, d1[0]
138        VMLAL.S16 q9, d9, d1[0]
139        VMOVL.S8 q5, d10
140
141        // BLOCK 5
142        VLD1.8  {d8},  [r9]!            // B6
143        VMLAL.S16 q2, d10, d1[1]
144        VMLAL.S16 q3, d11, d1[1]
145        VMOVL.S8 q4, d8
146
147        // BLOCK 6
148        VLD1.8  {d10},  [r9]!           // B7
149        VMLAL.S16 q8, d8, d1[2]
150        VMLAL.S16 q9, d9, d1[2]
151        VMOVL.S8 q5, d10
152        SUBS    r5, r5, 8
153
154        // BLOCK 7
155        VLD1.8  {d8},  [r9]!            // B0
156        VMLAL.S16 q2, d10, d1[3]
157        VMLAL.S16 q3, d11, d1[3]
158        BHS     2b
159
160        // Epilogue
161
162        .p2align 3
1633:
164        // Extend
165        VMOVL.S8 q0, d0
166        VMOVL.S8 q4, d8
167
168        // BLOCK 0
169        VLD1.8  {d10},  [r9]!           // B1
170        VMLAL.S16 q8, d8, d0[0]
171        VMLAL.S16 q9, d9, d0[0]
172        VMOVL.S8 q5, d10
173
174        // BLOCK 1
175        VLD1.8  {d8},  [r9]!            // B2
176        VMLAL.S16 q2, d10, d0[1]
177        VMLAL.S16 q3, d11, d0[1]
178        VMOVL.S8 q4, d8
179
180        // BLOCK 2
181        VLD1.8  {d10},  [r9]!           // B3
182        VMLAL.S16 q8, d8, d0[2]
183        VMLAL.S16 q9, d9, d0[2]
184        VMOVL.S8 q5, d10
185
186        // BLOCK 3
187        VLD1.8  {d8},  [r9]!            // B4
188        VMLAL.S16 q2, d10, d0[3]
189        VMLAL.S16 q3, d11, d0[3]
190        VMOVL.S8 q4, d8
191
192        // BLOCK 4
193        VLD1.8  {d10},  [r9]!           // B5
194        VMLAL.S16 q8, d8, d1[0]
195        VMLAL.S16 q9, d9, d1[0]
196        VMOVL.S8 q5, d10
197
198        // BLOCK 5
199        VLD1.8  {d8},  [r9]!            // B6
200        VMLAL.S16 q2, d10, d1[1]
201        VMLAL.S16 q3, d11, d1[1]
202        VMOVL.S8 q4, d8
203
204        // BLOCK 6
205        VLD1.8  {d10},  [r9]!           // B7
206        VMLAL.S16 q8, d8, d1[2]
207        VMLAL.S16 q9, d9, d1[2]
208        VMOVL.S8 q5, d10
209        ADDS    r5, r5, 8
210
211        VMLAL.S16 q2, d10, d1[3]
212        VMLAL.S16 q3, d11, d1[3]
213
214        # Is there a remainder?- 1-7 bytes of A
215        BNE     6f
216
2174:
218        # ks loop
219        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
220        BHI     1b
221
222        LDR     r14, [sp, 60]           // p = ks
223
224        VADD.S32 q8, q8, q2
225        VADD.S32 q9, q9, q3
226
227        # QC8 FP32 quantization
228        VLD1.8  {q0-q1},  [r9]!
229
230        VDUP.32 q2, d12[0]              // magic_bias
231        VDUP.32 q3, d12[1]              // magic_bias_less_output_zero_point
232
233        VCVT.F32.S32 q8,  q8
234        VCVT.F32.S32 q9,  q9
235
236        VMUL.F32 q8,  q8, q0            // multiplier
237        VMUL.F32 q9,  q9, q1
238
239        VADD.F32 q8,  q8, q2            // magic_bias
240        VADD.F32 q9,  q9, q2
241
242        VQSUB.S32 q8,  q8, q3           // magic_bias_less_output_zero_point
243        VQSUB.S32 q9,  q9, q3
244
245
246        VQMOVN.S32 d16, q8
247        VQMOVN.S32 d17, q9
248
249
250        VDUP.8  d24, d13[6]             // output_min
251
252        VQMOVN.S16 d0,  q8
253
254        VDUP.8  d25, d13[7]             // output_max
255
256        VMAX.S8 d0, d0, d24
257
258        SUBS    r1, r1, 8
259
260        VMIN.S8 d0, d0, d25
261
262        # Store full 1 x 8
263        BLO     7f
264        VST1.8  {d0}, [r11], r12
265        SUB     r2, r2, r14             // a -= ks
266        BHI     0b
267
268        VPOP    {d8-d13}
269        ADD     sp, sp, 16              // skip pad of 8, r2, r3
270        POP     {r5, r6, r7, r9, r11, pc}
271
272        # Remainder- 1 to 7 bytes of A
273        .p2align 3
2745:
275        AND     r5, r5, 7               // kc remainder 1 to 7
2766:
277        VLD1.8  {d0},  [r3]
278        VLD1.8  {d8},  [r9]!
279
280        VMOVL.S8 q0, d0
281        VMOVL.S8 q4, d8
282        VMLAL.S16 q8, d8, d0[0]
283        VMLAL.S16 q9, d9, d0[0]
284        CMP     r5, 2
285        BLO     4b
286
287        VLD1.8  {d8},  [r9]!
288        VMOVL.S8 q4, d8
289        VMLAL.S16 q8, d8, d0[1]
290        VMLAL.S16 q9, d9, d0[1]
291        BEQ     4b
292
293        VLD1.8  {d8},  [r9]!
294        VMOVL.S8 q4, d8
295        VMLAL.S16 q8, d8, d0[2]
296        VMLAL.S16 q9, d9, d0[2]
297        CMP     r5, 4
298        BLO     4b
299
300        VLD1.8  {d8},  [r9]!
301        VMOVL.S8 q4, d8
302        VMLAL.S16 q8, d8, d0[3]
303        VMLAL.S16 q9, d9, d0[3]
304        BEQ     4b
305
306        VLD1.8  {d8},  [r9]!
307        VMOVL.S8 q4, d8
308        VMLAL.S16 q8, d8, d1[0]
309        VMLAL.S16 q9, d9, d1[0]
310        CMP     r5, 6
311        BLO     4b
312
313        VLD1.8  {d8},  [r9]!
314        VMOVL.S8 q4, d8
315        VMLAL.S16 q8, d8, d1[1]
316        VMLAL.S16 q9, d9, d1[1]
317        BEQ     4b
318
319        VLD1.8  {d8},  [r9]!
320        VMOVL.S8 q4, d8
321        VMLAL.S16 q8, d8, d1[2]
322        VMLAL.S16 q9, d9, d1[2]
323        B       4b
324
325        # Store odd width
326        .p2align 3
3277:
328        TST     r1, 4
329        BEQ     8f
330        VST1.32 {d0[0]}, [r11]!
331        VEXT.8  q0, q0, q0, 4
3328:
333        TST     r1, 2
334        BEQ     9f
335        VST1.16 {d0[0]}, [r11]!
336        VEXT.8  q0, q0, q0, 2
337
3389:
339        TST     r1, 1
340        BEQ     10f
341        VST1.8  {d0[0]}, [r11]
342
34310:
344        VPOP    {d8-d13}
345        ADD     sp, sp, 16              // skip pad of 8, r2, r3
346        POP     {r5, r6, r7, r9, r11, pc}
347
348END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
349
350#ifdef __ELF__
351.section ".note.GNU-stack","",%progbits
352#endif
353