1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35
16//     size_t mr,                                     (r0)
17//     size_t nc,                                      r1
18//     size_t kc,                                     (r2) -> sp + 56 -> r5
19//     size_t ks,                                     (r3) -> sp + 60 -> r14
20//     const int8_t**restrict a,            sp + 88  -> r2
21//     const void*restrict w,              sp + 92  -> r9
22//     int8_t*restrict c,                   sp + 96  -> r11
23//     size_t cm_stride,                   sp + 100  -> r6
24//     size_t cn_stride,                   sp + 104  -> r12
25//     size_t a_offset,                    sp + 108 -> (r5)
26//     const int8_t* zero,                  sp + 112 -> r7
27//     xnn_qs8_minmax_params*params); sp + 116 -> (r5)
28
29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
30
31// Based on cortex_a53 microkernel but with Neon loads
32
33// Register usage
34// A0   r3  d0-d1 q0
35
36// B    r9  d8-d9 q4 q5
37
38// C0  r11 d16-d17  q8  d18-d19  q9
39//         q2, q3 acc2
40
41// Unused r4, r8, r10, d15, q10-q15, q1-q3
42
43// params structure is 4 bytes
44//  struct {
45//    int16_t output_zero_point;  d13[2]
46//    int8_t output_min;          d13[6]
47//    int8_t output_max;          d13[7]
48//  } xnn_qs8_minmax_params.neonv8;
49
50BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35
51        # Push 88 bytes
52        # r2, r3 will be reloaded in outer loop.
53        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
54        SUB     sp, sp, 8                           // +8
55        VPUSH   {d8-d13}                            // +48 = 88
56
57        LDR     r2,  [sp, 88]           // a
58        LDR     r9,  [sp, 92]           // w
59        LDR     r11, [sp, 96]           // c
60        LDR     r6,  [sp, 100]          // cm_stride
61        LDR     r12, [sp, 104]          // cn_stride
62        LDR     r7,  [sp, 112]          // zero
63        LDR     r5,  [sp, 116]          // params
64        MOV     r14, r3                 // p = ks
65
66        # Load params values
67        VLD1.32 {d13[]}, [r5]           // QC8 neonv8 params
68
69        PLD     [r9,  64]               // Prefetch B
70        PLD     [r9, 112]
71        PLD     [r9, 192]
72        PLD     [r9, 256]
73        PLD     [r9, 320]
74        PLD     [r9, 384]
75
76        .p2align 3
770:
78        # Load initial bias from w into accumulators
79        VLDM    r9!, {d16-d19}          // Bias
80        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
81        VMOV.I32 q3, 0
82
83        .p2align 3
841:
85        # Load next A pointer
86        LDR     r3, [r2,  0]
87
88        # Add a_offset
89        LDR     r5, [sp, 108]           // a_offset
90        ADD     r2, r2, 4
91        CMP     r3,  r7                 // if a0 == zero
92        ADD     r3,  r3, r5             // a0 += a_offset
93        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
94
95        LDR     r5, [sp, 56]            // kc
96        SUBS    r5, r5, 8               // kc - 8
97        BLO     5f                      // less than 8 channels?
98
99        // Prologue - load A0 and B0
100        VLD1.8  {d0},  [r3]!            // A0
101        SUBS    r5, r5, 8               // k = k - 8
102        VLD1.8  {d8},  [r9]!            // B0
103        BLO     3f                      // less than 8 channels?
104
105        // Main loop - 8 bytes
106        // 64 bytes for weights.
107
108        .p2align 3
1092:
110        // Extend
111        VMOVL.S8 q0, d0
112        VMOVL.S8 q4, d8
113        PLD     [r9, 448]
114
115        // BLOCK 0
116        VLD1.8  {d10},  [r9]!           // B1
117        VMLAL.S16 q8, d8, d0[0]
118        VMLAL.S16 q9, d9, d0[0]
119        VMOVL.S8 q5, d10
120
121        // BLOCK 1
122        VLD1.8  {d8},  [r9]!            // B2
123        VMLAL.S16 q2, d10, d0[1]
124        VMLAL.S16 q3, d11, d0[1]
125        VMOVL.S8 q4, d8
126
127        // BLOCK 2
128        VLD1.8  {d10},  [r9]!           // B3
129        VMLAL.S16 q8, d8, d0[2]
130        VMLAL.S16 q9, d9, d0[2]
131        VMOVL.S8 q5, d10
132
133        // BLOCK 3
134        VLD1.8  {d8},  [r9]!            // B4
135        VMLAL.S16 q2, d10, d0[3]
136        VMLAL.S16 q3, d11, d0[3]
137        VLD1.8  {d0},  [r3]!            // A0
138        VMOVL.S8 q4, d8
139
140        // BLOCK 4
141        VLD1.8  {d10},  [r9]!           // B5
142        VMLAL.S16 q8, d8, d1[0]
143        VMLAL.S16 q9, d9, d1[0]
144        VMOVL.S8 q5, d10
145
146        // BLOCK 5
147        VLD1.8  {d8},  [r9]!            // B6
148        VMLAL.S16 q2, d10, d1[1]
149        VMLAL.S16 q3, d11, d1[1]
150        VMOVL.S8 q4, d8
151
152        // BLOCK 6
153        VLD1.8  {d10},  [r9]!           // B7
154        VMLAL.S16 q8, d8, d1[2]
155        VMLAL.S16 q9, d9, d1[2]
156        VMOVL.S8 q5, d10
157        SUBS    r5, r5, 8
158
159        // BLOCK 7
160        VLD1.8  {d8},  [r9]!            // B0
161        VMLAL.S16 q2, d10, d1[3]
162        VMLAL.S16 q3, d11, d1[3]
163        BHS     2b
164
165        // Epilogue
166
167        .p2align 3
1683:
169        // Extend
170        VMOVL.S8 q0, d0
171        VMOVL.S8 q4, d8
172        PLD     [r9, 448]
173
174        // BLOCK 0
175        VLD1.8  {d10},  [r9]!           // B1
176        VMLAL.S16 q8, d8, d0[0]
177        VMLAL.S16 q9, d9, d0[0]
178        VMOVL.S8 q5, d10
179
180        // BLOCK 1
181        VLD1.8  {d8},  [r9]!            // B2
182        VMLAL.S16 q2, d10, d0[1]
183        VMLAL.S16 q3, d11, d0[1]
184        VMOVL.S8 q4, d8
185
186        // BLOCK 2
187        VLD1.8  {d10},  [r9]!           // B3
188        VMLAL.S16 q8, d8, d0[2]
189        VMLAL.S16 q9, d9, d0[2]
190        VMOVL.S8 q5, d10
191
192        // BLOCK 3
193        VLD1.8  {d8},  [r9]!            // B4
194        VMLAL.S16 q2, d10, d0[3]
195        VMLAL.S16 q3, d11, d0[3]
196        VMOVL.S8 q4, d8
197
198        // BLOCK 4
199        VLD1.8  {d10},  [r9]!           // B5
200        VMLAL.S16 q8, d8, d1[0]
201        VMLAL.S16 q9, d9, d1[0]
202        VMOVL.S8 q5, d10
203
204        // BLOCK 5
205        VLD1.8  {d8},  [r9]!            // B6
206        VMLAL.S16 q2, d10, d1[1]
207        VMLAL.S16 q3, d11, d1[1]
208        VMOVL.S8 q4, d8
209
210        // BLOCK 6
211        VLD1.8  {d10},  [r9]!           // B7
212        VMLAL.S16 q8, d8, d1[2]
213        VMLAL.S16 q9, d9, d1[2]
214        VMOVL.S8 q5, d10
215        ADDS    r5, r5, 8
216
217        VMLAL.S16 q2, d10, d1[3]
218        VMLAL.S16 q3, d11, d1[3]
219
220        # Is there a remainder?- 1-7 bytes of A
221        BNE     6f
222
2234:
224        # ks loop
225        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
226        BHI     1b
227
228        LDR     r14, [sp, 60]           // p = ks
229
230        VADD.S32 q8, q8, q2
231        VADD.S32 q9, q9, q3
232
233        # QC8 FP32 quantization
234        VLD1.8  {q0-q1},  [r9]!
235
236        VCVT.F32.S32 q8,  q8
237        VCVT.F32.S32 q9,  q9
238
239        VMUL.F32 q8,  q8, q0            // multiplier
240        VMUL.F32 q9,  q9, q1
241
242        VCVTN.S32.F32 q8,  q8
243        VCVTN.S32.F32 q9,  q9
244
245        VDUP.16 q0, d13[2]              // output_zero_point
246
247        VQMOVN.S32 d16, q8
248        VQMOVN.S32 d17, q9
249
250        VQADD.S16 q8,  q8, q0
251
252        VDUP.8  d24, d13[6]             // output_min
253
254        VQMOVN.S16 d0,  q8
255
256        VDUP.8  d25, d13[7]             // output_max
257
258        VMAX.S8 d0, d0, d24
259
260        SUBS    r1, r1, 8
261
262        VMIN.S8 d0, d0, d25
263
264        # Store full 1 x 8
265        BLO     7f
266        VST1.8  {d0}, [r11], r12
267        SUB     r2, r2, r14             // a -= ks
268        BHI     0b
269
270        VPOP    {d8-d13}
271        ADD     sp, sp, 16              // skip pad of 8, r2, r3
272        POP     {r5, r6, r7, r9, r11, pc}
273
274        # Remainder- 1 to 7 bytes of A
275        .p2align 3
2765:
277        AND     r5, r5, 7               // kc remainder 1 to 7
2786:
279        VLD1.8  {d0},  [r3]
280        VLD1.8  {d8},  [r9]!
281
282        VMOVL.S8 q0, d0
283        VMOVL.S8 q4, d8
284        VMLAL.S16 q8, d8, d0[0]
285        VMLAL.S16 q9, d9, d0[0]
286        CMP     r5, 2
287        BLO     4b
288
289        VLD1.8  {d8},  [r9]!
290        VMOVL.S8 q4, d8
291        VMLAL.S16 q8, d8, d0[1]
292        VMLAL.S16 q9, d9, d0[1]
293        BEQ     4b
294
295        VLD1.8  {d8},  [r9]!
296        VMOVL.S8 q4, d8
297        VMLAL.S16 q8, d8, d0[2]
298        VMLAL.S16 q9, d9, d0[2]
299        CMP     r5, 4
300        BLO     4b
301
302        VLD1.8  {d8},  [r9]!
303        VMOVL.S8 q4, d8
304        VMLAL.S16 q8, d8, d0[3]
305        VMLAL.S16 q9, d9, d0[3]
306        BEQ     4b
307
308        VLD1.8  {d8},  [r9]!
309        VMOVL.S8 q4, d8
310        VMLAL.S16 q8, d8, d1[0]
311        VMLAL.S16 q9, d9, d1[0]
312        CMP     r5, 6
313        BLO     4b
314
315        VLD1.8  {d8},  [r9]!
316        VMOVL.S8 q4, d8
317        VMLAL.S16 q8, d8, d1[1]
318        VMLAL.S16 q9, d9, d1[1]
319        BEQ     4b
320
321        VLD1.8  {d8},  [r9]!
322        VMOVL.S8 q4, d8
323        VMLAL.S16 q8, d8, d1[2]
324        VMLAL.S16 q9, d9, d1[2]
325        B       4b
326
327        # Store odd width
328        .p2align 3
3297:
330        TST     r1, 4
331        BEQ     8f
332        VST1.32 {d0[0]}, [r11]!
333        VEXT.8  q0, q0, q0, 4
3348:
335        TST     r1, 2
336        BEQ     9f
337        VST1.16 {d0[0]}, [r11]!
338        VEXT.8  q0, q0, q0, 2
339
3409:
341        TST     r1, 1
342        BEQ     10f
343        VST1.8  {d0[0]}, [r11]
344
34510:
346        VPOP    {d8-d13}
347        ADD     sp, sp, 16              // skip pad of 8, r2, r3
348        POP     {r5, r6, r7, r9, r11, pc}
349
350END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35
351
352#ifdef __ELF__
353.section ".note.GNU-stack","",%progbits
354#endif
355