1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
16//     size_t mr,                                     (r0)
17//     size_t nc,                                      r1
18//     size_t kc,                                     (r2) -> sp + 56 -> r5
19//     size_t ks,                                     (r3) -> sp + 60 -> r14
20//     const int8_t**restrict a,            sp + 88  -> r2
21//     const void*restrict w,              sp + 92  -> r9
22//     int8_t*restrict c,                   sp + 96  -> r11
23//     size_t cm_stride,                   sp + 100  -> r6
24//     size_t cn_stride,                   sp + 104  -> r12
25//     size_t a_offset,                    sp + 108 -> (r5)
26//     const int8_t* zero,                  sp + 112 -> r7
27//     xnn_qs8_conv_minmax_params*params); sp + 116 -> (r5)
28
29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
30
31// Based on cortex_a53 microkernel but with Neon loads
32
33// Register usage
34// A0   r3  d0-d1 q0
35
36// B    r9  d8-d9 q4 q5
37
38// C0  r11 d16-d17  q8  d18-d19  q9
39//         q2, q3 acc2
40
41// Unused r4, r8, r10, d15, q10-q15, q1-q3
42
43// params structure is 16 bytes
44//  struct {
45//    int32_t right_pre_shift;    d12[0]
46//    int32_t multiplier;         d12[1]
47//    int32_t right_post_shift;   d13[0]
48//    int16_t output_zero_point;  d13[2]
49//    int8_t output_min;          d13[6]
50//    int8_t output_max;          d13[7]
51//  } rndnu_neon;
52
53BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
54        # Push 88 bytes
55        # r2, r3 will be reloaded in outer loop.
56        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
57        SUB     sp, sp, 8                           // +8
58        VPUSH   {d8-d13}                            // +48 = 88
59
60        LDR     r2,  [sp, 88]           // a
61        LDR     r9,  [sp, 92]           // w
62        LDR     r11, [sp, 96]           // c
63        LDR     r6,  [sp, 100]          // cm_stride
64        LDR     r12, [sp, 104]          // cn_stride
65        LDR     r7,  [sp, 112]          // zero
66        LDR     r5,  [sp, 116]          // params
67        MOV     r14, r3                 // p = ks
68
69        # Load params values
70        VLDM    r5, {d12-d13}           // RNDNU params
71
72
73        .p2align 3
740:
75        # Load initial bias from w into accumulators
76        VLDM    r9!, {d16-d19}          // Bias
77        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
78        VMOV.I32 q3, 0
79
80        .p2align 3
811:
82        # Load next A pointer
83        LDR     r3, [r2,  0]
84
85        # Add a_offset
86        LDR     r5, [sp, 108]           // a_offset
87        ADD     r2, r2, 4
88        CMP     r3,  r7                 // if a0 == zero
89        ADD     r3,  r3, r5             // a0 += a_offset
90        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
91
92        LDR     r5, [sp, 56]            // kc
93        SUBS    r5, r5, 8               // kc - 8
94        BLO     5f                      // less than 8 channels?
95
96        // Prologue - load A0 and B0
97        VLD1.8  {d0},  [r3]!            // A0
98        SUBS    r5, r5, 8               // k = k - 8
99        VLD1.8  {d8},  [r9]!            // B0
100        BLO     3f                      // less than 8 channels?
101
102        // Main loop - 8 bytes
103        // 64 bytes for weights.
104
105        .p2align 3
1062:
107        // Extend
108        VMOVL.S8 q0, d0
109        VMOVL.S8 q4, d8
110
111        // BLOCK 0
112        VLD1.8  {d10},  [r9]!           // B1
113        VMLAL.S16 q8, d8, d0[0]
114        VMLAL.S16 q9, d9, d0[0]
115        VMOVL.S8 q5, d10
116
117        // BLOCK 1
118        VLD1.8  {d8},  [r9]!            // B2
119        VMLAL.S16 q2, d10, d0[1]
120        VMLAL.S16 q3, d11, d0[1]
121        VMOVL.S8 q4, d8
122
123        // BLOCK 2
124        VLD1.8  {d10},  [r9]!           // B3
125        VMLAL.S16 q8, d8, d0[2]
126        VMLAL.S16 q9, d9, d0[2]
127        VMOVL.S8 q5, d10
128
129        // BLOCK 3
130        VLD1.8  {d8},  [r9]!            // B4
131        VMLAL.S16 q2, d10, d0[3]
132        VMLAL.S16 q3, d11, d0[3]
133        VLD1.8  {d0},  [r3]!            // A0
134        VMOVL.S8 q4, d8
135
136        // BLOCK 4
137        VLD1.8  {d10},  [r9]!           // B5
138        VMLAL.S16 q8, d8, d1[0]
139        VMLAL.S16 q9, d9, d1[0]
140        VMOVL.S8 q5, d10
141
142        // BLOCK 5
143        VLD1.8  {d8},  [r9]!            // B6
144        VMLAL.S16 q2, d10, d1[1]
145        VMLAL.S16 q3, d11, d1[1]
146        VMOVL.S8 q4, d8
147
148        // BLOCK 6
149        VLD1.8  {d10},  [r9]!           // B7
150        VMLAL.S16 q8, d8, d1[2]
151        VMLAL.S16 q9, d9, d1[2]
152        VMOVL.S8 q5, d10
153        SUBS    r5, r5, 8
154
155        // BLOCK 7
156        VLD1.8  {d8},  [r9]!            // B0
157        VMLAL.S16 q2, d10, d1[3]
158        VMLAL.S16 q3, d11, d1[3]
159        BHS     2b
160
161        // Epilogue
162
163        .p2align 3
1643:
165        // Extend
166        VMOVL.S8 q0, d0
167        VMOVL.S8 q4, d8
168
169        // BLOCK 0
170        VLD1.8  {d10},  [r9]!           // B1
171        VMLAL.S16 q8, d8, d0[0]
172        VMLAL.S16 q9, d9, d0[0]
173        VMOVL.S8 q5, d10
174
175        // BLOCK 1
176        VLD1.8  {d8},  [r9]!            // B2
177        VMLAL.S16 q2, d10, d0[1]
178        VMLAL.S16 q3, d11, d0[1]
179        VMOVL.S8 q4, d8
180
181        // BLOCK 2
182        VLD1.8  {d10},  [r9]!           // B3
183        VMLAL.S16 q8, d8, d0[2]
184        VMLAL.S16 q9, d9, d0[2]
185        VMOVL.S8 q5, d10
186
187        // BLOCK 3
188        VLD1.8  {d8},  [r9]!            // B4
189        VMLAL.S16 q2, d10, d0[3]
190        VMLAL.S16 q3, d11, d0[3]
191        VMOVL.S8 q4, d8
192
193        // BLOCK 4
194        VLD1.8  {d10},  [r9]!           // B5
195        VMLAL.S16 q8, d8, d1[0]
196        VMLAL.S16 q9, d9, d1[0]
197        VMOVL.S8 q5, d10
198
199        // BLOCK 5
200        VLD1.8  {d8},  [r9]!            // B6
201        VMLAL.S16 q2, d10, d1[1]
202        VMLAL.S16 q3, d11, d1[1]
203        VMOVL.S8 q4, d8
204
205        // BLOCK 6
206        VLD1.8  {d10},  [r9]!           // B7
207        VMLAL.S16 q8, d8, d1[2]
208        VMLAL.S16 q9, d9, d1[2]
209        VMOVL.S8 q5, d10
210        ADDS    r5, r5, 8
211
212        VMLAL.S16 q2, d10, d1[3]
213        VMLAL.S16 q3, d11, d1[3]
214
215        # Is there a remainder?- 1-7 bytes of A
216        BNE     6f
217
2184:
219        # ks loop
220        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
221        BHI     1b
222
223        LDR     r14, [sp, 60]           // p = ks
224
225        VADD.S32 q8, q8, q2
226        VADD.S32 q9, q9, q3
227
228        # RNDNU quantization
229        VDUP.32 q0, d12[0]              // right_pre_shift
230
231        VQSHL.S32 q8,  q8, q0
232        VQSHL.S32 q9,  q9, q0
233
234        VDUP.32 q2, d13[0]              // right_post_shift
235
236        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
237        VQDMULH.S32 q9,  q9, d12[1]
238
239        VRSHL.S32 q8,  q8, q2
240        VRSHL.S32 q9,  q9, q2
241
242        VDUP.16 q0, d13[2]              // output_zero_point
243
244        VQMOVN.S32 d16, q8
245        VQMOVN.S32 d17, q9
246
247        VQADD.S16 q8,  q8, q0
248
249        VDUP.8  d24, d13[6]             // output_min
250
251        VQMOVN.S16 d0,  q8
252
253        VDUP.8  d25, d13[7]             // output_max
254
255        VMAX.S8 d0, d0, d24
256
257        SUBS    r1, r1, 8
258
259        VMIN.S8 d0, d0, d25
260
261        # Store full 1 x 8
262        BLO     7f
263        VST1.8  {d0}, [r11], r12
264        SUB     r2, r2, r14             // a -= ks
265        BHI     0b
266
267        VPOP    {d8-d13}
268        ADD     sp, sp, 16              // skip pad of 8, r2, r3
269        POP     {r5, r6, r7, r9, r11, pc}
270
271        # Remainder- 1 to 7 bytes of A
272        .p2align 3
2735:
274        AND     r5, r5, 7               // kc remainder 1 to 7
2756:
276        VLD1.8  {d0},  [r3]
277        VLD1.8  {d8},  [r9]!
278
279        VMOVL.S8 q0, d0
280        VMOVL.S8 q4, d8
281        VMLAL.S16 q8, d8, d0[0]
282        VMLAL.S16 q9, d9, d0[0]
283        CMP     r5, 2
284        BLO     4b
285
286        VLD1.8  {d8},  [r9]!
287        VMOVL.S8 q4, d8
288        VMLAL.S16 q8, d8, d0[1]
289        VMLAL.S16 q9, d9, d0[1]
290        BEQ     4b
291
292        VLD1.8  {d8},  [r9]!
293        VMOVL.S8 q4, d8
294        VMLAL.S16 q8, d8, d0[2]
295        VMLAL.S16 q9, d9, d0[2]
296        CMP     r5, 4
297        BLO     4b
298
299        VLD1.8  {d8},  [r9]!
300        VMOVL.S8 q4, d8
301        VMLAL.S16 q8, d8, d0[3]
302        VMLAL.S16 q9, d9, d0[3]
303        BEQ     4b
304
305        VLD1.8  {d8},  [r9]!
306        VMOVL.S8 q4, d8
307        VMLAL.S16 q8, d8, d1[0]
308        VMLAL.S16 q9, d9, d1[0]
309        CMP     r5, 6
310        BLO     4b
311
312        VLD1.8  {d8},  [r9]!
313        VMOVL.S8 q4, d8
314        VMLAL.S16 q8, d8, d1[1]
315        VMLAL.S16 q9, d9, d1[1]
316        BEQ     4b
317
318        VLD1.8  {d8},  [r9]!
319        VMOVL.S8 q4, d8
320        VMLAL.S16 q8, d8, d1[2]
321        VMLAL.S16 q9, d9, d1[2]
322        B       4b
323
324        # Store odd width
325        .p2align 3
3267:
327        TST     r1, 4
328        BEQ     8f
329        VST1.32 {d0[0]}, [r11]!
330        VEXT.8  q0, q0, q0, 4
3318:
332        TST     r1, 2
333        BEQ     9f
334        VST1.16 {d0[0]}, [r11]!
335        VEXT.8  q0, q0, q0, 2
336
3379:
338        TST     r1, 1
339        BEQ     10f
340        VST1.8  {d0[0]}, [r11]
341
34210:
343        VPOP    {d8-d13}
344        ADD     sp, sp, 16              // skip pad of 8, r2, r3
345        POP     {r5, r6, r7, r9, r11, pc}
346
347END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
348
349#ifdef __ELF__
350.section ".note.GNU-stack","",%progbits
351#endif
352