1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
16//     size_t mr,                                     (r0)
17//     size_t nc,                                      r1
18//     size_t kc,                                     (r2) -> sp + 56 -> r5
19//     size_t ks,                                     (r3) -> sp + 60 -> r14
20//     const int8_t**restrict a,            sp + 88  -> r2
21//     const void*restrict w,              sp + 92  -> r9
22//     int8_t*restrict c,                   sp + 96  -> r11
23//     size_t cm_stride,                   sp + 100  -> r6
24//     size_t cn_stride,                   sp + 104  -> r12
25//     size_t a_offset,                    sp + 108 -> (r5)
26//     const int8_t* zero,                  sp + 112 -> r7
27//     xnn_qs8_conv_minmax_params*params); sp + 116 -> (r5)
28
29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
30
31// Based on cortex_a53 microkernel but with Neon loads
32
33// Register usage
34// A0   r3  d0-d1 q0
35
36// B    r9  d8-d9 q4 q5
37
38// C0  r11 d16-d17  q8  d18-d19  q9
39//         q2, q3 acc2
40
41// Unused r4, r8, r10, d15, q10-q15, q1-q3
42
43// params structure is 16 bytes
44//  struct {
45//    int32_t right_pre_shift;    d12[0]
46//    int32_t multiplier;         d12[1]
47//    int32_t right_post_shift;   d13[0]
48//    int16_t output_zero_point;  d13[2]
49//    int8_t output_min;          d13[6]
50//    int8_t output_max;          d13[7]
51//  } rndnu_neon;
52
53BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
54        # Push 88 bytes
55        # r2, r3 will be reloaded in outer loop.
56        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
57        SUB     sp, sp, 8                           // +8
58        VPUSH   {d8-d13}                            // +48 = 88
59
60        LDR     r2,  [sp, 88]           // a
61        LDR     r9,  [sp, 92]           // w
62        LDR     r11, [sp, 96]           // c
63        LDR     r6,  [sp, 100]          // cm_stride
64        LDR     r12, [sp, 104]          // cn_stride
65        LDR     r7,  [sp, 112]          // zero
66        LDR     r5,  [sp, 116]          // params
67        MOV     r14, r3                 // p = ks
68
69        # Load params values
70        VLDM    r5, {d12-d13}           // RNDNU params
71
72        PLD     [r9,  64]               // Prefetch B
73        PLD     [r9, 112]
74        PLD     [r9, 192]
75        PLD     [r9, 256]
76        PLD     [r9, 320]
77        PLD     [r9, 384]
78
79        .p2align 3
800:
81        # Load initial bias from w into accumulators
82        VLDM    r9!, {d16-d19}          // Bias
83        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
84        VMOV.I32 q3, 0
85
86        .p2align 3
871:
88        # Load next A pointer
89        LDR     r3, [r2,  0]
90
91        # Add a_offset
92        LDR     r5, [sp, 108]           // a_offset
93        ADD     r2, r2, 4
94        CMP     r3,  r7                 // if a0 == zero
95        ADD     r3,  r3, r5             // a0 += a_offset
96        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
97
98        LDR     r5, [sp, 56]            // kc
99        SUBS    r5, r5, 8               // kc - 8
100        BLO     5f                      // less than 8 channels?
101
102        // Prologue - load A0 and B0
103        VLD1.8  {d0},  [r3]!            // A0
104        SUBS    r5, r5, 8               // k = k - 8
105        VLD1.8  {d8},  [r9]!            // B0
106        BLO     3f                      // less than 8 channels?
107
108        // Main loop - 8 bytes
109        // 64 bytes for weights.
110
111        .p2align 3
1122:
113        // Extend
114        VMOVL.S8 q0, d0
115        VMOVL.S8 q4, d8
116        PLD     [r9, 448]
117
118        // BLOCK 0
119        VLD1.8  {d10},  [r9]!           // B1
120        VMLAL.S16 q8, d8, d0[0]
121        VMLAL.S16 q9, d9, d0[0]
122        VMOVL.S8 q5, d10
123
124        // BLOCK 1
125        VLD1.8  {d8},  [r9]!            // B2
126        VMLAL.S16 q2, d10, d0[1]
127        VMLAL.S16 q3, d11, d0[1]
128        VMOVL.S8 q4, d8
129
130        // BLOCK 2
131        VLD1.8  {d10},  [r9]!           // B3
132        VMLAL.S16 q8, d8, d0[2]
133        VMLAL.S16 q9, d9, d0[2]
134        VMOVL.S8 q5, d10
135
136        // BLOCK 3
137        VLD1.8  {d8},  [r9]!            // B4
138        VMLAL.S16 q2, d10, d0[3]
139        VMLAL.S16 q3, d11, d0[3]
140        VLD1.8  {d0},  [r3]!            // A0
141        VMOVL.S8 q4, d8
142
143        // BLOCK 4
144        VLD1.8  {d10},  [r9]!           // B5
145        VMLAL.S16 q8, d8, d1[0]
146        VMLAL.S16 q9, d9, d1[0]
147        VMOVL.S8 q5, d10
148
149        // BLOCK 5
150        VLD1.8  {d8},  [r9]!            // B6
151        VMLAL.S16 q2, d10, d1[1]
152        VMLAL.S16 q3, d11, d1[1]
153        VMOVL.S8 q4, d8
154
155        // BLOCK 6
156        VLD1.8  {d10},  [r9]!           // B7
157        VMLAL.S16 q8, d8, d1[2]
158        VMLAL.S16 q9, d9, d1[2]
159        VMOVL.S8 q5, d10
160        SUBS    r5, r5, 8
161
162        // BLOCK 7
163        VLD1.8  {d8},  [r9]!            // B0
164        VMLAL.S16 q2, d10, d1[3]
165        VMLAL.S16 q3, d11, d1[3]
166        BHS     2b
167
168        // Epilogue
169
170        .p2align 3
1713:
172        // Extend
173        VMOVL.S8 q0, d0
174        VMOVL.S8 q4, d8
175        PLD     [r9, 448]
176
177        // BLOCK 0
178        VLD1.8  {d10},  [r9]!           // B1
179        VMLAL.S16 q8, d8, d0[0]
180        VMLAL.S16 q9, d9, d0[0]
181        VMOVL.S8 q5, d10
182
183        // BLOCK 1
184        VLD1.8  {d8},  [r9]!            // B2
185        VMLAL.S16 q2, d10, d0[1]
186        VMLAL.S16 q3, d11, d0[1]
187        VMOVL.S8 q4, d8
188
189        // BLOCK 2
190        VLD1.8  {d10},  [r9]!           // B3
191        VMLAL.S16 q8, d8, d0[2]
192        VMLAL.S16 q9, d9, d0[2]
193        VMOVL.S8 q5, d10
194
195        // BLOCK 3
196        VLD1.8  {d8},  [r9]!            // B4
197        VMLAL.S16 q2, d10, d0[3]
198        VMLAL.S16 q3, d11, d0[3]
199        VMOVL.S8 q4, d8
200
201        // BLOCK 4
202        VLD1.8  {d10},  [r9]!           // B5
203        VMLAL.S16 q8, d8, d1[0]
204        VMLAL.S16 q9, d9, d1[0]
205        VMOVL.S8 q5, d10
206
207        // BLOCK 5
208        VLD1.8  {d8},  [r9]!            // B6
209        VMLAL.S16 q2, d10, d1[1]
210        VMLAL.S16 q3, d11, d1[1]
211        VMOVL.S8 q4, d8
212
213        // BLOCK 6
214        VLD1.8  {d10},  [r9]!           // B7
215        VMLAL.S16 q8, d8, d1[2]
216        VMLAL.S16 q9, d9, d1[2]
217        VMOVL.S8 q5, d10
218        ADDS    r5, r5, 8
219
220        VMLAL.S16 q2, d10, d1[3]
221        VMLAL.S16 q3, d11, d1[3]
222
223        # Is there a remainder?- 1-7 bytes of A
224        BNE     6f
225
2264:
227        # ks loop
228        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
229        BHI     1b
230
231        LDR     r14, [sp, 60]           // p = ks
232
233        VADD.S32 q8, q8, q2
234        VADD.S32 q9, q9, q3
235
236        # RNDNU quantization
237        VDUP.32 q0, d12[0]              // right_pre_shift
238
239        VQSHL.S32 q8,  q8, q0
240        VQSHL.S32 q9,  q9, q0
241
242        VDUP.32 q2, d13[0]              // right_post_shift
243
244        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
245        VQDMULH.S32 q9,  q9, d12[1]
246
247        VRSHL.S32 q8,  q8, q2
248        VRSHL.S32 q9,  q9, q2
249
250        VDUP.16 q0, d13[2]              // output_zero_point
251
252        VQMOVN.S32 d16, q8
253        VQMOVN.S32 d17, q9
254
255        VQADD.S16 q8,  q8, q0
256
257        VDUP.8  d24, d13[6]             // output_min
258
259        VQMOVN.S16 d0,  q8
260
261        VDUP.8  d25, d13[7]             // output_max
262
263        VMAX.S8 d0, d0, d24
264
265        SUBS    r1, r1, 8
266
267        VMIN.S8 d0, d0, d25
268
269        # Store full 1 x 8
270        BLO     7f
271        VST1.8  {d0}, [r11], r12
272        SUB     r2, r2, r14             // a -= ks
273        BHI     0b
274
275        VPOP    {d8-d13}
276        ADD     sp, sp, 16              // skip pad of 8, r2, r3
277        POP     {r5, r6, r7, r9, r11, pc}
278
279        # Remainder- 1 to 7 bytes of A
280        .p2align 3
2815:
282        AND     r5, r5, 7               // kc remainder 1 to 7
2836:
284        VLD1.8  {d0},  [r3]
285        VLD1.8  {d8},  [r9]!
286
287        VMOVL.S8 q0, d0
288        VMOVL.S8 q4, d8
289        VMLAL.S16 q8, d8, d0[0]
290        VMLAL.S16 q9, d9, d0[0]
291        CMP     r5, 2
292        BLO     4b
293
294        VLD1.8  {d8},  [r9]!
295        VMOVL.S8 q4, d8
296        VMLAL.S16 q8, d8, d0[1]
297        VMLAL.S16 q9, d9, d0[1]
298        BEQ     4b
299
300        VLD1.8  {d8},  [r9]!
301        VMOVL.S8 q4, d8
302        VMLAL.S16 q8, d8, d0[2]
303        VMLAL.S16 q9, d9, d0[2]
304        CMP     r5, 4
305        BLO     4b
306
307        VLD1.8  {d8},  [r9]!
308        VMOVL.S8 q4, d8
309        VMLAL.S16 q8, d8, d0[3]
310        VMLAL.S16 q9, d9, d0[3]
311        BEQ     4b
312
313        VLD1.8  {d8},  [r9]!
314        VMOVL.S8 q4, d8
315        VMLAL.S16 q8, d8, d1[0]
316        VMLAL.S16 q9, d9, d1[0]
317        CMP     r5, 6
318        BLO     4b
319
320        VLD1.8  {d8},  [r9]!
321        VMOVL.S8 q4, d8
322        VMLAL.S16 q8, d8, d1[1]
323        VMLAL.S16 q9, d9, d1[1]
324        BEQ     4b
325
326        VLD1.8  {d8},  [r9]!
327        VMOVL.S8 q4, d8
328        VMLAL.S16 q8, d8, d1[2]
329        VMLAL.S16 q9, d9, d1[2]
330        B       4b
331
332        # Store odd width
333        .p2align 3
3347:
335        TST     r1, 4
336        BEQ     8f
337        VST1.32 {d0[0]}, [r11]!
338        VEXT.8  q0, q0, q0, 4
3398:
340        TST     r1, 2
341        BEQ     9f
342        VST1.16 {d0[0]}, [r11]!
343        VEXT.8  q0, q0, q0, 2
344
3459:
346        TST     r1, 1
347        BEQ     10f
348        VST1.8  {d0[0]}, [r11]
349
35010:
351        VPOP    {d8-d13}
352        ADD     sp, sp, 16              // skip pad of 8, r2, r3
353        POP     {r5, r6, r7, r9, r11, pc}
354
355END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
356
357#ifdef __ELF__
358.section ".note.GNU-stack","",%progbits
359#endif
360