1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
16//     size_t mr,                                     (r0)
17//     size_t nc,                                      r1
18//     size_t kc,                                     (r2) -> sp + 56 -> r5
19//     size_t ks,                                     (r3) -> sp + 60 -> r14
20//     const uint8_t**restrict a,            sp + 88  -> r2
21//     const void*restrict w,              sp + 92  -> r9
22//     uint8_t*restrict c,                   sp + 96  -> r11
23//     size_t cm_stride,                   sp + 100  -> r6
24//     size_t cn_stride,                   sp + 104  -> r12
25//     size_t a_offset,                    sp + 108 -> (r5)
26//     const uint8_t* zero,                  sp + 112 -> r7
27//     xnn_qs8_conv_minmax_params*params); sp + 116 -> (r5)
28
29// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
30
31// Based on cortex_a53 microkernel but with Neon loads
32
33// Register usage
34// A0   r3  d0-d1 q0
35
36// B    r9  d8-d9 q4 q5
37
38// C0  r11 d16-d17  q8  d18-d19  q9
39//         q2, q3 acc2
40
41// Unused r4, r8, r10, d15, q10-q15, q1-q3
42
43// params structure is 20 bytes
44//  struct {
45//    uint8_t kernel_zero_point[4];  d14
46//    int32_t right_pre_shift;       d12[0]
47//    int32_t multiplier;            d12[1]
48//    int32_t right_post_shift;      d13[0]
49//    int16_t output_zero_point;     d13[2]
50//    uint8_t output_min;            d13[6]
51//    uint8_t output_max;            d13[7]
52//  } rndnu_neon;
53
54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
55        # Push 88 bytes
56        # r2, r3 will be reloaded in outer loop.
57        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
58        VPUSH   {d8-d14}                            // +56 = 88
59
60        LDR     r2,  [sp, 88]           // a
61        LDR     r9,  [sp, 92]           // w
62        LDR     r11, [sp, 96]           // c
63        LDR     r6,  [sp, 100]          // cm_stride
64        LDR     r12, [sp, 104]          // cn_stride
65        LDR     r7,  [sp, 112]          // zero
66        LDR     r5,  [sp, 116]          // params
67        MOV     r14, r3                 // p = ks
68
69        # Load params values
70        VLD1.32 {d14[]}, [r5]!          // QU8 kernel_zero_point
71        VLDM    r5, {d12-d13}           // RNDNU params
72
73        PLD     [r9,  64]               // Prefetch B
74        PLD     [r9, 112]
75        PLD     [r9, 192]
76        PLD     [r9, 256]
77        PLD     [r9, 320]
78        PLD     [r9, 384]
79
80        .p2align 3
810:
82        # Load initial bias from w into accumulators
83        VLDM    r9!, {d16-d19}          // Bias
84        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
85        VMOV.I32 q3, 0
86
87        .p2align 3
881:
89        # Load next A pointer
90        LDR     r3, [r2,  0]
91
92        # Add a_offset
93        LDR     r5, [sp, 108]           // a_offset
94        ADD     r2, r2, 4
95        CMP     r3,  r7                 // if a0 == zero
96        ADD     r3,  r3, r5             // a0 += a_offset
97        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
98
99        LDR     r5, [sp, 56]            // kc
100        SUBS    r5, r5, 8               // kc - 8
101        BLO     5f                      // less than 8 channels?
102
103        // Prologue - load A0 and B0
104        VLD1.8  {d0},  [r3]!            // A0
105        SUBS    r5, r5, 8               // k = k - 8
106        VLD1.8  {d8},  [r9]!            // B0
107        BLO     3f                      // less than 8 channels?
108
109        // Main loop - 8 bytes
110        // 64 bytes for weights.
111
112        .p2align 3
1132:
114        // Extend
115        VMOVL.U8 q0, d0
116        VSUBL.U8 q4, d8, d14
117        PLD     [r9, 448]
118
119        // BLOCK 0
120        VLD1.8  {d10},  [r9]!           // B1
121        VMLAL.S16 q8, d8, d0[0]
122        VMLAL.S16 q9, d9, d0[0]
123        VSUBL.U8 q5, d10, d14
124
125        // BLOCK 1
126        VLD1.8  {d8},  [r9]!            // B2
127        VMLAL.S16 q2, d10, d0[1]
128        VMLAL.S16 q3, d11, d0[1]
129        VSUBL.U8 q4, d8, d14
130
131        // BLOCK 2
132        VLD1.8  {d10},  [r9]!           // B3
133        VMLAL.S16 q8, d8, d0[2]
134        VMLAL.S16 q9, d9, d0[2]
135        VSUBL.U8 q5, d10, d14
136
137        // BLOCK 3
138        VLD1.8  {d8},  [r9]!            // B4
139        VMLAL.S16 q2, d10, d0[3]
140        VMLAL.S16 q3, d11, d0[3]
141        VLD1.8  {d0},  [r3]!            // A0
142        VSUBL.U8 q4, d8, d14
143
144        // BLOCK 4
145        VLD1.8  {d10},  [r9]!           // B5
146        VMLAL.S16 q8, d8, d1[0]
147        VMLAL.S16 q9, d9, d1[0]
148        VSUBL.U8 q5, d10, d14
149
150        // BLOCK 5
151        VLD1.8  {d8},  [r9]!            // B6
152        VMLAL.S16 q2, d10, d1[1]
153        VMLAL.S16 q3, d11, d1[1]
154        VSUBL.U8 q4, d8, d14
155
156        // BLOCK 6
157        VLD1.8  {d10},  [r9]!           // B7
158        VMLAL.S16 q8, d8, d1[2]
159        VMLAL.S16 q9, d9, d1[2]
160        VSUBL.U8 q5, d10, d14
161        SUBS    r5, r5, 8
162
163        // BLOCK 7
164        VLD1.8  {d8},  [r9]!            // B0
165        VMLAL.S16 q2, d10, d1[3]
166        VMLAL.S16 q3, d11, d1[3]
167        BHS     2b
168
169        // Epilogue
170
171        .p2align 3
1723:
173        // Extend
174        VMOVL.U8 q0, d0
175        VSUBL.U8 q4, d8, d14
176        PLD     [r9, 448]
177
178        // BLOCK 0
179        VLD1.8  {d10},  [r9]!           // B1
180        VMLAL.S16 q8, d8, d0[0]
181        VMLAL.S16 q9, d9, d0[0]
182        VSUBL.U8 q5, d10, d14
183
184        // BLOCK 1
185        VLD1.8  {d8},  [r9]!            // B2
186        VMLAL.S16 q2, d10, d0[1]
187        VMLAL.S16 q3, d11, d0[1]
188        VSUBL.U8 q4, d8, d14
189
190        // BLOCK 2
191        VLD1.8  {d10},  [r9]!           // B3
192        VMLAL.S16 q8, d8, d0[2]
193        VMLAL.S16 q9, d9, d0[2]
194        VSUBL.U8 q5, d10, d14
195
196        // BLOCK 3
197        VLD1.8  {d8},  [r9]!            // B4
198        VMLAL.S16 q2, d10, d0[3]
199        VMLAL.S16 q3, d11, d0[3]
200        VSUBL.U8 q4, d8, d14
201
202        // BLOCK 4
203        VLD1.8  {d10},  [r9]!           // B5
204        VMLAL.S16 q8, d8, d1[0]
205        VMLAL.S16 q9, d9, d1[0]
206        VSUBL.U8 q5, d10, d14
207
208        // BLOCK 5
209        VLD1.8  {d8},  [r9]!            // B6
210        VMLAL.S16 q2, d10, d1[1]
211        VMLAL.S16 q3, d11, d1[1]
212        VSUBL.U8 q4, d8, d14
213
214        // BLOCK 6
215        VLD1.8  {d10},  [r9]!           // B7
216        VMLAL.S16 q8, d8, d1[2]
217        VMLAL.S16 q9, d9, d1[2]
218        VSUBL.U8 q5, d10, d14
219        ADDS    r5, r5, 8
220
221        VMLAL.S16 q2, d10, d1[3]
222        VMLAL.S16 q3, d11, d1[3]
223
224        # Is there a remainder?- 1-7 bytes of A
225        BNE     6f
226
2274:
228        # ks loop
229        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
230        BHI     1b
231
232        LDR     r14, [sp, 60]           // p = ks
233
234        VADD.S32 q8, q8, q2
235        VADD.S32 q9, q9, q3
236
237        # RNDNU quantization
238        VDUP.32 q0, d12[0]              // right_pre_shift
239
240        VQSHL.S32 q8,  q8, q0
241        VQSHL.S32 q9,  q9, q0
242
243        VDUP.32 q2, d13[0]              // right_post_shift
244
245        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
246        VQDMULH.S32 q9,  q9, d12[1]
247
248        VRSHL.S32 q8,  q8, q2
249        VRSHL.S32 q9,  q9, q2
250
251        VDUP.16 q0, d13[2]              // output_zero_point
252
253        VQMOVN.S32 d16, q8
254        VQMOVN.S32 d17, q9
255
256        VQADD.S16 q8,  q8, q0
257
258        VDUP.8  d24, d13[6]             // output_min
259
260        VQMOVUN.S16 d0,  q8
261
262        VDUP.8  d25, d13[7]             // output_max
263
264        VMAX.U8 d0, d0, d24
265
266        SUBS    r1, r1, 8
267
268        VMIN.U8 d0, d0, d25
269
270        # Store full 1 x 8
271        BLO     7f
272        VST1.8  {d0}, [r11], r12
273        SUB     r2, r2, r14             // a -= ks
274        BHI     0b
275
276        VPOP    {d8-d14}
277        ADD     sp, sp, 8               // skip r2, r3
278        POP     {r5, r6, r7, r9, r11, pc}
279
280        # Remainder- 1 to 7 bytes of A
281        .p2align 3
2825:
283        AND     r5, r5, 7               // kc remainder 1 to 7
2846:
285        VLD1.8  {d0},  [r3]
286        VLD1.8  {d8},  [r9]!
287
288        VMOVL.U8 q0, d0
289        VSUBL.U8 q4, d8, d14
290        VMLAL.S16 q8, d8, d0[0]
291        VMLAL.S16 q9, d9, d0[0]
292        CMP     r5, 2
293        BLO     4b
294
295        VLD1.8  {d8},  [r9]!
296        VSUBL.U8 q4, d8, d14
297        VMLAL.S16 q8, d8, d0[1]
298        VMLAL.S16 q9, d9, d0[1]
299        BEQ     4b
300
301        VLD1.8  {d8},  [r9]!
302        VSUBL.U8 q4, d8, d14
303        VMLAL.S16 q8, d8, d0[2]
304        VMLAL.S16 q9, d9, d0[2]
305        CMP     r5, 4
306        BLO     4b
307
308        VLD1.8  {d8},  [r9]!
309        VSUBL.U8 q4, d8, d14
310        VMLAL.S16 q8, d8, d0[3]
311        VMLAL.S16 q9, d9, d0[3]
312        BEQ     4b
313
314        VLD1.8  {d8},  [r9]!
315        VSUBL.U8 q4, d8, d14
316        VMLAL.S16 q8, d8, d1[0]
317        VMLAL.S16 q9, d9, d1[0]
318        CMP     r5, 6
319        BLO     4b
320
321        VLD1.8  {d8},  [r9]!
322        VSUBL.U8 q4, d8, d14
323        VMLAL.S16 q8, d8, d1[1]
324        VMLAL.S16 q9, d9, d1[1]
325        BEQ     4b
326
327        VLD1.8  {d8},  [r9]!
328        VSUBL.U8 q4, d8, d14
329        VMLAL.S16 q8, d8, d1[2]
330        VMLAL.S16 q9, d9, d1[2]
331        B       4b
332
333        # Store odd width
334        .p2align 3
3357:
336        TST     r1, 4
337        BEQ     8f
338        VST1.32 {d0[0]}, [r11]!
339        VEXT.8  q0, q0, q0, 4
3408:
341        TST     r1, 2
342        BEQ     9f
343        VST1.16 {d0[0]}, [r11]!
344        VEXT.8  q0, q0, q0, 2
345
3469:
347        TST     r1, 1
348        BEQ     10f
349        VST1.8  {d0[0]}, [r11]
350
35110:
352        VPOP    {d8-d14}
353        ADD     sp, sp, 8               // skip r2, r3
354        POP     {r5, r6, r7, r9, r11, pc}
355
356END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif
361