1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const uint8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     uint8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_conv_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41# params structure is 20 bytes
42#  struct {
43#    uint8_t kernel_zero_point[4];  d14
44#    int32_t right_pre_shift;       d12[0]
45#    int32_t multiplier;            d12[1]
46#    int32_t right_post_shift;      d13[0]
47#    int16_t output_zero_point;     d13[2]
48#    uint8_t output_min;            d13[6]
49#    uint8_t output_max;            d13[7]
50#  } rndnu_neon;
51
52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
53        # Push 96 bytes
54        PUSH    {r5, r7, r9, r11}                   // 16
55        SUB     sp, sp, 24                          // +24
56        VPUSH   {d8-d14}                            // +56 = 96
57
58        LDR     r11, [sp, 104]          // c
59        LDR     r9, [sp, 100]           // w
60        LDR     r5, [sp, 116]           // params
61
62        # Load params values
63        VLD1.32 {d14[]}, [r5]!          // QU8 kernel_zero_point
64        VLDM    r5, {d12-d13}           // RNDNU params
65        LDR     r7, [sp, 112]            // cn_stride
66
67        PLD     [r9,  64]               // Prefetch B
68        PLD     [r9, 128]
69        PLD     [r9, 192]
70        PLD     [r9, 256]
71        PLD     [r9, 320]
72        PLD     [r9, 384]
73
74        .p2align 3
750:
76        # Load initial bias from w into accumulators
77        VLDM    r9!, {d16-d19}          // Bias
78        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
79        SUBS    r5, r2, 8               // k = kc - 8
80        VMOV.I32 q3, 0
81        PLD     [r3,  64]               // Prefetch A
82        BLO     4f                      // less than 8 channels?
83
84        // Prologue - load A0 and B0
85        VLD1.8  {d0},  [r3]!            // A0
86        SUBS    r5, r5, 8               // k = k - 8
87        VLD1.8  {d8},  [r9]!            // B0
88        BLO     2f                      // less than 8 channels?
89
90        // Main loop - 8 bytes
91        // 64 bytes for weights.
92
93        .p2align 3
941:
95        // Extend
96        VMOVL.U8 q0, d0
97        VSUBL.U8 q4, d8, d14
98        PLD     [r9, 448]
99
100        // BLOCK 0
101        VLD1.8  {d10},  [r9]!           // B1
102        VMLAL.S16 q8, d8, d0[0]
103        VMLAL.S16 q9, d9, d0[0]
104        VSUBL.U8 q5, d10, d14
105
106        // BLOCK 1
107        VLD1.8  {d8},  [r9]!            // B2
108        VMLAL.S16 q2, d10, d0[1]
109        VMLAL.S16 q3, d11, d0[1]
110        VSUBL.U8 q4, d8, d14
111
112        // BLOCK 2
113        VLD1.8  {d10},  [r9]!           // B3
114        VMLAL.S16 q8, d8, d0[2]
115        VMLAL.S16 q9, d9, d0[2]
116        VSUBL.U8 q5, d10, d14
117
118        // BLOCK 3
119        VLD1.8  {d8},  [r9]!            // B4
120        VMLAL.S16 q2, d10, d0[3]
121        VMLAL.S16 q3, d11, d0[3]
122        VLD1.8  {d0},  [r3]!            // A0
123        VSUBL.U8 q4, d8, d14
124
125        // BLOCK 4
126        VLD1.8  {d10},  [r9]!           // B5
127        VMLAL.S16 q8, d8, d1[0]
128        VMLAL.S16 q9, d9, d1[0]
129        VSUBL.U8 q5, d10, d14
130
131        // BLOCK 5
132        VLD1.8  {d8},  [r9]!            // B6
133        VMLAL.S16 q2, d10, d1[1]
134        VMLAL.S16 q3, d11, d1[1]
135        VSUBL.U8 q4, d8, d14
136
137        // BLOCK 6
138        VLD1.8  {d10},  [r9]!           // B7
139        VMLAL.S16 q8, d8, d1[2]
140        VMLAL.S16 q9, d9, d1[2]
141        VSUBL.U8 q5, d10, d14
142
143        // BLOCK 7
144        VLD1.8  {d8},  [r9]!            // B0
145        VMLAL.S16 q2, d10, d1[3]
146        VMLAL.S16 q3, d11, d1[3]
147        SUBS    r5, r5, 8
148        BHS     1b
149
150        // Epilogue
151
152        .p2align 3
1532:
154        VMOVL.U8 q0, d0
155        VSUBL.U8 q4, d8, d14
156
157        VLD1.8  {d10},  [r9]!           // B1
158        VMLAL.S16 q8, d8, d0[0]
159        VMLAL.S16 q9, d9, d0[0]
160        VSUBL.U8 q5, d10, d14
161
162        VLD1.8  {d8},  [r9]!            // B2
163        VMLAL.S16 q2, d10, d0[1]
164        VMLAL.S16 q3, d11, d0[1]
165        VSUBL.U8 q4, d8, d14
166
167        VLD1.8  {d10},  [r9]!           // B3
168        VMLAL.S16 q8, d8, d0[2]
169        VMLAL.S16 q9, d9, d0[2]
170        VSUBL.U8 q5, d10, d14
171
172        VLD1.8  {d8},  [r9]!            // B4
173        VMLAL.S16 q2, d10, d0[3]
174        VMLAL.S16 q3, d11, d0[3]
175        VSUBL.U8 q4, d8, d14
176
177        VLD1.8  {d10},  [r9]!           // B5
178        VMLAL.S16 q8, d8, d1[0]
179        VMLAL.S16 q9, d9, d1[0]
180        VSUBL.U8 q5, d10, d14
181
182        VLD1.8  {d8},  [r9]!            // B6
183        VMLAL.S16 q2, d10, d1[1]
184        VMLAL.S16 q3, d11, d1[1]
185        VSUBL.U8 q4, d8, d14
186
187        VLD1.8  {d10},  [r9]!           // B7
188        VMLAL.S16 q8, d8, d1[2]
189        VMLAL.S16 q9, d9, d1[2]
190        VSUBL.U8 q5, d10, d14
191        ADDS    r5, r5, 8
192
193        VMLAL.S16 q2, d10, d1[3]
194        VMLAL.S16 q3, d11, d1[3]
195
196        # Is there a remainder?- 1-7 bytes of A
197        BNE     4f
198
1993:
200        VADD.S32 q8, q8, q2
201        VADD.S32 q9, q9, q3
202
203        # RNDNU quantization
204        VDUP.32 q0, d12[0]              // right_pre_shift
205
206        VQSHL.S32 q8,  q8, q0
207        VQSHL.S32 q9,  q9, q0
208
209        VDUP.32 q2, d13[0]              // right_post_shift
210
211        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
212        VQDMULH.S32 q9,  q9, d12[1]
213
214        VRSHL.S32 q8,  q8, q2
215        VRSHL.S32 q9,  q9, q2
216
217        VDUP.16 q0, d13[2]              // output_zero_point
218
219        VQMOVN.S32 d16, q8
220        VQMOVN.S32 d17, q9
221
222        VQADD.S16 q8,  q8, q0
223
224        VDUP.8  d24, d13[6]             // output_min
225
226        VQMOVUN.S16 d0,  q8
227
228        VDUP.8  d25, d13[7]             // output_max
229
230        VMAX.U8 d0, d0, d24
231
232        SUBS    r1, r1, 8
233
234        VMIN.U8 d0, d0, d25
235
236        # Store full 1 x 8
237        BLO     5f
238        VST1.8  {d0}, [r11], r7
239        SUB     r3, r3, r2
240        BHI     0b
241
242        VPOP    {d8-d14}
243        ADD     sp, sp, 8               // skip pad of 8
244        ADD     sp, sp, 16
245        POP     {r5, r7, r9, r11}
246        BX      lr
247
248        # Remainder- 1 to 7 bytes of A
249        .p2align 3
2504:
251        AND     r5, r5, 7               // kc remainder 1 to 7
252
253        VLD1.8  {d0},  [r3], r5
254        VLD1.8  {d8},  [r9]!
255
256        VMOVL.U8 q0, d0
257        VSUBL.U8 q4, d8, d14
258        VMLAL.S16 q8, d8, d0[0]
259        VMLAL.S16 q9, d9, d0[0]
260        CMP     r5, 2
261        BLO     3b
262
263        VLD1.8  {d8},  [r9]!
264        VSUBL.U8 q4, d8, d14
265        VMLAL.S16 q8, d8, d0[1]
266        VMLAL.S16 q9, d9, d0[1]
267        BEQ     3b
268
269        VLD1.8  {d8},  [r9]!
270        VSUBL.U8 q4, d8, d14
271        VMLAL.S16 q8, d8, d0[2]
272        VMLAL.S16 q9, d9, d0[2]
273        CMP     r5, 4
274        BLO     3b
275
276        VLD1.8  {d8},  [r9]!
277        VSUBL.U8 q4, d8, d14
278        VMLAL.S16 q8, d8, d0[3]
279        VMLAL.S16 q9, d9, d0[3]
280        BEQ     3b
281
282        VLD1.8  {d8},  [r9]!
283        VSUBL.U8 q4, d8, d14
284        VMLAL.S16 q8, d8, d1[0]
285        VMLAL.S16 q9, d9, d1[0]
286        CMP     r5, 6
287        BLO     3b
288
289        VLD1.8  {d8},  [r9]!
290        VSUBL.U8 q4, d8, d14
291        VMLAL.S16 q8, d8, d1[1]
292        VMLAL.S16 q9, d9, d1[1]
293        BEQ     3b
294
295        VLD1.8  {d8},  [r9]!
296        VSUBL.U8 q4, d8, d14
297        VMLAL.S16 q8, d8, d1[2]
298        VMLAL.S16 q9, d9, d1[2]
299        B       3b
300
301        # Store odd width
302        .p2align 3
3035:
304        TST     r1, 4
305        BEQ     6f
306        VST1.32 {d0[0]}, [r11]!
307        VEXT.8  q0, q0, q0, 4
3086:
309        TST     r1, 2
310        BEQ     7f
311        VST1.16 {d0[0]}, [r11]!
312        VEXT.8  q0, q0, q0, 2
3137:
314        TST     r1, 1
315        BEQ     8f
316        VST1.8  {d0[0]}, [r11]
3178:
318        VPOP    {d8-d14}
319        ADD     sp, sp, 8               // skip pad of 8
320        ADD     sp, sp, 16
321        POP     {r5, r7, r9, r11}
322        BX      lr
323
324END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7
325
326#ifdef __ELF__
327.section ".note.GNU-stack","",%progbits
328#endif
329
330