1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8-aarch32-neon-mlal-lane-cortex-a7.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15// void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7(
16//     size_t mr,                            r0
17//     size_t nc,                            r1
18//     size_t kc,                            (r2) -> r5
19//     const uint8_t*restrict a,              r3
20//     size_t a_stride,           sp + 96 -> (unused)
21//     const void*restrict w,     sp + 100 -> r9
22//     uint8_t*restrict c,         sp + 104 -> r11
23//     size_t cm_stride,          sp + 108 -> (unused)
24//     size_t cn_stride,          sp + 112 -> r7
25//     xnn_qs8_conv_minmax_params params)  sp + 116 -> (r5)
26
27// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
28
29// Based on cortex_a53 microkernel but with Neon loads
30
31// Register usage
32// A0   r3  d0-d1 q0
33
34// B    r9  d8-d9 q4 q5
35
36// C0  r11 d16-d17  q8  d18-d19  q9
37//         q2, q3 acc2
38
39// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
40
41# params structure is 20 bytes
42#  struct {
43#    uint8_t kernel_zero_point[4];  d14
44#    int32_t right_pre_shift;       d12[0]
45#    int32_t multiplier;            d12[1]
46#    int32_t right_post_shift;      d13[0]
47#    int16_t output_zero_point;     d13[2]
48#    uint8_t output_min;            d13[6]
49#    uint8_t output_max;            d13[7]
50#  } rndnu_neon;
51
52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
53        # Push 96 bytes
54        PUSH    {r5, r7, r9, r11}                   // 16
55        SUB     sp, sp, 24                          // +24
56        VPUSH   {d8-d14}                            // +56 = 96
57
58        LDR     r11, [sp, 104]          // c
59        LDR     r9, [sp, 100]           // w
60        LDR     r5, [sp, 116]           // params
61
62        # Load params values
63        VLD1.32 {d14[]}, [r5]!          // QU8 kernel_zero_point
64        VLDM    r5, {d12-d13}           // RNDNU params
65        LDR     r7, [sp, 112]            // cn_stride
66
67
68        .p2align 3
690:
70        # Load initial bias from w into accumulators
71        VLDM    r9!, {d16-d19}          // Bias
72        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
73        SUBS    r5, r2, 8               // k = kc - 8
74        VMOV.I32 q3, 0
75        BLO     4f                      // less than 8 channels?
76
77        // Prologue - load A0 and B0
78        VLD1.8  {d0},  [r3]!            // A0
79        SUBS    r5, r5, 8               // k = k - 8
80        VLD1.8  {d8},  [r9]!            // B0
81        BLO     2f                      // less than 8 channels?
82
83        // Main loop - 8 bytes
84        // 64 bytes for weights.
85
86        .p2align 3
871:
88        // Extend
89        VMOVL.U8 q0, d0
90        VSUBL.U8 q4, d8, d14
91
92        // BLOCK 0
93        VLD1.8  {d10},  [r9]!           // B1
94        VMLAL.S16 q8, d8, d0[0]
95        VMLAL.S16 q9, d9, d0[0]
96        VSUBL.U8 q5, d10, d14
97
98        // BLOCK 1
99        VLD1.8  {d8},  [r9]!            // B2
100        VMLAL.S16 q2, d10, d0[1]
101        VMLAL.S16 q3, d11, d0[1]
102        VSUBL.U8 q4, d8, d14
103
104        // BLOCK 2
105        VLD1.8  {d10},  [r9]!           // B3
106        VMLAL.S16 q8, d8, d0[2]
107        VMLAL.S16 q9, d9, d0[2]
108        VSUBL.U8 q5, d10, d14
109
110        // BLOCK 3
111        VLD1.8  {d8},  [r9]!            // B4
112        VMLAL.S16 q2, d10, d0[3]
113        VMLAL.S16 q3, d11, d0[3]
114        VLD1.8  {d0},  [r3]!            // A0
115        VSUBL.U8 q4, d8, d14
116
117        // BLOCK 4
118        VLD1.8  {d10},  [r9]!           // B5
119        VMLAL.S16 q8, d8, d1[0]
120        VMLAL.S16 q9, d9, d1[0]
121        VSUBL.U8 q5, d10, d14
122
123        // BLOCK 5
124        VLD1.8  {d8},  [r9]!            // B6
125        VMLAL.S16 q2, d10, d1[1]
126        VMLAL.S16 q3, d11, d1[1]
127        VSUBL.U8 q4, d8, d14
128
129        // BLOCK 6
130        VLD1.8  {d10},  [r9]!           // B7
131        VMLAL.S16 q8, d8, d1[2]
132        VMLAL.S16 q9, d9, d1[2]
133        VSUBL.U8 q5, d10, d14
134
135        // BLOCK 7
136        VLD1.8  {d8},  [r9]!            // B0
137        VMLAL.S16 q2, d10, d1[3]
138        VMLAL.S16 q3, d11, d1[3]
139        SUBS    r5, r5, 8
140        BHS     1b
141
142        // Epilogue
143
144        .p2align 3
1452:
146        VMOVL.U8 q0, d0
147        VSUBL.U8 q4, d8, d14
148
149        VLD1.8  {d10},  [r9]!           // B1
150        VMLAL.S16 q8, d8, d0[0]
151        VMLAL.S16 q9, d9, d0[0]
152        VSUBL.U8 q5, d10, d14
153
154        VLD1.8  {d8},  [r9]!            // B2
155        VMLAL.S16 q2, d10, d0[1]
156        VMLAL.S16 q3, d11, d0[1]
157        VSUBL.U8 q4, d8, d14
158
159        VLD1.8  {d10},  [r9]!           // B3
160        VMLAL.S16 q8, d8, d0[2]
161        VMLAL.S16 q9, d9, d0[2]
162        VSUBL.U8 q5, d10, d14
163
164        VLD1.8  {d8},  [r9]!            // B4
165        VMLAL.S16 q2, d10, d0[3]
166        VMLAL.S16 q3, d11, d0[3]
167        VSUBL.U8 q4, d8, d14
168
169        VLD1.8  {d10},  [r9]!           // B5
170        VMLAL.S16 q8, d8, d1[0]
171        VMLAL.S16 q9, d9, d1[0]
172        VSUBL.U8 q5, d10, d14
173
174        VLD1.8  {d8},  [r9]!            // B6
175        VMLAL.S16 q2, d10, d1[1]
176        VMLAL.S16 q3, d11, d1[1]
177        VSUBL.U8 q4, d8, d14
178
179        VLD1.8  {d10},  [r9]!           // B7
180        VMLAL.S16 q8, d8, d1[2]
181        VMLAL.S16 q9, d9, d1[2]
182        VSUBL.U8 q5, d10, d14
183        ADDS    r5, r5, 8
184
185        VMLAL.S16 q2, d10, d1[3]
186        VMLAL.S16 q3, d11, d1[3]
187
188        # Is there a remainder?- 1-7 bytes of A
189        BNE     4f
190
1913:
192        VADD.S32 q8, q8, q2
193        VADD.S32 q9, q9, q3
194
195        # RNDNU quantization
196        VDUP.32 q0, d12[0]              // right_pre_shift
197
198        VQSHL.S32 q8,  q8, q0
199        VQSHL.S32 q9,  q9, q0
200
201        VDUP.32 q2, d13[0]              // right_post_shift
202
203        VQDMULH.S32 q8,  q8, d12[1]     // multiplier
204        VQDMULH.S32 q9,  q9, d12[1]
205
206        VRSHL.S32 q8,  q8, q2
207        VRSHL.S32 q9,  q9, q2
208
209        VDUP.16 q0, d13[2]              // output_zero_point
210
211        VQMOVN.S32 d16, q8
212        VQMOVN.S32 d17, q9
213
214        VQADD.S16 q8,  q8, q0
215
216        VDUP.8  d24, d13[6]             // output_min
217
218        VQMOVUN.S16 d0,  q8
219
220        VDUP.8  d25, d13[7]             // output_max
221
222        VMAX.U8 d0, d0, d24
223
224        SUBS    r1, r1, 8
225
226        VMIN.U8 d0, d0, d25
227
228        # Store full 1 x 8
229        BLO     5f
230        VST1.8  {d0}, [r11], r7
231        SUB     r3, r3, r2
232        BHI     0b
233
234        VPOP    {d8-d14}
235        ADD     sp, sp, 8               // skip pad of 8
236        ADD     sp, sp, 16
237        POP     {r5, r7, r9, r11}
238        BX      lr
239
240        # Remainder- 1 to 7 bytes of A
241        .p2align 3
2424:
243        AND     r5, r5, 7               // kc remainder 1 to 7
244
245        VLD1.8  {d0},  [r3], r5
246        VLD1.8  {d8},  [r9]!
247
248        VMOVL.U8 q0, d0
249        VSUBL.U8 q4, d8, d14
250        VMLAL.S16 q8, d8, d0[0]
251        VMLAL.S16 q9, d9, d0[0]
252        CMP     r5, 2
253        BLO     3b
254
255        VLD1.8  {d8},  [r9]!
256        VSUBL.U8 q4, d8, d14
257        VMLAL.S16 q8, d8, d0[1]
258        VMLAL.S16 q9, d9, d0[1]
259        BEQ     3b
260
261        VLD1.8  {d8},  [r9]!
262        VSUBL.U8 q4, d8, d14
263        VMLAL.S16 q8, d8, d0[2]
264        VMLAL.S16 q9, d9, d0[2]
265        CMP     r5, 4
266        BLO     3b
267
268        VLD1.8  {d8},  [r9]!
269        VSUBL.U8 q4, d8, d14
270        VMLAL.S16 q8, d8, d0[3]
271        VMLAL.S16 q9, d9, d0[3]
272        BEQ     3b
273
274        VLD1.8  {d8},  [r9]!
275        VSUBL.U8 q4, d8, d14
276        VMLAL.S16 q8, d8, d1[0]
277        VMLAL.S16 q9, d9, d1[0]
278        CMP     r5, 6
279        BLO     3b
280
281        VLD1.8  {d8},  [r9]!
282        VSUBL.U8 q4, d8, d14
283        VMLAL.S16 q8, d8, d1[1]
284        VMLAL.S16 q9, d9, d1[1]
285        BEQ     3b
286
287        VLD1.8  {d8},  [r9]!
288        VSUBL.U8 q4, d8, d14
289        VMLAL.S16 q8, d8, d1[2]
290        VMLAL.S16 q9, d9, d1[2]
291        B       3b
292
293        # Store odd width
294        .p2align 3
2955:
296        TST     r1, 4
297        BEQ     6f
298        VST1.32 {d0[0]}, [r11]!
299        VEXT.8  q0, q0, q0, 4
3006:
301        TST     r1, 2
302        BEQ     7f
303        VST1.16 {d0[0]}, [r11]!
304        VEXT.8  q0, q0, q0, 2
3057:
306        TST     r1, 1
307        BEQ     8f
308        VST1.8  {d0[0]}, [r11]
3098:
310        VPOP    {d8-d14}
311        ADD     sp, sp, 8               // skip pad of 8
312        ADD     sp, sp, 16
313        POP     {r5, r7, r9, r11}
314        BX      lr
315
316END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7
317
318#ifdef __ELF__
319.section ".note.GNU-stack","",%progbits
320#endif
321
322