xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35
36
37BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d8, d9, [sp, -64]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d10, d11, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d12, d13, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        STP     d14, d15, [sp, 48]
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        CSEL    x7, x6, x7, LO          //   c1 = c0
50        BIC     x2, x2, 7
51
52        .p2align 3
530:
54        # Load initial bias from w into accumulators
55        SUBS    x0, x2, 16              // k = kc - 16
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        LDP     x10, x11, [sp, 64]       // cn_stride, params
68        MOV     v31.16b, v30.16b
69        # Is there at least 16 bytes for epilogue?
70        B.LO    4f
71
72        # Prologue: load A0, A1 and 2 B's
73        LDP     d4, d5, [x5]
74        LDP     d0, d6, [x3], 16
75        LDP     d1, d7, [x4], 16
76        LDP     d8, d9, [x5, 64]
77
78        # Is there at least 16 bytes for main loop?
79        SUBS    x0, x0, 16              // k = k - 16
80        B.LO    2f
81
82         # Main loop - 16 bytes of A
83        .p2align 3
841:
85        SMULL   v2.8h, v4.8b, v0.8b
86        SMULL   v3.8h, v4.8b, v1.8b
87        SMULL   v10.8h, v5.8b, v0.8b
88        SMULL   v11.8h, v5.8b, v1.8b
89        LDP     d4, d5, [x5, 16]
90        SMLAL   v2.8h, v8.8b, v6.8b
91        SMLAL   v3.8h, v8.8b, v7.8b
92        SMLAL   v10.8h, v9.8b, v6.8b
93        SMLAL   v11.8h, v9.8b, v7.8b
94
95        LDP     d8, d9, [x5, 80]
96        SMULL   v12.8h, v4.8b, v0.8b
97        SADALP  v16.4s,  v2.8h
98        SMULL   v13.8h, v4.8b, v1.8b
99        SADALP  v17.4s,  v3.8h
100        SMULL   v14.8h, v5.8b, v0.8b
101        SADALP  v18.4s, v10.8h
102        SMULL   v15.8h, v5.8b, v1.8b
103        SADALP  v19.4s, v11.8h
104        LDP     d4, d5, [x5, 32]
105        SMLAL   v12.8h, v8.8b, v6.8b
106        SMLAL   v13.8h, v8.8b, v7.8b
107        SMLAL   v14.8h, v9.8b, v6.8b
108        SMLAL   v15.8h, v9.8b, v7.8b
109
110        LDP     d8, d9, [x5, 96]
111        SMULL   v2.8h, v4.8b, v0.8b
112        SADALP  v20.4s, v12.8h
113        SMULL   v3.8h, v4.8b, v1.8b
114        SADALP  v21.4s, v13.8h
115        SMULL   v10.8h, v5.8b, v0.8b
116        SADALP  v22.4s, v14.8h
117        SMULL   v11.8h, v5.8b, v1.8b
118        SADALP  v23.4s, v15.8h
119        LDP     d4, d5, [x5, 48]
120        SMLAL   v2.8h, v8.8b, v6.8b
121        SMLAL   v3.8h, v8.8b, v7.8b
122        SMLAL   v10.8h, v9.8b, v6.8b
123        SMLAL   v11.8h, v9.8b, v7.8b
124
125        LDP     d8, d9, [x5, 112]
126        SMULL   v12.8h, v4.8b, v0.8b
127        ADD     x5, x5, 128
128        SADALP  v24.4s,  v2.8h
129        SMULL   v13.8h, v4.8b, v1.8b
130        SADALP  v25.4s,  v3.8h
131        SMULL   v14.8h, v5.8b, v0.8b
132        SADALP  v26.4s, v10.8h
133        SMULL   v15.8h, v5.8b, v1.8b
134        SADALP  v27.4s, v11.8h
135        SMLAL   v12.8h, v8.8b, v6.8b
136        LDP     d4, d5, [x5]            // Read B
137        SMLAL   v13.8h, v8.8b, v7.8b
138        SUBS    x0, x0, 16
139        SMLAL   v14.8h, v9.8b, v6.8b
140        LDP     d0, d6, [x3], 16        // Read A0
141        SMLAL   v15.8h, v9.8b, v7.8b
142
143        SADALP  v28.4s, v12.8h
144        LDP     d1, d7, [x4], 16        // Read A1
145        SADALP  v29.4s, v13.8h
146        SADALP  v30.4s, v14.8h
147        LDP     d8, d9, [x5, 64]        // Read B
148        SADALP  v31.4s, v15.8h
149        B.HS    1b
150
151        # Epilogue
152        # Same as main loop except no loads at end of loop
153        .p2align 3
1542:
155        SMULL   v2.8h, v4.8b, v0.8b
156        SMULL   v3.8h, v4.8b, v1.8b
157        SMULL   v10.8h, v5.8b, v0.8b
158        SMULL   v11.8h, v5.8b, v1.8b
159        LDP     d4, d5, [x5, 16]
160        SMLAL   v2.8h, v8.8b, v6.8b
161        SMLAL   v3.8h, v8.8b, v7.8b
162        SMLAL   v10.8h, v9.8b, v6.8b
163        SMLAL   v11.8h, v9.8b, v7.8b
164
165        LDP     d8, d9, [x5, 80]
166        SMULL   v12.8h, v4.8b, v0.8b
167        SADALP  v16.4s,  v2.8h
168        SMULL   v13.8h, v4.8b, v1.8b
169        SADALP  v17.4s,  v3.8h
170        SMULL   v14.8h, v5.8b, v0.8b
171        SADALP  v18.4s, v10.8h
172        SMULL   v15.8h, v5.8b, v1.8b
173        SADALP  v19.4s, v11.8h
174        LDP     d4, d5, [x5, 32]
175        SMLAL   v12.8h, v8.8b, v6.8b
176        SMLAL   v13.8h, v8.8b, v7.8b
177        SMLAL   v14.8h, v9.8b, v6.8b
178        SMLAL   v15.8h, v9.8b, v7.8b
179
180        LDP     d8, d9, [x5, 96]
181        SMULL   v2.8h, v4.8b, v0.8b
182        SADALP  v20.4s, v12.8h
183        SMULL   v3.8h, v4.8b, v1.8b
184        SADALP  v21.4s, v13.8h
185        SMULL   v10.8h, v5.8b, v0.8b
186        SADALP  v22.4s, v14.8h
187        SMULL   v11.8h, v5.8b, v1.8b
188        SADALP  v23.4s, v15.8h
189        LDP     d4, d5, [x5, 48]
190        SMLAL   v2.8h, v8.8b, v6.8b
191        SMLAL   v3.8h, v8.8b, v7.8b
192        SMLAL   v10.8h, v9.8b, v6.8b
193        SMLAL   v11.8h, v9.8b, v7.8b
194
195        LDP     d8, d9, [x5, 112]
196        SMULL   v12.8h, v4.8b, v0.8b
197        SADALP  v24.4s,  v2.8h
198        SMULL   v13.8h, v4.8b, v1.8b
199        SADALP  v25.4s,  v3.8h
200        SMULL   v14.8h, v5.8b, v0.8b
201        SADALP  v26.4s, v10.8h
202        SMULL   v15.8h, v5.8b, v1.8b
203        SADALP  v27.4s, v11.8h
204        SMLAL   v12.8h, v8.8b, v6.8b
205        SMLAL   v13.8h, v8.8b, v7.8b
206        SMLAL   v14.8h, v9.8b, v6.8b
207        SMLAL   v15.8h, v9.8b, v7.8b
208        ADD     x5, x5, 128
209
210        SADALP  v28.4s, v12.8h
211        SADALP  v29.4s, v13.8h
212        SADALP  v30.4s, v14.8h
213        SADALP  v31.4s, v15.8h
214
215        # Is there a remainder?- 8 bytes of A
216        TBNZ    x0, 3, 4f
217
218        .p2align 3
2193:
220        # Add columns
221        ADDP    v16.4s, v16.4s, v18.4s
222        ADDP    v20.4s, v20.4s, v22.4s
223        ADDP    v24.4s, v24.4s, v26.4s
224        ADDP    v28.4s, v28.4s, v30.4s
225        ADDP    v17.4s, v17.4s, v19.4s
226        ADDP    v21.4s, v21.4s, v23.4s
227        ADDP    v25.4s, v25.4s, v27.4s
228        ADDP    v29.4s, v29.4s, v31.4s
229        ADDP    v0.4s, v16.4s, v20.4s
230        ADDP    v1.4s, v24.4s, v28.4s
231        ADDP    v2.4s, v17.4s, v21.4s
232        ADDP    v3.4s, v25.4s, v29.4s
233
234        # Apply params - scale, bias and clamp
235        SCVTF   v0.4s, v0.4s
236        LD1R    {v4.4s}, [x11], 4
237        SCVTF   v1.4s, v1.4s
238        SCVTF   v2.4s, v2.4s
239        SCVTF   v3.4s, v3.4s
240        FMUL    v0.4s, v0.4s, v4.4s
241        FMUL    v1.4s, v1.4s, v4.4s
242        FMUL    v2.4s, v2.4s, v4.4s
243        FMUL    v3.4s, v3.4s, v4.4s
244
245        FCVTNS  v0.4s, v0.4s
246        FCVTNS  v1.4s, v1.4s
247        FCVTNS  v2.4s, v2.4s
248        FCVTNS  v3.4s, v3.4s
249
250        LD1R    {v5.8h}, [x11], 2
251        SQXTN   v0.4h, v0.4s
252        SQXTN   v2.4h, v2.4s
253        SQXTN2  v0.8h, v1.4s
254        SQXTN2  v2.8h, v3.4s
255        SUBS    x1, x1, 8
256        SQADD   v0.8h, v0.8h, v5.8h
257        SQADD   v1.8h, v2.8h, v5.8h
258        SQXTN   v0.8b, v0.8h
259        SQXTN2  v0.16b, v1.8h
260        LD1R    {v1.16b}, [x11], 1
261        LD1R    {v2.16b}, [x11]
262        SMAX    v0.16b, v0.16b, v1.16b
263        SMIN    v0.16b, v0.16b, v2.16b
264        B.LO    5f
265
266        # Store full 2 x 8
267        ST1     {v0.8b}, [x6], x10
268        SUB     x3, x3, x2              // a0 -= kc
269        ST1     {v0.d}[1], [x7], x10
270        SUB     x4, x4, x2              // a1 -= kc
271        B.HI    0b
272
273        # Restore d8-d15 from stack
274        LDP     d14, d15, [sp, 48]
275        LDP     d12, d13, [sp, 32]
276        LDP     d10, d11, [sp, 16]
277        LDP     d8, d9, [sp], 64
278        RET
279
280        # Remainder - 8 bytes of A
281        .p2align 3
2824:
283        LDR     d0, [x3], 8
284        LDP     d4, d5, [x5]
285        LDR     d1, [x4], 8
286        LDP     d6, d7, [x5, 16]
287        SMULL   v2.8h, v4.8b, v0.8b
288        SMULL   v3.8h, v4.8b, v1.8b
289        SMULL   v10.8h, v5.8b, v0.8b
290        SMULL   v11.8h, v5.8b, v1.8b
291        SMULL   v12.8h, v6.8b, v0.8b
292        SADALP  v16.4s,  v2.8h
293        SMULL   v13.8h, v6.8b, v1.8b
294        SADALP  v17.4s,  v3.8h
295        SMULL   v14.8h, v7.8b, v0.8b
296        SADALP  v18.4s, v10.8h
297        SMULL   v15.8h, v7.8b, v1.8b
298        SADALP  v19.4s, v11.8h
299        LDP     d4, d5, [x5, 32]
300        SMULL   v2.8h, v4.8b, v0.8b
301        SADALP  v20.4s, v12.8h
302        SMULL   v3.8h, v4.8b, v1.8b
303        SADALP  v21.4s, v13.8h
304        SMULL   v10.8h, v5.8b, v0.8b
305        SADALP  v22.4s, v14.8h
306        SMULL   v11.8h, v5.8b, v1.8b
307        SADALP  v23.4s, v15.8h
308        LDP     d6, d7, [x5, 48]
309        SMULL   v12.8h, v6.8b, v0.8b
310        SADALP  v24.4s,  v2.8h
311        SMULL   v13.8h, v6.8b, v1.8b
312        SADALP  v25.4s,  v3.8h
313        SMULL   v14.8h, v7.8b, v0.8b
314        SADALP  v26.4s, v10.8h
315        SMULL   v15.8h, v7.8b, v1.8b
316        SADALP  v27.4s, v11.8h
317        ADD     x5, x5, 64
318        SADALP  v28.4s, v12.8h
319        SADALP  v29.4s, v13.8h
320        SADALP  v30.4s, v14.8h
321        SADALP  v31.4s, v15.8h
322        B       3b
323
324        # Store odd width
325        .p2align 3
3265:
327        TBZ     x1, 2, 6f
328        STR     s0, [x6], 4
329        ST1     {v0.s}[2], [x7], 4
330        EXT     v0.16b, v0.16b, v0.16b, 4
331
3326:
333        TBZ     x1, 1, 7f
334        STR     h0, [x6], 2
335        ST1     {v0.h}[4], [x7], 2
336        EXT     v0.16b, v0.16b, v0.16b, 2
3377:
338        TBZ     x1, 0, 8f
339        STR     b0, [x6]
340        ST1     {v0.b}[8], [x7]
3418:
342        # Restore d8-d15 from stack
343        LDP     d14, d15, [sp, 48]
344        LDP     d12, d13, [sp, 32]
345        LDP     d10, d11, [sp, 16]
346        LDP     d8, d9, [sp], 64
347        RET
348
349END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
350
351#ifdef __ELF__
352.section ".note.GNU-stack","",%progbits
353#endif
354
355