xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35
36
37BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
38
39        # Clamp A and C pointers
40        CMP     x0, 2                   // if mr < 2
41        STP     d8, d9, [sp, -64]!
42        ADD     x4, x3, x4              // a1 = a0 + a_stride
43        STP     d10, d11, [sp, 16]
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d12, d13, [sp, 32]
46        CSEL    x4, x3, x4, LO          //   a1 = a0
47        STP     d14, d15, [sp, 48]
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        CSEL    x7, x6, x7, LO          //   c1 = c0
50        BIC     x2, x2, 7
51
52        .p2align 3
530:
54        # Load initial bias from w into accumulators
55        SUBS    x0, x2, 16              // k = kc - 16
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        LDP     x10, x11, [sp, 64]       // cn_stride, params
68        MOV     v31.16b, v30.16b
69        # Is there at least 16 bytes for epilogue?
70        B.LO    4f
71
72        # Prologue: load A0, A1 and 2 B's
73        LDP     d4, d5, [x5]
74        LDP     d0, d6, [x3], 16
75        LDP     d1, d7, [x4], 16
76        LDP     d8, d9, [x5, 64]
77
78        # Is there at least 16 bytes for main loop?
79        SUBS    x0, x0, 16              // k = k - 16
80        B.LO    2f
81
82         # Main loop - 16 bytes of A
83        .p2align 3
841:
85        SMULL   v2.8h, v4.8b, v0.8b
86        SMULL   v3.8h, v4.8b, v1.8b
87        SMULL   v10.8h, v5.8b, v0.8b
88        SMULL   v11.8h, v5.8b, v1.8b
89        LDP     d4, d5, [x5, 16]
90        SMLAL   v2.8h, v8.8b, v6.8b
91        SMLAL   v3.8h, v8.8b, v7.8b
92        SMLAL   v10.8h, v9.8b, v6.8b
93        SMLAL   v11.8h, v9.8b, v7.8b
94
95        LDP     d8, d9, [x5, 80]
96        SMULL   v12.8h, v4.8b, v0.8b
97        SADALP  v16.4s,  v2.8h
98        SMULL   v13.8h, v4.8b, v1.8b
99        SADALP  v17.4s,  v3.8h
100        SMULL   v14.8h, v5.8b, v0.8b
101        SADALP  v18.4s, v10.8h
102        SMULL   v15.8h, v5.8b, v1.8b
103        SADALP  v19.4s, v11.8h
104        LDP     d4, d5, [x5, 32]
105        SMLAL   v12.8h, v8.8b, v6.8b
106        SMLAL   v13.8h, v8.8b, v7.8b
107        SMLAL   v14.8h, v9.8b, v6.8b
108        SMLAL   v15.8h, v9.8b, v7.8b
109
110        LDP     d8, d9, [x5, 96]
111        SMULL   v2.8h, v4.8b, v0.8b
112        SADALP  v20.4s, v12.8h
113        SMULL   v3.8h, v4.8b, v1.8b
114        SADALP  v21.4s, v13.8h
115        SMULL   v10.8h, v5.8b, v0.8b
116        SADALP  v22.4s, v14.8h
117        SMULL   v11.8h, v5.8b, v1.8b
118        SADALP  v23.4s, v15.8h
119        LDP     d4, d5, [x5, 48]
120        SMLAL   v2.8h, v8.8b, v6.8b
121        SMLAL   v3.8h, v8.8b, v7.8b
122        SMLAL   v10.8h, v9.8b, v6.8b
123        SMLAL   v11.8h, v9.8b, v7.8b
124
125        LDP     d8, d9, [x5, 112]
126        SMULL   v12.8h, v4.8b, v0.8b
127        ADD     x5, x5, 128
128        SADALP  v24.4s,  v2.8h
129        SMULL   v13.8h, v4.8b, v1.8b
130        SADALP  v25.4s,  v3.8h
131        SMULL   v14.8h, v5.8b, v0.8b
132        SADALP  v26.4s, v10.8h
133        SMULL   v15.8h, v5.8b, v1.8b
134        SADALP  v27.4s, v11.8h
135        SMLAL   v12.8h, v8.8b, v6.8b
136        LDP     d4, d5, [x5]            // Read B
137        SMLAL   v13.8h, v8.8b, v7.8b
138        SUBS    x0, x0, 16
139        SMLAL   v14.8h, v9.8b, v6.8b
140        LDP     d0, d6, [x3], 16        // Read A0
141        SMLAL   v15.8h, v9.8b, v7.8b
142
143        SADALP  v28.4s, v12.8h
144        LDP     d1, d7, [x4], 16        // Read A1
145        SADALP  v29.4s, v13.8h
146        SADALP  v30.4s, v14.8h
147        LDP     d8, d9, [x5, 64]        // Read B
148        SADALP  v31.4s, v15.8h
149        B.HS    1b
150
151        # Epilogue
152        # Same as main loop except no loads at end of loop
153        .p2align 3
1542:
155        SMULL   v2.8h, v4.8b, v0.8b
156        SMULL   v3.8h, v4.8b, v1.8b
157        SMULL   v10.8h, v5.8b, v0.8b
158        SMULL   v11.8h, v5.8b, v1.8b
159        LDP     d4, d5, [x5, 16]
160        SMLAL   v2.8h, v8.8b, v6.8b
161        SMLAL   v3.8h, v8.8b, v7.8b
162        SMLAL   v10.8h, v9.8b, v6.8b
163        SMLAL   v11.8h, v9.8b, v7.8b
164
165        LDP     d8, d9, [x5, 80]
166        SMULL   v12.8h, v4.8b, v0.8b
167        SADALP  v16.4s,  v2.8h
168        SMULL   v13.8h, v4.8b, v1.8b
169        SADALP  v17.4s,  v3.8h
170        SMULL   v14.8h, v5.8b, v0.8b
171        SADALP  v18.4s, v10.8h
172        SMULL   v15.8h, v5.8b, v1.8b
173        SADALP  v19.4s, v11.8h
174        LDP     d4, d5, [x5, 32]
175        SMLAL   v12.8h, v8.8b, v6.8b
176        SMLAL   v13.8h, v8.8b, v7.8b
177        SMLAL   v14.8h, v9.8b, v6.8b
178        SMLAL   v15.8h, v9.8b, v7.8b
179
180        LDP     d8, d9, [x5, 96]
181        SMULL   v2.8h, v4.8b, v0.8b
182        SADALP  v20.4s, v12.8h
183        SMULL   v3.8h, v4.8b, v1.8b
184        SADALP  v21.4s, v13.8h
185        SMULL   v10.8h, v5.8b, v0.8b
186        SADALP  v22.4s, v14.8h
187        SMULL   v11.8h, v5.8b, v1.8b
188        SADALP  v23.4s, v15.8h
189        LDP     d4, d5, [x5, 48]
190        SMLAL   v2.8h, v8.8b, v6.8b
191        SMLAL   v3.8h, v8.8b, v7.8b
192        SMLAL   v10.8h, v9.8b, v6.8b
193        SMLAL   v11.8h, v9.8b, v7.8b
194
195        LDP     d8, d9, [x5, 112]
196        SMULL   v12.8h, v4.8b, v0.8b
197        SADALP  v24.4s,  v2.8h
198        SMULL   v13.8h, v4.8b, v1.8b
199        SADALP  v25.4s,  v3.8h
200        SMULL   v14.8h, v5.8b, v0.8b
201        SADALP  v26.4s, v10.8h
202        SMULL   v15.8h, v5.8b, v1.8b
203        SADALP  v27.4s, v11.8h
204        SMLAL   v12.8h, v8.8b, v6.8b
205        SMLAL   v13.8h, v8.8b, v7.8b
206        SMLAL   v14.8h, v9.8b, v6.8b
207        SMLAL   v15.8h, v9.8b, v7.8b
208        ADD     x5, x5, 128
209
210        SADALP  v28.4s, v12.8h
211        SADALP  v29.4s, v13.8h
212        SADALP  v30.4s, v14.8h
213        SADALP  v31.4s, v15.8h
214
215        # Is there a remainder?- 8 bytes of A
216        TBNZ    x0, 3, 4f
217
218        .p2align 3
2193:
220        # Add columns
221        ADDP    v16.4s, v16.4s, v18.4s
222        ADDP    v20.4s, v20.4s, v22.4s
223        ADDP    v24.4s, v24.4s, v26.4s
224        ADDP    v28.4s, v28.4s, v30.4s
225        ADDP    v17.4s, v17.4s, v19.4s
226        ADDP    v21.4s, v21.4s, v23.4s
227        ADDP    v25.4s, v25.4s, v27.4s
228        ADDP    v29.4s, v29.4s, v31.4s
229        ADDP    v0.4s, v16.4s, v20.4s
230        ADDP    v1.4s, v24.4s, v28.4s
231        ADDP    v2.4s, v17.4s, v21.4s
232        ADDP    v3.4s, v25.4s, v29.4s
233
234        # Load per channel scale values from weights
235        SCVTF   v0.4s, v0.4s
236        LDR     q4, [x5], 16
237        SCVTF   v1.4s, v1.4s
238        LDR     q5, [x5], 16
239        SCVTF   v2.4s, v2.4s
240        SCVTF   v3.4s, v3.4s
241        FMUL    v0.4s, v0.4s, v4.4s
242        FMUL    v1.4s, v1.4s, v5.4s
243        FMUL    v2.4s, v2.4s, v4.4s
244        FMUL    v3.4s, v3.4s, v5.4s
245
246        FCVTNS  v0.4s, v0.4s
247        FCVTNS  v1.4s, v1.4s
248        FCVTNS  v2.4s, v2.4s
249        FCVTNS  v3.4s, v3.4s
250
251        LD1R    {v5.8h}, [x11], 2
252        SQXTN   v0.4h, v0.4s
253        SQXTN   v2.4h, v2.4s
254        SQXTN2  v0.8h, v1.4s
255        SQXTN2  v2.8h, v3.4s
256        SUBS    x1, x1, 8
257        SQADD   v0.8h, v0.8h, v5.8h
258        SQADD   v1.8h, v2.8h, v5.8h
259        SQXTN   v0.8b, v0.8h
260        SQXTN2  v0.16b, v1.8h
261        LD1R    {v1.16b}, [x11], 1
262        LD1R    {v2.16b}, [x11]
263        SMAX    v0.16b, v0.16b, v1.16b
264        SMIN    v0.16b, v0.16b, v2.16b
265        B.LO    5f
266
267        # Store full 2 x 8
268        ST1     {v0.8b}, [x6], x10
269        SUB     x3, x3, x2              // a0 -= kc
270        ST1     {v0.d}[1], [x7], x10
271        SUB     x4, x4, x2              // a1 -= kc
272        B.HI    0b
273
274        # Restore d8-d15 from stack
275        LDP     d14, d15, [sp, 48]
276        LDP     d12, d13, [sp, 32]
277        LDP     d10, d11, [sp, 16]
278        LDP     d8, d9, [sp], 64
279        RET
280
281        # Remainder - 8 bytes of A
282        .p2align 3
2834:
284        LDR     d0, [x3], 8
285        LDP     d4, d5, [x5]
286        LDR     d1, [x4], 8
287        LDP     d6, d7, [x5, 16]
288        SMULL   v2.8h, v4.8b, v0.8b
289        SMULL   v3.8h, v4.8b, v1.8b
290        SMULL   v10.8h, v5.8b, v0.8b
291        SMULL   v11.8h, v5.8b, v1.8b
292        SMULL   v12.8h, v6.8b, v0.8b
293        SADALP  v16.4s,  v2.8h
294        SMULL   v13.8h, v6.8b, v1.8b
295        SADALP  v17.4s,  v3.8h
296        SMULL   v14.8h, v7.8b, v0.8b
297        SADALP  v18.4s, v10.8h
298        SMULL   v15.8h, v7.8b, v1.8b
299        SADALP  v19.4s, v11.8h
300        LDP     d4, d5, [x5, 32]
301        SMULL   v2.8h, v4.8b, v0.8b
302        SADALP  v20.4s, v12.8h
303        SMULL   v3.8h, v4.8b, v1.8b
304        SADALP  v21.4s, v13.8h
305        SMULL   v10.8h, v5.8b, v0.8b
306        SADALP  v22.4s, v14.8h
307        SMULL   v11.8h, v5.8b, v1.8b
308        SADALP  v23.4s, v15.8h
309        LDP     d6, d7, [x5, 48]
310        SMULL   v12.8h, v6.8b, v0.8b
311        SADALP  v24.4s,  v2.8h
312        SMULL   v13.8h, v6.8b, v1.8b
313        SADALP  v25.4s,  v3.8h
314        SMULL   v14.8h, v7.8b, v0.8b
315        SADALP  v26.4s, v10.8h
316        SMULL   v15.8h, v7.8b, v1.8b
317        SADALP  v27.4s, v11.8h
318        ADD     x5, x5, 64
319        SADALP  v28.4s, v12.8h
320        SADALP  v29.4s, v13.8h
321        SADALP  v30.4s, v14.8h
322        SADALP  v31.4s, v15.8h
323        B       3b
324
325        # Store odd width
326        .p2align 3
3275:
328        TBZ     x1, 2, 6f
329        STR     s0, [x6], 4
330        ST1     {v0.s}[2], [x7], 4
331        EXT     v0.16b, v0.16b, v0.16b, 4
332
3336:
334        TBZ     x1, 1, 7f
335        STR     h0, [x6], 2
336        ST1     {v0.h}[4], [x7], 2
337        EXT     v0.16b, v0.16b, v0.16b, 2
3387:
339        TBZ     x1, 0, 8f
340        STR     b0, [x6]
341        ST1     {v0.b}[8], [x7]
3428:
343        # Restore d8-d15 from stack
344        LDP     d14, d15, [sp, 48]
345        LDP     d12, d13, [sp, 32]
346        LDP     d10, d11, [sp, 16]
347        LDP     d8, d9, [sp], 64
348        RET
349
350END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
351
352#ifdef __ELF__
353.section ".note.GNU-stack","",%progbits
354#endif
355
356