xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37
38BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
39
40        # Clamp C pointers
41        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
42        CMP     x0, 2                   // if mr < 2
43        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d8, d9, [sp, -64]!
46        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
47        STP     d10, d11, [sp, 16]
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        STP     d12, d13, [sp, 32]
50        BIC     x2, x2, 7
51        STP     d14, d15, [sp, 48]
52
53        .p2align 3
540:
55        # Load initial bias from w into accumulators
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        MOV     v31.16b, v30.16b
68        MOV     x9, x3                  // p = ks
69
70        .p2align 3
711:
72        # Load next 2 A pointers
73        LDP     x13, x15, [x4], 16
74        CMP     x13, x12                // if a0 == zero
75        ADD     x13, x13, x8            // a0 += a_offset
76        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
77        CMP     x15, x12                // if a1 == zero
78        ADD     x15, x15, x8            // a1 += a_offset
79        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
80
81        # Is there at least 16 bytes for epilogue?
82        SUBS    x0, x2, 16              // k = kc - 16
83        B.LO    5f
84
85        # Prologue: load A0, A1 and 2 B's
86        LDP     d4, d5, [x5]
87        LDP     d0, d6, [x13], 16
88        LDP     d1, d7, [x15], 16
89        LDP     d8, d9, [x5, 64]
90
91        # Is there at least 16 bytes for main loop?
92        SUBS    x0, x0, 16              // k = k - 16
93        B.LO    3f
94
95         # Main loop - 16 bytes of A
96        .p2align 3
972:
98        SMULL   v2.8h, v4.8b, v0.8b
99        SMULL   v3.8h, v4.8b, v1.8b
100        SMULL   v10.8h, v5.8b, v0.8b
101        SMULL   v11.8h, v5.8b, v1.8b
102        LDP     d4, d5, [x5, 16]
103        SMLAL   v2.8h, v8.8b, v6.8b
104        SMLAL   v3.8h, v8.8b, v7.8b
105        SMLAL   v10.8h, v9.8b, v6.8b
106        SMLAL   v11.8h, v9.8b, v7.8b
107
108        LDP     d8, d9, [x5, 80]
109        SMULL   v12.8h, v4.8b, v0.8b
110        SADALP  v16.4s,  v2.8h
111        SMULL   v13.8h, v4.8b, v1.8b
112        SADALP  v17.4s,  v3.8h
113        SMULL   v14.8h, v5.8b, v0.8b
114        SADALP  v18.4s, v10.8h
115        SMULL   v15.8h, v5.8b, v1.8b
116        SADALP  v19.4s, v11.8h
117        LDP     d4, d5, [x5, 32]
118        SMLAL   v12.8h, v8.8b, v6.8b
119        SMLAL   v13.8h, v8.8b, v7.8b
120        SMLAL   v14.8h, v9.8b, v6.8b
121        SMLAL   v15.8h, v9.8b, v7.8b
122
123        LDP     d8, d9, [x5, 96]
124        SMULL   v2.8h, v4.8b, v0.8b
125        SADALP  v20.4s, v12.8h
126        SMULL   v3.8h, v4.8b, v1.8b
127        SADALP  v21.4s, v13.8h
128        SMULL   v10.8h, v5.8b, v0.8b
129        SADALP  v22.4s, v14.8h
130        SMULL   v11.8h, v5.8b, v1.8b
131        SADALP  v23.4s, v15.8h
132        LDP     d4, d5, [x5, 48]
133        SMLAL   v2.8h, v8.8b, v6.8b
134        SMLAL   v3.8h, v8.8b, v7.8b
135        SMLAL   v10.8h, v9.8b, v6.8b
136        SMLAL   v11.8h, v9.8b, v7.8b
137
138        LDP     d8, d9, [x5, 112]
139        SMULL   v12.8h, v4.8b, v0.8b
140        ADD     x5, x5, 128
141        SADALP  v24.4s,  v2.8h
142        SMULL   v13.8h, v4.8b, v1.8b
143        SADALP  v25.4s,  v3.8h
144        SMULL   v14.8h, v5.8b, v0.8b
145        SADALP  v26.4s, v10.8h
146        SMULL   v15.8h, v5.8b, v1.8b
147        SADALP  v27.4s, v11.8h
148        SMLAL   v12.8h, v8.8b, v6.8b
149        LDP     d4, d5, [x5]            // Read B
150        SMLAL   v13.8h, v8.8b, v7.8b
151        SUBS    x0, x0, 16
152        SMLAL   v14.8h, v9.8b, v6.8b
153        LDP     d0, d6, [x13], 16       // Read A0
154        SMLAL   v15.8h, v9.8b, v7.8b
155
156        SADALP  v28.4s, v12.8h
157        LDP     d1, d7, [x15], 16       // Read A1
158        SADALP  v29.4s, v13.8h
159        SADALP  v30.4s, v14.8h
160        LDP     d8, d9, [x5, 64]        // Read B
161        SADALP  v31.4s, v15.8h
162        B.HS    2b
163
164        # Epilogue
165        # Same as main loop except no loads at end of loop
166        .p2align 3
1673:
168        SMULL   v2.8h, v4.8b, v0.8b
169        SMULL   v3.8h, v4.8b, v1.8b
170        SMULL   v10.8h, v5.8b, v0.8b
171        SMULL   v11.8h, v5.8b, v1.8b
172        LDP     d4, d5, [x5, 16]
173        SMLAL   v2.8h, v8.8b, v6.8b
174        SMLAL   v3.8h, v8.8b, v7.8b
175        SMLAL   v10.8h, v9.8b, v6.8b
176        SMLAL   v11.8h, v9.8b, v7.8b
177
178        LDP     d8, d9, [x5, 80]
179        SMULL   v12.8h, v4.8b, v0.8b
180        SADALP  v16.4s,  v2.8h
181        SMULL   v13.8h, v4.8b, v1.8b
182        SADALP  v17.4s,  v3.8h
183        SMULL   v14.8h, v5.8b, v0.8b
184        SADALP  v18.4s, v10.8h
185        SMULL   v15.8h, v5.8b, v1.8b
186        SADALP  v19.4s, v11.8h
187        LDP     d4, d5, [x5, 32]
188        SMLAL   v12.8h, v8.8b, v6.8b
189        SMLAL   v13.8h, v8.8b, v7.8b
190        SMLAL   v14.8h, v9.8b, v6.8b
191        SMLAL   v15.8h, v9.8b, v7.8b
192
193        LDP     d8, d9, [x5, 96]
194        SMULL   v2.8h, v4.8b, v0.8b
195        SADALP  v20.4s, v12.8h
196        SMULL   v3.8h, v4.8b, v1.8b
197        SADALP  v21.4s, v13.8h
198        SMULL   v10.8h, v5.8b, v0.8b
199        SADALP  v22.4s, v14.8h
200        SMULL   v11.8h, v5.8b, v1.8b
201        SADALP  v23.4s, v15.8h
202        LDP     d4, d5, [x5, 48]
203        SMLAL   v2.8h, v8.8b, v6.8b
204        SMLAL   v3.8h, v8.8b, v7.8b
205        SMLAL   v10.8h, v9.8b, v6.8b
206        SMLAL   v11.8h, v9.8b, v7.8b
207
208        LDP     d8, d9, [x5, 112]
209        SMULL   v12.8h, v4.8b, v0.8b
210        SADALP  v24.4s,  v2.8h
211        SMULL   v13.8h, v4.8b, v1.8b
212        SADALP  v25.4s,  v3.8h
213        SMULL   v14.8h, v5.8b, v0.8b
214        SADALP  v26.4s, v10.8h
215        SMULL   v15.8h, v5.8b, v1.8b
216        SADALP  v27.4s, v11.8h
217        SMLAL   v12.8h, v8.8b, v6.8b
218        SMLAL   v13.8h, v8.8b, v7.8b
219        SMLAL   v14.8h, v9.8b, v6.8b
220        SMLAL   v15.8h, v9.8b, v7.8b
221        ADD     x5, x5, 128
222
223        SADALP  v28.4s, v12.8h
224        SADALP  v29.4s, v13.8h
225        SADALP  v30.4s, v14.8h
226        SADALP  v31.4s, v15.8h
227
228        # Is there a remainder?- 8 bytes of A
229        TBNZ    x0, 3, 5f
230
231        # ks loop
232        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
233        B.HI    1b
234
2354:
236        # Add columns
237        ADDP    v16.4s, v16.4s, v18.4s
238        ADDP    v20.4s, v20.4s, v22.4s
239        ADDP    v24.4s, v24.4s, v26.4s
240        ADDP    v28.4s, v28.4s, v30.4s
241        ADDP    v17.4s, v17.4s, v19.4s
242        ADDP    v21.4s, v21.4s, v23.4s
243        ADDP    v25.4s, v25.4s, v27.4s
244        ADDP    v29.4s, v29.4s, v31.4s
245        ADDP    v0.4s, v16.4s, v20.4s
246        ADDP    v1.4s, v24.4s, v28.4s
247        ADDP    v2.4s, v17.4s, v21.4s
248        ADDP    v3.4s, v25.4s, v29.4s
249
250        # Load per channel scale values from weights
251        SCVTF   v0.4s, v0.4s
252        LDR     q4, [x5], 16
253        SCVTF   v1.4s, v1.4s
254        LDR     q5, [x5], 16
255        SCVTF   v2.4s, v2.4s
256        SCVTF   v3.4s, v3.4s
257        FMUL    v0.4s, v0.4s, v4.4s
258        FMUL    v1.4s, v1.4s, v5.4s
259        FMUL    v2.4s, v2.4s, v4.4s
260        FMUL    v3.4s, v3.4s, v5.4s
261
262        FCVTNS  v0.4s, v0.4s
263        FCVTNS  v1.4s, v1.4s
264        FCVTNS  v2.4s, v2.4s
265        FCVTNS  v3.4s, v3.4s
266
267        LD1R    {v5.8h}, [x11], 2
268        SQXTN   v0.4h, v0.4s
269        SQXTN   v2.4h, v2.4s
270        SQXTN2  v0.8h, v1.4s
271        SQXTN2  v2.8h, v3.4s
272        SUBS    x1, x1, 8
273        SQADD   v0.8h, v0.8h, v5.8h
274        SQADD   v1.8h, v2.8h, v5.8h
275        SQXTN   v0.8b, v0.8h
276        SQXTN2  v0.16b, v1.8h
277        LD1R    {v1.16b}, [x11], 1
278        LD1R    {v2.16b}, [x11]
279        SMAX    v0.16b, v0.16b, v1.16b
280        SUB     x11, x11, 3          // rewind params pointer
281        SMIN    v0.16b, v0.16b, v2.16b
282        B.LO    6f
283
284        # Store full 2 x 8
285        ST1     {v0.d}[1], [x7], x10
286        ST1     {v0.8b}, [x6], x10
287
288        SUB     x4, x4, x3              // a -= ks
289
290        # nc loop
291        B.HI    0b
292
293        # Restore d8-d15 from stack
294        LDP     d14, d15, [sp, 48]
295        LDP     d12, d13, [sp, 32]
296        LDP     d10, d11, [sp, 16]
297        LDP     d8, d9, [sp], 64
298        RET
299
300        # Remainder - 8 bytes of A
301        .p2align 3
3025:
303        LDR     d0, [x13]
304        LDP     d4, d5, [x5]
305        LDR     d1, [x15]
306        LDP     d6, d7, [x5, 16]
307        SMULL   v2.8h, v4.8b, v0.8b
308        SMULL   v3.8h, v4.8b, v1.8b
309        SMULL   v10.8h, v5.8b, v0.8b
310        SMULL   v11.8h, v5.8b, v1.8b
311        SMULL   v12.8h, v6.8b, v0.8b
312        SADALP  v16.4s,  v2.8h
313        SMULL   v13.8h, v6.8b, v1.8b
314        SADALP  v17.4s,  v3.8h
315        SMULL   v14.8h, v7.8b, v0.8b
316        SADALP  v18.4s, v10.8h
317        SMULL   v15.8h, v7.8b, v1.8b
318        SADALP  v19.4s, v11.8h
319        LDP     d4, d5, [x5, 32]
320        SMULL   v2.8h, v4.8b, v0.8b
321        SADALP  v20.4s, v12.8h
322        SMULL   v3.8h, v4.8b, v1.8b
323        SADALP  v21.4s, v13.8h
324        SMULL   v10.8h, v5.8b, v0.8b
325        SADALP  v22.4s, v14.8h
326        SMULL   v11.8h, v5.8b, v1.8b
327        SADALP  v23.4s, v15.8h
328        LDP     d6, d7, [x5, 48]
329        SMULL   v12.8h, v6.8b, v0.8b
330        SADALP  v24.4s,  v2.8h
331        SMULL   v13.8h, v6.8b, v1.8b
332        SADALP  v25.4s,  v3.8h
333        SMULL   v14.8h, v7.8b, v0.8b
334        SADALP  v26.4s, v10.8h
335        SMULL   v15.8h, v7.8b, v1.8b
336        SADALP  v27.4s, v11.8h
337        ADD     x5, x5, 64
338        SADALP  v28.4s, v12.8h
339        SADALP  v29.4s, v13.8h
340        SADALP  v30.4s, v14.8h
341        SADALP  v31.4s, v15.8h
342
343        # ks loop
344        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
345        B.HI    1b
346        B       4b
347
348        # Store odd width
349        .p2align 3
3506:
351        TBZ     x1, 2, 7f
352        ST1     {v0.s}[2], [x7], 4
353        STR     s0, [x6], 4
354        EXT     v0.16b, v0.16b, v0.16b, 4
355
3567:
357        TBZ     x1, 1, 8f
358        ST1     {v0.h}[4], [x7], 2
359        STR     h0, [x6], 2
360        EXT     v0.16b, v0.16b, v0.16b, 2
3618:
362        TBZ     x1, 0, 9f
363        ST1     {v0.b}[8], [x7]
364        STR     b0, [x6]
3659:
366        # Restore d8-d15 from stack
367        LDP     d14, d15, [sp, 48]
368        LDP     d12, d13, [sp, 32]
369        LDP     d10, d11, [sp, 16]
370        LDP     d8, d9, [sp], 64
371        RET
372
373END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal
374
375#ifdef __ELF__
376.section ".note.GNU-stack","",%progbits
377#endif
378