xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37
38BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
39
40        # Clamp C pointers
41        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
42        CMP     x0, 2                   // if mr < 2
43        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d8, d9, [sp, -64]!
46        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
47        STP     d10, d11, [sp, 16]
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        STP     d12, d13, [sp, 32]
50        BIC     x2, x2, 7
51        STP     d14, d15, [sp, 48]
52
53        .p2align 3
540:
55        # Load initial bias from w into accumulators
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        MOV     v31.16b, v30.16b
68        MOV     x9, x3                  // p = ks
69
70        .p2align 3
711:
72        # Load next 2 A pointers
73        LDP     x13, x15, [x4], 16
74        CMP     x13, x12                // if a0 == zero
75        ADD     x13, x13, x8            // a0 += a_offset
76        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
77        CMP     x15, x12                // if a1 == zero
78        ADD     x15, x15, x8            // a1 += a_offset
79        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
80
81        # Is there at least 16 bytes for epilogue?
82        SUBS    x0, x2, 16              // k = kc - 16
83        B.LO    5f
84
85        # Prologue: load A0, A1 and 2 B's
86        LDP     d4, d5, [x5]
87        LDP     d0, d6, [x13], 16
88        LDP     d1, d7, [x15], 16
89        LDP     d8, d9, [x5, 64]
90
91        # Is there at least 16 bytes for main loop?
92        SUBS    x0, x0, 16              // k = k - 16
93        B.LO    3f
94
95         # Main loop - 16 bytes of A
96        .p2align 3
972:
98        SMULL   v2.8h, v4.8b, v0.8b
99        SMULL   v3.8h, v4.8b, v1.8b
100        PRFM    PLDL1KEEP, [x5, 448]
101        SMULL   v10.8h, v5.8b, v0.8b
102        SMULL   v11.8h, v5.8b, v1.8b
103        LDP     d4, d5, [x5, 16]
104        SMLAL   v2.8h, v8.8b, v6.8b
105        SMLAL   v3.8h, v8.8b, v7.8b
106        PRFM    PLDL1KEEP, [x5, 512]
107        SMLAL   v10.8h, v9.8b, v6.8b
108        SMLAL   v11.8h, v9.8b, v7.8b
109
110        LDP     d8, d9, [x5, 80]
111        SMULL   v12.8h, v4.8b, v0.8b
112        SADALP  v16.4s,  v2.8h
113        SMULL   v13.8h, v4.8b, v1.8b
114        SADALP  v17.4s,  v3.8h
115        SMULL   v14.8h, v5.8b, v0.8b
116        SADALP  v18.4s, v10.8h
117        SMULL   v15.8h, v5.8b, v1.8b
118        SADALP  v19.4s, v11.8h
119        LDP     d4, d5, [x5, 32]
120        SMLAL   v12.8h, v8.8b, v6.8b
121        SMLAL   v13.8h, v8.8b, v7.8b
122        PRFM    PLDL1KEEP, [x13, 128]
123        SMLAL   v14.8h, v9.8b, v6.8b
124        SMLAL   v15.8h, v9.8b, v7.8b
125
126        LDP     d8, d9, [x5, 96]
127        SMULL   v2.8h, v4.8b, v0.8b
128        SADALP  v20.4s, v12.8h
129        SMULL   v3.8h, v4.8b, v1.8b
130        SADALP  v21.4s, v13.8h
131        SMULL   v10.8h, v5.8b, v0.8b
132        SADALP  v22.4s, v14.8h
133        SMULL   v11.8h, v5.8b, v1.8b
134        SADALP  v23.4s, v15.8h
135        LDP     d4, d5, [x5, 48]
136        SMLAL   v2.8h, v8.8b, v6.8b
137        SMLAL   v3.8h, v8.8b, v7.8b
138        PRFM    PLDL1KEEP, [x15, 128]
139        SMLAL   v10.8h, v9.8b, v6.8b
140        SMLAL   v11.8h, v9.8b, v7.8b
141
142        LDP     d8, d9, [x5, 112]
143        SMULL   v12.8h, v4.8b, v0.8b
144        ADD     x5, x5, 128
145        SADALP  v24.4s,  v2.8h
146        SMULL   v13.8h, v4.8b, v1.8b
147        SADALP  v25.4s,  v3.8h
148        SMULL   v14.8h, v5.8b, v0.8b
149        SADALP  v26.4s, v10.8h
150        SMULL   v15.8h, v5.8b, v1.8b
151        SADALP  v27.4s, v11.8h
152        SMLAL   v12.8h, v8.8b, v6.8b
153        LDP     d4, d5, [x5]            // Read B
154        SMLAL   v13.8h, v8.8b, v7.8b
155        SUBS    x0, x0, 16
156        SMLAL   v14.8h, v9.8b, v6.8b
157        LDP     d0, d6, [x13], 16       // Read A0
158        SMLAL   v15.8h, v9.8b, v7.8b
159
160        SADALP  v28.4s, v12.8h
161        LDP     d1, d7, [x15], 16       // Read A1
162        SADALP  v29.4s, v13.8h
163        SADALP  v30.4s, v14.8h
164        LDP     d8, d9, [x5, 64]        // Read B
165        SADALP  v31.4s, v15.8h
166        B.HS    2b
167
168        # Epilogue
169        # Same as main loop except no loads at end of loop
170        .p2align 3
1713:
172        SMULL   v2.8h, v4.8b, v0.8b
173        SMULL   v3.8h, v4.8b, v1.8b
174        SMULL   v10.8h, v5.8b, v0.8b
175        SMULL   v11.8h, v5.8b, v1.8b
176        LDP     d4, d5, [x5, 16]
177        SMLAL   v2.8h, v8.8b, v6.8b
178        SMLAL   v3.8h, v8.8b, v7.8b
179        SMLAL   v10.8h, v9.8b, v6.8b
180        SMLAL   v11.8h, v9.8b, v7.8b
181
182        LDP     d8, d9, [x5, 80]
183        SMULL   v12.8h, v4.8b, v0.8b
184        SADALP  v16.4s,  v2.8h
185        SMULL   v13.8h, v4.8b, v1.8b
186        SADALP  v17.4s,  v3.8h
187        SMULL   v14.8h, v5.8b, v0.8b
188        SADALP  v18.4s, v10.8h
189        SMULL   v15.8h, v5.8b, v1.8b
190        SADALP  v19.4s, v11.8h
191        LDP     d4, d5, [x5, 32]
192        SMLAL   v12.8h, v8.8b, v6.8b
193        SMLAL   v13.8h, v8.8b, v7.8b
194        SMLAL   v14.8h, v9.8b, v6.8b
195        SMLAL   v15.8h, v9.8b, v7.8b
196
197        LDP     d8, d9, [x5, 96]
198        SMULL   v2.8h, v4.8b, v0.8b
199        SADALP  v20.4s, v12.8h
200        SMULL   v3.8h, v4.8b, v1.8b
201        SADALP  v21.4s, v13.8h
202        SMULL   v10.8h, v5.8b, v0.8b
203        SADALP  v22.4s, v14.8h
204        SMULL   v11.8h, v5.8b, v1.8b
205        SADALP  v23.4s, v15.8h
206        LDP     d4, d5, [x5, 48]
207        SMLAL   v2.8h, v8.8b, v6.8b
208        SMLAL   v3.8h, v8.8b, v7.8b
209        SMLAL   v10.8h, v9.8b, v6.8b
210        SMLAL   v11.8h, v9.8b, v7.8b
211
212        LDP     d8, d9, [x5, 112]
213        SMULL   v12.8h, v4.8b, v0.8b
214        SADALP  v24.4s,  v2.8h
215        SMULL   v13.8h, v4.8b, v1.8b
216        SADALP  v25.4s,  v3.8h
217        SMULL   v14.8h, v5.8b, v0.8b
218        SADALP  v26.4s, v10.8h
219        SMULL   v15.8h, v5.8b, v1.8b
220        SADALP  v27.4s, v11.8h
221        SMLAL   v12.8h, v8.8b, v6.8b
222        SMLAL   v13.8h, v8.8b, v7.8b
223        SMLAL   v14.8h, v9.8b, v6.8b
224        SMLAL   v15.8h, v9.8b, v7.8b
225        ADD     x5, x5, 128
226
227        SADALP  v28.4s, v12.8h
228        SADALP  v29.4s, v13.8h
229        SADALP  v30.4s, v14.8h
230        SADALP  v31.4s, v15.8h
231
232        # Is there a remainder?- 8 bytes of A
233        TBNZ    x0, 3, 5f
234
235        # ks loop
236        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
237        B.HI    1b
238
2394:
240        # Add columns
241        ADDP    v16.4s, v16.4s, v18.4s
242        ADDP    v20.4s, v20.4s, v22.4s
243        ADDP    v24.4s, v24.4s, v26.4s
244        ADDP    v28.4s, v28.4s, v30.4s
245        ADDP    v17.4s, v17.4s, v19.4s
246        ADDP    v21.4s, v21.4s, v23.4s
247        ADDP    v25.4s, v25.4s, v27.4s
248        ADDP    v29.4s, v29.4s, v31.4s
249        ADDP    v0.4s, v16.4s, v20.4s
250        ADDP    v1.4s, v24.4s, v28.4s
251        ADDP    v2.4s, v17.4s, v21.4s
252        ADDP    v3.4s, v25.4s, v29.4s
253
254        # Load per channel scale values from weights
255        SCVTF   v0.4s, v0.4s
256        LDR     q4, [x5], 16
257        SCVTF   v1.4s, v1.4s
258        LDR     q5, [x5], 16
259        SCVTF   v2.4s, v2.4s
260        SCVTF   v3.4s, v3.4s
261        FMUL    v0.4s, v0.4s, v4.4s
262        FMUL    v1.4s, v1.4s, v5.4s
263        FMUL    v2.4s, v2.4s, v4.4s
264        FMUL    v3.4s, v3.4s, v5.4s
265
266        FCVTNS  v0.4s, v0.4s
267        FCVTNS  v1.4s, v1.4s
268        FCVTNS  v2.4s, v2.4s
269        FCVTNS  v3.4s, v3.4s
270
271        LD1R    {v5.8h}, [x11], 2
272        SQXTN   v0.4h, v0.4s
273        SQXTN   v2.4h, v2.4s
274        SQXTN2  v0.8h, v1.4s
275        SQXTN2  v2.8h, v3.4s
276        SUBS    x1, x1, 8
277        SQADD   v0.8h, v0.8h, v5.8h
278        SQADD   v1.8h, v2.8h, v5.8h
279        SQXTN   v0.8b, v0.8h
280        SQXTN2  v0.16b, v1.8h
281        LD1R    {v1.16b}, [x11], 1
282        LD1R    {v2.16b}, [x11]
283        SMAX    v0.16b, v0.16b, v1.16b
284        SUB     x11, x11, 3          // rewind params pointer
285        SMIN    v0.16b, v0.16b, v2.16b
286        B.LO    6f
287
288        # Store full 2 x 8
289        ST1     {v0.d}[1], [x7], x10
290        ST1     {v0.8b}, [x6], x10
291
292        SUB     x4, x4, x3              // a -= ks
293
294        # nc loop
295        B.HI    0b
296
297        # Restore d8-d15 from stack
298        LDP     d14, d15, [sp, 48]
299        LDP     d12, d13, [sp, 32]
300        LDP     d10, d11, [sp, 16]
301        LDP     d8, d9, [sp], 64
302        RET
303
304        # Remainder - 8 bytes of A
305        .p2align 3
3065:
307        LDR     d0, [x13]
308        LDP     d4, d5, [x5]
309        LDR     d1, [x15]
310        LDP     d6, d7, [x5, 16]
311        SMULL   v2.8h, v4.8b, v0.8b
312        SMULL   v3.8h, v4.8b, v1.8b
313        SMULL   v10.8h, v5.8b, v0.8b
314        SMULL   v11.8h, v5.8b, v1.8b
315        SMULL   v12.8h, v6.8b, v0.8b
316        SADALP  v16.4s,  v2.8h
317        SMULL   v13.8h, v6.8b, v1.8b
318        SADALP  v17.4s,  v3.8h
319        SMULL   v14.8h, v7.8b, v0.8b
320        SADALP  v18.4s, v10.8h
321        SMULL   v15.8h, v7.8b, v1.8b
322        SADALP  v19.4s, v11.8h
323        LDP     d4, d5, [x5, 32]
324        SMULL   v2.8h, v4.8b, v0.8b
325        SADALP  v20.4s, v12.8h
326        SMULL   v3.8h, v4.8b, v1.8b
327        SADALP  v21.4s, v13.8h
328        SMULL   v10.8h, v5.8b, v0.8b
329        SADALP  v22.4s, v14.8h
330        SMULL   v11.8h, v5.8b, v1.8b
331        SADALP  v23.4s, v15.8h
332        LDP     d6, d7, [x5, 48]
333        SMULL   v12.8h, v6.8b, v0.8b
334        SADALP  v24.4s,  v2.8h
335        SMULL   v13.8h, v6.8b, v1.8b
336        SADALP  v25.4s,  v3.8h
337        SMULL   v14.8h, v7.8b, v0.8b
338        SADALP  v26.4s, v10.8h
339        SMULL   v15.8h, v7.8b, v1.8b
340        SADALP  v27.4s, v11.8h
341        ADD     x5, x5, 64
342        SADALP  v28.4s, v12.8h
343        SADALP  v29.4s, v13.8h
344        SADALP  v30.4s, v14.8h
345        SADALP  v31.4s, v15.8h
346
347        # ks loop
348        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
349        B.HI    1b
350        B       4b
351
352        # Store odd width
353        .p2align 3
3546:
355        TBZ     x1, 2, 7f
356        ST1     {v0.s}[2], [x7], 4
357        STR     s0, [x6], 4
358        EXT     v0.16b, v0.16b, v0.16b, 4
359
3607:
361        TBZ     x1, 1, 8f
362        ST1     {v0.h}[4], [x7], 2
363        STR     h0, [x6], 2
364        EXT     v0.16b, v0.16b, v0.16b, 2
3658:
366        TBZ     x1, 0, 9f
367        ST1     {v0.b}[8], [x7]
368        STR     b0, [x6]
3699:
370        # Restore d8-d15 from stack
371        LDP     d14, d15, [sp, 48]
372        LDP     d12, d13, [sp, 32]
373        LDP     d10, d11, [sp, 16]
374        LDP     d8, d9, [sp], 64
375        RET
376
377END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
378
379#ifdef __ELF__
380.section ".note.GNU-stack","",%progbits
381#endif
382