xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37
38BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
39
40        # Clamp C pointers
41        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
42        CMP     x0, 2                   // if mr < 2
43        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
44        ADD     x7, x6, x7              // c1 = c0 + cm_stride
45        STP     d8, d9, [sp, -64]!
46        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
47        STP     d10, d11, [sp, 16]
48        CSEL    x7, x6, x7, LO          //   c1 = c0
49        STP     d12, d13, [sp, 32]
50        BIC     x2, x2, 7
51        STP     d14, d15, [sp, 48]
52
53        .p2align 3
540:
55        # Load initial bias from w into accumulators
56        LDP     s16, s18, [x5], 8
57        MOV     v17.16b, v16.16b
58        MOV     v19.16b, v18.16b
59        LDP     s20, s22, [x5], 8
60        MOV     v21.16b, v20.16b
61        MOV     v23.16b, v22.16b
62        LDP     s24, s26, [x5], 8
63        MOV     v25.16b, v24.16b
64        MOV     v27.16b, v26.16b
65        LDP     s28, s30, [x5], 8
66        MOV     v29.16b, v28.16b
67        MOV     v31.16b, v30.16b
68        MOV     x9, x3                  // p = ks
69
70        .p2align 3
711:
72        # Load next 2 A pointers
73        LDP     x13, x15, [x4], 16
74        CMP     x13, x12                // if a0 == zero
75        ADD     x13, x13, x8            // a0 += a_offset
76        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
77        CMP     x15, x12                // if a1 == zero
78        ADD     x15, x15, x8            // a1 += a_offset
79        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
80
81        # Is there at least 16 bytes for epilogue?
82        SUBS    x0, x2, 16              // k = kc - 16
83        B.LO    5f
84
85        # Prologue: load A0, A1 and 2 B's
86        LDP     d4, d5, [x5]
87        LDP     d0, d6, [x13], 16
88        LDP     d1, d7, [x15], 16
89        LDP     d8, d9, [x5, 64]
90
91        # Is there at least 16 bytes for main loop?
92        SUBS    x0, x0, 16              // k = k - 16
93        B.LO    3f
94
95         # Main loop - 16 bytes of A
96        .p2align 3
972:
98        SMULL   v2.8h, v4.8b, v0.8b
99        SMULL   v3.8h, v4.8b, v1.8b
100        PRFM    PLDL1KEEP, [x5, 448]
101        SMULL   v10.8h, v5.8b, v0.8b
102        SMULL   v11.8h, v5.8b, v1.8b
103        LDP     d4, d5, [x5, 16]
104        SMLAL   v2.8h, v8.8b, v6.8b
105        SMLAL   v3.8h, v8.8b, v7.8b
106        PRFM    PLDL1KEEP, [x5, 512]
107        SMLAL   v10.8h, v9.8b, v6.8b
108        SMLAL   v11.8h, v9.8b, v7.8b
109
110        LDP     d8, d9, [x5, 80]
111        SMULL   v12.8h, v4.8b, v0.8b
112        SADALP  v16.4s,  v2.8h
113        SMULL   v13.8h, v4.8b, v1.8b
114        SADALP  v17.4s,  v3.8h
115        SMULL   v14.8h, v5.8b, v0.8b
116        SADALP  v18.4s, v10.8h
117        SMULL   v15.8h, v5.8b, v1.8b
118        SADALP  v19.4s, v11.8h
119        LDP     d4, d5, [x5, 32]
120        SMLAL   v12.8h, v8.8b, v6.8b
121        SMLAL   v13.8h, v8.8b, v7.8b
122        PRFM    PLDL1KEEP, [x13, 128]
123        SMLAL   v14.8h, v9.8b, v6.8b
124        SMLAL   v15.8h, v9.8b, v7.8b
125
126        LDP     d8, d9, [x5, 96]
127        SMULL   v2.8h, v4.8b, v0.8b
128        SADALP  v20.4s, v12.8h
129        SMULL   v3.8h, v4.8b, v1.8b
130        SADALP  v21.4s, v13.8h
131        SMULL   v10.8h, v5.8b, v0.8b
132        SADALP  v22.4s, v14.8h
133        SMULL   v11.8h, v5.8b, v1.8b
134        SADALP  v23.4s, v15.8h
135        LDP     d4, d5, [x5, 48]
136        SMLAL   v2.8h, v8.8b, v6.8b
137        SMLAL   v3.8h, v8.8b, v7.8b
138        PRFM    PLDL1KEEP, [x15, 128]
139        SMLAL   v10.8h, v9.8b, v6.8b
140        SMLAL   v11.8h, v9.8b, v7.8b
141
142        LDP     d8, d9, [x5, 112]
143        SMULL   v12.8h, v4.8b, v0.8b
144        ADD     x5, x5, 128
145        SADALP  v24.4s,  v2.8h
146        SMULL   v13.8h, v4.8b, v1.8b
147        SADALP  v25.4s,  v3.8h
148        SMULL   v14.8h, v5.8b, v0.8b
149        SADALP  v26.4s, v10.8h
150        SMULL   v15.8h, v5.8b, v1.8b
151        SADALP  v27.4s, v11.8h
152        SMLAL   v12.8h, v8.8b, v6.8b
153        LDP     d4, d5, [x5]            // Read B
154        SMLAL   v13.8h, v8.8b, v7.8b
155        SUBS    x0, x0, 16
156        SMLAL   v14.8h, v9.8b, v6.8b
157        LDP     d0, d6, [x13], 16       // Read A0
158        SMLAL   v15.8h, v9.8b, v7.8b
159
160        SADALP  v28.4s, v12.8h
161        LDP     d1, d7, [x15], 16       // Read A1
162        SADALP  v29.4s, v13.8h
163        SADALP  v30.4s, v14.8h
164        LDP     d8, d9, [x5, 64]        // Read B
165        SADALP  v31.4s, v15.8h
166        B.HS    2b
167
168        # Epilogue
169        # Same as main loop except no loads at end of loop
170        .p2align 3
1713:
172        SMULL   v2.8h, v4.8b, v0.8b
173        SMULL   v3.8h, v4.8b, v1.8b
174        SMULL   v10.8h, v5.8b, v0.8b
175        SMULL   v11.8h, v5.8b, v1.8b
176        LDP     d4, d5, [x5, 16]
177        SMLAL   v2.8h, v8.8b, v6.8b
178        SMLAL   v3.8h, v8.8b, v7.8b
179        SMLAL   v10.8h, v9.8b, v6.8b
180        SMLAL   v11.8h, v9.8b, v7.8b
181
182        LDP     d8, d9, [x5, 80]
183        SMULL   v12.8h, v4.8b, v0.8b
184        SADALP  v16.4s,  v2.8h
185        SMULL   v13.8h, v4.8b, v1.8b
186        SADALP  v17.4s,  v3.8h
187        SMULL   v14.8h, v5.8b, v0.8b
188        SADALP  v18.4s, v10.8h
189        SMULL   v15.8h, v5.8b, v1.8b
190        SADALP  v19.4s, v11.8h
191        LDP     d4, d5, [x5, 32]
192        SMLAL   v12.8h, v8.8b, v6.8b
193        SMLAL   v13.8h, v8.8b, v7.8b
194        SMLAL   v14.8h, v9.8b, v6.8b
195        SMLAL   v15.8h, v9.8b, v7.8b
196
197        LDP     d8, d9, [x5, 96]
198        SMULL   v2.8h, v4.8b, v0.8b
199        SADALP  v20.4s, v12.8h
200        SMULL   v3.8h, v4.8b, v1.8b
201        SADALP  v21.4s, v13.8h
202        SMULL   v10.8h, v5.8b, v0.8b
203        SADALP  v22.4s, v14.8h
204        SMULL   v11.8h, v5.8b, v1.8b
205        SADALP  v23.4s, v15.8h
206        LDP     d4, d5, [x5, 48]
207        SMLAL   v2.8h, v8.8b, v6.8b
208        SMLAL   v3.8h, v8.8b, v7.8b
209        SMLAL   v10.8h, v9.8b, v6.8b
210        SMLAL   v11.8h, v9.8b, v7.8b
211
212        LDP     d8, d9, [x5, 112]
213        SMULL   v12.8h, v4.8b, v0.8b
214        SADALP  v24.4s,  v2.8h
215        SMULL   v13.8h, v4.8b, v1.8b
216        SADALP  v25.4s,  v3.8h
217        SMULL   v14.8h, v5.8b, v0.8b
218        SADALP  v26.4s, v10.8h
219        SMULL   v15.8h, v5.8b, v1.8b
220        SADALP  v27.4s, v11.8h
221        SMLAL   v12.8h, v8.8b, v6.8b
222        SMLAL   v13.8h, v8.8b, v7.8b
223        SMLAL   v14.8h, v9.8b, v6.8b
224        SMLAL   v15.8h, v9.8b, v7.8b
225        ADD     x5, x5, 128
226
227        SADALP  v28.4s, v12.8h
228        SADALP  v29.4s, v13.8h
229        SADALP  v30.4s, v14.8h
230        SADALP  v31.4s, v15.8h
231
232        # Is there a remainder?- 8 bytes of A
233        TBNZ    x0, 3, 5f
234
235        # ks loop
236        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
237        B.HI    1b
238
2394:
240        # Add columns
241        ADDP    v16.4s, v16.4s, v18.4s
242        ADDP    v20.4s, v20.4s, v22.4s
243        ADDP    v24.4s, v24.4s, v26.4s
244        ADDP    v28.4s, v28.4s, v30.4s
245        ADDP    v17.4s, v17.4s, v19.4s
246        ADDP    v21.4s, v21.4s, v23.4s
247        ADDP    v25.4s, v25.4s, v27.4s
248        ADDP    v29.4s, v29.4s, v31.4s
249        ADDP    v0.4s, v16.4s, v20.4s
250        ADDP    v1.4s, v24.4s, v28.4s
251        ADDP    v2.4s, v17.4s, v21.4s
252        ADDP    v3.4s, v25.4s, v29.4s
253
254        # Apply params - scale, bias and clamp
255        SCVTF   v0.4s, v0.4s
256        LD1R    {v4.4s}, [x11], 4
257        SCVTF   v1.4s, v1.4s
258        SCVTF   v2.4s, v2.4s
259        SCVTF   v3.4s, v3.4s
260        FMUL    v0.4s, v0.4s, v4.4s
261        FMUL    v1.4s, v1.4s, v4.4s
262        FMUL    v2.4s, v2.4s, v4.4s
263        FMUL    v3.4s, v3.4s, v4.4s
264
265        FCVTNS  v0.4s, v0.4s
266        FCVTNS  v1.4s, v1.4s
267        FCVTNS  v2.4s, v2.4s
268        FCVTNS  v3.4s, v3.4s
269
270        LD1R    {v5.8h}, [x11], 2
271        SQXTN   v0.4h, v0.4s
272        SQXTN   v2.4h, v2.4s
273        SQXTN2  v0.8h, v1.4s
274        SQXTN2  v2.8h, v3.4s
275        SUBS    x1, x1, 8
276        SQADD   v0.8h, v0.8h, v5.8h
277        SQADD   v1.8h, v2.8h, v5.8h
278        SQXTN   v0.8b, v0.8h
279        SQXTN2  v0.16b, v1.8h
280        LD1R    {v1.16b}, [x11], 1
281        LD1R    {v2.16b}, [x11]
282        SMAX    v0.16b, v0.16b, v1.16b
283        SUB     x11, x11, 7          // rewind params pointer
284        SMIN    v0.16b, v0.16b, v2.16b
285        B.LO    6f
286
287        # Store full 2 x 8
288        ST1     {v0.d}[1], [x7], x10
289        ST1     {v0.8b}, [x6], x10
290
291        SUB     x4, x4, x3              // a -= ks
292
293        # nc loop
294        B.HI    0b
295
296        # Restore d8-d15 from stack
297        LDP     d14, d15, [sp, 48]
298        LDP     d12, d13, [sp, 32]
299        LDP     d10, d11, [sp, 16]
300        LDP     d8, d9, [sp], 64
301        RET
302
303        # Remainder - 8 bytes of A
304        .p2align 3
3055:
306        LDR     d0, [x13]
307        LDP     d4, d5, [x5]
308        LDR     d1, [x15]
309        LDP     d6, d7, [x5, 16]
310        SMULL   v2.8h, v4.8b, v0.8b
311        SMULL   v3.8h, v4.8b, v1.8b
312        SMULL   v10.8h, v5.8b, v0.8b
313        SMULL   v11.8h, v5.8b, v1.8b
314        SMULL   v12.8h, v6.8b, v0.8b
315        SADALP  v16.4s,  v2.8h
316        SMULL   v13.8h, v6.8b, v1.8b
317        SADALP  v17.4s,  v3.8h
318        SMULL   v14.8h, v7.8b, v0.8b
319        SADALP  v18.4s, v10.8h
320        SMULL   v15.8h, v7.8b, v1.8b
321        SADALP  v19.4s, v11.8h
322        LDP     d4, d5, [x5, 32]
323        SMULL   v2.8h, v4.8b, v0.8b
324        SADALP  v20.4s, v12.8h
325        SMULL   v3.8h, v4.8b, v1.8b
326        SADALP  v21.4s, v13.8h
327        SMULL   v10.8h, v5.8b, v0.8b
328        SADALP  v22.4s, v14.8h
329        SMULL   v11.8h, v5.8b, v1.8b
330        SADALP  v23.4s, v15.8h
331        LDP     d6, d7, [x5, 48]
332        SMULL   v12.8h, v6.8b, v0.8b
333        SADALP  v24.4s,  v2.8h
334        SMULL   v13.8h, v6.8b, v1.8b
335        SADALP  v25.4s,  v3.8h
336        SMULL   v14.8h, v7.8b, v0.8b
337        SADALP  v26.4s, v10.8h
338        SMULL   v15.8h, v7.8b, v1.8b
339        SADALP  v27.4s, v11.8h
340        ADD     x5, x5, 64
341        SADALP  v28.4s, v12.8h
342        SADALP  v29.4s, v13.8h
343        SADALP  v30.4s, v14.8h
344        SADALP  v31.4s, v15.8h
345
346        # ks loop
347        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
348        B.HI    1b
349        B       4b
350
351        # Store odd width
352        .p2align 3
3536:
354        TBZ     x1, 2, 7f
355        ST1     {v0.s}[2], [x7], 4
356        STR     s0, [x6], 4
357        EXT     v0.16b, v0.16b, v0.16b, 4
358
3597:
360        TBZ     x1, 1, 8f
361        ST1     {v0.h}[4], [x7], 2
362        STR     h0, [x6], 2
363        EXT     v0.16b, v0.16b, v0.16b, 2
3648:
365        TBZ     x1, 0, 9f
366        ST1     {v0.b}[8], [x7]
367        STR     b0, [x6]
3689:
369        # Restore d8-d15 from stack
370        LDP     d14, d15, [sp, 48]
371        LDP     d12, d13, [sp, 32]
372        LDP     d10, d11, [sp, 16]
373        LDP     d8, d9, [sp], 64
374        RET
375
376END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm
377
378#ifdef __ELF__
379.section ".note.GNU-stack","",%progbits
380#endif
381