xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# x16, x17, x20, x21 tenporary a53 gpr load data
36
37
38BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
39
40        # Clamp A and C pointers
41        CMP     x0, 2                   // if mr < 2
42        STP     d8, d9, [sp, -80]!
43        ADD     x4, x3, x4              // a1 = a0 + a_stride
44        STP     d10, d11, [sp, 16]
45        ADD     x7, x6, x7              // c1 = c0 + cm_stride
46        STP     d12, d13, [sp, 32]
47        CSEL    x4, x3, x4, LO          //   a1 = a0
48        STP     d14, d15, [sp, 48]
49        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        BIC     x2, x2, 7
52        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
53
54        .p2align 3
550:
56        # Load initial bias from w into accumulators
57        SUBS    x0, x2, 16              // k = kc - 16
58        LDP     s16, s18, [x5], 8
59        MOV     v17.16b, v16.16b
60        MOV     v19.16b, v18.16b
61        LDP     s20, s22, [x5], 8
62        MOV     v21.16b, v20.16b
63        MOV     v23.16b, v22.16b
64        LDP     s24, s26, [x5], 8
65        MOV     v25.16b, v24.16b
66        MOV     v27.16b, v26.16b
67        LDP     s28, s30, [x5], 8
68        MOV     v29.16b, v28.16b
69        LDP     x10, x11, [sp, 80]       // cn_stride, params
70        MOV     v31.16b, v30.16b
71        # Is there at least 16 bytes for epilogue?
72        B.LO    4f
73
74        # Prologue: load A0, A1 and 2 B's
75        LDP     d4, d5, [x5]            // Read B
76        LDP     d0, d6, [x3], 16        // Read A0
77        LDR     x17, [x5, 64]           // Read B
78        LDP     d1, d7, [x4], 16        // Read A1
79        LDR     x16, [x5, 16]
80
81        # Is there at least 16 bytes for main loop?
82        SUBS    x0, x0, 16              // k = k - 16
83        B.LO    2f
84
85         # Main loop - 16 bytes of A
86         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
87         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
88
89        .p2align 3
901:
91        # BLOCK 0 - 18 cycles - includes prfm
92        LDR     d9, [x5, 72]            // Read B
93        INS     v8.d[0], x17
94        SMULL   v2.8h, v4.8b, v0.8b
95        SMULL   v3.8h, v4.8b, v1.8b
96        LDR     x17, [x5, 80]
97        SMULL   v10.8h, v5.8b, v0.8b
98        SMULL   v11.8h, v5.8b, v1.8b
99        LDR     d5, [x5, 24]
100        INS     v4.d[0], x16
101        SMLAL   v2.8h, v8.8b, v6.8b
102        SMLAL   v3.8h, v8.8b, v7.8b
103        LDR     x16, [x5, 32]
104        SMLAL   v10.8h, v9.8b, v6.8b
105        SMLAL   v11.8h, v9.8b, v7.8b
106        SADALP  v16.4s,  v2.8h
107        SADALP  v17.4s,  v3.8h
108        SADALP  v18.4s, v10.8h
109        SADALP  v19.4s, v11.8h
110
111        # BLOCK 1- 18 cycles
112        LDR     d9, [x5, 88]
113        INS     v8.d[0], x17
114        SMULL   v12.8h, v4.8b, v0.8b
115        SMULL   v13.8h, v4.8b, v1.8b
116        LDR     x17, [x5, 96]
117        SMULL   v14.8h, v5.8b, v0.8b
118        SMULL   v15.8h, v5.8b, v1.8b
119        LDR     d5, [x5, 40]
120        INS     v4.d[0], x16
121        SMLAL   v12.8h, v8.8b, v6.8b
122        SMLAL   v13.8h, v8.8b, v7.8b
123        LDR     x16, [x5, 48]
124        SMLAL   v14.8h, v9.8b, v6.8b
125        SMLAL   v15.8h, v9.8b, v7.8b
126        SADALP  v20.4s, v12.8h
127        SADALP  v21.4s, v13.8h
128        SADALP  v22.4s, v14.8h
129        SADALP  v23.4s, v15.8h
130
131        # BLOCK 2 - 18 cycles
132        LDR     d9, [x5, 104]
133        INS     v8.d[0], x17
134        SMULL   v2.8h, v4.8b, v0.8b
135        SMULL   v3.8h, v4.8b, v1.8b
136        LDR     x17, [x5, 112]
137        SMULL   v10.8h, v5.8b, v0.8b
138        SMULL   v11.8h, v5.8b, v1.8b
139        LDR     d5, [x5, 56]
140        INS     v4.d[0], x16
141        SMLAL   v2.8h, v8.8b, v6.8b
142        SMLAL   v3.8h, v8.8b, v7.8b
143        LDR     x16, [x5, 128]
144        SMLAL   v10.8h, v9.8b, v6.8b
145        SMLAL   v11.8h, v9.8b, v7.8b
146        SADALP  v24.4s,  v2.8h
147        LDR     x20, [x3], 8            // Read A0
148        SADALP  v25.4s,  v3.8h
149        LDR     x21, [x4], 8            // Read A1
150        SADALP  v26.4s, v10.8h
151        SADALP  v27.4s, v11.8h
152        SUBS    x0, x0, 16
153
154        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
155        LDR     d9, [x5, 120]
156        INS     v8.d[0], x17
157        SMULL   v12.8h, v4.8b, v0.8b
158        SMULL   v13.8h, v4.8b, v1.8b
159        LDR     x17, [x5, 192]          // Read B
160        SMULL   v14.8h, v5.8b, v0.8b
161        SMULL   v15.8h, v5.8b, v1.8b
162        LDR     d5, [x5, 136]           // Read B
163        INS     v4.d[0], x16
164        SMLAL   v12.8h, v8.8b, v6.8b
165        SMLAL   v13.8h, v8.8b, v7.8b
166        LDR     x16, [x5, 144]
167        SMLAL   v14.8h, v9.8b, v6.8b
168        SMLAL   v15.8h, v9.8b, v7.8b
169        LDR     d6, [x3], 8             // Read A0
170        INS     v0.d[0], x20
171        LDR     d7, [x4], 8             // Read A1
172        INS     v1.d[0], x21
173        SADALP  v28.4s, v12.8h
174        SADALP  v29.4s, v13.8h
175        ADD     x5, x5, 128
176        SADALP  v30.4s, v14.8h
177        SADALP  v31.4s, v15.8h
178        B.HS    1b
179
180        # Epilogue
181        # Same as main loop except no loads at end of loop
182
183        .p2align 3
1842:
185        # BLOCK 0 - 18 cycles
186        LDR     d9, [x5, 72]            // Read B
187        INS     v8.d[0], x17
188        SMULL   v2.8h, v4.8b, v0.8b
189        SMULL   v3.8h, v4.8b, v1.8b
190        LDR     x17, [x5, 80]
191        SMULL   v10.8h, v5.8b, v0.8b
192        SMULL   v11.8h, v5.8b, v1.8b
193        LDR     d5, [x5, 24]
194        INS     v4.d[0], x16
195        SMLAL   v2.8h, v8.8b, v6.8b
196        SMLAL   v3.8h, v8.8b, v7.8b
197        LDR     x16, [x5, 32]
198        SMLAL   v10.8h, v9.8b, v6.8b
199        SMLAL   v11.8h, v9.8b, v7.8b
200        SADALP  v16.4s,  v2.8h
201        SADALP  v17.4s,  v3.8h
202        SADALP  v18.4s, v10.8h
203        SADALP  v19.4s, v11.8h
204
205        # BLOCK 1- 18 cycles
206        LDR     d9, [x5, 88]
207        INS     v8.d[0], x17
208        SMULL   v12.8h, v4.8b, v0.8b
209        SMULL   v13.8h, v4.8b, v1.8b
210        LDR     x17, [x5, 96]
211        SMULL   v14.8h, v5.8b, v0.8b
212        SMULL   v15.8h, v5.8b, v1.8b
213        LDR     d5, [x5, 40]
214        INS     v4.d[0], x16
215        SMLAL   v12.8h, v8.8b, v6.8b
216        SMLAL   v13.8h, v8.8b, v7.8b
217        LDR     x16, [x5, 48]
218        SMLAL   v14.8h, v9.8b, v6.8b
219        SMLAL   v15.8h, v9.8b, v7.8b
220        SADALP  v20.4s, v12.8h
221        SADALP  v21.4s, v13.8h
222        SADALP  v22.4s, v14.8h
223        SADALP  v23.4s, v15.8h
224
225        # BLOCK 2 - 18 cycles
226        LDR     d9, [x5, 104]
227        INS     v8.d[0], x17
228        SMULL   v2.8h, v4.8b, v0.8b
229        SMULL   v3.8h, v4.8b, v1.8b
230        LDR     x17, [x5, 112]
231        SMULL   v10.8h, v5.8b, v0.8b
232        SMULL   v11.8h, v5.8b, v1.8b
233        LDR     d5, [x5, 56]
234        INS     v4.d[0], x16
235        SMLAL   v2.8h, v8.8b, v6.8b
236        SMLAL   v3.8h, v8.8b, v7.8b
237        SMLAL   v10.8h, v9.8b, v6.8b
238        SMLAL   v11.8h, v9.8b, v7.8b
239        SADALP  v24.4s,  v2.8h
240        SADALP  v25.4s,  v3.8h
241        SADALP  v26.4s, v10.8h
242        SADALP  v27.4s, v11.8h
243
244        # BLOCK 3 - 17 cycles
245        LDR     d9, [x5, 120]
246        INS     v8.d[0], x17
247        SMULL   v12.8h, v4.8b, v0.8b
248        SMULL   v13.8h, v4.8b, v1.8b
249        SMULL   v14.8h, v5.8b, v0.8b
250        SMULL   v15.8h, v5.8b, v1.8b
251        SMLAL   v12.8h, v8.8b, v6.8b
252        SMLAL   v13.8h, v8.8b, v7.8b
253        SMLAL   v14.8h, v9.8b, v6.8b
254        SMLAL   v15.8h, v9.8b, v7.8b
255        SADALP  v28.4s, v12.8h
256        SADALP  v29.4s, v13.8h
257        ADD     x5, x5, 128
258        SADALP  v30.4s, v14.8h
259        SADALP  v31.4s, v15.8h
260
261        # Is there a remainder?- 8 bytes of A
262        TBNZ    x0, 3, 4f
263
264        .p2align 3
2653:
266        # Add columns
267        ADDP    v16.4s, v16.4s, v18.4s
268        ADDP    v20.4s, v20.4s, v22.4s
269        ADDP    v24.4s, v24.4s, v26.4s
270        ADDP    v28.4s, v28.4s, v30.4s
271        ADDP    v17.4s, v17.4s, v19.4s
272        ADDP    v21.4s, v21.4s, v23.4s
273        ADDP    v25.4s, v25.4s, v27.4s
274        ADDP    v29.4s, v29.4s, v31.4s
275        ADDP    v0.4s, v16.4s, v20.4s
276        ADDP    v1.4s, v24.4s, v28.4s
277        ADDP    v2.4s, v17.4s, v21.4s
278        ADDP    v3.4s, v25.4s, v29.4s
279
280        # Load per channel scale values from weights
281        SCVTF   v0.4s, v0.4s
282        LDR     q4, [x5], 16
283        SCVTF   v1.4s, v1.4s
284        LDR     q5, [x5], 16
285        SCVTF   v2.4s, v2.4s
286        SCVTF   v3.4s, v3.4s
287        FMUL    v0.4s, v0.4s, v4.4s
288        FMUL    v1.4s, v1.4s, v5.4s
289        FMUL    v2.4s, v2.4s, v4.4s
290        FMUL    v3.4s, v3.4s, v5.4s
291
292        FCVTNS  v0.4s, v0.4s
293        FCVTNS  v1.4s, v1.4s
294        FCVTNS  v2.4s, v2.4s
295        FCVTNS  v3.4s, v3.4s
296
297        LD1R    {v5.8h}, [x11], 2
298        SQXTN   v0.4h, v0.4s
299        SQXTN   v2.4h, v2.4s
300        SQXTN2  v0.8h, v1.4s
301        SQXTN2  v2.8h, v3.4s
302        SUBS    x1, x1, 8
303        SQADD   v0.8h, v0.8h, v5.8h
304        SQADD   v1.8h, v2.8h, v5.8h
305        SQXTN   v0.8b, v0.8h
306        SQXTN2  v0.16b, v1.8h
307        LD1R    {v1.16b}, [x11], 1
308        LD1R    {v2.16b}, [x11]
309        SMAX    v0.16b, v0.16b, v1.16b
310        SMIN    v0.16b, v0.16b, v2.16b
311        B.LO    5f
312
313        # Store full 2 x 8
314        ST1     {v0.8b}, [x6], x10
315        SUB     x3, x3, x2              // a0 -= kc
316        ST1     {v0.d}[1], [x7], x10
317        SUB     x4, x4, x2              // a1 -= kc
318        B.HI    0b
319
320        # Restore x20,x21 from stack
321        LDP     x20, x21, [sp, 64]
322
323        # Restore d8-d15 from stack
324        LDP     d14, d15, [sp, 48]
325        LDP     d12, d13, [sp, 32]
326        LDP     d10, d11, [sp, 16]
327        LDP     d8, d9, [sp], 80
328        RET
329
330        # Remainder - 8 bytes of A
331        .p2align 3
3324:
333        LDR     d0, [x3], 8
334        LDP     d4, d5, [x5]
335        LDR     d1, [x4], 8
336        LDP     d6, d7, [x5, 16]
337        SMULL   v2.8h, v4.8b, v0.8b
338        SMULL   v3.8h, v4.8b, v1.8b
339        SMULL   v10.8h, v5.8b, v0.8b
340        SMULL   v11.8h, v5.8b, v1.8b
341        SMULL   v12.8h, v6.8b, v0.8b
342        SADALP  v16.4s,  v2.8h
343        SMULL   v13.8h, v6.8b, v1.8b
344        SADALP  v17.4s,  v3.8h
345        SMULL   v14.8h, v7.8b, v0.8b
346        SADALP  v18.4s, v10.8h
347        SMULL   v15.8h, v7.8b, v1.8b
348        SADALP  v19.4s, v11.8h
349        LDP     d4, d5, [x5, 32]
350        SMULL   v2.8h, v4.8b, v0.8b
351        SADALP  v20.4s, v12.8h
352        SMULL   v3.8h, v4.8b, v1.8b
353        SADALP  v21.4s, v13.8h
354        SMULL   v10.8h, v5.8b, v0.8b
355        SADALP  v22.4s, v14.8h
356        SMULL   v11.8h, v5.8b, v1.8b
357        SADALP  v23.4s, v15.8h
358        LDP     d6, d7, [x5, 48]
359        SMULL   v12.8h, v6.8b, v0.8b
360        SADALP  v24.4s,  v2.8h
361        SMULL   v13.8h, v6.8b, v1.8b
362        SADALP  v25.4s,  v3.8h
363        SMULL   v14.8h, v7.8b, v0.8b
364        SADALP  v26.4s, v10.8h
365        SMULL   v15.8h, v7.8b, v1.8b
366        SADALP  v27.4s, v11.8h
367        ADD     x5, x5, 64
368        SADALP  v28.4s, v12.8h
369        SADALP  v29.4s, v13.8h
370        SADALP  v30.4s, v14.8h
371        SADALP  v31.4s, v15.8h
372        B       3b
373
374        # Store odd width
375        .p2align 3
3765:
377        TBZ     x1, 2, 6f
378        STR     s0, [x6], 4
379        ST1     {v0.s}[2], [x7], 4
380        EXT     v0.16b, v0.16b, v0.16b, 4
381
3826:
383        TBZ     x1, 1, 7f
384        STR     h0, [x6], 2
385        ST1     {v0.h}[4], [x7], 2
386        EXT     v0.16b, v0.16b, v0.16b, 2
3877:
388        TBZ     x1, 0, 8f
389        STR     b0, [x6]
390        ST1     {v0.b}[8], [x7]
3918:
392        # Restore x20,x21 from stack
393        LDP     x20, x21, [sp, 64]
394
395        # Restore d8-d15 from stack
396        LDP     d14, d15, [sp, 48]
397        LDP     d12, d13, [sp, 32]
398        LDP     d10, d11, [sp, 16]
399        LDP     d8, d9, [sp], 80
400        RET
401
402END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
403
404#ifdef __ELF__
405.section ".note.GNU-stack","",%progbits
406#endif
407
408