xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# x16, x17, x20, x21 tenporary a53 gpr load data
36
37
38BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
39
40        # Clamp A and C pointers
41        CMP     x0, 2                   // if mr < 2
42        STP     d8, d9, [sp, -80]!
43        ADD     x4, x3, x4              // a1 = a0 + a_stride
44        STP     d10, d11, [sp, 16]
45        ADD     x7, x6, x7              // c1 = c0 + cm_stride
46        STP     d12, d13, [sp, 32]
47        CSEL    x4, x3, x4, LO          //   a1 = a0
48        STP     d14, d15, [sp, 48]
49        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        BIC     x2, x2, 7
52        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
53
54        .p2align 3
550:
56        # Load initial bias from w into accumulators
57        SUBS    x0, x2, 16              // k = kc - 16
58        LDP     s16, s18, [x5], 8
59        MOV     v17.16b, v16.16b
60        MOV     v19.16b, v18.16b
61        LDP     s20, s22, [x5], 8
62        MOV     v21.16b, v20.16b
63        MOV     v23.16b, v22.16b
64        LDP     s24, s26, [x5], 8
65        MOV     v25.16b, v24.16b
66        MOV     v27.16b, v26.16b
67        LDP     s28, s30, [x5], 8
68        MOV     v29.16b, v28.16b
69        LDP     x10, x11, [sp, 80]       // cn_stride, params
70        MOV     v31.16b, v30.16b
71        # Is there at least 16 bytes for epilogue?
72        B.LO    4f
73
74        # Prologue: load A0, A1 and 2 B's
75        LDP     d4, d5, [x5]            // Read B
76        LDP     d0, d6, [x3], 16        // Read A0
77        LDR     x17, [x5, 64]           // Read B
78        LDP     d1, d7, [x4], 16        // Read A1
79        LDR     x16, [x5, 16]
80
81        # Is there at least 16 bytes for main loop?
82        SUBS    x0, x0, 16              // k = k - 16
83        B.LO    2f
84
85         # Main loop - 16 bytes of A
86         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
87         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
88
89        .p2align 3
901:
91        # BLOCK 0 - 18 cycles - includes prfm
92        LDR     d9, [x5, 72]            // Read B
93        INS     v8.d[0], x17
94        SMULL   v2.8h, v4.8b, v0.8b
95        SMULL   v3.8h, v4.8b, v1.8b
96        LDR     x17, [x5, 80]
97        SMULL   v10.8h, v5.8b, v0.8b
98        SMULL   v11.8h, v5.8b, v1.8b
99        LDR     d5, [x5, 24]
100        INS     v4.d[0], x16
101        SMLAL   v2.8h, v8.8b, v6.8b
102        SMLAL   v3.8h, v8.8b, v7.8b
103        LDR     x16, [x5, 32]
104        SMLAL   v10.8h, v9.8b, v6.8b
105        SMLAL   v11.8h, v9.8b, v7.8b
106        SADALP  v16.4s,  v2.8h
107        SADALP  v17.4s,  v3.8h
108        SADALP  v18.4s, v10.8h
109        SADALP  v19.4s, v11.8h
110
111        # BLOCK 1- 18 cycles
112        LDR     d9, [x5, 88]
113        INS     v8.d[0], x17
114        SMULL   v12.8h, v4.8b, v0.8b
115        SMULL   v13.8h, v4.8b, v1.8b
116        LDR     x17, [x5, 96]
117        SMULL   v14.8h, v5.8b, v0.8b
118        SMULL   v15.8h, v5.8b, v1.8b
119        LDR     d5, [x5, 40]
120        INS     v4.d[0], x16
121        SMLAL   v12.8h, v8.8b, v6.8b
122        SMLAL   v13.8h, v8.8b, v7.8b
123        LDR     x16, [x5, 48]
124        SMLAL   v14.8h, v9.8b, v6.8b
125        SMLAL   v15.8h, v9.8b, v7.8b
126        SADALP  v20.4s, v12.8h
127        SADALP  v21.4s, v13.8h
128        SADALP  v22.4s, v14.8h
129        SADALP  v23.4s, v15.8h
130
131        # BLOCK 2 - 18 cycles
132        LDR     d9, [x5, 104]
133        INS     v8.d[0], x17
134        SMULL   v2.8h, v4.8b, v0.8b
135        SMULL   v3.8h, v4.8b, v1.8b
136        LDR     x17, [x5, 112]
137        SMULL   v10.8h, v5.8b, v0.8b
138        SMULL   v11.8h, v5.8b, v1.8b
139        LDR     d5, [x5, 56]
140        INS     v4.d[0], x16
141        SMLAL   v2.8h, v8.8b, v6.8b
142        SMLAL   v3.8h, v8.8b, v7.8b
143        LDR     x16, [x5, 128]
144        SMLAL   v10.8h, v9.8b, v6.8b
145        SMLAL   v11.8h, v9.8b, v7.8b
146        SADALP  v24.4s,  v2.8h
147        LDR     x20, [x3], 8            // Read A0
148        SADALP  v25.4s,  v3.8h
149        LDR     x21, [x4], 8            // Read A1
150        SADALP  v26.4s, v10.8h
151        SADALP  v27.4s, v11.8h
152        SUBS    x0, x0, 16
153
154        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
155        LDR     d9, [x5, 120]
156        INS     v8.d[0], x17
157        SMULL   v12.8h, v4.8b, v0.8b
158        SMULL   v13.8h, v4.8b, v1.8b
159        LDR     x17, [x5, 192]          // Read B
160        SMULL   v14.8h, v5.8b, v0.8b
161        SMULL   v15.8h, v5.8b, v1.8b
162        LDR     d5, [x5, 136]           // Read B
163        INS     v4.d[0], x16
164        SMLAL   v12.8h, v8.8b, v6.8b
165        SMLAL   v13.8h, v8.8b, v7.8b
166        LDR     x16, [x5, 144]
167        SMLAL   v14.8h, v9.8b, v6.8b
168        SMLAL   v15.8h, v9.8b, v7.8b
169        LDR     d6, [x3], 8             // Read A0
170        INS     v0.d[0], x20
171        LDR     d7, [x4], 8             // Read A1
172        INS     v1.d[0], x21
173        SADALP  v28.4s, v12.8h
174        SADALP  v29.4s, v13.8h
175        ADD     x5, x5, 128
176        SADALP  v30.4s, v14.8h
177        SADALP  v31.4s, v15.8h
178        B.HS    1b
179
180        # Epilogue
181        # Same as main loop except no loads at end of loop
182
183        .p2align 3
1842:
185        # BLOCK 0 - 18 cycles
186        LDR     d9, [x5, 72]            // Read B
187        INS     v8.d[0], x17
188        SMULL   v2.8h, v4.8b, v0.8b
189        SMULL   v3.8h, v4.8b, v1.8b
190        LDR     x17, [x5, 80]
191        SMULL   v10.8h, v5.8b, v0.8b
192        SMULL   v11.8h, v5.8b, v1.8b
193        LDR     d5, [x5, 24]
194        INS     v4.d[0], x16
195        SMLAL   v2.8h, v8.8b, v6.8b
196        SMLAL   v3.8h, v8.8b, v7.8b
197        LDR     x16, [x5, 32]
198        SMLAL   v10.8h, v9.8b, v6.8b
199        SMLAL   v11.8h, v9.8b, v7.8b
200        SADALP  v16.4s,  v2.8h
201        SADALP  v17.4s,  v3.8h
202        SADALP  v18.4s, v10.8h
203        SADALP  v19.4s, v11.8h
204
205        # BLOCK 1- 18 cycles
206        LDR     d9, [x5, 88]
207        INS     v8.d[0], x17
208        SMULL   v12.8h, v4.8b, v0.8b
209        SMULL   v13.8h, v4.8b, v1.8b
210        LDR     x17, [x5, 96]
211        SMULL   v14.8h, v5.8b, v0.8b
212        SMULL   v15.8h, v5.8b, v1.8b
213        LDR     d5, [x5, 40]
214        INS     v4.d[0], x16
215        SMLAL   v12.8h, v8.8b, v6.8b
216        SMLAL   v13.8h, v8.8b, v7.8b
217        LDR     x16, [x5, 48]
218        SMLAL   v14.8h, v9.8b, v6.8b
219        SMLAL   v15.8h, v9.8b, v7.8b
220        SADALP  v20.4s, v12.8h
221        SADALP  v21.4s, v13.8h
222        SADALP  v22.4s, v14.8h
223        SADALP  v23.4s, v15.8h
224
225        # BLOCK 2 - 18 cycles
226        LDR     d9, [x5, 104]
227        INS     v8.d[0], x17
228        SMULL   v2.8h, v4.8b, v0.8b
229        SMULL   v3.8h, v4.8b, v1.8b
230        LDR     x17, [x5, 112]
231        SMULL   v10.8h, v5.8b, v0.8b
232        SMULL   v11.8h, v5.8b, v1.8b
233        LDR     d5, [x5, 56]
234        INS     v4.d[0], x16
235        SMLAL   v2.8h, v8.8b, v6.8b
236        SMLAL   v3.8h, v8.8b, v7.8b
237        SMLAL   v10.8h, v9.8b, v6.8b
238        SMLAL   v11.8h, v9.8b, v7.8b
239        SADALP  v24.4s,  v2.8h
240        SADALP  v25.4s,  v3.8h
241        SADALP  v26.4s, v10.8h
242        SADALP  v27.4s, v11.8h
243
244        # BLOCK 3 - 17 cycles
245        LDR     d9, [x5, 120]
246        INS     v8.d[0], x17
247        SMULL   v12.8h, v4.8b, v0.8b
248        SMULL   v13.8h, v4.8b, v1.8b
249        SMULL   v14.8h, v5.8b, v0.8b
250        SMULL   v15.8h, v5.8b, v1.8b
251        SMLAL   v12.8h, v8.8b, v6.8b
252        SMLAL   v13.8h, v8.8b, v7.8b
253        SMLAL   v14.8h, v9.8b, v6.8b
254        SMLAL   v15.8h, v9.8b, v7.8b
255        SADALP  v28.4s, v12.8h
256        SADALP  v29.4s, v13.8h
257        ADD     x5, x5, 128
258        SADALP  v30.4s, v14.8h
259        SADALP  v31.4s, v15.8h
260
261        # Is there a remainder?- 8 bytes of A
262        TBNZ    x0, 3, 4f
263
264        .p2align 3
2653:
266        # Add columns
267        ADDP    v16.4s, v16.4s, v18.4s
268        ADDP    v20.4s, v20.4s, v22.4s
269        ADDP    v24.4s, v24.4s, v26.4s
270        ADDP    v28.4s, v28.4s, v30.4s
271        ADDP    v17.4s, v17.4s, v19.4s
272        ADDP    v21.4s, v21.4s, v23.4s
273        ADDP    v25.4s, v25.4s, v27.4s
274        ADDP    v29.4s, v29.4s, v31.4s
275        ADDP    v0.4s, v16.4s, v20.4s
276        ADDP    v1.4s, v24.4s, v28.4s
277        ADDP    v2.4s, v17.4s, v21.4s
278        ADDP    v3.4s, v25.4s, v29.4s
279
280        # Apply params - scale, bias and clamp
281        SCVTF   v0.4s, v0.4s
282        LD1R    {v4.4s}, [x11], 4
283        SCVTF   v1.4s, v1.4s
284        SCVTF   v2.4s, v2.4s
285        SCVTF   v3.4s, v3.4s
286        FMUL    v0.4s, v0.4s, v4.4s
287        FMUL    v1.4s, v1.4s, v4.4s
288        FMUL    v2.4s, v2.4s, v4.4s
289        FMUL    v3.4s, v3.4s, v4.4s
290
291        FCVTNS  v0.4s, v0.4s
292        FCVTNS  v1.4s, v1.4s
293        FCVTNS  v2.4s, v2.4s
294        FCVTNS  v3.4s, v3.4s
295
296        LD1R    {v5.8h}, [x11], 2
297        SQXTN   v0.4h, v0.4s
298        SQXTN   v2.4h, v2.4s
299        SQXTN2  v0.8h, v1.4s
300        SQXTN2  v2.8h, v3.4s
301        SUBS    x1, x1, 8
302        SQADD   v0.8h, v0.8h, v5.8h
303        SQADD   v1.8h, v2.8h, v5.8h
304        SQXTN   v0.8b, v0.8h
305        SQXTN2  v0.16b, v1.8h
306        LD1R    {v1.16b}, [x11], 1
307        LD1R    {v2.16b}, [x11]
308        SMAX    v0.16b, v0.16b, v1.16b
309        SMIN    v0.16b, v0.16b, v2.16b
310        B.LO    5f
311
312        # Store full 2 x 8
313        ST1     {v0.8b}, [x6], x10
314        SUB     x3, x3, x2              // a0 -= kc
315        ST1     {v0.d}[1], [x7], x10
316        SUB     x4, x4, x2              // a1 -= kc
317        B.HI    0b
318
319        # Restore x20,x21 from stack
320        LDP     x20, x21, [sp, 64]
321
322        # Restore d8-d15 from stack
323        LDP     d14, d15, [sp, 48]
324        LDP     d12, d13, [sp, 32]
325        LDP     d10, d11, [sp, 16]
326        LDP     d8, d9, [sp], 80
327        RET
328
329        # Remainder - 8 bytes of A
330        .p2align 3
3314:
332        LDR     d0, [x3], 8
333        LDP     d4, d5, [x5]
334        LDR     d1, [x4], 8
335        LDP     d6, d7, [x5, 16]
336        SMULL   v2.8h, v4.8b, v0.8b
337        SMULL   v3.8h, v4.8b, v1.8b
338        SMULL   v10.8h, v5.8b, v0.8b
339        SMULL   v11.8h, v5.8b, v1.8b
340        SMULL   v12.8h, v6.8b, v0.8b
341        SADALP  v16.4s,  v2.8h
342        SMULL   v13.8h, v6.8b, v1.8b
343        SADALP  v17.4s,  v3.8h
344        SMULL   v14.8h, v7.8b, v0.8b
345        SADALP  v18.4s, v10.8h
346        SMULL   v15.8h, v7.8b, v1.8b
347        SADALP  v19.4s, v11.8h
348        LDP     d4, d5, [x5, 32]
349        SMULL   v2.8h, v4.8b, v0.8b
350        SADALP  v20.4s, v12.8h
351        SMULL   v3.8h, v4.8b, v1.8b
352        SADALP  v21.4s, v13.8h
353        SMULL   v10.8h, v5.8b, v0.8b
354        SADALP  v22.4s, v14.8h
355        SMULL   v11.8h, v5.8b, v1.8b
356        SADALP  v23.4s, v15.8h
357        LDP     d6, d7, [x5, 48]
358        SMULL   v12.8h, v6.8b, v0.8b
359        SADALP  v24.4s,  v2.8h
360        SMULL   v13.8h, v6.8b, v1.8b
361        SADALP  v25.4s,  v3.8h
362        SMULL   v14.8h, v7.8b, v0.8b
363        SADALP  v26.4s, v10.8h
364        SMULL   v15.8h, v7.8b, v1.8b
365        SADALP  v27.4s, v11.8h
366        ADD     x5, x5, 64
367        SADALP  v28.4s, v12.8h
368        SADALP  v29.4s, v13.8h
369        SADALP  v30.4s, v14.8h
370        SADALP  v31.4s, v15.8h
371        B       3b
372
373        # Store odd width
374        .p2align 3
3755:
376        TBZ     x1, 2, 6f
377        STR     s0, [x6], 4
378        ST1     {v0.s}[2], [x7], 4
379        EXT     v0.16b, v0.16b, v0.16b, 4
380
3816:
382        TBZ     x1, 1, 7f
383        STR     h0, [x6], 2
384        ST1     {v0.h}[4], [x7], 2
385        EXT     v0.16b, v0.16b, v0.16b, 2
3867:
387        TBZ     x1, 0, 8f
388        STR     b0, [x6]
389        ST1     {v0.b}[8], [x7]
3908:
391        # Restore x20,x21 from stack
392        LDP     x20, x21, [sp, 64]
393
394        # Restore d8-d15 from stack
395        LDP     d14, d15, [sp, 48]
396        LDP     d12, d13, [sp, 32]
397        LDP     d10, d11, [sp, 16]
398        LDP     d8, d9, [sp], 80
399        RET
400
401END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
402
403#ifdef __ELF__
404.section ".note.GNU-stack","",%progbits
405#endif
406
407