xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32# x16, x17, x7 tenporary a53 gpr load data
33
34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55        LDR     x16, [x5, 16]           // Read B
56
57        # Is there at least 16 bytes for main loop?
58        SUBS    x0, x0, 16              // k = k - 16
59        B.LO    2f
60
61        # Main loop - 16 bytes of A
62        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
63        # 1 load for A0 = +1 cycle.  Total 41 cycles.
64
65        .p2align 3
661:
67        # BLOCK 0 - 6 cycles
68        SMULL   v17.8h, v4.8b, v0.8b
69        LDR     x17, [x5, 80]
70        SMULL   v19.8h, v5.8b, v0.8b
71        LDR     d5, [x5, 24]
72        INS     v4.d[0], x16
73        SMLAL   v17.8h, v2.8b, v6.8b
74        LDR     x16, [x5, 32]
75        SMLAL   v19.8h, v3.8b, v6.8b
76        LDR     d3, [x5, 88]
77        INS     v2.d[0], x17
78
79        # BLOCK 1 - 10 cycles
80        SMULL   v21.8h, v4.8b, v0.8b
81        LDR     x17, [x5, 96]
82        SMULL   v23.8h, v5.8b, v0.8b
83        SADALP  v16.4s, v17.8h
84        SADALP  v18.4s, v19.8h
85        LDR     d5, [x5, 40]
86        INS     v4.d[0], x16
87        SMLAL   v21.8h, v2.8b, v6.8b
88        LDR     x16, [x5, 48]
89        SMLAL   v23.8h, v3.8b, v6.8b
90        LDR     d3, [x5, 104]
91        INS     v2.d[0], x17
92
93        # BLOCK 2 - 10 cycles
94        SMULL   v17.8h, v4.8b, v0.8b
95        LDR     x17, [x5, 112]
96        SMULL   v19.8h, v5.8b, v0.8b
97        SADALP  v20.4s, v21.8h
98        SADALP  v22.4s, v23.8h
99        LDR     d5, [x5, 56]
100        INS     v4.d[0], x16
101        SMLAL   v17.8h, v2.8b, v6.8b
102        LDR     x16, [x5, 128]
103        SMLAL   v19.8h, v3.8b, v6.8b
104        LDR     d3, [x5, 120]
105        INS     v2.d[0], x17
106
107        # BLOCK 3 - 15 cycles
108        SMULL   v21.8h, v4.8b, v0.8b
109        LDR     x7, [x3], 8             // Read A0
110        SMULL   v23.8h, v5.8b, v0.8b
111        LDR     x17, [x5, 192]          // Read B
112        SADALP  v24.4s, v17.8h
113        SUBS    x0, x0, 16
114        SADALP  v26.4s, v19.8h
115        LDR     d5, [x5, 136]           // Read B
116        INS     v4.d[0], x16
117        SMLAL   v21.8h, v2.8b, v6.8b
118        LDR     x16, [x5, 144]
119        SMLAL   v23.8h, v3.8b, v6.8b
120        LDR     d6, [x3], 8             // Read A0
121        INS     v0.d[0], x7
122        LDR     d3, [x5, 200]           // Read B
123        INS     v2.d[0], x17
124        SADALP  v28.4s, v21.8h
125        ADD     x5, x5, 128
126        SADALP  v30.4s, v23.8h
127        B.HS    1b
128
129        # Epilogue
130        # Same as main loop except no loads at end of loop
131
132        .p2align 3
1332:
134        # BLOCK 0 - 6 cycles
135        SMULL   v17.8h, v4.8b, v0.8b
136        LDR     x17, [x5, 80]
137        SMULL   v19.8h, v5.8b, v0.8b
138        LDR     d5, [x5, 24]
139        INS     v4.d[0], x16
140        SMLAL   v17.8h, v2.8b, v6.8b
141        LDR     x16, [x5, 32]
142        SMLAL   v19.8h, v3.8b, v6.8b
143        LDR     d3, [x5, 88]
144        INS     v2.d[0], x17
145
146        # BLOCK 1 - 10 cycles
147        SMULL   v21.8h, v4.8b, v0.8b
148        LDR     x17, [x5, 96]
149        SMULL   v23.8h, v5.8b, v0.8b
150        SADALP  v16.4s, v17.8h
151        SADALP  v18.4s, v19.8h
152        LDR     d5, [x5, 40]
153        INS     v4.d[0], x16
154        SMLAL   v21.8h, v2.8b, v6.8b
155        LDR     x16, [x5, 48]
156        SMLAL   v23.8h, v3.8b, v6.8b
157        LDR     d3, [x5, 104]
158        INS     v2.d[0], x17
159
160        # BLOCK 2 - 10 cycles
161        SMULL   v17.8h, v4.8b, v0.8b
162        LDR     x17, [x5, 112]
163        SMULL   v19.8h, v5.8b, v0.8b
164        SADALP  v20.4s, v21.8h
165        SADALP  v22.4s, v23.8h
166        LDR     d5, [x5, 56]
167        INS     v4.d[0], x16
168        SMLAL   v17.8h, v2.8b, v6.8b
169        SMLAL   v19.8h, v3.8b, v6.8b
170        LDR     d3, [x5, 120]
171        INS     v2.d[0], x17
172
173        # BLOCK 3 - 12 cycles
174        SMULL   v21.8h, v4.8b, v0.8b
175        SMULL   v23.8h, v5.8b, v0.8b
176        SADALP  v24.4s, v17.8h
177        SADALP  v26.4s, v19.8h
178        SMLAL   v21.8h, v2.8b, v6.8b
179        SMLAL   v23.8h, v3.8b, v6.8b
180        SADALP  v28.4s, v21.8h
181        ADD     x5, x5, 128
182        SADALP  v30.4s, v23.8h
183
184        # Is there a remainder?- 8 bytes of A
185        TBNZ    x0, 3, 4f
186
187        .p2align 3
1883:
189        # Add columns
190        ADDP    v16.4s, v16.4s, v18.4s
191        ADDP    v20.4s, v20.4s, v22.4s
192        ADDP    v24.4s, v24.4s, v26.4s
193        ADDP    v28.4s, v28.4s, v30.4s
194        ADDP    v0.4s, v16.4s, v20.4s
195        ADDP    v1.4s, v24.4s, v28.4s
196
197        # Apply params - scale, bias and clamp
198        SCVTF   v0.4s, v0.4s
199        LD1R    {v4.4s}, [x11], 4
200        SCVTF   v1.4s, v1.4s
201        FMUL    v0.4s, v0.4s, v4.4s
202        FMUL    v1.4s, v1.4s, v4.4s
203
204        FCVTNS  v0.4s, v0.4s
205        FCVTNS  v1.4s, v1.4s
206
207        LD1R    {v5.8h}, [x11], 2
208        SQXTN   v0.4h, v0.4s
209        SQXTN2  v0.8h, v1.4s
210        SUBS    x1, x1, 8
211        SQADD   v0.8h, v0.8h, v5.8h
212        LD1R    {v1.16b}, [x11], 1
213        SQXTN   v0.8b, v0.8h
214        LD1R    {v17.16b}, [x11]
215        SMAX    v0.8b, v0.8b, v1.8b
216        SUB     x11, x11, 7            // rewind params pointer
217        SMIN    v0.8b, v0.8b, v17.8b
218        B.LO    5f
219
220        # Store full 1 x 8
221        ST1     {v0.8b}, [x6], x10
222        SUB     x3, x3, x2              // a0 -= kc
223        B.HI    0b
224        RET
225
226        # Remainder - 8 bytes of A
227        .p2align 3
2284:
229        LDR     d0, [x3], 8
230        LDP     d4, d5, [x5]
231        LDP     d6, d7, [x5, 16]
232        SMULL   v17.8h, v4.8b, v0.8b
233        SMULL   v19.8h, v5.8b, v0.8b
234        SMULL   v21.8h, v6.8b, v0.8b
235        SMULL   v23.8h, v7.8b, v0.8b
236        LDP     d4, d5, [x5, 32]
237        LDP     d6, d7, [x5, 48]
238        SADALP  v16.4s, v17.8h
239        SADALP  v18.4s, v19.8h
240        SADALP  v20.4s, v21.8h
241        SADALP  v22.4s, v23.8h
242        SMULL   v17.8h, v4.8b, v0.8b
243        SMULL   v19.8h, v5.8b, v0.8b
244        SMULL   v21.8h, v6.8b, v0.8b
245        SMULL   v23.8h, v7.8b, v0.8b
246        ADD     x5, x5, 64
247        SADALP  v24.4s, v17.8h
248        SADALP  v26.4s, v19.8h
249        SADALP  v28.4s, v21.8h
250        SADALP  v30.4s, v23.8h
251        B       3b
252
253        # Store odd width
254        .p2align 3
2555:
256        TBZ     x1, 2, 6f
257        STR     s0, [x6], 4
258        EXT     v0.16b, v0.16b, v0.16b, 4
259
2606:
261        TBZ     x1, 1, 7f
262        STR     h0, [x6], 2
263        EXT     v0.16b, v0.16b, v0.16b, 2
2647:
265        TBZ     x1, 0, 8f
266        STR     b0, [x6]
2678:
268        RET
269
270END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
271
272#ifdef __ELF__
273.section ".note.GNU-stack","",%progbits
274#endif
275
276