xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32# x16, x17, x7 tenporary a53 gpr load data
33
34BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55        LDR     x16, [x5, 16]           // Read B
56
57        # Is there at least 16 bytes for main loop?
58        SUBS    x0, x0, 16              // k = k - 16
59        B.LO    2f
60
61        # Main loop - 16 bytes of A
62        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
63        # 1 load for A0 = +1 cycle.  Total 41 cycles.
64
65        .p2align 3
661:
67        # BLOCK 0 - 6 cycles
68        SMULL   v17.8h, v4.8b, v0.8b
69        LDR     x17, [x5, 80]
70        SMULL   v19.8h, v5.8b, v0.8b
71        LDR     d5, [x5, 24]
72        INS     v4.d[0], x16
73        SMLAL   v17.8h, v2.8b, v6.8b
74        LDR     x16, [x5, 32]
75        SMLAL   v19.8h, v3.8b, v6.8b
76        LDR     d3, [x5, 88]
77        INS     v2.d[0], x17
78
79        # BLOCK 1 - 10 cycles
80        SMULL   v21.8h, v4.8b, v0.8b
81        LDR     x17, [x5, 96]
82        SMULL   v23.8h, v5.8b, v0.8b
83        SADALP  v16.4s, v17.8h
84        SADALP  v18.4s, v19.8h
85        LDR     d5, [x5, 40]
86        INS     v4.d[0], x16
87        SMLAL   v21.8h, v2.8b, v6.8b
88        LDR     x16, [x5, 48]
89        SMLAL   v23.8h, v3.8b, v6.8b
90        LDR     d3, [x5, 104]
91        INS     v2.d[0], x17
92
93        # BLOCK 2 - 10 cycles
94        SMULL   v17.8h, v4.8b, v0.8b
95        LDR     x17, [x5, 112]
96        SMULL   v19.8h, v5.8b, v0.8b
97        SADALP  v20.4s, v21.8h
98        SADALP  v22.4s, v23.8h
99        LDR     d5, [x5, 56]
100        INS     v4.d[0], x16
101        SMLAL   v17.8h, v2.8b, v6.8b
102        LDR     x16, [x5, 128]
103        SMLAL   v19.8h, v3.8b, v6.8b
104        LDR     d3, [x5, 120]
105        INS     v2.d[0], x17
106
107        # BLOCK 3 - 15 cycles
108        SMULL   v21.8h, v4.8b, v0.8b
109        LDR     x7, [x3], 8             // Read A0
110        SMULL   v23.8h, v5.8b, v0.8b
111        LDR     x17, [x5, 192]          // Read B
112        SADALP  v24.4s, v17.8h
113        SUBS    x0, x0, 16
114        SADALP  v26.4s, v19.8h
115        LDR     d5, [x5, 136]           // Read B
116        INS     v4.d[0], x16
117        SMLAL   v21.8h, v2.8b, v6.8b
118        LDR     x16, [x5, 144]
119        SMLAL   v23.8h, v3.8b, v6.8b
120        LDR     d6, [x3], 8             // Read A0
121        INS     v0.d[0], x7
122        LDR     d3, [x5, 200]           // Read B
123        INS     v2.d[0], x17
124        SADALP  v28.4s, v21.8h
125        ADD     x5, x5, 128
126        SADALP  v30.4s, v23.8h
127        B.HS    1b
128
129        # Epilogue
130        # Same as main loop except no loads at end of loop
131
132        .p2align 3
1332:
134        # BLOCK 0 - 6 cycles
135        SMULL   v17.8h, v4.8b, v0.8b
136        LDR     x17, [x5, 80]
137        SMULL   v19.8h, v5.8b, v0.8b
138        LDR     d5, [x5, 24]
139        INS     v4.d[0], x16
140        SMLAL   v17.8h, v2.8b, v6.8b
141        LDR     x16, [x5, 32]
142        SMLAL   v19.8h, v3.8b, v6.8b
143        LDR     d3, [x5, 88]
144        INS     v2.d[0], x17
145
146        # BLOCK 1 - 10 cycles
147        SMULL   v21.8h, v4.8b, v0.8b
148        LDR     x17, [x5, 96]
149        SMULL   v23.8h, v5.8b, v0.8b
150        SADALP  v16.4s, v17.8h
151        SADALP  v18.4s, v19.8h
152        LDR     d5, [x5, 40]
153        INS     v4.d[0], x16
154        SMLAL   v21.8h, v2.8b, v6.8b
155        LDR     x16, [x5, 48]
156        SMLAL   v23.8h, v3.8b, v6.8b
157        LDR     d3, [x5, 104]
158        INS     v2.d[0], x17
159
160        # BLOCK 2 - 10 cycles
161        SMULL   v17.8h, v4.8b, v0.8b
162        LDR     x17, [x5, 112]
163        SMULL   v19.8h, v5.8b, v0.8b
164        SADALP  v20.4s, v21.8h
165        SADALP  v22.4s, v23.8h
166        LDR     d5, [x5, 56]
167        INS     v4.d[0], x16
168        SMLAL   v17.8h, v2.8b, v6.8b
169        SMLAL   v19.8h, v3.8b, v6.8b
170        LDR     d3, [x5, 120]
171        INS     v2.d[0], x17
172
173        # BLOCK 3 - 12 cycles
174        SMULL   v21.8h, v4.8b, v0.8b
175        SMULL   v23.8h, v5.8b, v0.8b
176        SADALP  v24.4s, v17.8h
177        SADALP  v26.4s, v19.8h
178        SMLAL   v21.8h, v2.8b, v6.8b
179        SMLAL   v23.8h, v3.8b, v6.8b
180        SADALP  v28.4s, v21.8h
181        ADD     x5, x5, 128
182        SADALP  v30.4s, v23.8h
183
184        # Is there a remainder?- 8 bytes of A
185        TBNZ    x0, 3, 4f
186
187        .p2align 3
1883:
189        # Add columns
190        ADDP    v16.4s, v16.4s, v18.4s
191        ADDP    v20.4s, v20.4s, v22.4s
192        ADDP    v24.4s, v24.4s, v26.4s
193        ADDP    v28.4s, v28.4s, v30.4s
194        ADDP    v0.4s, v16.4s, v20.4s
195        ADDP    v1.4s, v24.4s, v28.4s
196
197        # Load per channel scale values from weights
198        SCVTF   v0.4s, v0.4s
199        LDR     q4, [x5], 16
200        SCVTF   v1.4s, v1.4s
201        LDR     q5, [x5], 16
202        FMUL    v0.4s, v0.4s, v4.4s
203        FMUL    v1.4s, v1.4s, v5.4s
204
205        FCVTNS  v0.4s, v0.4s
206        FCVTNS  v1.4s, v1.4s
207
208        LD1R    {v5.8h}, [x11], 2
209        SQXTN   v0.4h, v0.4s
210        SQXTN2  v0.8h, v1.4s
211        SUBS    x1, x1, 8
212        SQADD   v0.8h, v0.8h, v5.8h
213        LD1R    {v1.16b}, [x11], 1
214        SQXTN   v0.8b, v0.8h
215        LD1R    {v17.16b}, [x11]
216        SMAX    v0.8b, v0.8b, v1.8b
217        SUB     x11, x11, 3            // rewind params pointer
218        SMIN    v0.8b, v0.8b, v17.8b
219        B.LO    5f
220
221        # Store full 1 x 8
222        ST1     {v0.8b}, [x6], x10
223        SUB     x3, x3, x2              // a0 -= kc
224        B.HI    0b
225        RET
226
227        # Remainder - 8 bytes of A
228        .p2align 3
2294:
230        LDR     d0, [x3], 8
231        LDP     d4, d5, [x5]
232        LDP     d6, d7, [x5, 16]
233        SMULL   v17.8h, v4.8b, v0.8b
234        SMULL   v19.8h, v5.8b, v0.8b
235        SMULL   v21.8h, v6.8b, v0.8b
236        SMULL   v23.8h, v7.8b, v0.8b
237        LDP     d4, d5, [x5, 32]
238        LDP     d6, d7, [x5, 48]
239        SADALP  v16.4s, v17.8h
240        SADALP  v18.4s, v19.8h
241        SADALP  v20.4s, v21.8h
242        SADALP  v22.4s, v23.8h
243        SMULL   v17.8h, v4.8b, v0.8b
244        SMULL   v19.8h, v5.8b, v0.8b
245        SMULL   v21.8h, v6.8b, v0.8b
246        SMULL   v23.8h, v7.8b, v0.8b
247        ADD     x5, x5, 64
248        SADALP  v24.4s, v17.8h
249        SADALP  v26.4s, v19.8h
250        SADALP  v28.4s, v21.8h
251        SADALP  v30.4s, v23.8h
252        B       3b
253
254        # Store odd width
255        .p2align 3
2565:
257        TBZ     x1, 2, 6f
258        STR     s0, [x6], 4
259        EXT     v0.16b, v0.16b, v0.16b, 4
260
2616:
262        TBZ     x1, 1, 7f
263        STR     h0, [x6], 2
264        EXT     v0.16b, v0.16b, v0.16b, 2
2657:
266        TBZ     x1, 0, 8f
267        STR     b0, [x6]
2688:
269        RET
270
271END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
272
273#ifdef __ELF__
274.section ".note.GNU-stack","",%progbits
275#endif
276
277