xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32
33
34BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55
56        # Is there at least 16 bytes for main loop?
57        SUBS    x0, x0, 16              // k = k - 16
58        B.LO    2f
59
60        # Main loop - 16 bytes of A
61        # 4 groups of 2 mul/mla/adap = 6 cycles.
62        # 2 load for A0, A1 = +4 cycle.  Total 36 cycles.
63
64        .p2align 3
651:
66        # BLOCK 0 - 4 cycles
67        SMULL   v17.8h, v4.8b, v0.8b
68        SMULL   v19.8h, v5.8b, v0.8b
69        LDP     d4, d5, [x5, 16]
70        SMLAL   v17.8h, v2.8b, v6.8b
71        SMLAL   v19.8h, v3.8b, v6.8b
72        LDP     d2, d3, [x5, 80]
73
74        # BLOCK 1 - 6 cycles
75        SMULL   v21.8h, v4.8b, v0.8b
76        SMULL   v23.8h, v5.8b, v0.8b
77        SADALP  v16.4s, v17.8h
78        SADALP  v18.4s, v19.8h
79        LDP     d4, d5, [x5, 32]
80        SMLAL   v21.8h, v2.8b, v6.8b
81        SMLAL   v23.8h, v3.8b, v6.8b
82        LDP     d2, d3, [x5, 96]
83
84        # BLOCK 2 - 6 cycles
85        SMULL   v17.8h, v4.8b, v0.8b
86        SMULL   v19.8h, v5.8b, v0.8b
87        SADALP  v20.4s, v21.8h
88        SADALP  v22.4s, v23.8h
89        LDP     d4, d5, [x5, 48]
90        SMLAL   v17.8h, v2.8b, v6.8b
91        SMLAL   v19.8h, v3.8b, v6.8b
92        LDP     d2, d3, [x5, 112]
93
94        # BLOCK 3 - 14 cycles
95        SMULL   v21.8h, v4.8b, v0.8b
96        ADD     x5, x5, 128
97        SMULL   v23.8h, v5.8b, v0.8b
98        SADALP  v24.4s, v17.8h
99        SUBS    x0, x0, 16
100        SADALP  v26.4s, v19.8h
101        LDP     d4, d5, [x5]            // Read B
102        SMLAL   v21.8h, v2.8b, v6.8b
103        SMLAL   v23.8h, v3.8b, v6.8b
104        LDP     d0, d6, [x3], 16        // Read A0
105        SADALP  v28.4s, v21.8h
106        LDP     d2, d3, [x5, 64]        // Read B
107        SADALP  v30.4s, v23.8h
108        B.HS    1b
109
110        # Epilogue
111        # Same as main loop except no loads at end of loop
112
113        .p2align 3
1142:
115        # BLOCK 0 - 4 cycles
116        SMULL   v17.8h, v4.8b, v0.8b
117        SMULL   v19.8h, v5.8b, v0.8b
118        LDP     d4, d5, [x5, 16]
119        SMLAL   v17.8h, v2.8b, v6.8b
120        SMLAL   v19.8h, v3.8b, v6.8b
121        LDP     d2, d3, [x5, 80]
122
123        # BLOCK 1 - 6 cycles
124        SMULL   v21.8h, v4.8b, v0.8b
125        SMULL   v23.8h, v5.8b, v0.8b
126        SADALP  v16.4s, v17.8h
127        SADALP  v18.4s, v19.8h
128        LDP     d4, d5, [x5, 32]
129        SMLAL   v21.8h, v2.8b, v6.8b
130        SMLAL   v23.8h, v3.8b, v6.8b
131        LDP     d2, d3, [x5, 96]
132
133        # BLOCK 2 - 6 cycles
134        SMULL   v17.8h, v4.8b, v0.8b
135        SMULL   v19.8h, v5.8b, v0.8b
136        SADALP  v20.4s, v21.8h
137        SADALP  v22.4s, v23.8h
138        LDP     d4, d5, [x5, 48]
139        SMLAL   v17.8h, v2.8b, v6.8b
140        SMLAL   v19.8h, v3.8b, v6.8b
141        LDP     d2, d3, [x5, 112]
142
143        # BLOCK 3 - 8 cycles
144        SMULL   v21.8h, v4.8b, v0.8b
145        ADD     x5, x5, 128
146        SMULL   v23.8h, v5.8b, v0.8b
147        SADALP  v24.4s, v17.8h
148        SADALP  v26.4s, v19.8h
149        SMLAL   v21.8h, v2.8b, v6.8b
150        SMLAL   v23.8h, v3.8b, v6.8b
151        SADALP  v28.4s, v21.8h
152        SADALP  v30.4s, v23.8h
153
154        # Is there a remainder?- 8 bytes of A
155        TBNZ    x0, 3, 4f
156
157        .p2align 3
1583:
159        # Add columns
160        ADDP    v16.4s, v16.4s, v18.4s
161        ADDP    v20.4s, v20.4s, v22.4s
162        ADDP    v24.4s, v24.4s, v26.4s
163        ADDP    v28.4s, v28.4s, v30.4s
164        ADDP    v0.4s, v16.4s, v20.4s
165        ADDP    v1.4s, v24.4s, v28.4s
166
167        # Apply params - scale, bias and clamp
168        SCVTF   v0.4s, v0.4s
169        LD1R    {v4.4s}, [x11], 4
170        SCVTF   v1.4s, v1.4s
171        FMUL    v0.4s, v0.4s, v4.4s
172        FMUL    v1.4s, v1.4s, v4.4s
173
174        FCVTNS  v0.4s, v0.4s
175        FCVTNS  v1.4s, v1.4s
176
177        LD1R    {v5.8h}, [x11], 2
178        SQXTN   v0.4h, v0.4s
179        SQXTN2  v0.8h, v1.4s
180        SUBS    x1, x1, 8
181        SQADD   v0.8h, v0.8h, v5.8h
182        LD1R    {v1.16b}, [x11], 1
183        SQXTN   v0.8b, v0.8h
184        LD1R    {v17.16b}, [x11]
185        SMAX    v0.8b, v0.8b, v1.8b
186        SUB     x11, x11, 7            // rewind params pointer
187        SMIN    v0.8b, v0.8b, v17.8b
188        B.LO    5f
189
190        # Store full 1 x 8
191        ST1     {v0.8b}, [x6], x10
192        SUB     x3, x3, x2              // a0 -= kc
193        B.HI    0b
194        RET
195
196        # Remainder - 8 bytes of A
197        .p2align 3
1984:
199        LDR     d0, [x3], 8
200        LDP     d4, d5, [x5]
201        LDP     d6, d7, [x5, 16]
202        SMULL   v17.8h, v4.8b, v0.8b
203        SMULL   v19.8h, v5.8b, v0.8b
204        SMULL   v21.8h, v6.8b, v0.8b
205        SMULL   v23.8h, v7.8b, v0.8b
206        LDP     d4, d5, [x5, 32]
207        LDP     d6, d7, [x5, 48]
208        SADALP  v16.4s, v17.8h
209        SADALP  v18.4s, v19.8h
210        SADALP  v20.4s, v21.8h
211        SADALP  v22.4s, v23.8h
212        SMULL   v17.8h, v4.8b, v0.8b
213        SMULL   v19.8h, v5.8b, v0.8b
214        SMULL   v21.8h, v6.8b, v0.8b
215        SMULL   v23.8h, v7.8b, v0.8b
216        ADD     x5, x5, 64
217        SADALP  v24.4s, v17.8h
218        SADALP  v26.4s, v19.8h
219        SADALP  v28.4s, v21.8h
220        SADALP  v30.4s, v23.8h
221        B       3b
222
223        # Store odd width
224        .p2align 3
2255:
226        TBZ     x1, 2, 6f
227        STR     s0, [x6], 4
228        EXT     v0.16b, v0.16b, v0.16b, 4
229
2306:
231        TBZ     x1, 1, 7f
232        STR     h0, [x6], 2
233        EXT     v0.16b, v0.16b, v0.16b, 2
2347:
235        TBZ     x1, 0, 8f
236        STR     b0, [x6]
2378:
238        RET
239
240END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
241
242#ifdef __ELF__
243.section ".note.GNU-stack","",%progbits
244#endif
245
246