xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/1x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           (x4)
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# B   x5  v4  v5  v2  v3
30# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
31# temp0  v17 v19 v21 v23
32
33
34BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
35
36        LDP     x10, x11, [sp]          // cn_stride, params
37        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
38        BIC     x2, x2, 7
39
40        .p2align 3
410:
42        # Load initial bias from w into accumulators
43        LDP     s16, s18, [x5], 8
44        SUBS    x0, x2, 16              // k = kc - 16
45        LDP     s20, s22, [x5], 8
46        LDP     s24, s26, [x5], 8
47        LDP     s28, s30, [x5], 8
48        # Is there at least 16 bytes for epilogue?
49        B.LO    4f
50
51        # Prologue: load A0 and 4 B's
52        LDP     d0, d6, [x3], 16        // Read A0
53        LDP     d4, d5, [x5]            // Read B
54        LDP     d2, d3, [x5, 64]        // Read B
55
56        # Is there at least 16 bytes for main loop?
57        SUBS    x0, x0, 16              // k = k - 16
58        B.LO    2f
59
60        # Main loop - 16 bytes of A
61        # 4 groups of 2 mul/mla/adap = 6 cycles.
62        # 2 load for A0, A1 = +4 cycle.  Total 36 cycles.
63
64        .p2align 3
651:
66        # BLOCK 0 - 4 cycles
67        SMULL   v17.8h, v4.8b, v0.8b
68        SMULL   v19.8h, v5.8b, v0.8b
69        LDP     d4, d5, [x5, 16]
70        SMLAL   v17.8h, v2.8b, v6.8b
71        SMLAL   v19.8h, v3.8b, v6.8b
72        LDP     d2, d3, [x5, 80]
73
74        # BLOCK 1 - 6 cycles
75        SMULL   v21.8h, v4.8b, v0.8b
76        SMULL   v23.8h, v5.8b, v0.8b
77        SADALP  v16.4s, v17.8h
78        SADALP  v18.4s, v19.8h
79        LDP     d4, d5, [x5, 32]
80        SMLAL   v21.8h, v2.8b, v6.8b
81        SMLAL   v23.8h, v3.8b, v6.8b
82        LDP     d2, d3, [x5, 96]
83
84        # BLOCK 2 - 6 cycles
85        SMULL   v17.8h, v4.8b, v0.8b
86        SMULL   v19.8h, v5.8b, v0.8b
87        SADALP  v20.4s, v21.8h
88        SADALP  v22.4s, v23.8h
89        LDP     d4, d5, [x5, 48]
90        SMLAL   v17.8h, v2.8b, v6.8b
91        SMLAL   v19.8h, v3.8b, v6.8b
92        LDP     d2, d3, [x5, 112]
93
94        # BLOCK 3 - 14 cycles
95        SMULL   v21.8h, v4.8b, v0.8b
96        ADD     x5, x5, 128
97        SMULL   v23.8h, v5.8b, v0.8b
98        SADALP  v24.4s, v17.8h
99        SUBS    x0, x0, 16
100        SADALP  v26.4s, v19.8h
101        LDP     d4, d5, [x5]            // Read B
102        SMLAL   v21.8h, v2.8b, v6.8b
103        SMLAL   v23.8h, v3.8b, v6.8b
104        LDP     d0, d6, [x3], 16        // Read A0
105        SADALP  v28.4s, v21.8h
106        LDP     d2, d3, [x5, 64]        // Read B
107        SADALP  v30.4s, v23.8h
108        B.HS    1b
109
110        # Epilogue
111        # Same as main loop except no loads at end of loop
112
113        .p2align 3
1142:
115        # BLOCK 0 - 4 cycles
116        SMULL   v17.8h, v4.8b, v0.8b
117        SMULL   v19.8h, v5.8b, v0.8b
118        LDP     d4, d5, [x5, 16]
119        SMLAL   v17.8h, v2.8b, v6.8b
120        SMLAL   v19.8h, v3.8b, v6.8b
121        LDP     d2, d3, [x5, 80]
122
123        # BLOCK 1 - 6 cycles
124        SMULL   v21.8h, v4.8b, v0.8b
125        SMULL   v23.8h, v5.8b, v0.8b
126        SADALP  v16.4s, v17.8h
127        SADALP  v18.4s, v19.8h
128        LDP     d4, d5, [x5, 32]
129        SMLAL   v21.8h, v2.8b, v6.8b
130        SMLAL   v23.8h, v3.8b, v6.8b
131        LDP     d2, d3, [x5, 96]
132
133        # BLOCK 2 - 6 cycles
134        SMULL   v17.8h, v4.8b, v0.8b
135        SMULL   v19.8h, v5.8b, v0.8b
136        SADALP  v20.4s, v21.8h
137        SADALP  v22.4s, v23.8h
138        LDP     d4, d5, [x5, 48]
139        SMLAL   v17.8h, v2.8b, v6.8b
140        SMLAL   v19.8h, v3.8b, v6.8b
141        LDP     d2, d3, [x5, 112]
142
143        # BLOCK 3 - 8 cycles
144        SMULL   v21.8h, v4.8b, v0.8b
145        ADD     x5, x5, 128
146        SMULL   v23.8h, v5.8b, v0.8b
147        SADALP  v24.4s, v17.8h
148        SADALP  v26.4s, v19.8h
149        SMLAL   v21.8h, v2.8b, v6.8b
150        SMLAL   v23.8h, v3.8b, v6.8b
151        SADALP  v28.4s, v21.8h
152        SADALP  v30.4s, v23.8h
153
154        # Is there a remainder?- 8 bytes of A
155        TBNZ    x0, 3, 4f
156
157        .p2align 3
1583:
159        # Add columns
160        ADDP    v16.4s, v16.4s, v18.4s
161        ADDP    v20.4s, v20.4s, v22.4s
162        ADDP    v24.4s, v24.4s, v26.4s
163        ADDP    v28.4s, v28.4s, v30.4s
164        ADDP    v0.4s, v16.4s, v20.4s
165        ADDP    v1.4s, v24.4s, v28.4s
166
167        # Load per channel scale values from weights
168        SCVTF   v0.4s, v0.4s
169        LDR     q4, [x5], 16
170        SCVTF   v1.4s, v1.4s
171        LDR     q5, [x5], 16
172        FMUL    v0.4s, v0.4s, v4.4s
173        FMUL    v1.4s, v1.4s, v5.4s
174
175        FCVTNS  v0.4s, v0.4s
176        FCVTNS  v1.4s, v1.4s
177
178        LD1R    {v5.8h}, [x11], 2
179        SQXTN   v0.4h, v0.4s
180        SQXTN2  v0.8h, v1.4s
181        SUBS    x1, x1, 8
182        SQADD   v0.8h, v0.8h, v5.8h
183        LD1R    {v1.16b}, [x11], 1
184        SQXTN   v0.8b, v0.8h
185        LD1R    {v17.16b}, [x11]
186        SMAX    v0.8b, v0.8b, v1.8b
187        SUB     x11, x11, 3            // rewind params pointer
188        SMIN    v0.8b, v0.8b, v17.8b
189        B.LO    5f
190
191        # Store full 1 x 8
192        ST1     {v0.8b}, [x6], x10
193        SUB     x3, x3, x2              // a0 -= kc
194        B.HI    0b
195        RET
196
197        # Remainder - 8 bytes of A
198        .p2align 3
1994:
200        LDR     d0, [x3], 8
201        LDP     d4, d5, [x5]
202        LDP     d6, d7, [x5, 16]
203        SMULL   v17.8h, v4.8b, v0.8b
204        SMULL   v19.8h, v5.8b, v0.8b
205        SMULL   v21.8h, v6.8b, v0.8b
206        SMULL   v23.8h, v7.8b, v0.8b
207        LDP     d4, d5, [x5, 32]
208        LDP     d6, d7, [x5, 48]
209        SADALP  v16.4s, v17.8h
210        SADALP  v18.4s, v19.8h
211        SADALP  v20.4s, v21.8h
212        SADALP  v22.4s, v23.8h
213        SMULL   v17.8h, v4.8b, v0.8b
214        SMULL   v19.8h, v5.8b, v0.8b
215        SMULL   v21.8h, v6.8b, v0.8b
216        SMULL   v23.8h, v7.8b, v0.8b
217        ADD     x5, x5, 64
218        SADALP  v24.4s, v17.8h
219        SADALP  v26.4s, v19.8h
220        SADALP  v28.4s, v21.8h
221        SADALP  v30.4s, v23.8h
222        B       3b
223
224        # Store odd width
225        .p2align 3
2265:
227        TBZ     x1, 2, 6f
228        STR     s0, [x6], 4
229        EXT     v0.16b, v0.16b, v0.16b, 4
230
2316:
232        TBZ     x1, 1, 7f
233        STR     h0, [x6], 2
234        EXT     v0.16b, v0.16b, v0.16b, 2
2357:
236        TBZ     x1, 0, 8f
237        STR     b0, [x6]
2388:
239        RET
240
241END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
242
243#ifdef __ELF__
244.section ".note.GNU-stack","",%progbits
245#endif
246
247