xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34
35
36BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
37
38        # Clamp C pointers
39        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
40        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
41        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
42        BIC     x2, x2, 7
43
44        .p2align 3
450:
46        # Load initial bias from w into accumulators
47        LDP     s16, s18, [x5], 8
48        LDP     s20, s22, [x5], 8
49        LDP     s24, s26, [x5], 8
50        LDP     s28, s30, [x5], 8
51        MOV     x9, x3                  // p = ks
52
53        .p2align 3
541:
55        # Load next A pointer
56        LDR     x13, [x4], 8
57        CMP     x13, x12                // if a0 == zero
58        ADD     x13, x13, x8            // a0 += a_offset
59        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
60
61        # Is there at least 16 bytes for epilogue?
62        SUBS    x0, x2, 16              // k = kc - 16
63        B.LO    5f
64
65        # Prologue: load A0 and 4 B's
66        LDP     d0, d6, [x13], 16       // Read A0
67        LDP     d4, d5, [x5]            // Read B
68        LDP     d2, d3, [x5, 64]        // Read B
69
70        # Is there at least 16 bytes for main loop?
71        SUBS    x0, x0, 16              // k = k - 16
72        B.LO    3f
73
74        # Main loop - 16 bytes of A
75        # 4 groups of 2 mul/mla/adap = 6 cycles.
76        # 2 load for A0, A1 = +4 cycle.  Total 36 cycles.
77
78        .p2align 3
792:
80        # BLOCK 0 - 4 cycles
81        SMULL   v17.8h, v4.8b, v0.8b
82        SMULL   v19.8h, v5.8b, v0.8b
83        LDP     d4, d5, [x5, 16]
84        SMLAL   v17.8h, v2.8b, v6.8b
85        SMLAL   v19.8h, v3.8b, v6.8b
86        LDP     d2, d3, [x5, 80]
87
88        # BLOCK 1 - 6 cycles
89        SMULL   v21.8h, v4.8b, v0.8b
90        SMULL   v23.8h, v5.8b, v0.8b
91        SADALP  v16.4s, v17.8h
92        SADALP  v18.4s, v19.8h
93        LDP     d4, d5, [x5, 32]
94        SMLAL   v21.8h, v2.8b, v6.8b
95        SMLAL   v23.8h, v3.8b, v6.8b
96        LDP     d2, d3, [x5, 96]
97
98        # BLOCK 2 - 6 cycles
99        SMULL   v17.8h, v4.8b, v0.8b
100        SMULL   v19.8h, v5.8b, v0.8b
101        SADALP  v20.4s, v21.8h
102        SADALP  v22.4s, v23.8h
103        LDP     d4, d5, [x5, 48]
104        SMLAL   v17.8h, v2.8b, v6.8b
105        SMLAL   v19.8h, v3.8b, v6.8b
106        LDP     d2, d3, [x5, 112]
107
108        # BLOCK 3 - 14 cycles
109        SMULL   v21.8h, v4.8b, v0.8b
110        ADD     x5, x5, 128
111        SMULL   v23.8h, v5.8b, v0.8b
112        SADALP  v24.4s, v17.8h
113        SUBS    x0, x0, 16
114        SADALP  v26.4s, v19.8h
115        LDP     d4, d5, [x5]            // Read B
116        SMLAL   v21.8h, v2.8b, v6.8b
117        SMLAL   v23.8h, v3.8b, v6.8b
118        LDP     d0, d6, [x13], 16       // Read A0
119        SADALP  v28.4s, v21.8h
120        LDP     d2, d3, [x5, 64]        // Read B
121        SADALP  v30.4s, v23.8h
122        B.HS    2b
123
124        # Epilogue
125        # Same as main loop except no loads at end of loop
126
127        .p2align 3
1283:
129       # BLOCK 0 - 4 cycles
130        SMULL   v17.8h, v4.8b, v0.8b
131        SMULL   v19.8h, v5.8b, v0.8b
132        LDP     d4, d5, [x5, 16]
133        SMLAL   v17.8h, v2.8b, v6.8b
134        SMLAL   v19.8h, v3.8b, v6.8b
135        LDP     d2, d3, [x5, 80]
136
137        # BLOCK 1 - 6 cycles
138        SMULL   v21.8h, v4.8b, v0.8b
139        SMULL   v23.8h, v5.8b, v0.8b
140        SADALP  v16.4s, v17.8h
141        SADALP  v18.4s, v19.8h
142        LDP     d4, d5, [x5, 32]
143        SMLAL   v21.8h, v2.8b, v6.8b
144        SMLAL   v23.8h, v3.8b, v6.8b
145        LDP     d2, d3, [x5, 96]
146
147        # BLOCK 2 - 6 cycles
148        SMULL   v17.8h, v4.8b, v0.8b
149        SMULL   v19.8h, v5.8b, v0.8b
150        SADALP  v20.4s, v21.8h
151        SADALP  v22.4s, v23.8h
152        LDP     d4, d5, [x5, 48]
153        SMLAL   v17.8h, v2.8b, v6.8b
154        SMLAL   v19.8h, v3.8b, v6.8b
155        LDP     d2, d3, [x5, 112]
156
157        # BLOCK 3 - 8 cycles
158        SMULL   v21.8h, v4.8b, v0.8b
159        ADD     x5, x5, 128
160        SMULL   v23.8h, v5.8b, v0.8b
161        SADALP  v24.4s, v17.8h
162        SUBS    x0, x0, 16
163        SADALP  v26.4s, v19.8h
164        SMLAL   v21.8h, v2.8b, v6.8b
165        SMLAL   v23.8h, v3.8b, v6.8b
166        SADALP  v28.4s, v21.8h
167        SADALP  v30.4s, v23.8h
168
169        # Is there a remainder?- 8 bytes of A
170        TBNZ    x0, 3, 5f
171
172        # ks loop
173        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
174        B.HI    1b
175
1764:
177        # Add columns
178        ADDP    v16.4s, v16.4s, v18.4s
179        ADDP    v20.4s, v20.4s, v22.4s
180        ADDP    v24.4s, v24.4s, v26.4s
181        ADDP    v28.4s, v28.4s, v30.4s
182        ADDP    v0.4s, v16.4s, v20.4s
183        ADDP    v1.4s, v24.4s, v28.4s
184
185        # Load per channel scale values from weights
186        SCVTF   v0.4s, v0.4s
187        LDR     q4, [x5], 16
188        SCVTF   v1.4s, v1.4s
189        LDR     q5, [x5], 16
190        FMUL    v0.4s, v0.4s, v4.4s
191        FMUL    v1.4s, v1.4s, v5.4s
192
193        FCVTNS  v0.4s, v0.4s
194        FCVTNS  v1.4s, v1.4s
195
196        LD1R    {v5.8h}, [x11], 2
197        SQXTN   v0.4h, v0.4s
198        SQXTN2  v0.8h, v1.4s
199        SUBS    x1, x1, 8
200        SQADD   v0.8h, v0.8h, v5.8h
201        LD1R    {v1.16b}, [x11], 1
202        SQXTN   v0.8b, v0.8h
203        LD1R    {v17.16b}, [x11]
204        SMAX    v0.8b, v0.8b, v1.8b
205        SUB     x11, x11, 3          // rewind params pointer
206        SMIN    v0.8b, v0.8b, v17.8b
207        B.LO    6f
208
209        # Store full 1 x 8
210        ST1     {v0.8b}, [x6], x10
211        SUB     x4, x4, x3              // a -= ks
212        B.HI    0b
213        RET
214
215        # Remainder - 8 bytes of A
216        .p2align 3
2175:
218        LDR     d0, [x13], 8
219        LDP     d4, d5, [x5]
220        LDP     d6, d7, [x5, 16]
221        SMULL   v17.8h, v4.8b, v0.8b
222        SMULL   v19.8h, v5.8b, v0.8b
223        SMULL   v21.8h, v6.8b, v0.8b
224        SMULL   v23.8h, v7.8b, v0.8b
225        LDP     d4, d5, [x5, 32]
226        LDP     d6, d7, [x5, 48]
227        SADALP  v16.4s, v17.8h
228        SADALP  v18.4s, v19.8h
229        SADALP  v20.4s, v21.8h
230        SADALP  v22.4s, v23.8h
231        SMULL   v17.8h, v4.8b, v0.8b
232        SMULL   v19.8h, v5.8b, v0.8b
233        SMULL   v21.8h, v6.8b, v0.8b
234        SMULL   v23.8h, v7.8b, v0.8b
235        ADD     x5, x5, 64
236        SADALP  v24.4s, v17.8h
237        SADALP  v26.4s, v19.8h
238        SADALP  v28.4s, v21.8h
239        SADALP  v30.4s, v23.8h
240
241        # ks loop
242        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
243        B.HI    1b
244        B       4b
245
246        # Store odd width
247        .p2align 3
2486:
249        TBZ     x1, 2, 7f
250        STR     s0, [x6], 4
251        EXT     v0.16b, v0.16b, v0.16b, 4
252
2537:
254        TBZ     x1, 1, 8f
255        STR     h0, [x6], 2
256        EXT     v0.16b, v0.16b, v0.16b, 2
2578:
258        TBZ     x1, 0, 9f
259        STR     b0, [x6]
2609:
261        RET
262
263END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal
264
265#ifdef __ELF__
266.section ".note.GNU-stack","",%progbits
267#endif
268