xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34# x16, x17, x7 tenporary a53 gpr load data
35
36
37BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
38
39        # Clamp C pointers
40        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
41        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
42        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
43        BIC     x2, x2, 7
44
45        .p2align 3
460:
47        # Load initial bias from w into accumulators
48        LDP     s16, s18, [x5], 8
49        LDP     s20, s22, [x5], 8
50        LDP     s24, s26, [x5], 8
51        LDP     s28, s30, [x5], 8
52        MOV     x9, x3                  // p = ks
53
54        .p2align 3
551:
56        # Load next A pointer
57        LDR     x13, [x4], 8
58        CMP     x13, x12                // if a0 == zero
59        ADD     x13, x13, x8            // a0 += a_offset
60        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
61
62        # Is there at least 16 bytes for epilogue?
63        SUBS    x0, x2, 16              // k = kc - 16
64        B.LO    5f
65
66        # Prologue: load A0 and 4 B's
67        LDP     d0, d6, [x13], 16       // Read A0
68        LDP     d4, d5, [x5]            // Read B
69        LDP     d2, d3, [x5, 64]        // Read B
70        LDR     x16, [x5, 16]           // Read B
71
72        # Is there at least 16 bytes for main loop?
73        SUBS    x0, x0, 16              // k = k - 16
74        B.LO    3f
75
76        # Main loop - 16 bytes of A
77        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
78        # 1 load for A0 = +1 cycle.  Total 41 cycles.
79
80        .p2align 3
812:
82        # BLOCK 0 - 6 cycles
83        SMULL   v17.8h, v4.8b, v0.8b
84        LDR     x17, [x5, 80]
85        SMULL   v19.8h, v5.8b, v0.8b
86        LDR     d5, [x5, 24]
87        INS     v4.d[0], x16
88        SMLAL   v17.8h, v2.8b, v6.8b
89        LDR     x16, [x5, 32]
90        SMLAL   v19.8h, v3.8b, v6.8b
91        LDR     d3, [x5, 88]
92        INS     v2.d[0], x17
93
94        # BLOCK 1 - 10 cycles
95        SMULL   v21.8h, v4.8b, v0.8b
96        LDR     x17, [x5, 96]
97        SMULL   v23.8h, v5.8b, v0.8b
98        SADALP  v16.4s, v17.8h
99        SADALP  v18.4s, v19.8h
100        LDR     d5, [x5, 40]
101        INS     v4.d[0], x16
102        SMLAL   v21.8h, v2.8b, v6.8b
103        LDR     x16, [x5, 48]
104        SMLAL   v23.8h, v3.8b, v6.8b
105        LDR     d3, [x5, 104]
106        INS     v2.d[0], x17
107
108        # BLOCK 2 - 10 cycles
109        SMULL   v17.8h, v4.8b, v0.8b
110        LDR     x17, [x5, 112]
111        SMULL   v19.8h, v5.8b, v0.8b
112        SADALP  v20.4s, v21.8h
113        SADALP  v22.4s, v23.8h
114        LDR     d5, [x5, 56]
115        INS     v4.d[0], x16
116        SMLAL   v17.8h, v2.8b, v6.8b
117        LDR     x16, [x5, 128]
118        SMLAL   v19.8h, v3.8b, v6.8b
119        LDR     d3, [x5, 120]
120        INS     v2.d[0], x17
121
122        # BLOCK 3 - 15 cycles
123        SMULL   v21.8h, v4.8b, v0.8b
124        LDR     x7, [x13], 8            // Read A0
125        SMULL   v23.8h, v5.8b, v0.8b
126        LDR     x17, [x5, 192]          // Read B
127        SADALP  v24.4s, v17.8h
128        SUBS    x0, x0, 16
129        SADALP  v26.4s, v19.8h
130        LDR     d5, [x5, 136]           // Read B
131        INS     v4.d[0], x16
132        SMLAL   v21.8h, v2.8b, v6.8b
133        LDR     x16, [x5, 144]
134        SMLAL   v23.8h, v3.8b, v6.8b
135        LDR     d6, [x13], 8            // Read A0
136        INS     v0.d[0], x7
137        LDR     d3, [x5, 200]           // Read B
138        INS     v2.d[0], x17
139        SADALP  v28.4s, v21.8h
140        ADD     x5, x5, 128
141        SADALP  v30.4s, v23.8h
142        B.HS    2b
143
144        # Epilogue
145        # Same as main loop except no loads at end of loop
146
147        .p2align 3
1483:
149        # BLOCK 0 - 6 cycles
150        SMULL   v17.8h, v4.8b, v0.8b
151        LDR     x17, [x5, 80]
152        SMULL   v19.8h, v5.8b, v0.8b
153        LDR     d5, [x5, 24]
154        INS     v4.d[0], x16
155        SMLAL   v17.8h, v2.8b, v6.8b
156        LDR     x16, [x5, 32]
157        SMLAL   v19.8h, v3.8b, v6.8b
158        LDR     d3, [x5, 88]
159        INS     v2.d[0], x17
160
161        # BLOCK 1 - 10 cycles
162        SMULL   v21.8h, v4.8b, v0.8b
163        LDR     x17, [x5, 96]
164        SMULL   v23.8h, v5.8b, v0.8b
165        SADALP  v16.4s, v17.8h
166        SADALP  v18.4s, v19.8h
167        LDR     d5, [x5, 40]
168        INS     v4.d[0], x16
169        SMLAL   v21.8h, v2.8b, v6.8b
170        LDR     x16, [x5, 48]
171        SMLAL   v23.8h, v3.8b, v6.8b
172        LDR     d3, [x5, 104]
173        INS     v2.d[0], x17
174
175        # BLOCK 2 - 10 cycles
176        SMULL   v17.8h, v4.8b, v0.8b
177        LDR     x17, [x5, 112]
178        SMULL   v19.8h, v5.8b, v0.8b
179        SADALP  v20.4s, v21.8h
180        SADALP  v22.4s, v23.8h
181        LDR     d5, [x5, 56]
182        INS     v4.d[0], x16
183        SMLAL   v17.8h, v2.8b, v6.8b
184        SMLAL   v19.8h, v3.8b, v6.8b
185        LDR     d3, [x5, 120]
186        INS     v2.d[0], x17
187
188        # BLOCK 3 - 12 cycles
189        SMULL   v21.8h, v4.8b, v0.8b
190        SMULL   v23.8h, v5.8b, v0.8b
191        SADALP  v24.4s, v17.8h
192        SADALP  v26.4s, v19.8h
193        SMLAL   v21.8h, v2.8b, v6.8b
194        SMLAL   v23.8h, v3.8b, v6.8b
195        SADALP  v28.4s, v21.8h
196        ADD     x5, x5, 128
197        SADALP  v30.4s, v23.8h
198
199        # Is there a remainder?- 8 bytes of A
200        TBNZ    x0, 3, 5f
201
202        # ks loop
203        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
204        B.HI    1b
205
2064:
207        # Add columns
208        ADDP    v16.4s, v16.4s, v18.4s
209        ADDP    v20.4s, v20.4s, v22.4s
210        ADDP    v24.4s, v24.4s, v26.4s
211        ADDP    v28.4s, v28.4s, v30.4s
212        ADDP    v0.4s, v16.4s, v20.4s
213        ADDP    v1.4s, v24.4s, v28.4s
214
215        # Load per channel scale values from weights
216        SCVTF   v0.4s, v0.4s
217        LDR     q4, [x5], 16
218        SCVTF   v1.4s, v1.4s
219        LDR     q5, [x5], 16
220        FMUL    v0.4s, v0.4s, v4.4s
221        FMUL    v1.4s, v1.4s, v5.4s
222
223        FCVTNS  v0.4s, v0.4s
224        FCVTNS  v1.4s, v1.4s
225
226        LD1R    {v5.8h}, [x11], 2
227        SQXTN   v0.4h, v0.4s
228        SQXTN2  v0.8h, v1.4s
229        SUBS    x1, x1, 8
230        SQADD   v0.8h, v0.8h, v5.8h
231        LD1R    {v1.16b}, [x11], 1
232        SQXTN   v0.8b, v0.8h
233        LD1R    {v17.16b}, [x11]
234        SMAX    v0.8b, v0.8b, v1.8b
235        SUB     x11, x11, 3          // rewind params pointer
236
237        SMIN    v0.8b, v0.8b, v17.8b
238        B.LO    6f
239
240        # Store full 1 x 8
241        ST1     {v0.8b}, [x6], x10
242        SUB     x4, x4, x3              // a -= ks
243        B.HI    0b
244        RET
245
246        # Remainder - 8 bytes of A
247        .p2align 3
2485:
249        LDR     d0, [x13], 8
250        LDP     d4, d5, [x5]
251        LDP     d6, d7, [x5, 16]
252        SMULL   v17.8h, v4.8b, v0.8b
253        SMULL   v19.8h, v5.8b, v0.8b
254        SMULL   v21.8h, v6.8b, v0.8b
255        SMULL   v23.8h, v7.8b, v0.8b
256        LDP     d4, d5, [x5, 32]
257        LDP     d6, d7, [x5, 48]
258        SADALP  v16.4s, v17.8h
259        SADALP  v18.4s, v19.8h
260        SADALP  v20.4s, v21.8h
261        SADALP  v22.4s, v23.8h
262        SMULL   v17.8h, v4.8b, v0.8b
263        SMULL   v19.8h, v5.8b, v0.8b
264        SMULL   v21.8h, v6.8b, v0.8b
265        SMULL   v23.8h, v7.8b, v0.8b
266        ADD     x5, x5, 64
267        SADALP  v24.4s, v17.8h
268        SADALP  v26.4s, v19.8h
269        SADALP  v28.4s, v21.8h
270        SADALP  v30.4s, v23.8h
271
272        # ks loop
273        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
274        B.HI    1b
275        B       4b
276
277        # Store odd width
278        .p2align 3
2796:
280        TBZ     x1, 2, 7f
281        STR     s0, [x6], 4
282        EXT     v0.16b, v0.16b, v0.16b, 4
283
2847:
285        TBZ     x1, 1, 8f
286        STR     h0, [x6], 2
287        EXT     v0.16b, v0.16b, v0.16b, 2
2888:
289        TBZ     x1, 0, 9f
290        STR     b0, [x6]
2919:
292        RET
293
294END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
295
296#ifdef __ELF__
297.section ".note.GNU-stack","",%progbits
298#endif
299