xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          (x7)
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# B   x5  v4  v5  v2  v3
32# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
33# temp0  v17 v19 v21 v23
34# x16, x17, x7 tenporary a53 gpr load data
35
36
37BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
38
39        # Clamp C pointers
40        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
41        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
42        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
43        BIC     x2, x2, 7
44
45        .p2align 3
460:
47        # Load initial bias from w into accumulators
48        LDP     s16, s18, [x5], 8
49        LDP     s20, s22, [x5], 8
50        LDP     s24, s26, [x5], 8
51        LDP     s28, s30, [x5], 8
52        MOV     x9, x3                  // p = ks
53
54        .p2align 3
551:
56        # Load next A pointer
57        LDR     x13, [x4], 8
58        CMP     x13, x12                // if a0 == zero
59        ADD     x13, x13, x8            // a0 += a_offset
60        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
61
62        # Is there at least 16 bytes for epilogue?
63        SUBS    x0, x2, 16              // k = kc - 16
64        B.LO    5f
65
66        # Prologue: load A0 and 4 B's
67        LDP     d0, d6, [x13], 16       // Read A0
68        LDP     d4, d5, [x5]            // Read B
69        LDP     d2, d3, [x5, 64]        // Read B
70        LDR     x16, [x5, 16]           // Read B
71
72        # Is there at least 16 bytes for main loop?
73        SUBS    x0, x0, 16              // k = k - 16
74        B.LO    3f
75
76        # Main loop - 16 bytes of A
77        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
78        # 1 load for A0 = +1 cycle.  Total 41 cycles.
79
80        .p2align 3
812:
82        # BLOCK 0 - 6 cycles
83        SMULL   v17.8h, v4.8b, v0.8b
84        LDR     x17, [x5, 80]
85        SMULL   v19.8h, v5.8b, v0.8b
86        LDR     d5, [x5, 24]
87        INS     v4.d[0], x16
88        SMLAL   v17.8h, v2.8b, v6.8b
89        LDR     x16, [x5, 32]
90        SMLAL   v19.8h, v3.8b, v6.8b
91        LDR     d3, [x5, 88]
92        INS     v2.d[0], x17
93
94        # BLOCK 1 - 10 cycles
95        SMULL   v21.8h, v4.8b, v0.8b
96        LDR     x17, [x5, 96]
97        SMULL   v23.8h, v5.8b, v0.8b
98        SADALP  v16.4s, v17.8h
99        SADALP  v18.4s, v19.8h
100        LDR     d5, [x5, 40]
101        INS     v4.d[0], x16
102        SMLAL   v21.8h, v2.8b, v6.8b
103        LDR     x16, [x5, 48]
104        SMLAL   v23.8h, v3.8b, v6.8b
105        LDR     d3, [x5, 104]
106        INS     v2.d[0], x17
107
108        # BLOCK 2 - 10 cycles
109        SMULL   v17.8h, v4.8b, v0.8b
110        LDR     x17, [x5, 112]
111        SMULL   v19.8h, v5.8b, v0.8b
112        SADALP  v20.4s, v21.8h
113        SADALP  v22.4s, v23.8h
114        LDR     d5, [x5, 56]
115        INS     v4.d[0], x16
116        SMLAL   v17.8h, v2.8b, v6.8b
117        LDR     x16, [x5, 128]
118        SMLAL   v19.8h, v3.8b, v6.8b
119        LDR     d3, [x5, 120]
120        INS     v2.d[0], x17
121
122        # BLOCK 3 - 15 cycles
123        SMULL   v21.8h, v4.8b, v0.8b
124        LDR     x7, [x13], 8            // Read A0
125        SMULL   v23.8h, v5.8b, v0.8b
126        LDR     x17, [x5, 192]          // Read B
127        SADALP  v24.4s, v17.8h
128        SUBS    x0, x0, 16
129        SADALP  v26.4s, v19.8h
130        LDR     d5, [x5, 136]           // Read B
131        INS     v4.d[0], x16
132        SMLAL   v21.8h, v2.8b, v6.8b
133        LDR     x16, [x5, 144]
134        SMLAL   v23.8h, v3.8b, v6.8b
135        LDR     d6, [x13], 8            // Read A0
136        INS     v0.d[0], x7
137        LDR     d3, [x5, 200]           // Read B
138        INS     v2.d[0], x17
139        SADALP  v28.4s, v21.8h
140        ADD     x5, x5, 128
141        SADALP  v30.4s, v23.8h
142        B.HS    2b
143
144        # Epilogue
145        # Same as main loop except no loads at end of loop
146
147        .p2align 3
1483:
149        # BLOCK 0 - 6 cycles
150        SMULL   v17.8h, v4.8b, v0.8b
151        LDR     x17, [x5, 80]
152        SMULL   v19.8h, v5.8b, v0.8b
153        LDR     d5, [x5, 24]
154        INS     v4.d[0], x16
155        SMLAL   v17.8h, v2.8b, v6.8b
156        LDR     x16, [x5, 32]
157        SMLAL   v19.8h, v3.8b, v6.8b
158        LDR     d3, [x5, 88]
159        INS     v2.d[0], x17
160
161        # BLOCK 1 - 10 cycles
162        SMULL   v21.8h, v4.8b, v0.8b
163        LDR     x17, [x5, 96]
164        SMULL   v23.8h, v5.8b, v0.8b
165        SADALP  v16.4s, v17.8h
166        SADALP  v18.4s, v19.8h
167        LDR     d5, [x5, 40]
168        INS     v4.d[0], x16
169        SMLAL   v21.8h, v2.8b, v6.8b
170        LDR     x16, [x5, 48]
171        SMLAL   v23.8h, v3.8b, v6.8b
172        LDR     d3, [x5, 104]
173        INS     v2.d[0], x17
174
175        # BLOCK 2 - 10 cycles
176        SMULL   v17.8h, v4.8b, v0.8b
177        LDR     x17, [x5, 112]
178        SMULL   v19.8h, v5.8b, v0.8b
179        SADALP  v20.4s, v21.8h
180        SADALP  v22.4s, v23.8h
181        LDR     d5, [x5, 56]
182        INS     v4.d[0], x16
183        SMLAL   v17.8h, v2.8b, v6.8b
184        SMLAL   v19.8h, v3.8b, v6.8b
185        LDR     d3, [x5, 120]
186        INS     v2.d[0], x17
187
188        # BLOCK 3 - 12 cycles
189        SMULL   v21.8h, v4.8b, v0.8b
190        SMULL   v23.8h, v5.8b, v0.8b
191        SADALP  v24.4s, v17.8h
192        SADALP  v26.4s, v19.8h
193        SMLAL   v21.8h, v2.8b, v6.8b
194        SMLAL   v23.8h, v3.8b, v6.8b
195        SADALP  v28.4s, v21.8h
196        ADD     x5, x5, 128
197        SADALP  v30.4s, v23.8h
198
199        # Is there a remainder?- 8 bytes of A
200        TBNZ    x0, 3, 5f
201
202        # ks loop
203        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
204        B.HI    1b
205
2064:
207        # Add columns
208        ADDP    v16.4s, v16.4s, v18.4s
209        ADDP    v20.4s, v20.4s, v22.4s
210        ADDP    v24.4s, v24.4s, v26.4s
211        ADDP    v28.4s, v28.4s, v30.4s
212        ADDP    v0.4s, v16.4s, v20.4s
213        ADDP    v1.4s, v24.4s, v28.4s
214
215        # Apply params - scale, bias and clamp
216        SCVTF   v0.4s, v0.4s
217        LD1R    {v4.4s}, [x11], 4
218        SCVTF   v1.4s, v1.4s
219        FMUL    v0.4s, v0.4s, v4.4s
220        FMUL    v1.4s, v1.4s, v4.4s
221
222        FCVTNS  v0.4s, v0.4s
223        FCVTNS  v1.4s, v1.4s
224
225        LD1R    {v5.8h}, [x11], 2
226        SQXTN   v0.4h, v0.4s
227        SQXTN2  v0.8h, v1.4s
228        SUBS    x1, x1, 8
229        SQADD   v0.8h, v0.8h, v5.8h
230        LD1R    {v1.16b}, [x11], 1
231        SQXTN   v0.8b, v0.8h
232        LD1R    {v17.16b}, [x11]
233        SMAX    v0.8b, v0.8b, v1.8b
234        SUB     x11, x11, 7          // rewind params pointer
235
236        SMIN    v0.8b, v0.8b, v17.8b
237        B.LO    6f
238
239        # Store full 1 x 8
240        ST1     {v0.8b}, [x6], x10
241        SUB     x4, x4, x3              // a -= ks
242        B.HI    0b
243        RET
244
245        # Remainder - 8 bytes of A
246        .p2align 3
2475:
248        LDR     d0, [x13], 8
249        LDP     d4, d5, [x5]
250        LDP     d6, d7, [x5, 16]
251        SMULL   v17.8h, v4.8b, v0.8b
252        SMULL   v19.8h, v5.8b, v0.8b
253        SMULL   v21.8h, v6.8b, v0.8b
254        SMULL   v23.8h, v7.8b, v0.8b
255        LDP     d4, d5, [x5, 32]
256        LDP     d6, d7, [x5, 48]
257        SADALP  v16.4s, v17.8h
258        SADALP  v18.4s, v19.8h
259        SADALP  v20.4s, v21.8h
260        SADALP  v22.4s, v23.8h
261        SMULL   v17.8h, v4.8b, v0.8b
262        SMULL   v19.8h, v5.8b, v0.8b
263        SMULL   v21.8h, v6.8b, v0.8b
264        SMULL   v23.8h, v7.8b, v0.8b
265        ADD     x5, x5, 64
266        SADALP  v24.4s, v17.8h
267        SADALP  v26.4s, v19.8h
268        SADALP  v28.4s, v21.8h
269        SADALP  v30.4s, v23.8h
270
271        # ks loop
272        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
273        B.HI    1b
274        B       4b
275
276        # Store odd width
277        .p2align 3
2786:
279        TBZ     x1, 2, 7f
280        STR     s0, [x6], 4
281        EXT     v0.16b, v0.16b, v0.16b, 4
282
2837:
284        TBZ     x1, 1, 8f
285        STR     h0, [x6], 2
286        EXT     v0.16b, v0.16b, v0.16b, 2
2878:
288        TBZ     x1, 0, 9f
289        STR     b0, [x6]
2909:
291        RET
292
293END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53
294
295#ifdef __ELF__
296.section ".note.GNU-stack","",%progbits
297#endif
298