xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8
9#include <xnnpack/assembly.h>
10
11$DATATYPE = "qc8" if CHANNELWISE else "qs8"
12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
14# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53(
15#     size_t mr,                 x0
16#     size_t nc,                 x1
17#     size_t kc,                 x2 / x0
18#     size_t ks,                 x3 / x9
19#     const int8_t**restrict a,  x4
20#     const int8_t* restrict w,  x5
21#     int8_t* restrict c,        x6
22#     size_t cm_stride,          (x7)
23#     size_t cn_stride,                  [sp] -> x10
24#     size_t a_offset,                   [sp + 8] -> x8
25#     const int8_t* zero,                [sp + 16] -> x12
26#     const union ${PARAMS_UNION} params [sp + 24] -> x11
27
28# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
29
30# Register usage
31# A0 x13  v0  v6
32# B   x5  v4  v5  v2  v3
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# temp0  v17 v19 v21 v23
35# x16, x17, x7 tenporary a53 gpr load data
36
37
38BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53
39
40        # Clamp C pointers
41        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
42        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
43        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
44        BIC     x2, x2, 7
45
46        .p2align 3
470:
48        # Load initial bias from w into accumulators
49        LDP     s16, s18, [x5], 8
50        LDP     s20, s22, [x5], 8
51        LDP     s24, s26, [x5], 8
52        LDP     s28, s30, [x5], 8
53        MOV     x9, x3                  // p = ks
54
55        .p2align 3
561:
57        # Load next A pointer
58        LDR     x13, [x4], 8
59        CMP     x13, x12                // if a0 == zero
60        ADD     x13, x13, x8            // a0 += a_offset
61        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
62
63        # Is there at least 16 bytes for epilogue?
64        SUBS    x0, x2, 16              // k = kc - 16
65        B.LO    5f
66
67        # Prologue: load A0 and 4 B's
68        LDP     d0, d6, [x13], 16       // Read A0
69        LDP     d4, d5, [x5]            // Read B
70        LDP     d2, d3, [x5, 64]        // Read B
71        LDR     x16, [x5, 16]           // Read B
72
73        # Is there at least 16 bytes for main loop?
74        SUBS    x0, x0, 16              // k = k - 16
75        B.LO    3f
76
77        # Main loop - 16 bytes of A
78        # 4 groups of 2 mul/mla/adap + 2 load = 10 cycles.
79        # 1 load for A0 = +1 cycle.  Total 41 cycles.
80
81        .p2align 3
822:
83        # BLOCK 0 - 6 cycles
84        SMULL   v17.8h, v4.8b, v0.8b
85        LDR     x17, [x5, 80]
86        SMULL   v19.8h, v5.8b, v0.8b
87        LDR     d5, [x5, 24]
88        INS     v4.d[0], x16
89        SMLAL   v17.8h, v2.8b, v6.8b
90        LDR     x16, [x5, 32]
91        SMLAL   v19.8h, v3.8b, v6.8b
92        LDR     d3, [x5, 88]
93        INS     v2.d[0], x17
94
95        # BLOCK 1 - 10 cycles
96        SMULL   v21.8h, v4.8b, v0.8b
97        LDR     x17, [x5, 96]
98        SMULL   v23.8h, v5.8b, v0.8b
99        SADALP  v16.4s, v17.8h
100        $if PREFETCH:
101          PRFM    PLDL1KEEP, [x5, 448]
102        SADALP  v18.4s, v19.8h
103        $if PREFETCH:
104          PRFM    PLDL1KEEP, [x5, 512]
105        LDR     d5, [x5, 40]
106        INS     v4.d[0], x16
107        SMLAL   v21.8h, v2.8b, v6.8b
108        LDR     x16, [x5, 48]
109        SMLAL   v23.8h, v3.8b, v6.8b
110        LDR     d3, [x5, 104]
111        INS     v2.d[0], x17
112
113        # BLOCK 2 - 10 cycles
114        SMULL   v17.8h, v4.8b, v0.8b
115        LDR     x17, [x5, 112]
116        SMULL   v19.8h, v5.8b, v0.8b
117        SADALP  v20.4s, v21.8h
118        $if PREFETCH:
119          PRFM    PLDL1KEEP, [x13, 128]
120        SADALP  v22.4s, v23.8h
121        LDR     d5, [x5, 56]
122        INS     v4.d[0], x16
123        SMLAL   v17.8h, v2.8b, v6.8b
124        LDR     x16, [x5, 128]
125        SMLAL   v19.8h, v3.8b, v6.8b
126        LDR     d3, [x5, 120]
127        INS     v2.d[0], x17
128
129        # BLOCK 3 - 15 cycles
130        SMULL   v21.8h, v4.8b, v0.8b
131        LDR     x7, [x13], 8            // Read A0
132        SMULL   v23.8h, v5.8b, v0.8b
133        LDR     x17, [x5, 192]          // Read B
134        SADALP  v24.4s, v17.8h
135        SUBS    x0, x0, 16
136        SADALP  v26.4s, v19.8h
137        LDR     d5, [x5, 136]           // Read B
138        INS     v4.d[0], x16
139        SMLAL   v21.8h, v2.8b, v6.8b
140        LDR     x16, [x5, 144]
141        SMLAL   v23.8h, v3.8b, v6.8b
142        LDR     d6, [x13], 8            // Read A0
143        INS     v0.d[0], x7
144        LDR     d3, [x5, 200]           // Read B
145        INS     v2.d[0], x17
146        SADALP  v28.4s, v21.8h
147        ADD     x5, x5, 128
148        SADALP  v30.4s, v23.8h
149        B.HS    2b
150
151        # Epilogue
152        # Same as main loop except no loads at end of loop
153
154        .p2align 3
1553:
156        # BLOCK 0 - 6 cycles
157        SMULL   v17.8h, v4.8b, v0.8b
158        LDR     x17, [x5, 80]
159        SMULL   v19.8h, v5.8b, v0.8b
160        LDR     d5, [x5, 24]
161        INS     v4.d[0], x16
162        SMLAL   v17.8h, v2.8b, v6.8b
163        LDR     x16, [x5, 32]
164        SMLAL   v19.8h, v3.8b, v6.8b
165        LDR     d3, [x5, 88]
166        INS     v2.d[0], x17
167
168        # BLOCK 1 - 10 cycles
169        SMULL   v21.8h, v4.8b, v0.8b
170        LDR     x17, [x5, 96]
171        SMULL   v23.8h, v5.8b, v0.8b
172        SADALP  v16.4s, v17.8h
173        SADALP  v18.4s, v19.8h
174        LDR     d5, [x5, 40]
175        INS     v4.d[0], x16
176        SMLAL   v21.8h, v2.8b, v6.8b
177        LDR     x16, [x5, 48]
178        SMLAL   v23.8h, v3.8b, v6.8b
179        LDR     d3, [x5, 104]
180        INS     v2.d[0], x17
181
182        # BLOCK 2 - 10 cycles
183        SMULL   v17.8h, v4.8b, v0.8b
184        LDR     x17, [x5, 112]
185        SMULL   v19.8h, v5.8b, v0.8b
186        SADALP  v20.4s, v21.8h
187        SADALP  v22.4s, v23.8h
188        LDR     d5, [x5, 56]
189        INS     v4.d[0], x16
190        SMLAL   v17.8h, v2.8b, v6.8b
191        SMLAL   v19.8h, v3.8b, v6.8b
192        LDR     d3, [x5, 120]
193        INS     v2.d[0], x17
194
195        # BLOCK 3 - 12 cycles
196        SMULL   v21.8h, v4.8b, v0.8b
197        SMULL   v23.8h, v5.8b, v0.8b
198        SADALP  v24.4s, v17.8h
199        SADALP  v26.4s, v19.8h
200        SMLAL   v21.8h, v2.8b, v6.8b
201        SMLAL   v23.8h, v3.8b, v6.8b
202        SADALP  v28.4s, v21.8h
203        ADD     x5, x5, 128
204        SADALP  v30.4s, v23.8h
205
206        # Is there a remainder?- 8 bytes of A
207        TBNZ    x0, 3, 5f
208
209        # ks loop
210        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
211        B.HI    1b
212
2134:
214        # Add columns
215        ADDP    v16.4s, v16.4s, v18.4s
216        ADDP    v20.4s, v20.4s, v22.4s
217        $if REQUANTIZATION == "RNDNU":
218          LD1R    {v4.4s}, [x11], 4
219        ADDP    v24.4s, v24.4s, v26.4s
220        ADDP    v28.4s, v28.4s, v30.4s
221        $if REQUANTIZATION == "RNDNU":
222          LD1R    {v7.4s}, [x11], 4
223        ADDP    v0.4s, v16.4s, v20.4s
224        ADDP    v1.4s, v24.4s, v28.4s
225
226        $if REQUANTIZATION == "RNDNU":
227          # Apply params - preshift, scale, postshift, bias and clamp
228          LD1R    {v5.4s}, [x11], 4
229          SQSHL   v0.4s, v0.4s, v4.4s     // shift to upper bits
230          SQSHL   v1.4s, v1.4s, v4.4s
231          SQDMULH v0.4s, v0.4s, v7.4s     // scale without rounding
232          SQDMULH v1.4s, v1.4s, v7.4s
233          SRSHL   v0.4s, v0.4s, v5.4s     // signed rounding shift left
234          SRSHL   v1.4s, v1.4s, v5.4s
235        $elif REQUANTIZATION == "FP32":
236          $if not CHANNELWISE:
237            # Apply params - scale, bias and clamp
238            SCVTF   v0.4s, v0.4s
239            LD1R    {v4.4s}, [x11], 4
240            SCVTF   v1.4s, v1.4s
241            FMUL    v0.4s, v0.4s, v4.4s
242            FMUL    v1.4s, v1.4s, v4.4s
243          $else:
244            # Load per channel scale values from weights
245            SCVTF   v0.4s, v0.4s
246            LDR     q4, [x5], 16
247            SCVTF   v1.4s, v1.4s
248            LDR     q5, [x5], 16
249            FMUL    v0.4s, v0.4s, v4.4s
250            FMUL    v1.4s, v1.4s, v5.4s
251
252          FCVTNS  v0.4s, v0.4s
253          FCVTNS  v1.4s, v1.4s
254
255        LD1R    {v5.8h}, [x11], 2
256        SQXTN   v0.4h, v0.4s
257        SQXTN2  v0.8h, v1.4s
258        SUBS    x1, x1, 8
259        SQADD   v0.8h, v0.8h, v5.8h
260        LD1R    {v1.16b}, [x11], 1
261        SQXTN   v0.8b, v0.8h
262        LD1R    {v17.16b}, [x11]
263        SMAX    v0.8b, v0.8b, v1.8b
264        SUB     x11, x11, ${REWIND_DECREMENT}          // rewind params pointer
265
266        SMIN    v0.8b, v0.8b, v17.8b
267        B.LO    6f
268
269        # Store full 1 x 8
270        ST1     {v0.8b}, [x6], x10
271        SUB     x4, x4, x3              // a -= ks
272        B.HI    0b
273        RET
274
275        # Remainder - 8 bytes of A
276        .p2align 3
2775:
278        LDR     d0, [x13], 8
279        LDP     d4, d5, [x5]
280        LDP     d6, d7, [x5, 16]
281        SMULL   v17.8h, v4.8b, v0.8b
282        SMULL   v19.8h, v5.8b, v0.8b
283        SMULL   v21.8h, v6.8b, v0.8b
284        SMULL   v23.8h, v7.8b, v0.8b
285        LDP     d4, d5, [x5, 32]
286        LDP     d6, d7, [x5, 48]
287        SADALP  v16.4s, v17.8h
288        SADALP  v18.4s, v19.8h
289        SADALP  v20.4s, v21.8h
290        SADALP  v22.4s, v23.8h
291        SMULL   v17.8h, v4.8b, v0.8b
292        SMULL   v19.8h, v5.8b, v0.8b
293        SMULL   v21.8h, v6.8b, v0.8b
294        SMULL   v23.8h, v7.8b, v0.8b
295        ADD     x5, x5, 64
296        SADALP  v24.4s, v17.8h
297        SADALP  v26.4s, v19.8h
298        SADALP  v28.4s, v21.8h
299        SADALP  v30.4s, v23.8h
300
301        # ks loop
302        SUBS    x9, x9, 8               // ks -= MR * sizeof(int8_t*)
303        B.HI    1b
304        B       4b
305
306        # Store odd width
307        .p2align 3
3086:
309        TBZ     x1, 2, 7f
310        STR     s0, [x6], 4
311        EXT     v0.16b, v0.16b, v0.16b, 4
312
3137:
314        TBZ     x1, 1, 8f
315        STR     h0, [x6], 2
316        EXT     v0.16b, v0.16b, v0.16b, 2
3178:
318        TBZ     x1, 0, 9f
319        STR     b0, [x6]
3209:
321        RET
322
323END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8c8__aarch64_neon_mlal${"_prfm" if PREFETCH else ""}_cortex_a53
324
325#ifdef __ELF__
326.section ".note.GNU-stack","",%progbits
327#endif
328