xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/4x8c4-aarch64-neondot-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10$REWIND_DECREMENT = {"RNDNU": 19, "FP32": 11}[REQUANTIZATION]
11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55(
12#     size_t mr,                 x0
13#     size_t nc,                 x1
14#     size_t kc,                 x2 / x0
15#     size_t ks,                 x3 / x9
16#     const int8_t**restrict a,  x4
17#     const int8_t* restrict w,  x5
18#     int8_t* restrict c,        x6
19#     size_t cm_stride,          x7
20#     size_t cn_stride,          [sp] -> (x10)
21#     size_t a_offset,           [sp + 8] -> x8
22#     const int8_t* zero,        [sp + 16] -> x12
23#     const union xnn_qu8_conv_minmax_params [sp + 24] -> (x11)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x13  v0  v4
29# A1  x14  v1  v5
30# A2  x15  v2  v6
31# A3  x10  v3  v7
32# B    x5 v28 v29 v30 v31
33# C0   x6 v16 v20
34# C1  x16 v17 v21
35# C2  x17 v18 v22
36# C3   x7 v19 v23
37# zero_point v8 v24 v25 v26 v27
38# unused v9 v10 v11 v12 v13 v14 v15
39
40# x11 temp for Cortex-A55 loads
41
42BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDR     x8, [sp, 8]             // Load a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
50        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
51        ADD     x17, x16, x7            // c2 = c1 + cm_stride
52        STR     d8, [sp, -16]!          // Save d8 on stack
53                                        // if mr <= 2
54        CSEL    x17, x16, x17, LS       //   c2 = c1
55        BIC     x2, x2, 3
56        CMP     x0, 4                   // if mr < 4
57        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
58        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
59        CSEL    x7,  x17, x7, LO        //   c3 = c2
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOVI    v24.16b, 0
72        MOVI    v25.16b, 0
73        MOVI    v26.16b, 0
74        MOVI    v27.16b, 0
75        MOV     x9, x3                  // p = ks
76
77        .p2align 3
781:
79        # Load next 4 A pointers
80        LDP     x13, x14, [x4], 16
81        LDP     x15, x10, [x4], 16
82
83        CMP     x13, x12                // if a0 == zero
84        ADD     x13, x13, x8            // a0 += a_offset
85        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
86        CMP     x14, x12                // if a1 == zero
87        ADD     x14, x14, x8            // a1 += a_offset
88        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
89        CMP     x15, x12                // if a2 == zero
90        ADD     x15, x15, x8            // a2 += a_offset
91        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
92        CMP     x10, x12                // if a3 == zero
93        ADD     x10, x10, x8            // a3 += a_offset
94        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
95
96        # Is there at least 16 bytes for prologue/epilogue?
97        SUBS    x0, x2, 16              // k = kc - 16
98        B.LO    5f
99
100        # prologue - read A and B values for block 0 and 1
101        LDR     d0, [x13], 8
102        LDR     q28, [x5], 16
103        LDR     d1, [x14], 8
104        LDR     d2, [x15], 8
105        LDR     d3, [x10], 8
106        SUBS    x0, x0, 16              // is there 16 for main loop?
107        LDR     d29, [x5], 8
108        LDR     x11, [x5], 8
109        # Is there at least 16 bytes for main loop?
110        B.LO    3f
111
112        # Main loop - 16 bytes of A in 4 groups of 2 blocks
113        # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels
114        # 4 LD64 for A
115        # 4 LD128 for W. = 2 LD64 + INS.
116        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
117
118        .p2align 3
1192:
120        # BLOCK 0
121        UDOT    v16.4s,  v28.16b, v0.4b[0]
122        LDR     d30,  [x5], 8
123        UDOT    v17.4s,  v28.16b, v1.4b[0]
124        INS     v29.d[1], x11
125        UDOT    v18.4s,  v28.16b, v2.4b[0]
126        LDR     x11,  [x5], 8
127        UDOT    v19.4s,  v28.16b, v3.4b[0]
128        LDR     d4, [x13], 8
129
130        # BLOCK 1
131        UDOT    v20.4s,  v29.16b, v0.4b[0]
132        LDR     d31,  [x5], 8
133        UDOT    v21.4s,  v29.16b, v1.4b[0]
134        INS     v30.d[1], x11
135        UDOT    v22.4s,  v29.16b, v2.4b[0]
136        LDR     x11,  [x5], 8
137        UDOT    v23.4s,  v29.16b, v3.4b[0]
138        LDR     d5, [x14], 8
139
140        # BLOCK 0
141        UDOT    v16.4s, v30.16b, v0.4b[1]
142        LDR     d28,  [x5], 8
143        UDOT    v17.4s, v30.16b, v1.4b[1]
144        INS     v31.d[1], x11
145        UDOT    v18.4s, v30.16b, v2.4b[1]
146        LDR     x11,  [x5], 8
147        UDOT    v19.4s, v30.16b, v3.4b[1]
148        LDR     d6, [x15], 8
149
150        # BLOCK 1
151        UDOT    v20.4s, v31.16b, v0.4b[1]
152        LDR     d29,  [x5], 8
153        UDOT    v21.4s, v31.16b, v1.4b[1]
154        INS     v28.d[1], x11
155        UDOT    v22.4s, v31.16b, v2.4b[1]
156        LDR     x11,  [x5], 8
157        UDOT    v23.4s, v31.16b, v3.4b[1]
158        LDR     d7, [x10], 8
159
160        UDOT    v24.2s, v8.8b, v0.8b
161        UDOT    v25.2s, v8.8b, v1.8b
162        UDOT    v26.2s, v8.8b, v2.8b
163        UDOT    v27.2s, v8.8b, v3.8b
164
165        # BLOCK 0
166        UDOT    v16.4s,  v28.16b, v4.4b[0]
167        LDR     d30,  [x5], 8
168        UDOT    v17.4s,  v28.16b, v5.4b[0]
169        INS     v29.d[1], x11
170        UDOT    v18.4s,  v28.16b, v6.4b[0]
171        LDR     x11,  [x5], 8
172        UDOT    v19.4s,  v28.16b, v7.4b[0]
173        LDR     d0, [x13], 8
174
175        # BLOCK 1
176        UDOT    v20.4s,  v29.16b, v4.4b[0]
177        LDR     d31,  [x5], 8
178        UDOT    v21.4s,  v29.16b, v5.4b[0]
179        INS     v30.d[1], x11
180        UDOT    v22.4s,  v29.16b, v6.4b[0]
181        LDR     x11,  [x5], 8
182        UDOT    v23.4s,  v29.16b, v7.4b[0]
183        LDR     d1, [x14], 8
184
185        # BLOCK 0
186        UDOT    v16.4s, v30.16b, v4.4b[1]
187        LDR     d28,  [x5], 8
188        UDOT    v17.4s, v30.16b, v5.4b[1]
189        INS     v31.d[1], x11
190        UDOT    v18.4s, v30.16b, v6.4b[1]
191        LDR     x11,  [x5], 8
192        UDOT    v19.4s, v30.16b, v7.4b[1]
193        LDR     d2, [x15], 8
194
195        # BLOCK 1
196        UDOT    v20.4s, v31.16b, v4.4b[1]
197        LDR     d29,  [x5], 8
198        UDOT    v21.4s, v31.16b, v5.4b[1]
199        INS     v28.d[1], x11
200        UDOT    v22.4s, v31.16b, v6.4b[1]
201        LDR     x11,  [x5], 8
202        UDOT    v23.4s, v31.16b, v7.4b[1]
203        LDR     d3, [x10], 8
204
205        UDOT    v24.2s, v8.8b, v4.8b
206        UDOT    v25.2s, v8.8b, v5.8b
207        SUBS    x0, x0, 16
208        UDOT    v26.2s, v8.8b, v6.8b
209        UDOT    v27.2s, v8.8b, v7.8b
210
211        B.HS    2b
212
213        # Epilogue.  Same as main loop but no preloads in final group
2143:
215        # BLOCK 0
216        UDOT    v16.4s,  v28.16b, v0.4b[0]
217        LDR     d30,  [x5], 8
218        UDOT    v17.4s,  v28.16b, v1.4b[0]
219        INS     v29.d[1], x11
220        UDOT    v18.4s,  v28.16b, v2.4b[0]
221        LDR     x11,  [x5], 8
222        UDOT    v19.4s,  v28.16b, v3.4b[0]
223        LDR     d4, [x13], 8
224
225        # BLOCK 1
226        UDOT    v20.4s,  v29.16b, v0.4b[0]
227        LDR     d31,  [x5], 8
228        UDOT    v21.4s,  v29.16b, v1.4b[0]
229        INS     v30.d[1], x11
230        UDOT    v22.4s,  v29.16b, v2.4b[0]
231        LDR     x11,  [x5], 8
232        UDOT    v23.4s,  v29.16b, v3.4b[0]
233        LDR     d5, [x14], 8
234
235        # BLOCK 0
236        UDOT    v16.4s, v30.16b, v0.4b[1]
237        LDR     d28,  [x5], 8
238        UDOT    v17.4s, v30.16b, v1.4b[1]
239        INS     v31.d[1], x11
240        UDOT    v18.4s, v30.16b, v2.4b[1]
241        LDR     x11,  [x5], 8
242        UDOT    v19.4s, v30.16b, v3.4b[1]
243        LDR     d6, [x15], 8
244
245        # BLOCK 1
246        UDOT    v20.4s, v31.16b, v0.4b[1]
247        LDR     d29,  [x5], 8
248        UDOT    v21.4s, v31.16b, v1.4b[1]
249        INS     v28.d[1], x11
250        UDOT    v22.4s, v31.16b, v2.4b[1]
251        LDR     x11,  [x5], 8
252        UDOT    v23.4s, v31.16b, v3.4b[1]
253        LDR     d7, [x10], 8
254
255        UDOT    v24.2s, v8.8b, v0.8b
256        UDOT    v25.2s, v8.8b, v1.8b
257        UDOT    v26.2s, v8.8b, v2.8b
258        UDOT    v27.2s, v8.8b, v3.8b
259
260        # BLOCK 0
261        UDOT    v16.4s,  v28.16b, v4.4b[0]
262        LDR     d30,  [x5], 8
263        UDOT    v17.4s,  v28.16b, v5.4b[0]
264        INS     v29.d[1], x11
265        UDOT    v18.4s,  v28.16b, v6.4b[0]
266        LDR     x11,  [x5], 8
267        UDOT    v19.4s,  v28.16b, v7.4b[0]
268
269        # BLOCK 1
270        UDOT    v20.4s,  v29.16b, v4.4b[0]
271        LDR     d31,  [x5], 8
272        UDOT    v21.4s,  v29.16b, v5.4b[0]
273        INS     v30.d[1], x11
274        UDOT    v22.4s,  v29.16b, v6.4b[0]
275        LDR     x11,  [x5], 8
276        UDOT    v23.4s,  v29.16b, v7.4b[0]
277
278        # BLOCK 0
279        UDOT    v16.4s, v30.16b, v4.4b[1]
280        UDOT    v17.4s, v30.16b, v5.4b[1]
281        INS     v31.d[1], x11
282        UDOT    v18.4s, v30.16b, v6.4b[1]
283        UDOT    v19.4s, v30.16b, v7.4b[1]
284
285        # BLOCK 1
286        UDOT    v20.4s, v31.16b, v4.4b[1]
287        UDOT    v21.4s, v31.16b, v5.4b[1]
288        UDOT    v22.4s, v31.16b, v6.4b[1]
289        UDOT    v23.4s, v31.16b, v7.4b[1]
290
291        AND     x0, x2, 15              // kc remainder 0 to 12
292
293        UDOT    v24.2s, v8.8b, v4.8b
294        UDOT    v25.2s, v8.8b, v5.8b
295        UDOT    v26.2s, v8.8b, v6.8b
296        UDOT    v27.2s, v8.8b, v7.8b
297
298        # Is there a remainder?- 4 to 12 bytes of A
299        CBNZ    x0, 5f
300
301        .p2align 3
3024:
303        # ks loop
304        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
305        B.HI    1b
306
307        ADDP    v0.2s, v24.2s, v25.2s
308        ADDP    v1.2s, v26.2s, v27.2s
309        LDR     x11, [sp, 40]           // reload params pointer
310        DUP     v24.4s, v0.s[0]
311        DUP     v25.4s, v0.s[1]
312        DUP     v26.4s, v1.s[0]
313        DUP     v27.4s, v1.s[1]
314        ADD     x11, x11, 4
315
316        # Subtract zero point from accumulators
317        SUB     v16.4s, v16.4s, v24.4s
318        SUB     v17.4s, v17.4s, v25.4s
319        SUB     v18.4s, v18.4s, v26.4s
320        SUB     v19.4s, v19.4s, v27.4s
321        SUB     v20.4s, v20.4s, v24.4s
322        SUB     v21.4s, v21.4s, v25.4s
323        SUB     v22.4s, v22.4s, v26.4s
324        SUB     v23.4s, v23.4s, v27.4s
325
326        $if REQUANTIZATION == "RNDNU":
327          # Apply params - preshift, scale, postshift, bias and clamp
328          LD1R    {v4.4s}, [x11], 4
329          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
330          SSHL    v17.4s, v17.4s, v4.4s
331          SSHL    v18.4s, v18.4s, v4.4s
332          SSHL    v19.4s, v19.4s, v4.4s
333          LD1R    {v5.4s}, [x11], 4
334          SSHL    v20.4s, v20.4s, v4.4s
335          SSHL    v21.4s, v21.4s, v4.4s
336          SSHL    v22.4s, v22.4s, v4.4s
337          SSHL    v23.4s, v23.4s, v4.4s
338          LD1R    {v6.4s}, [x11], 4
339          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
340          SQDMULH v17.4s, v17.4s, v5.4s
341          SQDMULH v18.4s, v18.4s, v5.4s
342          SQDMULH v19.4s, v19.4s, v5.4s
343          SQDMULH v20.4s, v20.4s, v5.4s
344          SQDMULH v21.4s, v21.4s, v5.4s
345          SQDMULH v22.4s, v22.4s, v5.4s
346          SQDMULH v23.4s, v23.4s, v5.4s
347          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
348          SRSHL   v17.4s, v17.4s, v6.4s
349          SRSHL   v18.4s, v18.4s, v6.4s
350          SRSHL   v19.4s, v19.4s, v6.4s
351          SRSHL   v20.4s, v20.4s, v6.4s
352          SRSHL   v21.4s, v21.4s, v6.4s
353          SRSHL   v22.4s, v22.4s, v6.4s
354          SRSHL   v23.4s, v23.4s, v6.4s
355        $elif REQUANTIZATION == "FP32":
356          # Apply params - scale, bias and clamp
357          SCVTF   v16.4s, v16.4s
358          SCVTF   v17.4s, v17.4s
359          LD1R    {v4.4s}, [x11], 4
360          SCVTF   v18.4s, v18.4s
361          SCVTF   v19.4s, v19.4s
362          SCVTF   v20.4s, v20.4s
363          SCVTF   v21.4s, v21.4s
364          SCVTF   v22.4s, v22.4s
365          SCVTF   v23.4s, v23.4s
366
367          FMUL    v16.4s, v16.4s, v4.4s
368          FMUL    v17.4s, v17.4s, v4.4s
369          FMUL    v18.4s, v18.4s, v4.4s
370          FMUL    v19.4s, v19.4s, v4.4s
371          FMUL    v20.4s, v20.4s, v4.4s
372          FMUL    v21.4s, v21.4s, v4.4s
373          FMUL    v22.4s, v22.4s, v4.4s
374          FMUL    v23.4s, v23.4s, v4.4s
375
376          FCVTNS  v16.4s, v16.4s
377          FCVTNS  v17.4s, v17.4s
378          FCVTNS  v18.4s, v18.4s
379          FCVTNS  v19.4s, v19.4s
380          FCVTNS  v20.4s, v20.4s
381          FCVTNS  v21.4s, v21.4s
382          FCVTNS  v22.4s, v22.4s
383          FCVTNS  v23.4s, v23.4s
384
385        SQXTN   v16.4h, v16.4s
386        SQXTN   v17.4h, v17.4s
387        SQXTN   v18.4h, v18.4s
388        SQXTN   v19.4h, v19.4s
389        LD1R    {v6.8h}, [x11], 2        // add bias
390
391        SQXTN2  v16.8h, v20.4s
392        SQXTN2  v17.8h, v21.4s
393        SQXTN2  v18.8h, v22.4s
394        SQXTN2  v19.8h, v23.4s
395
396        SQADD   v16.8h, v16.8h, v6.8h
397        SQADD   v17.8h, v17.8h, v6.8h
398        LDR     x10, [sp, 16]            // Load cn_stride
399        SQADD   v18.8h, v18.8h, v6.8h
400        SQADD   v19.8h, v19.8h, v6.8h
401        LD1R    {v4.16b}, [x11], 1       // clamp min value
402
403        SQXTUN  v0.8b, v16.8h
404        SQXTUN  v1.8b, v18.8h
405        LD1R    {v5.16b}, [x11]          // clamp max value
406        SQXTUN2 v0.16b, v17.8h
407        SQXTUN2 v1.16b, v19.8h
408
409        UMAX    v0.16b, v0.16b, v4.16b
410        UMAX    v1.16b, v1.16b, v4.16b
411        SUBS    x1, x1, 8
412        UMIN    v0.16b, v0.16b, v5.16b
413        UMIN    v1.16b, v1.16b, v5.16b
414        B.LO    7f
415
416        # Store full 4 x 8
417        ST1     {v1.d}[1],  [x7], x10
418        ST1     {v1.8b}, [x17], x10
419        ST1     {v0.d}[1], [x16], x10
420        ST1     {v0.8b},  [x6], x10
421        SUB     x4, x4, x3              // a -= ks
422
423        # nc loop
424        B.HI    0b
425
426        # Restore d8 from stack
427        LDR     d8, [sp], 16
428        RET
429
430        # Remainder- 4 to 12 bytes of A
431        .p2align 3
4325:
433        TBZ     x0, 3, 6f
434
435        LDR     d0, [x13], 8
436        LDR     q4, [x5], 16
437        LDR     d1, [x14], 8
438        LDR     d2, [x15], 8
439        LDR     d3, [x10], 8
440        LDR     q5, [x5], 16
441        UDOT    v24.2s, v8.8b, v0.8b
442        UDOT    v25.2s, v8.8b, v1.8b
443        UDOT    v26.2s, v8.8b, v2.8b
444        UDOT    v27.2s, v8.8b, v3.8b
445        UDOT    v16.4s, v4.16b, v0.4b[0]
446        UDOT    v17.4s, v4.16b, v1.4b[0]
447        UDOT    v18.4s, v4.16b, v2.4b[0]
448        UDOT    v19.4s, v4.16b, v3.4b[0]
449        LDR     q6, [x5], 16
450        UDOT    v20.4s, v5.16b, v0.4b[0]
451        UDOT    v21.4s, v5.16b, v1.4b[0]
452        UDOT    v22.4s, v5.16b, v2.4b[0]
453        UDOT    v23.4s, v5.16b, v3.4b[0]
454        LDR     q4, [x5], 16
455        UDOT    v16.4s, v6.16b, v0.4b[1]
456        UDOT    v17.4s, v6.16b, v1.4b[1]
457        UDOT    v18.4s, v6.16b, v2.4b[1]
458        UDOT    v19.4s, v6.16b, v3.4b[1]
459        UDOT    v20.4s, v4.16b, v0.4b[1]
460        UDOT    v21.4s, v4.16b, v1.4b[1]
461        UDOT    v22.4s, v4.16b, v2.4b[1]
462        UDOT    v23.4s, v4.16b, v3.4b[1]
463        TBZ     x0, 2, 4b
4646:
465        LDR     s0, [x13], 4
466        LDR     q4, [x5], 16
467        LDR     s1, [x14], 4
468        LDR     s2, [x15], 4
469        LDR     s3, [x10], 4
470        LDR     q5, [x5], 16
471        UDOT    v24.2s, v8.8b, v0.8b
472        UDOT    v25.2s, v8.8b, v1.8b
473        UDOT    v26.2s, v8.8b, v2.8b
474        UDOT    v27.2s, v8.8b, v3.8b
475        UDOT    v16.4s, v4.16b, v0.4b[0]
476        UDOT    v17.4s, v4.16b, v1.4b[0]
477        UDOT    v18.4s, v4.16b, v2.4b[0]
478        UDOT    v19.4s, v4.16b, v3.4b[0]
479        UDOT    v20.4s, v5.16b, v0.4b[0]
480        UDOT    v21.4s, v5.16b, v1.4b[0]
481        UDOT    v22.4s, v5.16b, v2.4b[0]
482        UDOT    v23.4s, v5.16b, v3.4b[0]
483        B       4b
484
485        # Store odd width
486        .p2align 3
4877:
488        TBZ     x1, 2, 8f
489        ST1     {v1.s}[2], [x7], 4
490        STR     s1, [x17], 4
491        ST1     {v0.s}[2], [x16], 4
492        STR     s0, [x6], 4
493        EXT     v0.16b, v0.16b, v0.16b, 4
494        EXT     v1.16b, v1.16b, v1.16b, 4
4958:
496        TBZ     x1, 1, 9f
497        ST1     {v1.h}[4], [x7], 2
498        STR     h1, [x17], 2
499        ST1     {v0.h}[4], [x16], 2
500        STR     h0, [x6], 2
501        EXT     v0.16b, v0.16b, v0.16b, 2
502        EXT     v1.16b, v1.16b, v1.16b, 2
5039:
504        TBZ     x1, 0, 10f
505        ST1     {v1.b}[8], [x7]
506        STR     b1, [x17]
507        ST1     {v0.b}[8], [x16]
508        STR     b0, [x6]
50910:
510        # Restore d8 from stack
511        LDR     d8, [sp], 16
512        RET
513
514END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55
515
516#ifdef __ELF__
517.section ".note.GNU-stack","",%progbits
518#endif
519