xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/4x8c4-aarch64-neondot-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
11# void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55(
12#     size_t mr,                 x0
13#     size_t nc,                 x1
14#     size_t kc,                 x2 / x0
15#     const int8_t* restrict a,  x3
16#     size_t a_stride,           x4
17#     const void* restrict w,    x5
18#     int8_t* restrict c,        x6
19#     size_t cm_stride,          x7
20#     size_t cn_stride,          [sp] -> x12
21#     const union xnn_qu8_conv_minmax_params)  [sp + 8] -> x11
22
23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
24
25# Register usage
26# A0  x3  v0  v4
27# A1 x15  v1  v5
28# A2 x13  v2  v6
29# A3  x4  v3  v7
30# B   x5  v28  v29 v30 v31
31# C0  x6 v16 v20
32# C1  x8 v17 v21
33# C2  x9 v18 v22
34# C3  x7 v19 v23
35# zero_point v24 v25 v26 v27 v8
36# unused v12 v13 v14 v15 v29 v30 v31
37
38# x14 temp for Cortex-A55 loads
39
40BEGIN_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55
41
42        # Clamp A and C pointers
43        CMP     x0, 2                   // if mr < 2
44        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
45        ADD     x15, x3, x4             // a1 = a0 + a_stride
46        ADD     x8, x6, x7              // c1 = c0 + cm_stride
47        CSEL    x15, x3, x15, LO        //   a1 = a0
48        CSEL    x8, x6,  x8, LO         //   c1 = c0
49        BIC     x2, x2, 3
50
51        LDP     x12, x11, [sp]          // cn_stride, params
52
53        ADD     x13, x15, x4            // a2 = a1 + a_stride
54        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
55        STR     d8, [sp, -16]!          // Save d8 on stack
56                                        // if mr <= 2
57        CSEL    x13, x15, x13, LS       //   a2 = a1
58        CSEL    x9,  x8,  x9, LS        //   c2 = c1
59
60        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x4, x13, x4             // a3 = a2 + a_stride
64        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
65        CSEL    x4, x13, x4, LO         //   a3 = a2
66        CSEL    x7,  x9, x7, LO         //   c3 = c2
67
68        .p2align 3
690:
70        # Load initial bias from w into accumulators
71        LDP     q16, q20, [x5], 32
72        MOV     v17.16b, v16.16b
73        MOV     v18.16b, v16.16b
74        MOV     v19.16b, v16.16b
75        MOV     v21.16b, v20.16b
76        MOV     v22.16b, v20.16b
77        MOV     v23.16b, v20.16b
78        SUBS    x0, x2, 16              // k = kc - 16
79        MOVI    v24.16b, 0
80        MOVI    v25.16b, 0
81        MOVI    v26.16b, 0
82        MOVI    v27.16b, 0
83
84        # Is there at least 16 bytes for prologue/epilogue?
85        B.LO    4f
86
87        # prologue - read A and B values for block 0 and 1
88        LDR     d0,  [x3], 8
89        LDR     q28, [x5], 16
90        LDR     d1, [x15], 8
91        LDR     d2, [x13], 8
92        LDR     d3,  [x4], 8
93        SUBS    x0, x0, 16              // is there 16 for main loop?
94        LDR     d29, [x5], 8
95        LDR     x14, [x5], 8
96        # Is there at least 16 bytes for main loop?
97        B.LO    2f
98
99        # Main loop - 16 bytes of A in 4 groups of 2 blocks
100        # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels
101        # 4 LD64 for A
102        # 4 LD128 for W. = 2 LD64 + INS.
103        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
104
105        .p2align 3
1061:
107        # BLOCK 0
108        UDOT    v16.4s,  v28.16b, v0.4b[0]
109        LDR     d30,  [x5], 8
110        UDOT    v17.4s,  v28.16b, v1.4b[0]
111        INS     v29.d[1], x14
112        UDOT    v18.4s,  v28.16b, v2.4b[0]
113        LDR     x14,  [x5], 8
114        UDOT    v19.4s,  v28.16b, v3.4b[0]
115        LDR     d4,  [x3], 8
116
117        # BLOCK 1
118        UDOT    v20.4s,  v29.16b, v0.4b[0]
119        LDR     d31,  [x5], 8
120        UDOT    v21.4s,  v29.16b, v1.4b[0]
121        INS     v30.d[1], x14
122        UDOT    v22.4s,  v29.16b, v2.4b[0]
123        LDR     x14,  [x5], 8
124        UDOT    v23.4s,  v29.16b, v3.4b[0]
125        LDR     d5, [x15], 8
126
127        # BLOCK 0
128        UDOT    v16.4s, v30.16b, v0.4b[1]
129        LDR     d28,  [x5], 8
130        UDOT    v17.4s, v30.16b, v1.4b[1]
131        INS     v31.d[1], x14
132        UDOT    v18.4s, v30.16b, v2.4b[1]
133        LDR     x14,  [x5], 8
134        UDOT    v19.4s, v30.16b, v3.4b[1]
135        LDR     d6, [x13], 8
136
137        # BLOCK 1
138        UDOT    v20.4s, v31.16b, v0.4b[1]
139        LDR     d29,  [x5], 8
140        UDOT    v21.4s, v31.16b, v1.4b[1]
141        INS     v28.d[1], x14
142        UDOT    v22.4s, v31.16b, v2.4b[1]
143        LDR     x14,  [x5], 8
144        UDOT    v23.4s, v31.16b, v3.4b[1]
145        LDR     d7,  [x4], 8
146
147        UDOT    v24.2s, v8.8b, v0.8b
148        UDOT    v25.2s, v8.8b, v1.8b
149        UDOT    v26.2s, v8.8b, v2.8b
150        UDOT    v27.2s, v8.8b, v3.8b
151
152        # BLOCK 0
153        UDOT    v16.4s,  v28.16b, v4.4b[0]
154        LDR     d30,  [x5], 8
155        UDOT    v17.4s,  v28.16b, v5.4b[0]
156        INS     v29.d[1], x14
157        UDOT    v18.4s,  v28.16b, v6.4b[0]
158        LDR     x14,  [x5], 8
159        UDOT    v19.4s,  v28.16b, v7.4b[0]
160        LDR     d0,  [x3], 8
161
162        # BLOCK 1
163        UDOT    v20.4s,  v29.16b, v4.4b[0]
164        LDR     d31,  [x5], 8
165        UDOT    v21.4s,  v29.16b, v5.4b[0]
166        INS     v30.d[1], x14
167        UDOT    v22.4s,  v29.16b, v6.4b[0]
168        LDR     x14,  [x5], 8
169        UDOT    v23.4s,  v29.16b, v7.4b[0]
170        LDR     d1, [x15], 8
171
172        # BLOCK 0
173        UDOT    v16.4s, v30.16b, v4.4b[1]
174        LDR     d28,  [x5], 8
175        UDOT    v17.4s, v30.16b, v5.4b[1]
176        INS     v31.d[1], x14
177        UDOT    v18.4s, v30.16b, v6.4b[1]
178        LDR     x14,  [x5], 8
179        UDOT    v19.4s, v30.16b, v7.4b[1]
180        LDR     d2, [x13], 8
181
182        # BLOCK 1
183        UDOT    v20.4s, v31.16b, v4.4b[1]
184        LDR     d29,  [x5], 8
185        UDOT    v21.4s, v31.16b, v5.4b[1]
186        INS     v28.d[1], x14
187        UDOT    v22.4s, v31.16b, v6.4b[1]
188        LDR     x14,  [x5], 8
189        UDOT    v23.4s, v31.16b, v7.4b[1]
190        LDR     d3,  [x4], 8
191
192        UDOT    v24.2s, v8.8b, v4.8b
193        UDOT    v25.2s, v8.8b, v5.8b
194        SUBS    x0, x0, 16
195        UDOT    v26.2s, v8.8b, v6.8b
196        UDOT    v27.2s, v8.8b, v7.8b
197
198        B.HS    1b
199
200        # Epilogue.  Same as main loop but no preloads in final group
2012:
202        # BLOCK 0
203        UDOT    v16.4s,  v28.16b, v0.4b[0]
204        LDR     d30,  [x5], 8
205        UDOT    v17.4s,  v28.16b, v1.4b[0]
206        INS     v29.d[1], x14
207        UDOT    v18.4s,  v28.16b, v2.4b[0]
208        LDR     x14,  [x5], 8
209        UDOT    v19.4s,  v28.16b, v3.4b[0]
210        LDR     d4,  [x3], 8
211
212        # BLOCK 1
213        UDOT    v20.4s,  v29.16b, v0.4b[0]
214        LDR     d31,  [x5], 8
215        UDOT    v21.4s,  v29.16b, v1.4b[0]
216        INS     v30.d[1], x14
217        UDOT    v22.4s,  v29.16b, v2.4b[0]
218        LDR     x14,  [x5], 8
219        UDOT    v23.4s,  v29.16b, v3.4b[0]
220        LDR     d5, [x15], 8
221
222        # BLOCK 0
223        UDOT    v16.4s, v30.16b, v0.4b[1]
224        LDR     d28,  [x5], 8
225        UDOT    v17.4s, v30.16b, v1.4b[1]
226        INS     v31.d[1], x14
227        UDOT    v18.4s, v30.16b, v2.4b[1]
228        LDR     x14,  [x5], 8
229        UDOT    v19.4s, v30.16b, v3.4b[1]
230        LDR     d6, [x13], 8
231
232        # BLOCK 1
233        UDOT    v20.4s, v31.16b, v0.4b[1]
234        LDR     d29,  [x5], 8
235        UDOT    v21.4s, v31.16b, v1.4b[1]
236        INS     v28.d[1], x14
237        UDOT    v22.4s, v31.16b, v2.4b[1]
238        LDR     x14,  [x5], 8
239        UDOT    v23.4s, v31.16b, v3.4b[1]
240        LDR     d7,  [x4], 8
241
242        UDOT    v24.2s, v8.8b, v0.8b
243        UDOT    v25.2s, v8.8b, v1.8b
244        UDOT    v26.2s, v8.8b, v2.8b
245        UDOT    v27.2s, v8.8b, v3.8b
246
247        # BLOCK 0
248        UDOT    v16.4s,  v28.16b, v4.4b[0]
249        LDR     d30,  [x5], 8
250        UDOT    v17.4s,  v28.16b, v5.4b[0]
251        INS     v29.d[1], x14
252        UDOT    v18.4s,  v28.16b, v6.4b[0]
253        LDR     x14,  [x5], 8
254        UDOT    v19.4s,  v28.16b, v7.4b[0]
255
256        # BLOCK 1
257        UDOT    v20.4s,  v29.16b, v4.4b[0]
258        LDR     d31,  [x5], 8
259        UDOT    v21.4s,  v29.16b, v5.4b[0]
260        INS     v30.d[1], x14
261        UDOT    v22.4s,  v29.16b, v6.4b[0]
262        LDR     x14,  [x5], 8
263        UDOT    v23.4s,  v29.16b, v7.4b[0]
264
265        # BLOCK 0
266        UDOT    v16.4s, v30.16b, v4.4b[1]
267        UDOT    v17.4s, v30.16b, v5.4b[1]
268        INS     v31.d[1], x14
269        UDOT    v18.4s, v30.16b, v6.4b[1]
270        UDOT    v19.4s, v30.16b, v7.4b[1]
271
272        # BLOCK 1
273        UDOT    v20.4s, v31.16b, v4.4b[1]
274        UDOT    v21.4s, v31.16b, v5.4b[1]
275        UDOT    v22.4s, v31.16b, v6.4b[1]
276        UDOT    v23.4s, v31.16b, v7.4b[1]
277
278        AND     x0, x2, 15              // kc remainder 0 to 12
279
280        UDOT    v24.2s, v8.8b, v4.8b
281        UDOT    v25.2s, v8.8b, v5.8b
282        UDOT    v26.2s, v8.8b, v6.8b
283        UDOT    v27.2s, v8.8b, v7.8b
284
285        # Is there a remainder?- 4 to 12 bytes of A
286        CBNZ    x0, 4f
287
288        .p2align 3
2893:
290        ADDP    v0.2s, v24.2s, v25.2s
291        ADDP    v1.2s, v26.2s, v27.2s
292        DUP     v24.4s, v0.s[0]
293        DUP     v25.4s, v0.s[1]
294        DUP     v26.4s, v1.s[0]
295        DUP     v27.4s, v1.s[1]
296
297        # Subtract zero point from accumulators
298        SUB     v16.4s, v16.4s, v24.4s
299        SUB     v17.4s, v17.4s, v25.4s
300        SUB     v18.4s, v18.4s, v26.4s
301        SUB     v19.4s, v19.4s, v27.4s
302        SUB     v20.4s, v20.4s, v24.4s
303        SUB     v21.4s, v21.4s, v25.4s
304        SUB     v22.4s, v22.4s, v26.4s
305        SUB     v23.4s, v23.4s, v27.4s
306
307        $if REQUANTIZATION == "RNDNU":
308          # Apply params - preshift, scale, postshift, bias and clamp
309          LD1R    {v4.4s}, [x11], 4
310          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
311          SSHL    v17.4s, v17.4s, v4.4s
312          SSHL    v18.4s, v18.4s, v4.4s
313          SSHL    v19.4s, v19.4s, v4.4s
314          LD1R    {v5.4s}, [x11], 4
315          SSHL    v20.4s, v20.4s, v4.4s
316          SSHL    v21.4s, v21.4s, v4.4s
317          SSHL    v22.4s, v22.4s, v4.4s
318          SSHL    v23.4s, v23.4s, v4.4s
319          LD1R    {v6.4s}, [x11], 4
320          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
321          SQDMULH v17.4s, v17.4s, v5.4s
322          SQDMULH v18.4s, v18.4s, v5.4s
323          SQDMULH v19.4s, v19.4s, v5.4s
324          SQDMULH v20.4s, v20.4s, v5.4s
325          SQDMULH v21.4s, v21.4s, v5.4s
326          SQDMULH v22.4s, v22.4s, v5.4s
327          SQDMULH v23.4s, v23.4s, v5.4s
328          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
329          SRSHL   v17.4s, v17.4s, v6.4s
330          SRSHL   v18.4s, v18.4s, v6.4s
331          SRSHL   v19.4s, v19.4s, v6.4s
332          SRSHL   v20.4s, v20.4s, v6.4s
333          SRSHL   v21.4s, v21.4s, v6.4s
334          SRSHL   v22.4s, v22.4s, v6.4s
335          SRSHL   v23.4s, v23.4s, v6.4s
336        $elif REQUANTIZATION == "FP32":
337          # Apply params - scale, bias and clamp
338          SCVTF   v16.4s, v16.4s
339          SCVTF   v17.4s, v17.4s
340          LD1R    {v4.4s}, [x11], 4
341          SCVTF   v18.4s, v18.4s
342          SCVTF   v19.4s, v19.4s
343          SCVTF   v20.4s, v20.4s
344          SCVTF   v21.4s, v21.4s
345          SCVTF   v22.4s, v22.4s
346          SCVTF   v23.4s, v23.4s
347
348          FMUL    v16.4s, v16.4s, v4.4s
349          FMUL    v17.4s, v17.4s, v4.4s
350          FMUL    v18.4s, v18.4s, v4.4s
351          FMUL    v19.4s, v19.4s, v4.4s
352          FMUL    v20.4s, v20.4s, v4.4s
353          FMUL    v21.4s, v21.4s, v4.4s
354          FMUL    v22.4s, v22.4s, v4.4s
355          FMUL    v23.4s, v23.4s, v4.4s
356
357          FCVTNS  v16.4s, v16.4s
358          FCVTNS  v17.4s, v17.4s
359          FCVTNS  v18.4s, v18.4s
360          FCVTNS  v19.4s, v19.4s
361          FCVTNS  v20.4s, v20.4s
362          FCVTNS  v21.4s, v21.4s
363          FCVTNS  v22.4s, v22.4s
364          FCVTNS  v23.4s, v23.4s
365
366        SQXTN   v16.4h, v16.4s
367        SQXTN   v17.4h, v17.4s
368        SQXTN   v18.4h, v18.4s
369        SQXTN   v19.4h, v19.4s
370        LD1R    {v6.8h}, [x11], 2       // add bias
371
372        SQXTN2  v16.8h, v20.4s
373        SQXTN2  v17.8h, v21.4s
374        SQXTN2  v18.8h, v22.4s
375        SQXTN2  v19.8h, v23.4s
376
377        SQADD   v16.8h, v16.8h, v6.8h
378        SQADD   v17.8h, v17.8h, v6.8h
379        SQADD   v18.8h, v18.8h, v6.8h
380        SQADD   v19.8h, v19.8h, v6.8h
381        LD1R    {v4.16b}, [x11], 1      // clamp min value
382
383        SQXTUN  v0.8b, v16.8h
384        SQXTUN  v1.8b, v18.8h
385        LD1R    {v5.16b}, [x11]         // clamp max value
386        SQXTUN2 v0.16b, v17.8h
387        SQXTUN2 v1.16b, v19.8h
388        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
389
390        UMAX    v0.16b, v0.16b, v4.16b
391        UMAX    v1.16b, v1.16b, v4.16b
392        SUBS    x1, x1, 8
393        UMIN    v0.16b, v0.16b, v5.16b
394        UMIN    v1.16b, v1.16b, v5.16b
395        B.LO    6f
396
397        # Store full 4 x 8
398        ST1     {v0.8b}, [x6], x12
399        SUB     x3,  x3, x2             // a0 -= kc
400        ST1     {v0.d}[1], [x8], x12
401        SUB     x15, x15, x2            // a1 -= kc
402        ST1     {v1.8b}, [x9], x12
403        SUB     x13, x13, x2            // a2 -= kc
404        ST1     {v1.d}[1], [x7], x12
405        SUB     x4,  x4, x2             // a3 -= kc
406        B.NE    0b
407
408        # Restore d8 from stack
409        LDR     d8, [sp], 16
410        RET
411
412        # Remainder- 4 to 12 bytes of A
413        # Although C4, its safe to read 16 bytes.
414        .p2align 3
4154:
416        TBZ     x0, 3, 5f
417
418        LDR     d0,  [x3], 8
419        LDR     q4, [x5], 16
420        LDR     d1, [x15], 8
421        LDR     d2, [x13], 8
422        LDR     d3,  [x4], 8
423        LDR     q5, [x5], 16
424        UDOT    v24.2s, v8.8b, v0.8b
425        UDOT    v25.2s, v8.8b, v1.8b
426        UDOT    v26.2s, v8.8b, v2.8b
427        UDOT    v27.2s, v8.8b, v3.8b
428        UDOT    v16.4s, v4.16b, v0.4b[0]
429        UDOT    v17.4s, v4.16b, v1.4b[0]
430        UDOT    v18.4s, v4.16b, v2.4b[0]
431        UDOT    v19.4s, v4.16b, v3.4b[0]
432        LDR     q6, [x5], 16
433        UDOT    v20.4s, v5.16b, v0.4b[0]
434        UDOT    v21.4s, v5.16b, v1.4b[0]
435        UDOT    v22.4s, v5.16b, v2.4b[0]
436        UDOT    v23.4s, v5.16b, v3.4b[0]
437        LDR     q4, [x5], 16
438        UDOT    v16.4s, v6.16b, v0.4b[1]
439        UDOT    v17.4s, v6.16b, v1.4b[1]
440        UDOT    v18.4s, v6.16b, v2.4b[1]
441        UDOT    v19.4s, v6.16b, v3.4b[1]
442        UDOT    v20.4s, v4.16b, v0.4b[1]
443        UDOT    v21.4s, v4.16b, v1.4b[1]
444        UDOT    v22.4s, v4.16b, v2.4b[1]
445        UDOT    v23.4s, v4.16b, v3.4b[1]
446        TBZ     x0, 2, 3b
4475:
448        LDR     s0,  [x3], 4
449        LDR     q4, [x5], 16
450        LDR     s1, [x15], 4
451        LDR     s2, [x13], 4
452        LDR     s3,  [x4], 4
453        LDR     q5, [x5], 16
454        UDOT    v24.2s, v8.8b, v0.8b
455        UDOT    v25.2s, v8.8b, v1.8b
456        UDOT    v26.2s, v8.8b, v2.8b
457        UDOT    v27.2s, v8.8b, v3.8b
458        UDOT    v16.4s, v4.16b, v0.4b[0]
459        UDOT    v17.4s, v4.16b, v1.4b[0]
460        UDOT    v18.4s, v4.16b, v2.4b[0]
461        UDOT    v19.4s, v4.16b, v3.4b[0]
462        UDOT    v20.4s, v5.16b, v0.4b[0]
463        UDOT    v21.4s, v5.16b, v1.4b[0]
464        UDOT    v22.4s, v5.16b, v2.4b[0]
465        UDOT    v23.4s, v5.16b, v3.4b[0]
466        B       3b
467
468        # Store odd width
469        .p2align 3
4706:
471        TBZ     x1, 2, 7f
472        STR     s0, [x6], 4
473        ST1     {v0.s}[2], [x8], 4
474        STR     s1, [x9], 4
475        ST1     {v1.s}[2], [x7], 4
476        EXT     v0.16b, v0.16b, v0.16b, 4
477        EXT     v1.16b, v1.16b, v1.16b, 4
4787:
479        TBZ     x1, 1, 8f
480        STR     h0, [x6], 2
481        ST1     {v0.h}[4], [x8], 2
482        STR     h1, [x9], 2
483        ST1     {v1.h}[4], [x7], 2
484        EXT     v0.16b, v0.16b, v0.16b, 2
485        EXT     v1.16b, v1.16b, v1.16b, 2
4868:
487        TBZ     x1, 0, 9f
488        STR     b0, [x6]
489        ST1     {v0.b}[8], [x8]
490        STR     b1, [x9]
491        ST1     {v1.b}[8], [x7]
4929:
493        # Restore d8 from stack
494        LDR     d8, [sp], 16
495        RET
496
497END_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55
498
499#ifdef __ELF__
500.section ".note.GNU-stack","",%progbits
501#endif
502