xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/4x8c4-aarch64-neondot-ld128.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128(
12#     size_t mr,                 x0
13#     size_t nc,                 x1
14#     size_t kc,                 x2 / x0
15#     size_t ks,                 x3 / x9
16#     const int8_t**restrict a,  x4
17#     const int8_t* restrict w,  x5
18#     int8_t* restrict c,        x6
19#     size_t cm_stride,          x7
20#     size_t cn_stride,          [sp] -> x0
21#     size_t a_offset,           [sp + 8] -> x8
22#     const int8_t* zero,        [sp + 16] -> x12
23#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x13  v0
29# A1  x14  v1
30# A2  x15  v2
31# A3  x10  v3
32# B    x5  v4  v5  v6
33# C0   x6 v16 v20
34# C1  x16 v17 v21
35# C2  x17 v18 v22
36# C3   x7 v19 v23
37# zero_point v7 v24 v25 v26 v27
38# unused  v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31
39
40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128
41
42        # Clamp C pointers
43        CMP     x0, 2                   // if mr < 2
44        LDR     x8, [sp, 8]             // Load a_offset
45        ADD     x16, x6, x7             // c1 = c0 + cm_stride
46        CSEL    x16, x6,  x16, LO       //   c1 = c0
47        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
48
49        ADD     x17, x16, x7            // c2 = c1 + cm_stride
50        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
51                                        // if mr <= 2
52        CSEL    x17, x16, x17, LS       //   c2 = c1
53        BIC     x2, x2, 3
54
55        CMP     x0, 4                   // if mr < 4
56        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
57        CSEL    x7,  x17, x7, LO        //   c3 = c2
58
59        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     x9, x3                  // p = ks
66        MOVI    v24.16b, 0
67        MOVI    v25.16b, 0
68        MOVI    v26.16b, 0
69        MOVI    v27.16b, 0
70        MOV     v17.16b, v16.16b
71        MOV     v18.16b, v16.16b
72        MOV     v19.16b, v16.16b
73        MOV     v21.16b, v20.16b
74        MOV     v22.16b, v20.16b
75        MOV     v23.16b, v20.16b
76
77        .p2align 3
781:
79        # Load next 4 A pointers
80        LDP     x13, x14, [x4], 16
81        LDP     x15, x10, [x4], 16
82
83        CMP     x13, x12                // if a0 == zero
84        ADD     x13, x13, x8            // a0 += a_offset
85        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
86        CMP     x14, x12                // if a1 == zero
87        ADD     x14, x14, x8            // a1 += a_offset
88        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
89        CMP     x15, x12                // if a2 == zero
90        ADD     x15, x15, x8            // a2 += a_offset
91        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
92        CMP     x10, x12                // if a3 == zero
93        ADD     x10, x10, x8            // a3 += a_offset
94        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
95
96        # Is there at least 16 bytes for main loop?
97        SUBS    x0, x2, 16              // k = kc - 8
98        B.LO    40f
99
100        # Main loop - 16 bytes of A
101        .p2align 3
1022:
103        LDR     q0, [x13], 16
104        LDR     q4,  [x5], 16
105        LDR     q1, [x14], 16
106        LDR     q2, [x15], 16
107        LDR     q3, [x10], 16
108        LDR     q5,  [x5], 16
109        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
110        UDOT    v25.4s, v7.16b, v1.16b
111        UDOT    v26.4s, v7.16b, v2.16b
112        UDOT    v27.4s, v7.16b, v3.16b
113        UDOT    v16.4s, v4.16b, v0.4b[0]
114        UDOT    v17.4s, v4.16b, v1.4b[0]
115        LDR     q6,  [x5], 16
116        UDOT    v18.4s, v4.16b, v2.4b[0]
117        UDOT    v19.4s, v4.16b, v3.4b[0]
118        UDOT    v20.4s, v5.16b, v0.4b[0]
119        UDOT    v21.4s, v5.16b, v1.4b[0]
120        LDR     q4,  [x5], 16
121        UDOT    v22.4s, v5.16b, v2.4b[0]
122        UDOT    v23.4s, v5.16b, v3.4b[0]
123        UDOT    v16.4s, v6.16b, v0.4b[1]
124        UDOT    v17.4s, v6.16b, v1.4b[1]
125        LDR     q5,  [x5], 16
126        UDOT    v18.4s, v6.16b, v2.4b[1]
127        UDOT    v19.4s, v6.16b, v3.4b[1]
128        UDOT    v20.4s, v4.16b, v0.4b[1]
129        UDOT    v21.4s, v4.16b, v1.4b[1]
130        LDR     q6,  [x5], 16
131        UDOT    v22.4s, v4.16b, v2.4b[1]
132        UDOT    v23.4s, v4.16b, v3.4b[1]
133        UDOT    v16.4s, v5.16b, v0.4b[2]
134        UDOT    v17.4s, v5.16b, v1.4b[2]
135        LDR     q4,  [x5], 16
136        UDOT    v18.4s, v5.16b, v2.4b[2]
137        UDOT    v19.4s, v5.16b, v3.4b[2]
138        UDOT    v20.4s, v6.16b, v0.4b[2]
139        UDOT    v21.4s, v6.16b, v1.4b[2]
140        LDR     q5,  [x5], 16
141        UDOT    v22.4s, v6.16b, v2.4b[2]
142        UDOT    v23.4s, v6.16b, v3.4b[2]
143        UDOT    v16.4s, v4.16b, v0.4b[3]
144        UDOT    v17.4s, v4.16b, v1.4b[3]
145        UDOT    v18.4s, v4.16b, v2.4b[3]
146        UDOT    v19.4s, v4.16b, v3.4b[3]
147        SUBS    x0, x0, 16
148        UDOT    v20.4s, v5.16b, v0.4b[3]
149        UDOT    v21.4s, v5.16b, v1.4b[3]
150        UDOT    v22.4s, v5.16b, v2.4b[3]
151        UDOT    v23.4s, v5.16b, v3.4b[3]
152        B.HS    2b
153
154        # Is there a remainder?- 8 bytes of A
155        TBNZ    x0, 3, 4f
156        # Is there a remainder?- 4 bytes of A
157        TBNZ    x0, 2, 5f
158
1593:
160        # ks loop
161        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
162        B.HI    1b
163
164        ADDP    v0.4s, v24.4s, v24.4s
165        ADDP    v1.4s, v25.4s, v25.4s
166        ADDP    v2.4s, v26.4s, v26.4s
167        ADDP    v3.4s, v27.4s, v27.4s
168        ADDP    v24.4s, v0.4s, v0.4s
169        ADDP    v25.4s, v1.4s, v1.4s
170        ADDP    v26.4s, v2.4s, v2.4s
171        ADDP    v27.4s, v3.4s, v3.4s
172
173        # Subtract zero point from accumulators
174        SUB     v16.4s, v16.4s, v24.4s
175        SUB     v17.4s, v17.4s, v25.4s
176        SUB     v18.4s, v18.4s, v26.4s
177        SUB     v19.4s, v19.4s, v27.4s
178        SUB     v20.4s, v20.4s, v24.4s
179        SUB     v21.4s, v21.4s, v25.4s
180        SUB     v22.4s, v22.4s, v26.4s
181        SUB     v23.4s, v23.4s, v27.4s
182
183        $if REQUANTIZATION == "RNDNU":
184          # Apply params - preshift, scale, postshift, bias and clamp
185          LD1R    {v4.4s}, [x11], 4
186          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
187          SSHL    v17.4s, v17.4s, v4.4s
188          SSHL    v18.4s, v18.4s, v4.4s
189          SSHL    v19.4s, v19.4s, v4.4s
190          LD1R    {v5.4s}, [x11], 4
191          SSHL    v20.4s, v20.4s, v4.4s
192          SSHL    v21.4s, v21.4s, v4.4s
193          SSHL    v22.4s, v22.4s, v4.4s
194          SSHL    v23.4s, v23.4s, v4.4s
195          LD1R    {v6.4s}, [x11], 4
196          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
197          SQDMULH v17.4s, v17.4s, v5.4s
198          SQDMULH v18.4s, v18.4s, v5.4s
199          SQDMULH v19.4s, v19.4s, v5.4s
200          SQDMULH v20.4s, v20.4s, v5.4s
201          SQDMULH v21.4s, v21.4s, v5.4s
202          SQDMULH v22.4s, v22.4s, v5.4s
203          SQDMULH v23.4s, v23.4s, v5.4s
204          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
205          SRSHL   v17.4s, v17.4s, v6.4s
206          SRSHL   v18.4s, v18.4s, v6.4s
207          SRSHL   v19.4s, v19.4s, v6.4s
208          SRSHL   v20.4s, v20.4s, v6.4s
209          SRSHL   v21.4s, v21.4s, v6.4s
210          SRSHL   v22.4s, v22.4s, v6.4s
211          SRSHL   v23.4s, v23.4s, v6.4s
212        $elif REQUANTIZATION == "FP32":
213          # Apply params - scale, bias and clamp
214          SCVTF   v16.4s, v16.4s
215          SCVTF   v17.4s, v17.4s
216          LD1R    {v4.4s}, [x11], 4
217          SCVTF   v18.4s, v18.4s
218          SCVTF   v19.4s, v19.4s
219          SCVTF   v20.4s, v20.4s
220          SCVTF   v21.4s, v21.4s
221          SCVTF   v22.4s, v22.4s
222          SCVTF   v23.4s, v23.4s
223
224          FMUL    v16.4s, v16.4s, v4.4s
225          FMUL    v17.4s, v17.4s, v4.4s
226          FMUL    v18.4s, v18.4s, v4.4s
227          FMUL    v19.4s, v19.4s, v4.4s
228          FMUL    v20.4s, v20.4s, v4.4s
229          FMUL    v21.4s, v21.4s, v4.4s
230          FMUL    v22.4s, v22.4s, v4.4s
231          FMUL    v23.4s, v23.4s, v4.4s
232
233          FCVTNS  v16.4s, v16.4s
234          FCVTNS  v17.4s, v17.4s
235          FCVTNS  v18.4s, v18.4s
236          FCVTNS  v19.4s, v19.4s
237          FCVTNS  v20.4s, v20.4s
238          FCVTNS  v21.4s, v21.4s
239          FCVTNS  v22.4s, v22.4s
240          FCVTNS  v23.4s, v23.4s
241
242        SQXTN   v16.4h, v16.4s
243        SQXTN   v17.4h, v17.4s
244        SQXTN   v18.4h, v18.4s
245        SQXTN   v19.4h, v19.4s
246        LD1R    {v6.8h}, [x11], 2        // add bias
247
248        SQXTN2  v16.8h, v20.4s
249        SQXTN2  v17.8h, v21.4s
250        SQXTN2  v18.8h, v22.4s
251        SQXTN2  v19.8h, v23.4s
252        LDR     x0, [sp]                 // Load cn_offset
253
254        SQADD   v16.8h, v16.8h, v6.8h
255        SQADD   v17.8h, v17.8h, v6.8h
256        SQADD   v18.8h, v18.8h, v6.8h
257        SQADD   v19.8h, v19.8h, v6.8h
258        LD1R    {v4.16b}, [x11], 1       // clamp min value
259
260        SQXTUN  v0.8b, v16.8h
261        SQXTUN  v1.8b, v18.8h
262        LD1R    {v5.16b}, [x11]          // clamp max value
263        SQXTUN2 v0.16b, v17.8h
264        SQXTUN2 v1.16b, v19.8h
265        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
266
267        UMAX    v0.16b, v0.16b, v4.16b
268        UMAX    v1.16b, v1.16b, v4.16b
269        SUBS    x1, x1, 8
270        UMIN    v0.16b, v0.16b, v5.16b
271        UMIN    v1.16b, v1.16b, v5.16b
272        B.LO    6f
273
274        # Store full 4 x 8
275        ST1     {v1.d}[1],  [x7], x0
276        ST1     {v1.8b},   [x17], x0
277        ST1     {v0.d}[1], [x16], x0
278        ST1     {v0.8b},    [x6], x0
279        SUB     x4, x4, x3              // a -= ks
280
281        # nc loop
282        B.HI    0b
283        RET
284
285        # Remainder- 4-12 bytes of A
286        .p2align 3
28740:     TBZ     x0, 3, 5f
2884:
289        LDR     d0, [x13], 8
290        LDR     q4,  [x5]
291        LDR     d1, [x14], 8
292        LDR     d2, [x15], 8
293        LDR     d3, [x10], 8
294        LDR     q5,  [x5, 16]
295        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
296        UDOT    v25.4s, v7.16b, v1.16b
297        UDOT    v26.4s, v7.16b, v2.16b
298        UDOT    v27.4s, v7.16b, v3.16b
299        UDOT    v16.4s, v4.16b, v0.4b[0]
300        UDOT    v17.4s, v4.16b, v1.4b[0]
301        LDR     q6,  [x5, 32]
302        UDOT    v18.4s, v4.16b, v2.4b[0]
303        UDOT    v19.4s, v4.16b, v3.4b[0]
304        UDOT    v20.4s, v5.16b, v0.4b[0]
305        UDOT    v21.4s, v5.16b, v1.4b[0]
306        LDR     q4,  [x5, 48]
307        UDOT    v22.4s, v5.16b, v2.4b[0]
308        UDOT    v23.4s, v5.16b, v3.4b[0]
309        UDOT    v16.4s, v6.16b, v0.4b[1]
310        UDOT    v17.4s, v6.16b, v1.4b[1]
311        UDOT    v18.4s, v6.16b, v2.4b[1]
312        UDOT    v19.4s, v6.16b, v3.4b[1]
313        ADD     x5, x5, 64
314        UDOT    v20.4s, v4.16b, v0.4b[1]
315        UDOT    v21.4s, v4.16b, v1.4b[1]
316        UDOT    v22.4s, v4.16b, v2.4b[1]
317        UDOT    v23.4s, v4.16b, v3.4b[1]
318        TBZ     x0, 2, 3b
3195:
320        LDR     s0, [x13], 4
321        LDR     q4, [x5], 16
322        LDR     s1, [x14], 4
323        LDR     s2, [x15], 4
324        LDR     s3, [x10], 4
325        LDR     q5, [x5], 16
326        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
327        UDOT    v25.4s, v7.16b, v1.16b
328        UDOT    v26.4s, v7.16b, v2.16b
329        UDOT    v27.4s, v7.16b, v3.16b
330        UDOT    v16.4s, v4.16b, v0.4b[0]
331        UDOT    v17.4s, v4.16b, v1.4b[0]
332        UDOT    v18.4s, v4.16b, v2.4b[0]
333        UDOT    v19.4s, v4.16b, v3.4b[0]
334        UDOT    v20.4s, v5.16b, v0.4b[0]
335        UDOT    v21.4s, v5.16b, v1.4b[0]
336        UDOT    v22.4s, v5.16b, v2.4b[0]
337        UDOT    v23.4s, v5.16b, v3.4b[0]
338        B       3b
339
340        # Store odd width
341        .p2align 3
3426:
343        TBZ     x1, 2, 7f
344        ST1     {v1.s}[2], [x7], 4
345        STR     s1, [x17], 4
346        ST1     {v0.s}[2], [x16], 4
347        STR     s0, [x6], 4
348        EXT     v0.16b, v0.16b, v0.16b, 4
349        EXT     v1.16b, v1.16b, v1.16b, 4
3507:
351        TBZ     x1, 1, 8f
352        ST1     {v1.h}[4], [x7], 2
353        STR     h1, [x17], 2
354        ST1     {v0.h}[4], [x16], 2
355        STR     h0, [x6], 2
356        EXT     v0.16b, v0.16b, v0.16b, 2
357        EXT     v1.16b, v1.16b, v1.16b, 2
3588:
359        TBZ     x1, 0, 9f
360        ST1     {v1.b}[8], [x7]
361        STR     b1, [x17]
362        ST1     {v0.b}[8], [x16]
363        STR     b0, [x6]
3649:
365        RET
366
367END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128
368
369#ifdef __ELF__
370.section ".note.GNU-stack","",%progbits
371#endif
372