xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x8c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0
29# A1 x15  v1
30# A2 x13  v2
31# A3  x4  v3
32# B   x5  v4  v5  v6
33# C0  x6 v16 v20
34# C1  x8 v17 v21
35# C2  x9 v18 v22
36# C3  x7 v19 v23
37# zero_point v7 v24 v25 v26 v27
38# unused v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31
39
40BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128
41
42        # Clamp A and C pointers
43        CMP     x0, 2                   // if mr < 2
44        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
45        ADD     x15, x3, x4             // a1 = a0 + a_stride
46        ADD     x8, x6, x7              // c1 = c0 + cm_stride
47        CSEL    x15, x3, x15, LO        //   a1 = a0
48        CSEL    x8, x6,  x8, LO         //   c1 = c0
49        BIC     x2, x2, 3
50
51        LDP     x12, x11, [sp]          // cn_stride, params
52
53        ADD     x13, x15, x4            // a2 = a1 + a_stride
54        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
55                                        // if mr <= 2
56        CSEL    x13, x15, x13, LS       //   a2 = a1
57        CSEL    x9,  x8,  x9, LS        //   c2 = c1
58
59        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
60
61        CMP     x0, 4                   // if mr < 4
62        ADD     x4, x13, x4             // a3 = a2 + a_stride
63        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
64        CSEL    x4, x13, x4, LO         //   a3 = a2
65        CSEL    x7,  x9, x7, LO         //   c3 = c2
66
67        .p2align 3
680:
69        # Load initial bias from w into accumulators
70        LDP     q16, q20, [x5], 32
71        SUBS    x0, x2, 16              // k = kc - 16
72        MOV     v17.16b, v16.16b
73        MOV     v18.16b, v16.16b
74        MOV     v19.16b, v16.16b
75        MOV     v21.16b, v20.16b
76        MOV     v22.16b, v20.16b
77        MOV     v23.16b, v20.16b
78        MOVI    v24.16b, 0
79        MOVI    v25.16b, 0
80        MOVI    v26.16b, 0
81        MOVI    v27.16b, 0
82
83        # Is there at least 16 bytes?
84        B.LO    30f
85
86        # Main loop - 16 bytes of A
87        .p2align 3
881:
89        LDR     q0,  [x3], 16
90        LDR     q4,  [x5], 16
91        LDR     q1, [x15], 16
92        LDR     q2, [x13], 16
93        LDR     q3,  [x4], 16
94        LDR     q5,  [x5], 16
95        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
96        UDOT    v25.4s, v7.16b, v1.16b
97        UDOT    v26.4s, v7.16b, v2.16b
98        UDOT    v27.4s, v7.16b, v3.16b
99        UDOT    v16.4s, v4.16b, v0.4b[0]
100        UDOT    v17.4s, v4.16b, v1.4b[0]
101        LDR     q6,  [x5], 16
102        UDOT    v18.4s, v4.16b, v2.4b[0]
103        UDOT    v19.4s, v4.16b, v3.4b[0]
104        UDOT    v20.4s, v5.16b, v0.4b[0]
105        UDOT    v21.4s, v5.16b, v1.4b[0]
106        LDR     q4,  [x5], 16
107        UDOT    v22.4s, v5.16b, v2.4b[0]
108        UDOT    v23.4s, v5.16b, v3.4b[0]
109        UDOT    v16.4s, v6.16b, v0.4b[1]
110        UDOT    v17.4s, v6.16b, v1.4b[1]
111        LDR     q5,  [x5], 16
112        UDOT    v18.4s, v6.16b, v2.4b[1]
113        UDOT    v19.4s, v6.16b, v3.4b[1]
114        UDOT    v20.4s, v4.16b, v0.4b[1]
115        UDOT    v21.4s, v4.16b, v1.4b[1]
116        LDR     q6,  [x5], 16
117        UDOT    v22.4s, v4.16b, v2.4b[1]
118        UDOT    v23.4s, v4.16b, v3.4b[1]
119        UDOT    v16.4s, v5.16b, v0.4b[2]
120        UDOT    v17.4s, v5.16b, v1.4b[2]
121        LDR     q4,  [x5], 16
122        UDOT    v18.4s, v5.16b, v2.4b[2]
123        UDOT    v19.4s, v5.16b, v3.4b[2]
124        UDOT    v20.4s, v6.16b, v0.4b[2]
125        UDOT    v21.4s, v6.16b, v1.4b[2]
126        LDR     q5,  [x5], 16
127        UDOT    v22.4s, v6.16b, v2.4b[2]
128        UDOT    v23.4s, v6.16b, v3.4b[2]
129        UDOT    v16.4s, v4.16b, v0.4b[3]
130        UDOT    v17.4s, v4.16b, v1.4b[3]
131        UDOT    v18.4s, v4.16b, v2.4b[3]
132        UDOT    v19.4s, v4.16b, v3.4b[3]
133        SUBS    x0, x0, 16
134        UDOT    v20.4s, v5.16b, v0.4b[3]
135        UDOT    v21.4s, v5.16b, v1.4b[3]
136        UDOT    v22.4s, v5.16b, v2.4b[3]
137        UDOT    v23.4s, v5.16b, v3.4b[3]
138        B.HS    1b
139
140        # Is there a remainder?- 8 bytes of A
141        TBNZ    x0, 3, 3f
142        # Is there a remainder?- 4 bytes of A
143        TBNZ    x0, 2, 4f
144
1452:
146        ADDP    v0.4s, v24.4s, v24.4s
147        ADDP    v1.4s, v25.4s, v25.4s
148        ADDP    v2.4s, v26.4s, v26.4s
149        ADDP    v3.4s, v27.4s, v27.4s
150        ADDP    v24.4s, v0.4s, v0.4s
151        ADDP    v25.4s, v1.4s, v1.4s
152        ADDP    v26.4s, v2.4s, v2.4s
153        ADDP    v27.4s, v3.4s, v3.4s
154
155        # Subtract zero point from accumulators
156        SUB     v16.4s, v16.4s, v24.4s
157        SUB     v17.4s, v17.4s, v25.4s
158        SUB     v18.4s, v18.4s, v26.4s
159        SUB     v19.4s, v19.4s, v27.4s
160        SUB     v20.4s, v20.4s, v24.4s
161        SUB     v21.4s, v21.4s, v25.4s
162        SUB     v22.4s, v22.4s, v26.4s
163        SUB     v23.4s, v23.4s, v27.4s
164
165        # Apply params - preshift, scale, postshift, bias and clamp
166        LD1R    {v4.4s}, [x11], 4
167        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
168        SSHL    v17.4s, v17.4s, v4.4s
169        SSHL    v18.4s, v18.4s, v4.4s
170        SSHL    v19.4s, v19.4s, v4.4s
171        LD1R    {v5.4s}, [x11], 4
172        SSHL    v20.4s, v20.4s, v4.4s
173        SSHL    v21.4s, v21.4s, v4.4s
174        SSHL    v22.4s, v22.4s, v4.4s
175        SSHL    v23.4s, v23.4s, v4.4s
176        LD1R    {v6.4s}, [x11], 4
177        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
178        SQDMULH v17.4s, v17.4s, v5.4s
179        SQDMULH v18.4s, v18.4s, v5.4s
180        SQDMULH v19.4s, v19.4s, v5.4s
181        SQDMULH v20.4s, v20.4s, v5.4s
182        SQDMULH v21.4s, v21.4s, v5.4s
183        SQDMULH v22.4s, v22.4s, v5.4s
184        SQDMULH v23.4s, v23.4s, v5.4s
185        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
186        SRSHL   v17.4s, v17.4s, v6.4s
187        SRSHL   v18.4s, v18.4s, v6.4s
188        SRSHL   v19.4s, v19.4s, v6.4s
189        SRSHL   v20.4s, v20.4s, v6.4s
190        SRSHL   v21.4s, v21.4s, v6.4s
191        SRSHL   v22.4s, v22.4s, v6.4s
192        SRSHL   v23.4s, v23.4s, v6.4s
193
194        SQXTN   v16.4h, v16.4s
195        SQXTN   v17.4h, v17.4s
196        SQXTN   v18.4h, v18.4s
197        SQXTN   v19.4h, v19.4s
198        LD1R    {v6.8h}, [x11], 2       // add bias
199
200        SQXTN2  v16.8h, v20.4s
201        SQXTN2  v17.8h, v21.4s
202        SQXTN2  v18.8h, v22.4s
203        SQXTN2  v19.8h, v23.4s
204
205        SQADD   v16.8h, v16.8h, v6.8h
206        SQADD   v17.8h, v17.8h, v6.8h
207        SQADD   v18.8h, v18.8h, v6.8h
208        SQADD   v19.8h, v19.8h, v6.8h
209        LD1R    {v4.16b}, [x11], 1      // clamp min value
210
211        SQXTUN  v0.8b, v16.8h
212        SQXTUN  v1.8b, v18.8h
213        LD1R    {v5.16b}, [x11]         // clamp max value
214        SQXTUN2 v0.16b, v17.8h
215        SQXTUN2 v1.16b, v19.8h
216        SUB     x11, x11, 15             // rewind params pointer
217
218        UMAX    v0.16b, v0.16b, v4.16b
219        UMAX    v1.16b, v1.16b, v4.16b
220        SUBS    x1, x1, 8
221        UMIN    v0.16b, v0.16b, v5.16b
222        UMIN    v1.16b, v1.16b, v5.16b
223        B.LO    5f
224
225        # Store full 4 x 8
226        ST1     {v0.8b}, [x6], x12
227        SUB     x3,  x3, x2             // a0 -= kc
228        ST1     {v0.d}[1], [x8], x12
229        SUB     x15, x15, x2            // a1 -= kc
230        ST1     {v1.8b}, [x9], x12
231        SUB     x13, x13, x2            // a2 -= kc
232        ST1     {v1.d}[1], [x7], x12
233        SUB     x4,  x4, x2             // a3 -= kc
234        B.NE    0b
235        RET
236
237        # Remainder- 4-12 bytes of A
238        .p2align 3
23930:     TBZ     x0, 3, 4f
240
2413:
242        LDR     d0,  [x3], 8
243        LDR     q4,  [x5]
244        LDR     d1, [x15], 8
245        LDR     d2, [x13], 8
246        LDR     d3,  [x4], 8
247        LDR     q5,  [x5, 16]
248        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
249        UDOT    v25.4s, v7.16b, v1.16b
250        UDOT    v26.4s, v7.16b, v2.16b
251        UDOT    v27.4s, v7.16b, v3.16b
252        UDOT    v16.4s, v4.16b, v0.4b[0]
253        UDOT    v17.4s, v4.16b, v1.4b[0]
254        LDR     q6,  [x5, 32]
255        UDOT    v18.4s, v4.16b, v2.4b[0]
256        UDOT    v19.4s, v4.16b, v3.4b[0]
257        UDOT    v20.4s, v5.16b, v0.4b[0]
258        UDOT    v21.4s, v5.16b, v1.4b[0]
259        LDR     q4,  [x5, 48]
260        UDOT    v22.4s, v5.16b, v2.4b[0]
261        UDOT    v23.4s, v5.16b, v3.4b[0]
262        UDOT    v16.4s, v6.16b, v0.4b[1]
263        UDOT    v17.4s, v6.16b, v1.4b[1]
264        UDOT    v18.4s, v6.16b, v2.4b[1]
265        UDOT    v19.4s, v6.16b, v3.4b[1]
266        ADD     x5, x5, 64
267        UDOT    v20.4s, v4.16b, v0.4b[1]
268        UDOT    v21.4s, v4.16b, v1.4b[1]
269        UDOT    v22.4s, v4.16b, v2.4b[1]
270        UDOT    v23.4s, v4.16b, v3.4b[1]
271        TBZ     x0, 2, 2b
272
2734:
274        LDR     s0,  [x3], 4
275        LDR     q4, [x5], 16
276        LDR     s1, [x15], 4
277        LDR     s2, [x13], 4
278        LDR     s3,  [x4], 4
279        LDR     q5, [x5], 16
280        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
281        UDOT    v25.4s, v7.16b, v1.16b
282        UDOT    v26.4s, v7.16b, v2.16b
283        UDOT    v27.4s, v7.16b, v3.16b
284        UDOT    v16.4s, v4.16b, v0.4b[0]
285        UDOT    v17.4s, v4.16b, v1.4b[0]
286        UDOT    v18.4s, v4.16b, v2.4b[0]
287        UDOT    v19.4s, v4.16b, v3.4b[0]
288        UDOT    v20.4s, v5.16b, v0.4b[0]
289        UDOT    v21.4s, v5.16b, v1.4b[0]
290        UDOT    v22.4s, v5.16b, v2.4b[0]
291        UDOT    v23.4s, v5.16b, v3.4b[0]
292        B       2b
293
294        # Store odd width
295        .p2align 3
2965:
297        TBZ     x1, 2, 6f
298        STR     s0, [x6], 4
299        ST1     {v0.s}[2], [x8], 4
300        STR     s1, [x9], 4
301        ST1     {v1.s}[2], [x7], 4
302        EXT     v0.16b, v0.16b, v0.16b, 4
303        EXT     v1.16b, v1.16b, v1.16b, 4
3046:
305        TBZ     x1, 1, 7f
306        STR     h0, [x6], 2
307        ST1     {v0.h}[4], [x8], 2
308        STR     h1, [x9], 2
309        ST1     {v1.h}[4], [x7], 2
310        EXT     v0.16b, v0.16b, v0.16b, 2
311        EXT     v1.16b, v1.16b, v1.16b, 2
3127:
313        TBZ     x1, 0, 8f
314        STR     b0, [x6]
315        ST1     {v0.b}[8], [x8]
316        STR     b1, [x9]
317        ST1     {v1.b}[8], [x7]
3188:
319        RET
320
321END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128
322
323#ifdef __ELF__
324.section ".note.GNU-stack","",%progbits
325#endif
326