xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-igemm/4x8c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x0
23#     size_t a_offset,           [sp + 8] -> x8
24#     const int8_t* zero,        [sp + 16] -> x12
25#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x10  v3
34# B    x5  v4  v5  v6
35# C0   x6 v16 v20
36# C1  x16 v17 v21
37# C2  x17 v18 v22
38# C3   x7 v19 v23
39# zero_point v7 v24 v25 v26 v27
40# unused  v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31
41
42BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDR     x8, [sp, 8]             // Load a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
50
51        ADD     x17, x16, x7            // c2 = c1 + cm_stride
52        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
53                                        // if mr <= 2
54        CSEL    x17, x16, x17, LS       //   c2 = c1
55        BIC     x2, x2, 3
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
59        CSEL    x7,  x17, x7, LO        //   c3 = c2
60
61        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        LDP     q16, q20, [x5], 32
67        MOV     x9, x3                  // p = ks
68        MOVI    v24.16b, 0
69        MOVI    v25.16b, 0
70        MOVI    v26.16b, 0
71        MOVI    v27.16b, 0
72        MOV     v17.16b, v16.16b
73        MOV     v18.16b, v16.16b
74        MOV     v19.16b, v16.16b
75        MOV     v21.16b, v20.16b
76        MOV     v22.16b, v20.16b
77        MOV     v23.16b, v20.16b
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x10, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x10, x12                // if a3 == zero
95        ADD     x10, x10, x8            // a3 += a_offset
96        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 16 bytes for main loop?
99        SUBS    x0, x2, 16              // k = kc - 8
100        B.LO    40f
101
102        # Main loop - 16 bytes of A
103        .p2align 3
1042:
105        LDR     q0, [x13], 16
106        LDR     q4,  [x5], 16
107        LDR     q1, [x14], 16
108        LDR     q2, [x15], 16
109        LDR     q3, [x10], 16
110        LDR     q5,  [x5], 16
111        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
112        UDOT    v25.4s, v7.16b, v1.16b
113        UDOT    v26.4s, v7.16b, v2.16b
114        UDOT    v27.4s, v7.16b, v3.16b
115        UDOT    v16.4s, v4.16b, v0.4b[0]
116        UDOT    v17.4s, v4.16b, v1.4b[0]
117        LDR     q6,  [x5], 16
118        UDOT    v18.4s, v4.16b, v2.4b[0]
119        UDOT    v19.4s, v4.16b, v3.4b[0]
120        UDOT    v20.4s, v5.16b, v0.4b[0]
121        UDOT    v21.4s, v5.16b, v1.4b[0]
122        LDR     q4,  [x5], 16
123        UDOT    v22.4s, v5.16b, v2.4b[0]
124        UDOT    v23.4s, v5.16b, v3.4b[0]
125        UDOT    v16.4s, v6.16b, v0.4b[1]
126        UDOT    v17.4s, v6.16b, v1.4b[1]
127        LDR     q5,  [x5], 16
128        UDOT    v18.4s, v6.16b, v2.4b[1]
129        UDOT    v19.4s, v6.16b, v3.4b[1]
130        UDOT    v20.4s, v4.16b, v0.4b[1]
131        UDOT    v21.4s, v4.16b, v1.4b[1]
132        LDR     q6,  [x5], 16
133        UDOT    v22.4s, v4.16b, v2.4b[1]
134        UDOT    v23.4s, v4.16b, v3.4b[1]
135        UDOT    v16.4s, v5.16b, v0.4b[2]
136        UDOT    v17.4s, v5.16b, v1.4b[2]
137        LDR     q4,  [x5], 16
138        UDOT    v18.4s, v5.16b, v2.4b[2]
139        UDOT    v19.4s, v5.16b, v3.4b[2]
140        UDOT    v20.4s, v6.16b, v0.4b[2]
141        UDOT    v21.4s, v6.16b, v1.4b[2]
142        LDR     q5,  [x5], 16
143        UDOT    v22.4s, v6.16b, v2.4b[2]
144        UDOT    v23.4s, v6.16b, v3.4b[2]
145        UDOT    v16.4s, v4.16b, v0.4b[3]
146        UDOT    v17.4s, v4.16b, v1.4b[3]
147        UDOT    v18.4s, v4.16b, v2.4b[3]
148        UDOT    v19.4s, v4.16b, v3.4b[3]
149        SUBS    x0, x0, 16
150        UDOT    v20.4s, v5.16b, v0.4b[3]
151        UDOT    v21.4s, v5.16b, v1.4b[3]
152        UDOT    v22.4s, v5.16b, v2.4b[3]
153        UDOT    v23.4s, v5.16b, v3.4b[3]
154        B.HS    2b
155
156        # Is there a remainder?- 8 bytes of A
157        TBNZ    x0, 3, 4f
158        # Is there a remainder?- 4 bytes of A
159        TBNZ    x0, 2, 5f
160
1613:
162        # ks loop
163        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
164        B.HI    1b
165
166        ADDP    v0.4s, v24.4s, v24.4s
167        ADDP    v1.4s, v25.4s, v25.4s
168        ADDP    v2.4s, v26.4s, v26.4s
169        ADDP    v3.4s, v27.4s, v27.4s
170        ADDP    v24.4s, v0.4s, v0.4s
171        ADDP    v25.4s, v1.4s, v1.4s
172        ADDP    v26.4s, v2.4s, v2.4s
173        ADDP    v27.4s, v3.4s, v3.4s
174
175        # Subtract zero point from accumulators
176        SUB     v16.4s, v16.4s, v24.4s
177        SUB     v17.4s, v17.4s, v25.4s
178        SUB     v18.4s, v18.4s, v26.4s
179        SUB     v19.4s, v19.4s, v27.4s
180        SUB     v20.4s, v20.4s, v24.4s
181        SUB     v21.4s, v21.4s, v25.4s
182        SUB     v22.4s, v22.4s, v26.4s
183        SUB     v23.4s, v23.4s, v27.4s
184
185        # Apply params - preshift, scale, postshift, bias and clamp
186        LD1R    {v4.4s}, [x11], 4
187        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
188        SSHL    v17.4s, v17.4s, v4.4s
189        SSHL    v18.4s, v18.4s, v4.4s
190        SSHL    v19.4s, v19.4s, v4.4s
191        LD1R    {v5.4s}, [x11], 4
192        SSHL    v20.4s, v20.4s, v4.4s
193        SSHL    v21.4s, v21.4s, v4.4s
194        SSHL    v22.4s, v22.4s, v4.4s
195        SSHL    v23.4s, v23.4s, v4.4s
196        LD1R    {v6.4s}, [x11], 4
197        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
198        SQDMULH v17.4s, v17.4s, v5.4s
199        SQDMULH v18.4s, v18.4s, v5.4s
200        SQDMULH v19.4s, v19.4s, v5.4s
201        SQDMULH v20.4s, v20.4s, v5.4s
202        SQDMULH v21.4s, v21.4s, v5.4s
203        SQDMULH v22.4s, v22.4s, v5.4s
204        SQDMULH v23.4s, v23.4s, v5.4s
205        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
206        SRSHL   v17.4s, v17.4s, v6.4s
207        SRSHL   v18.4s, v18.4s, v6.4s
208        SRSHL   v19.4s, v19.4s, v6.4s
209        SRSHL   v20.4s, v20.4s, v6.4s
210        SRSHL   v21.4s, v21.4s, v6.4s
211        SRSHL   v22.4s, v22.4s, v6.4s
212        SRSHL   v23.4s, v23.4s, v6.4s
213
214        SQXTN   v16.4h, v16.4s
215        SQXTN   v17.4h, v17.4s
216        SQXTN   v18.4h, v18.4s
217        SQXTN   v19.4h, v19.4s
218        LD1R    {v6.8h}, [x11], 2        // add bias
219
220        SQXTN2  v16.8h, v20.4s
221        SQXTN2  v17.8h, v21.4s
222        SQXTN2  v18.8h, v22.4s
223        SQXTN2  v19.8h, v23.4s
224        LDR     x0, [sp]                 // Load cn_offset
225
226        SQADD   v16.8h, v16.8h, v6.8h
227        SQADD   v17.8h, v17.8h, v6.8h
228        SQADD   v18.8h, v18.8h, v6.8h
229        SQADD   v19.8h, v19.8h, v6.8h
230        LD1R    {v4.16b}, [x11], 1       // clamp min value
231
232        SQXTUN  v0.8b, v16.8h
233        SQXTUN  v1.8b, v18.8h
234        LD1R    {v5.16b}, [x11]          // clamp max value
235        SQXTUN2 v0.16b, v17.8h
236        SQXTUN2 v1.16b, v19.8h
237        SUB     x11, x11, 15             // rewind params pointer
238
239        UMAX    v0.16b, v0.16b, v4.16b
240        UMAX    v1.16b, v1.16b, v4.16b
241        SUBS    x1, x1, 8
242        UMIN    v0.16b, v0.16b, v5.16b
243        UMIN    v1.16b, v1.16b, v5.16b
244        B.LO    6f
245
246        # Store full 4 x 8
247        ST1     {v1.d}[1],  [x7], x0
248        ST1     {v1.8b},   [x17], x0
249        ST1     {v0.d}[1], [x16], x0
250        ST1     {v0.8b},    [x6], x0
251        SUB     x4, x4, x3              // a -= ks
252
253        # nc loop
254        B.HI    0b
255        RET
256
257        # Remainder- 4-12 bytes of A
258        .p2align 3
25940:     TBZ     x0, 3, 5f
2604:
261        LDR     d0, [x13], 8
262        LDR     q4,  [x5]
263        LDR     d1, [x14], 8
264        LDR     d2, [x15], 8
265        LDR     d3, [x10], 8
266        LDR     q5,  [x5, 16]
267        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
268        UDOT    v25.4s, v7.16b, v1.16b
269        UDOT    v26.4s, v7.16b, v2.16b
270        UDOT    v27.4s, v7.16b, v3.16b
271        UDOT    v16.4s, v4.16b, v0.4b[0]
272        UDOT    v17.4s, v4.16b, v1.4b[0]
273        LDR     q6,  [x5, 32]
274        UDOT    v18.4s, v4.16b, v2.4b[0]
275        UDOT    v19.4s, v4.16b, v3.4b[0]
276        UDOT    v20.4s, v5.16b, v0.4b[0]
277        UDOT    v21.4s, v5.16b, v1.4b[0]
278        LDR     q4,  [x5, 48]
279        UDOT    v22.4s, v5.16b, v2.4b[0]
280        UDOT    v23.4s, v5.16b, v3.4b[0]
281        UDOT    v16.4s, v6.16b, v0.4b[1]
282        UDOT    v17.4s, v6.16b, v1.4b[1]
283        UDOT    v18.4s, v6.16b, v2.4b[1]
284        UDOT    v19.4s, v6.16b, v3.4b[1]
285        ADD     x5, x5, 64
286        UDOT    v20.4s, v4.16b, v0.4b[1]
287        UDOT    v21.4s, v4.16b, v1.4b[1]
288        UDOT    v22.4s, v4.16b, v2.4b[1]
289        UDOT    v23.4s, v4.16b, v3.4b[1]
290        TBZ     x0, 2, 3b
2915:
292        LDR     s0, [x13], 4
293        LDR     q4, [x5], 16
294        LDR     s1, [x14], 4
295        LDR     s2, [x15], 4
296        LDR     s3, [x10], 4
297        LDR     q5, [x5], 16
298        UDOT    v24.4s, v7.16b, v0.16b  // update zero point
299        UDOT    v25.4s, v7.16b, v1.16b
300        UDOT    v26.4s, v7.16b, v2.16b
301        UDOT    v27.4s, v7.16b, v3.16b
302        UDOT    v16.4s, v4.16b, v0.4b[0]
303        UDOT    v17.4s, v4.16b, v1.4b[0]
304        UDOT    v18.4s, v4.16b, v2.4b[0]
305        UDOT    v19.4s, v4.16b, v3.4b[0]
306        UDOT    v20.4s, v5.16b, v0.4b[0]
307        UDOT    v21.4s, v5.16b, v1.4b[0]
308        UDOT    v22.4s, v5.16b, v2.4b[0]
309        UDOT    v23.4s, v5.16b, v3.4b[0]
310        B       3b
311
312        # Store odd width
313        .p2align 3
3146:
315        TBZ     x1, 2, 7f
316        ST1     {v1.s}[2], [x7], 4
317        STR     s1, [x17], 4
318        ST1     {v0.s}[2], [x16], 4
319        STR     s0, [x6], 4
320        EXT     v0.16b, v0.16b, v0.16b, 4
321        EXT     v1.16b, v1.16b, v1.16b, 4
3227:
323        TBZ     x1, 1, 8f
324        ST1     {v1.h}[4], [x7], 2
325        STR     h1, [x17], 2
326        ST1     {v0.h}[4], [x16], 2
327        STR     h0, [x6], 2
328        EXT     v0.16b, v0.16b, v0.16b, 2
329        EXT     v1.16b, v1.16b, v1.16b, 2
3308:
331        TBZ     x1, 0, 9f
332        ST1     {v1.b}[8], [x7]
333        STR     b1, [x17]
334        ST1     {v0.b}[8], [x16]
335        STR     b0, [x6]
3369:
337        RET
338
339END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128
340
341#ifdef __ELF__
342.section ".note.GNU-stack","",%progbits
343#endif
344