xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6  v7
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v8 v9 v10 v11 v12 v13 v14 v15
38
39BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64
40
41        # Clamp A and C pointers
42        CMP     x0, 2                   // if mr < 2
43        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
44        ADD     x15, x3, x4             // a1 = a0 + a_stride
45        ADD     x8, x6, x7              // c1 = c0 + cm_stride
46        CSEL    x15, x3, x15, LO        //   a1 = a0
47        CSEL    x8, x6,  x8, LO         //   c1 = c0
48        BIC     x2, x2, 3
49
50        ADD     x13, x15, x4            // a2 = a1 + a_stride
51        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        CSEL    x13, x15, x13, LS       //   a2 = a1
54        CSEL    x9,  x8,  x9, LS        //   c2 = c1
55
56        LDP     x12, x11, [sp]          // cn_stride, params
57
58        CMP     x0, 4                   // if mr < 4
59        ADD     x4, x13, x4             // a3 = a2 + a_stride
60        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
61        CSEL    x4, x13, x4, LO         //   a3 = a2
62        CSEL    x7,  x9, x7, LO         //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        SUBS    x0, x2, 8               // k = kc - 8
78        MOV     v27.16b, v24.16b
79        MOV     v29.16b, v28.16b
80        MOV     v30.16b, v28.16b
81        MOV     v31.16b, v28.16b
82        # Is there at least 8 bytes?
83        B.LO    3f
84
85        # Main loop - 8 bytes of A
86        .p2align 3
871:
88        LDR     d0,  [x3], 8
89        LDR     q4,  [x5], 16
90        LDR     d1, [x15], 8
91        LDR     d2, [x13], 8
92        LDR     d3,  [x4], 8
93        LDR     q5,  [x5], 16
94        SDOT    v16.4s, v4.16b,  v0.4b[0]
95        SDOT    v17.4s, v4.16b,  v1.4b[0]
96        LDP     q6, q7, [x5], 32
97        SDOT    v18.4s, v4.16b,  v2.4b[0]
98        SDOT    v19.4s, v4.16b,  v3.4b[0]
99        SDOT    v20.4s, v5.16b,  v0.4b[0]
100        SDOT    v21.4s, v5.16b,  v1.4b[0]
101        SDOT    v22.4s, v5.16b,  v2.4b[0]
102        SDOT    v23.4s, v5.16b,  v3.4b[0]
103        SDOT    v24.4s, v6.16b, v0.4b[0]
104        SDOT    v25.4s, v6.16b, v1.4b[0]
105        LDP     q4, q5, [x5], 32
106        SDOT    v26.4s, v6.16b, v2.4b[0]
107        SDOT    v27.4s, v6.16b, v3.4b[0]
108        SDOT    v28.4s, v7.16b, v0.4b[0]
109        SDOT    v29.4s, v7.16b, v1.4b[0]
110        SDOT    v30.4s, v7.16b, v2.4b[0]
111        SDOT    v31.4s, v7.16b, v3.4b[0]
112        SDOT    v16.4s, v4.16b,  v0.4b[1]
113        SDOT    v17.4s, v4.16b,  v1.4b[1]
114        LDP     q6, q7, [x5], 32
115        SDOT    v18.4s, v4.16b,  v2.4b[1]
116        SDOT    v19.4s, v4.16b,  v3.4b[1]
117        SDOT    v20.4s, v5.16b,  v0.4b[1]
118        SDOT    v21.4s, v5.16b,  v1.4b[1]
119        SDOT    v22.4s, v5.16b,  v2.4b[1]
120        SDOT    v23.4s, v5.16b,  v3.4b[1]
121        SDOT    v24.4s, v6.16b,  v0.4b[1]
122        SDOT    v25.4s, v6.16b,  v1.4b[1]
123        SDOT    v26.4s, v6.16b,  v2.4b[1]
124        SDOT    v27.4s, v6.16b,  v3.4b[1]
125        SDOT    v28.4s, v7.16b,  v0.4b[1]
126        SDOT    v29.4s, v7.16b,  v1.4b[1]
127        SDOT    v30.4s, v7.16b,  v2.4b[1]
128        SUBS    x0, x0, 8
129        SDOT    v31.4s, v7.16b,  v3.4b[1]
130        B.HS    1b
131
132        # Is there a remainder?- 4 bytes of A
133        TBNZ    x0, 2, 3f
134
1352:
136        # Apply params - preshift, scale, postshift, bias and clamp
137        LD1R    {v4.4s}, [x11], 4
138        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
139        SQSHL   v17.4s, v17.4s, v4.4s
140        SQSHL   v18.4s, v18.4s, v4.4s
141        SQSHL   v19.4s, v19.4s, v4.4s
142        SQSHL   v20.4s, v20.4s, v4.4s
143        SQSHL   v21.4s, v21.4s, v4.4s
144        SQSHL   v22.4s, v22.4s, v4.4s
145        SQSHL   v23.4s, v23.4s, v4.4s
146        LD1R    {v5.4s}, [x11], 4
147        SQSHL   v24.4s, v24.4s, v4.4s
148        SQSHL   v25.4s, v25.4s, v4.4s
149        SQSHL   v26.4s, v26.4s, v4.4s
150        SQSHL   v27.4s, v27.4s, v4.4s
151        SQSHL   v28.4s, v28.4s, v4.4s
152        SQSHL   v29.4s, v29.4s, v4.4s
153        SQSHL   v30.4s, v30.4s, v4.4s
154        SQSHL   v31.4s, v31.4s, v4.4s
155        LD1R    {v6.4s}, [x11], 4
156        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
157        SQDMULH v17.4s, v17.4s, v5.4s
158        SQDMULH v18.4s, v18.4s, v5.4s
159        SQDMULH v19.4s, v19.4s, v5.4s
160        SQDMULH v20.4s, v20.4s, v5.4s
161        SQDMULH v21.4s, v21.4s, v5.4s
162        SQDMULH v22.4s, v22.4s, v5.4s
163        SQDMULH v23.4s, v23.4s, v5.4s
164        SQDMULH v24.4s, v24.4s, v5.4s
165        SQDMULH v25.4s, v25.4s, v5.4s
166        SQDMULH v26.4s, v26.4s, v5.4s
167        SQDMULH v27.4s, v27.4s, v5.4s
168        SQDMULH v28.4s, v28.4s, v5.4s
169        SQDMULH v29.4s, v29.4s, v5.4s
170        SQDMULH v30.4s, v30.4s, v5.4s
171        SQDMULH v31.4s, v31.4s, v5.4s
172        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
173        SRSHL   v17.4s, v17.4s, v6.4s
174        SRSHL   v18.4s, v18.4s, v6.4s
175        SRSHL   v19.4s, v19.4s, v6.4s
176        SRSHL   v20.4s, v20.4s, v6.4s
177        SRSHL   v21.4s, v21.4s, v6.4s
178        SRSHL   v22.4s, v22.4s, v6.4s
179        SRSHL   v23.4s, v23.4s, v6.4s
180        SRSHL   v24.4s, v24.4s, v6.4s
181        SRSHL   v25.4s, v25.4s, v6.4s
182        SRSHL   v26.4s, v26.4s, v6.4s
183        SRSHL   v27.4s, v27.4s, v6.4s
184        SRSHL   v28.4s, v28.4s, v6.4s
185        SRSHL   v29.4s, v29.4s, v6.4s
186        SRSHL   v30.4s, v30.4s, v6.4s
187        SRSHL   v31.4s, v31.4s, v6.4s
188
189        SQXTN   v16.4h, v16.4s
190        SQXTN   v17.4h, v17.4s
191        SQXTN   v18.4h, v18.4s
192        SQXTN   v19.4h, v19.4s
193        SQXTN   v24.4h, v24.4s
194        SQXTN   v25.4h, v25.4s
195        SQXTN   v26.4h, v26.4s
196        SQXTN   v27.4h, v27.4s
197        LD1R    {v6.8h}, [x11], 2       // add bias
198
199        SQXTN2  v16.8h, v20.4s
200        SQXTN2  v17.8h, v21.4s
201        SQXTN2  v18.8h, v22.4s
202        SQXTN2  v19.8h, v23.4s
203        SQXTN2  v24.8h, v28.4s
204        SQXTN2  v25.8h, v29.4s
205        SQXTN2  v26.8h, v30.4s
206        SQXTN2  v27.8h, v31.4s
207
208        SQADD   v16.8h, v16.8h, v6.8h
209        SQADD   v17.8h, v17.8h, v6.8h
210        SQADD   v18.8h, v18.8h, v6.8h
211        SQADD   v19.8h, v19.8h, v6.8h
212        SQADD   v24.8h, v24.8h, v6.8h
213        SQADD   v25.8h, v25.8h, v6.8h
214        SQADD   v26.8h, v26.8h, v6.8h
215        SQADD   v27.8h, v27.8h, v6.8h
216        LD1R    {v4.16b}, [x11], 1      // clamp min value
217
218        SQXTN   v0.8b, v16.8h
219        SQXTN   v1.8b, v17.8h
220        SQXTN   v2.8b, v18.8h
221        SQXTN   v3.8b, v19.8h
222        LD1R    {v5.16b}, [x11]         // clamp max value
223        SQXTN2  v0.16b, v24.8h
224        SQXTN2  v1.16b, v25.8h
225        SQXTN2  v2.16b, v26.8h
226        SQXTN2  v3.16b, v27.8h
227        SUB     x11, x11, 15            // rewind params pointer
228
229        SMAX    v0.16b, v0.16b, v4.16b
230        SMAX    v1.16b, v1.16b, v4.16b
231        SMAX    v2.16b, v2.16b, v4.16b
232        SMAX    v3.16b, v3.16b, v4.16b
233        SUBS    x1, x1, 16
234        SMIN    v0.16b, v0.16b, v5.16b
235        SMIN    v1.16b, v1.16b, v5.16b
236        SMIN    v2.16b, v2.16b, v5.16b
237        SMIN    v3.16b, v3.16b, v5.16b
238        B.LO    4f
239
240        # Store full 4 x 16
241        ST1     {v0.16b}, [x6], x12
242        SUB     x3,  x3, x2             // a0 -= kc
243        ST1     {v1.16b}, [x8], x12
244        SUB     x15, x15, x2            // a1 -= kc
245        ST1     {v2.16b}, [x9], x12
246        SUB     x13, x13, x2            // a2 -= kc
247        ST1     {v3.16b}, [x7], x12
248        SUB     x4,  x4, x2             // a3 -= kc
249        B.NE    0b
250        RET
251
252
253        # Remainder- 4 bytes of A
254        .p2align 3
2553:
256        LDR     s0,  [x3], 4
257        LDR     q4, [x5], 16
258        LDR     s1, [x15], 4
259        LDR     s2, [x13], 4
260        LDR     s3,  [x4], 4
261        SDOT    v16.4s, v4.16b,  v0.4b[0]
262        LDR     q5, [x5], 16
263        SDOT    v17.4s, v4.16b,  v1.4b[0]
264        SDOT    v18.4s, v4.16b,  v2.4b[0]
265        SDOT    v19.4s, v4.16b,  v3.4b[0]
266        SDOT    v20.4s, v5.16b,  v0.4b[0]
267        LDP     q6, q7, [x5], 32
268        SDOT    v21.4s, v5.16b,  v1.4b[0]
269        SDOT    v22.4s, v5.16b,  v2.4b[0]
270        SDOT    v23.4s, v5.16b,  v3.4b[0]
271        SDOT    v24.4s, v6.16b, v0.4b[0]
272        SDOT    v25.4s, v6.16b, v1.4b[0]
273        SDOT    v26.4s, v6.16b, v2.4b[0]
274        SDOT    v27.4s, v6.16b, v3.4b[0]
275        SDOT    v28.4s, v7.16b, v0.4b[0]
276        SDOT    v29.4s, v7.16b, v1.4b[0]
277        SDOT    v30.4s, v7.16b, v2.4b[0]
278        SDOT    v31.4s, v7.16b, v3.4b[0]
279        B       2b
280
281        # Store odd width
282        .p2align 3
2834:
284        TBZ     x1, 3, 5f
285        STR     d0, [x6], 8
286        STR     d1, [x8], 8
287        DUP     d0, v0.d[1]
288        DUP     d1, v1.d[1]
289        STR     d2, [x9], 8
290        STR     d3, [x7], 8
291        DUP     d2, v2.d[1]
292        DUP     d3, v3.d[1]
2935:
294        TBZ     x1, 2, 6f
295        STR     s0, [x6], 4
296        STR     s1, [x8], 4
297        DUP     s0, v0.s[1]
298        DUP     s1, v1.s[1]
299        STR     s2, [x9], 4
300        STR     s3, [x7], 4
301        DUP     s2, v2.s[1]
302        DUP     s3, v3.s[1]
3036:
304        TBZ     x1, 1, 7f
305        STR     h0, [x6], 2
306        STR     h1, [x8], 2
307        DUP     h0, v0.h[1]
308        DUP     h1, v1.h[1]
309        STR     h2, [x9], 2
310        STR     h3, [x7], 2
311        DUP     h2, v2.h[1]
312        DUP     h3, v3.h[1]
3137:
314        TBZ     x1, 0, 8f
315        STR     b0, [x6]
316        STR     b1, [x8]
317        STR     b2, [x9]
318        STR     b3, [x7]
3198:
320        RET
321
322END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64
323
324#ifdef __ELF__
325.section ".note.GNU-stack","",%progbits
326#endif
327