xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6  v7
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v8 v9 v10 v11 v12 v13 v14 v15
38
39BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
40
41        # Clamp A and C pointers
42        CMP     x0, 2                   // if mr < 2
43        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
44        ADD     x15, x3, x4             // a1 = a0 + a_stride
45        ADD     x8, x6, x7              // c1 = c0 + cm_stride
46        CSEL    x15, x3, x15, LO        //   a1 = a0
47        CSEL    x8, x6,  x8, LO         //   c1 = c0
48        BIC     x2, x2, 3
49
50        ADD     x13, x15, x4            // a2 = a1 + a_stride
51        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        CSEL    x13, x15, x13, LS       //   a2 = a1
54        CSEL    x9,  x8,  x9, LS        //   c2 = c1
55
56        LDP     x12, x11, [sp]          // cn_stride, params
57
58        CMP     x0, 4                   // if mr < 4
59        ADD     x4, x13, x4             // a3 = a2 + a_stride
60        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
61        CSEL    x4, x13, x4, LO         //   a3 = a2
62        CSEL    x7,  x9, x7, LO         //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        SUBS    x0, x2, 8               // k = kc - 8
78        MOV     v27.16b, v24.16b
79        MOV     v29.16b, v28.16b
80        MOV     v30.16b, v28.16b
81        MOV     v31.16b, v28.16b
82        # Is there at least 8 bytes?
83        B.LO    3f
84
85        # Main loop - 8 bytes of A
86        .p2align 3
871:
88        LDR     d0,  [x3], 8
89        LDR     q4,  [x5], 16
90        LDR     d1, [x15], 8
91        LDR     d2, [x13], 8
92        LDR     d3,  [x4], 8
93        LDR     q5,  [x5], 16
94        SDOT    v16.4s, v4.16b,  v0.4b[0]
95        SDOT    v17.4s, v4.16b,  v1.4b[0]
96        LDP     q6, q7, [x5], 32
97        SDOT    v18.4s, v4.16b,  v2.4b[0]
98        SDOT    v19.4s, v4.16b,  v3.4b[0]
99        SDOT    v20.4s, v5.16b,  v0.4b[0]
100        SDOT    v21.4s, v5.16b,  v1.4b[0]
101        SDOT    v22.4s, v5.16b,  v2.4b[0]
102        SDOT    v23.4s, v5.16b,  v3.4b[0]
103        SDOT    v24.4s, v6.16b, v0.4b[0]
104        SDOT    v25.4s, v6.16b, v1.4b[0]
105        LDP     q4, q5, [x5], 32
106        SDOT    v26.4s, v6.16b, v2.4b[0]
107        SDOT    v27.4s, v6.16b, v3.4b[0]
108        SDOT    v28.4s, v7.16b, v0.4b[0]
109        SDOT    v29.4s, v7.16b, v1.4b[0]
110        SDOT    v30.4s, v7.16b, v2.4b[0]
111        SDOT    v31.4s, v7.16b, v3.4b[0]
112        SDOT    v16.4s, v4.16b,  v0.4b[1]
113        SDOT    v17.4s, v4.16b,  v1.4b[1]
114        LDP     q6, q7, [x5], 32
115        SDOT    v18.4s, v4.16b,  v2.4b[1]
116        SDOT    v19.4s, v4.16b,  v3.4b[1]
117        SDOT    v20.4s, v5.16b,  v0.4b[1]
118        SDOT    v21.4s, v5.16b,  v1.4b[1]
119        SDOT    v22.4s, v5.16b,  v2.4b[1]
120        SDOT    v23.4s, v5.16b,  v3.4b[1]
121        SDOT    v24.4s, v6.16b,  v0.4b[1]
122        SDOT    v25.4s, v6.16b,  v1.4b[1]
123        SDOT    v26.4s, v6.16b,  v2.4b[1]
124        SDOT    v27.4s, v6.16b,  v3.4b[1]
125        SDOT    v28.4s, v7.16b,  v0.4b[1]
126        SDOT    v29.4s, v7.16b,  v1.4b[1]
127        SDOT    v30.4s, v7.16b,  v2.4b[1]
128        SUBS    x0, x0, 8
129        SDOT    v31.4s, v7.16b,  v3.4b[1]
130        B.HS    1b
131
132        # Is there a remainder?- 4 bytes of A
133        TBNZ    x0, 2, 3f
134
1352:
136        SCVTF   v16.4s, v16.4s
137        SCVTF   v17.4s, v17.4s
138        # Load per channel scale values from weights
139        LDR     q4, [x5], 16
140        SCVTF   v18.4s, v18.4s
141        SCVTF   v19.4s, v19.4s
142        LDR     q5, [x5], 16
143        SCVTF   v20.4s, v20.4s
144        SCVTF   v21.4s, v21.4s
145        SCVTF   v22.4s, v22.4s
146        SCVTF   v23.4s, v23.4s
147        SCVTF   v24.4s, v24.4s
148        SCVTF   v25.4s, v25.4s
149        SCVTF   v26.4s, v26.4s
150        SCVTF   v27.4s, v27.4s
151        SCVTF   v28.4s, v28.4s
152        SCVTF   v29.4s, v29.4s
153        SCVTF   v30.4s, v30.4s
154        SCVTF   v31.4s, v31.4s
155
156        LDR     q6, [x5], 16
157        FMUL    v16.4s, v16.4s, v4.4s
158        FMUL    v17.4s, v17.4s, v4.4s
159        FMUL    v18.4s, v18.4s, v4.4s
160        FMUL    v19.4s, v19.4s, v4.4s
161        FMUL    v20.4s, v20.4s, v5.4s
162        LDR     q4, [x5], 16
163        FMUL    v21.4s, v21.4s, v5.4s
164        FMUL    v22.4s, v22.4s, v5.4s
165        FMUL    v23.4s, v23.4s, v5.4s
166        FMUL    v24.4s, v24.4s, v6.4s
167        FMUL    v25.4s, v25.4s, v6.4s
168        FMUL    v26.4s, v26.4s, v6.4s
169        FMUL    v27.4s, v27.4s, v6.4s
170        FMUL    v28.4s, v28.4s, v4.4s
171        FMUL    v29.4s, v29.4s, v4.4s
172        FMUL    v30.4s, v30.4s, v4.4s
173        FMUL    v31.4s, v31.4s, v4.4s
174
175        FCVTNS  v16.4s, v16.4s
176        FCVTNS  v17.4s, v17.4s
177        FCVTNS  v18.4s, v18.4s
178        FCVTNS  v19.4s, v19.4s
179        FCVTNS  v20.4s, v20.4s
180        FCVTNS  v21.4s, v21.4s
181        FCVTNS  v22.4s, v22.4s
182        FCVTNS  v23.4s, v23.4s
183        FCVTNS  v24.4s, v24.4s
184        FCVTNS  v25.4s, v25.4s
185        FCVTNS  v26.4s, v26.4s
186        FCVTNS  v27.4s, v27.4s
187        FCVTNS  v28.4s, v28.4s
188        FCVTNS  v29.4s, v29.4s
189        FCVTNS  v30.4s, v30.4s
190        FCVTNS  v31.4s, v31.4s
191
192        SQXTN   v16.4h, v16.4s
193        SQXTN   v17.4h, v17.4s
194        SQXTN   v18.4h, v18.4s
195        SQXTN   v19.4h, v19.4s
196        SQXTN   v24.4h, v24.4s
197        SQXTN   v25.4h, v25.4s
198        SQXTN   v26.4h, v26.4s
199        SQXTN   v27.4h, v27.4s
200        LD1R    {v6.8h}, [x11], 2       // add bias
201
202        SQXTN2  v16.8h, v20.4s
203        SQXTN2  v17.8h, v21.4s
204        SQXTN2  v18.8h, v22.4s
205        SQXTN2  v19.8h, v23.4s
206        SQXTN2  v24.8h, v28.4s
207        SQXTN2  v25.8h, v29.4s
208        SQXTN2  v26.8h, v30.4s
209        SQXTN2  v27.8h, v31.4s
210
211        SQADD   v16.8h, v16.8h, v6.8h
212        SQADD   v17.8h, v17.8h, v6.8h
213        SQADD   v18.8h, v18.8h, v6.8h
214        SQADD   v19.8h, v19.8h, v6.8h
215        SQADD   v24.8h, v24.8h, v6.8h
216        SQADD   v25.8h, v25.8h, v6.8h
217        SQADD   v26.8h, v26.8h, v6.8h
218        SQADD   v27.8h, v27.8h, v6.8h
219        LD1R    {v4.16b}, [x11], 1      // clamp min value
220
221        SQXTN   v0.8b, v16.8h
222        SQXTN   v1.8b, v17.8h
223        SQXTN   v2.8b, v18.8h
224        SQXTN   v3.8b, v19.8h
225        LD1R    {v5.16b}, [x11]         // clamp max value
226        SQXTN2  v0.16b, v24.8h
227        SQXTN2  v1.16b, v25.8h
228        SQXTN2  v2.16b, v26.8h
229        SQXTN2  v3.16b, v27.8h
230        SUB     x11, x11, 3            // rewind params pointer
231
232        SMAX    v0.16b, v0.16b, v4.16b
233        SMAX    v1.16b, v1.16b, v4.16b
234        SMAX    v2.16b, v2.16b, v4.16b
235        SMAX    v3.16b, v3.16b, v4.16b
236        SUBS    x1, x1, 16
237        SMIN    v0.16b, v0.16b, v5.16b
238        SMIN    v1.16b, v1.16b, v5.16b
239        SMIN    v2.16b, v2.16b, v5.16b
240        SMIN    v3.16b, v3.16b, v5.16b
241        B.LO    4f
242
243        # Store full 4 x 16
244        ST1     {v0.16b}, [x6], x12
245        SUB     x3,  x3, x2             // a0 -= kc
246        ST1     {v1.16b}, [x8], x12
247        SUB     x15, x15, x2            // a1 -= kc
248        ST1     {v2.16b}, [x9], x12
249        SUB     x13, x13, x2            // a2 -= kc
250        ST1     {v3.16b}, [x7], x12
251        SUB     x4,  x4, x2             // a3 -= kc
252        B.NE    0b
253        RET
254
255
256        # Remainder- 4 bytes of A
257        .p2align 3
2583:
259        LDR     s0,  [x3], 4
260        LDR     q4, [x5], 16
261        LDR     s1, [x15], 4
262        LDR     s2, [x13], 4
263        LDR     s3,  [x4], 4
264        SDOT    v16.4s, v4.16b,  v0.4b[0]
265        LDR     q5, [x5], 16
266        SDOT    v17.4s, v4.16b,  v1.4b[0]
267        SDOT    v18.4s, v4.16b,  v2.4b[0]
268        SDOT    v19.4s, v4.16b,  v3.4b[0]
269        SDOT    v20.4s, v5.16b,  v0.4b[0]
270        LDP     q6, q7, [x5], 32
271        SDOT    v21.4s, v5.16b,  v1.4b[0]
272        SDOT    v22.4s, v5.16b,  v2.4b[0]
273        SDOT    v23.4s, v5.16b,  v3.4b[0]
274        SDOT    v24.4s, v6.16b, v0.4b[0]
275        SDOT    v25.4s, v6.16b, v1.4b[0]
276        SDOT    v26.4s, v6.16b, v2.4b[0]
277        SDOT    v27.4s, v6.16b, v3.4b[0]
278        SDOT    v28.4s, v7.16b, v0.4b[0]
279        SDOT    v29.4s, v7.16b, v1.4b[0]
280        SDOT    v30.4s, v7.16b, v2.4b[0]
281        SDOT    v31.4s, v7.16b, v3.4b[0]
282        B       2b
283
284        # Store odd width
285        .p2align 3
2864:
287        TBZ     x1, 3, 5f
288        STR     d0, [x6], 8
289        STR     d1, [x8], 8
290        DUP     d0, v0.d[1]
291        DUP     d1, v1.d[1]
292        STR     d2, [x9], 8
293        STR     d3, [x7], 8
294        DUP     d2, v2.d[1]
295        DUP     d3, v3.d[1]
2965:
297        TBZ     x1, 2, 6f
298        STR     s0, [x6], 4
299        STR     s1, [x8], 4
300        DUP     s0, v0.s[1]
301        DUP     s1, v1.s[1]
302        STR     s2, [x9], 4
303        STR     s3, [x7], 4
304        DUP     s2, v2.s[1]
305        DUP     s3, v3.s[1]
3066:
307        TBZ     x1, 1, 7f
308        STR     h0, [x6], 2
309        STR     h1, [x8], 2
310        DUP     h0, v0.h[1]
311        DUP     h1, v1.h[1]
312        STR     h2, [x9], 2
313        STR     h3, [x7], 2
314        DUP     h2, v2.h[1]
315        DUP     h3, v3.h[1]
3167:
317        TBZ     x1, 0, 8f
318        STR     b0, [x6]
319        STR     b1, [x8]
320        STR     b2, [x9]
321        STR     b3, [x7]
3228:
323        RET
324
325END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
326
327#ifdef __ELF__
328.section ".note.GNU-stack","",%progbits
329#endif
330