xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6  v7
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v8 v9 v10 v11 v12 v13 v14 v15
38
39BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
40
41        # Clamp A and C pointers
42        CMP     x0, 2                   // if mr < 2
43        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
44        ADD     x15, x3, x4             // a1 = a0 + a_stride
45        ADD     x8, x6, x7              // c1 = c0 + cm_stride
46        CSEL    x15, x3, x15, LO        //   a1 = a0
47        CSEL    x8, x6,  x8, LO         //   c1 = c0
48        BIC     x2, x2, 3
49
50        ADD     x13, x15, x4            // a2 = a1 + a_stride
51        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        CSEL    x13, x15, x13, LS       //   a2 = a1
54        CSEL    x9,  x8,  x9, LS        //   c2 = c1
55
56        LDP     x12, x11, [sp]          // cn_stride, params
57
58        CMP     x0, 4                   // if mr < 4
59        ADD     x4, x13, x4             // a3 = a2 + a_stride
60        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
61        CSEL    x4, x13, x4, LO         //   a3 = a2
62        CSEL    x7,  x9, x7, LO         //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     x0, x2                  // k = kc.  assumes kc > 0
78        MOV     v27.16b, v24.16b
79        MOV     v29.16b, v28.16b
80        MOV     v30.16b, v28.16b
81        MOV     v31.16b, v28.16b
82
83        # Main loop - 4 bytes of A
84        .p2align 3
851:
86        LD1R    {v0.4s},  [x3], 4
87        LDR     q4, [x5], 16
88        LD1R    {v1.4s}, [x15], 4
89        LD1R    {v2.4s}, [x13], 4
90        LD1R    {v3.4s},  [x4], 4
91        SDOT    v16.4s, v4.16b, v0.16b
92        SDOT    v17.4s, v4.16b, v1.16b
93        LDR     q5, [x5], 16
94        SDOT    v18.4s, v4.16b, v2.16b
95        SDOT    v19.4s, v4.16b, v3.16b
96        LDR     q6, [x5], 16
97        SDOT    v20.4s, v5.16b, v0.16b
98        SDOT    v21.4s, v5.16b, v1.16b
99        LDR     q7, [x5], 16
100        SDOT    v22.4s, v5.16b, v2.16b
101        SDOT    v23.4s, v5.16b, v3.16b
102        SUBS    x0, x0, 4
103        SDOT    v24.4s, v6.16b, v0.16b
104        SDOT    v25.4s, v6.16b, v1.16b
105        SDOT    v26.4s, v6.16b, v2.16b
106        SDOT    v27.4s, v6.16b, v3.16b
107        SDOT    v28.4s, v7.16b, v0.16b
108        SDOT    v29.4s, v7.16b, v1.16b
109        SDOT    v30.4s, v7.16b, v2.16b
110        SDOT    v31.4s, v7.16b, v3.16b
111        B.HI    1b
112
113        SCVTF   v16.4s, v16.4s
114        SCVTF   v17.4s, v17.4s
115        # Apply params - scale, bias and clamp
116        LD1R    {v4.4s}, [x11], 4
117        SCVTF   v18.4s, v18.4s
118        SCVTF   v19.4s, v19.4s
119        SCVTF   v20.4s, v20.4s
120        SCVTF   v21.4s, v21.4s
121        SCVTF   v22.4s, v22.4s
122        SCVTF   v23.4s, v23.4s
123        SCVTF   v24.4s, v24.4s
124        SCVTF   v25.4s, v25.4s
125        SCVTF   v26.4s, v26.4s
126        SCVTF   v27.4s, v27.4s
127        SCVTF   v28.4s, v28.4s
128        SCVTF   v29.4s, v29.4s
129        SCVTF   v30.4s, v30.4s
130        SCVTF   v31.4s, v31.4s
131
132        FMUL    v16.4s, v16.4s, v4.4s
133        FMUL    v17.4s, v17.4s, v4.4s
134        FMUL    v18.4s, v18.4s, v4.4s
135        FMUL    v19.4s, v19.4s, v4.4s
136        FMUL    v20.4s, v20.4s, v4.4s
137        FMUL    v21.4s, v21.4s, v4.4s
138        FMUL    v22.4s, v22.4s, v4.4s
139        FMUL    v23.4s, v23.4s, v4.4s
140        FMUL    v24.4s, v24.4s, v4.4s
141        FMUL    v25.4s, v25.4s, v4.4s
142        FMUL    v26.4s, v26.4s, v4.4s
143        FMUL    v27.4s, v27.4s, v4.4s
144        FMUL    v28.4s, v28.4s, v4.4s
145        FMUL    v29.4s, v29.4s, v4.4s
146        FMUL    v30.4s, v30.4s, v4.4s
147        FMUL    v31.4s, v31.4s, v4.4s
148
149        FCVTNS  v16.4s, v16.4s
150        FCVTNS  v17.4s, v17.4s
151        FCVTNS  v18.4s, v18.4s
152        FCVTNS  v19.4s, v19.4s
153        FCVTNS  v20.4s, v20.4s
154        FCVTNS  v21.4s, v21.4s
155        FCVTNS  v22.4s, v22.4s
156        FCVTNS  v23.4s, v23.4s
157        FCVTNS  v24.4s, v24.4s
158        FCVTNS  v25.4s, v25.4s
159        FCVTNS  v26.4s, v26.4s
160        FCVTNS  v27.4s, v27.4s
161        FCVTNS  v28.4s, v28.4s
162        FCVTNS  v29.4s, v29.4s
163        FCVTNS  v30.4s, v30.4s
164        FCVTNS  v31.4s, v31.4s
165
166        SQXTN   v16.4h, v16.4s
167        SQXTN   v17.4h, v17.4s
168        SQXTN   v18.4h, v18.4s
169        SQXTN   v19.4h, v19.4s
170        SQXTN   v24.4h, v24.4s
171        SQXTN   v25.4h, v25.4s
172        SQXTN   v26.4h, v26.4s
173        SQXTN   v27.4h, v27.4s
174        LD1R    {v6.8h}, [x11], 2       // add bias
175
176        SQXTN2  v16.8h, v20.4s
177        SQXTN2  v17.8h, v21.4s
178        SQXTN2  v18.8h, v22.4s
179        SQXTN2  v19.8h, v23.4s
180        SQXTN2  v24.8h, v28.4s
181        SQXTN2  v25.8h, v29.4s
182        SQXTN2  v26.8h, v30.4s
183        SQXTN2  v27.8h, v31.4s
184
185        SQADD   v16.8h, v16.8h, v6.8h
186        SQADD   v17.8h, v17.8h, v6.8h
187        SQADD   v18.8h, v18.8h, v6.8h
188        SQADD   v19.8h, v19.8h, v6.8h
189        SQADD   v24.8h, v24.8h, v6.8h
190        SQADD   v25.8h, v25.8h, v6.8h
191        SQADD   v26.8h, v26.8h, v6.8h
192        SQADD   v27.8h, v27.8h, v6.8h
193        LD1R    {v4.16b}, [x11], 1      // clamp min value
194
195        SQXTN   v0.8b, v16.8h
196        SQXTN   v1.8b, v17.8h
197        SQXTN   v2.8b, v18.8h
198        SQXTN   v3.8b, v19.8h
199        LD1R    {v5.16b}, [x11]         // clamp max value
200        SQXTN2  v0.16b, v24.8h
201        SQXTN2  v1.16b, v25.8h
202        SQXTN2  v2.16b, v26.8h
203        SQXTN2  v3.16b, v27.8h
204        SUB     x11, x11, 7            // rewind params pointer
205
206        SMAX    v0.16b, v0.16b, v4.16b
207        SMAX    v1.16b, v1.16b, v4.16b
208        SMAX    v2.16b, v2.16b, v4.16b
209        SMAX    v3.16b, v3.16b, v4.16b
210        SUBS    x1, x1, 16
211        SMIN    v0.16b, v0.16b, v5.16b
212        SMIN    v1.16b, v1.16b, v5.16b
213        SMIN    v2.16b, v2.16b, v5.16b
214        SMIN    v3.16b, v3.16b, v5.16b
215        B.LO    2f
216
217        # Store full 4 x 16
218        ST1     {v0.16b}, [x6], x12
219        SUB     x3,  x3, x2             // a0 -= kc
220        ST1     {v1.16b}, [x8], x12
221        SUB     x15, x15, x2            // a1 -= kc
222        ST1     {v2.16b}, [x9], x12
223        SUB     x13, x13, x2            // a2 -= kc
224        ST1     {v3.16b}, [x7], x12
225        SUB     x4,  x4, x2             // a3 -= kc
226        B.NE    0b
227        RET
228
229        # Store odd width
230        .p2align 3
2312:
232        TBZ     x1, 3, 3f
233        STR     d0, [x6], 8
234        STR     d1, [x8], 8
235        DUP     d0, v0.d[1]
236        DUP     d1, v1.d[1]
237        STR     d2, [x9], 8
238        STR     d3, [x7], 8
239        DUP     d2, v2.d[1]
240        DUP     d3, v3.d[1]
2413:
242        TBZ     x1, 2, 4f
243        STR     s0, [x6], 4
244        STR     s1, [x8], 4
245        DUP     s0, v0.s[1]
246        DUP     s1, v1.s[1]
247        STR     s2, [x9], 4
248        STR     s3, [x7], 4
249        DUP     s2, v2.s[1]
250        DUP     s3, v3.s[1]
2514:
252        TBZ     x1, 1, 5f
253        STR     h0, [x6], 2
254        STR     h1, [x8], 2
255        DUP     h0, v0.h[1]
256        DUP     h1, v1.h[1]
257        STR     h2, [x9], 2
258        STR     h3, [x7], 2
259        DUP     h2, v2.h[1]
260        DUP     h3, v3.h[1]
2615:
262        TBZ     x1, 0, 6f
263        STR     b0, [x6]
264        STR     b1, [x8]
265        STR     b2, [x9]
266        STR     b3, [x7]
2676:
268        RET
269
270END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
271
272#ifdef __ELF__
273.section ".note.GNU-stack","",%progbits
274#endif
275