xref: /aosp_15_r20/external/XNNPACK/src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6  v7
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v8 v9 v10 v11 v12 v13 v14 v15
38
39BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
40
41        # Clamp A and C pointers
42        CMP     x0, 2                   // if mr < 2
43        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
44        ADD     x15, x3, x4             // a1 = a0 + a_stride
45        ADD     x8, x6, x7              // c1 = c0 + cm_stride
46        CSEL    x15, x3, x15, LO        //   a1 = a0
47        CSEL    x8, x6,  x8, LO         //   c1 = c0
48        BIC     x2, x2, 3
49
50        ADD     x13, x15, x4            // a2 = a1 + a_stride
51        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        CSEL    x13, x15, x13, LS       //   a2 = a1
54        CSEL    x9,  x8,  x9, LS        //   c2 = c1
55
56        LDP     x12, x11, [sp]          // cn_stride, params
57
58        CMP     x0, 4                   // if mr < 4
59        ADD     x4, x13, x4             // a3 = a2 + a_stride
60        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
61        CSEL    x4, x13, x4, LO         //   a3 = a2
62        CSEL    x7,  x9, x7, LO         //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     x0, x2                  // k = kc.  assumes kc > 0
78        MOV     v27.16b, v24.16b
79        MOV     v29.16b, v28.16b
80        MOV     v30.16b, v28.16b
81        MOV     v31.16b, v28.16b
82
83        # Main loop - 4 bytes of A
84        .p2align 3
851:
86        LD1R    {v0.4s},  [x3], 4
87        LDR     q4, [x5], 16
88        LD1R    {v1.4s}, [x15], 4
89        LD1R    {v2.4s}, [x13], 4
90        LD1R    {v3.4s},  [x4], 4
91        SDOT    v16.4s, v4.16b, v0.16b
92        SDOT    v17.4s, v4.16b, v1.16b
93        LDR     q5, [x5], 16
94        SDOT    v18.4s, v4.16b, v2.16b
95        SDOT    v19.4s, v4.16b, v3.16b
96        LDR     q6, [x5], 16
97        SDOT    v20.4s, v5.16b, v0.16b
98        SDOT    v21.4s, v5.16b, v1.16b
99        LDR     q7, [x5], 16
100        SDOT    v22.4s, v5.16b, v2.16b
101        SDOT    v23.4s, v5.16b, v3.16b
102        SUBS    x0, x0, 4
103        SDOT    v24.4s, v6.16b, v0.16b
104        SDOT    v25.4s, v6.16b, v1.16b
105        SDOT    v26.4s, v6.16b, v2.16b
106        SDOT    v27.4s, v6.16b, v3.16b
107        SDOT    v28.4s, v7.16b, v0.16b
108        SDOT    v29.4s, v7.16b, v1.16b
109        SDOT    v30.4s, v7.16b, v2.16b
110        SDOT    v31.4s, v7.16b, v3.16b
111        B.HI    1b
112
113        SCVTF   v16.4s, v16.4s
114        SCVTF   v17.4s, v17.4s
115        # Load per channel scale values from weights
116        LDR     q4, [x5], 16
117        SCVTF   v18.4s, v18.4s
118        SCVTF   v19.4s, v19.4s
119        LDR     q5, [x5], 16
120        SCVTF   v20.4s, v20.4s
121        SCVTF   v21.4s, v21.4s
122        SCVTF   v22.4s, v22.4s
123        SCVTF   v23.4s, v23.4s
124        SCVTF   v24.4s, v24.4s
125        SCVTF   v25.4s, v25.4s
126        SCVTF   v26.4s, v26.4s
127        SCVTF   v27.4s, v27.4s
128        SCVTF   v28.4s, v28.4s
129        SCVTF   v29.4s, v29.4s
130        SCVTF   v30.4s, v30.4s
131        SCVTF   v31.4s, v31.4s
132
133        LDR     q6, [x5], 16
134        FMUL    v16.4s, v16.4s, v4.4s
135        FMUL    v17.4s, v17.4s, v4.4s
136        FMUL    v18.4s, v18.4s, v4.4s
137        FMUL    v19.4s, v19.4s, v4.4s
138        FMUL    v20.4s, v20.4s, v5.4s
139        LDR     q4, [x5], 16
140        FMUL    v21.4s, v21.4s, v5.4s
141        FMUL    v22.4s, v22.4s, v5.4s
142        FMUL    v23.4s, v23.4s, v5.4s
143        FMUL    v24.4s, v24.4s, v6.4s
144        FMUL    v25.4s, v25.4s, v6.4s
145        FMUL    v26.4s, v26.4s, v6.4s
146        FMUL    v27.4s, v27.4s, v6.4s
147        FMUL    v28.4s, v28.4s, v4.4s
148        FMUL    v29.4s, v29.4s, v4.4s
149        FMUL    v30.4s, v30.4s, v4.4s
150        FMUL    v31.4s, v31.4s, v4.4s
151
152        FCVTNS  v16.4s, v16.4s
153        FCVTNS  v17.4s, v17.4s
154        FCVTNS  v18.4s, v18.4s
155        FCVTNS  v19.4s, v19.4s
156        FCVTNS  v20.4s, v20.4s
157        FCVTNS  v21.4s, v21.4s
158        FCVTNS  v22.4s, v22.4s
159        FCVTNS  v23.4s, v23.4s
160        FCVTNS  v24.4s, v24.4s
161        FCVTNS  v25.4s, v25.4s
162        FCVTNS  v26.4s, v26.4s
163        FCVTNS  v27.4s, v27.4s
164        FCVTNS  v28.4s, v28.4s
165        FCVTNS  v29.4s, v29.4s
166        FCVTNS  v30.4s, v30.4s
167        FCVTNS  v31.4s, v31.4s
168
169        SQXTN   v16.4h, v16.4s
170        SQXTN   v17.4h, v17.4s
171        SQXTN   v18.4h, v18.4s
172        SQXTN   v19.4h, v19.4s
173        SQXTN   v24.4h, v24.4s
174        SQXTN   v25.4h, v25.4s
175        SQXTN   v26.4h, v26.4s
176        SQXTN   v27.4h, v27.4s
177        LD1R    {v6.8h}, [x11], 2       // add bias
178
179        SQXTN2  v16.8h, v20.4s
180        SQXTN2  v17.8h, v21.4s
181        SQXTN2  v18.8h, v22.4s
182        SQXTN2  v19.8h, v23.4s
183        SQXTN2  v24.8h, v28.4s
184        SQXTN2  v25.8h, v29.4s
185        SQXTN2  v26.8h, v30.4s
186        SQXTN2  v27.8h, v31.4s
187
188        SQADD   v16.8h, v16.8h, v6.8h
189        SQADD   v17.8h, v17.8h, v6.8h
190        SQADD   v18.8h, v18.8h, v6.8h
191        SQADD   v19.8h, v19.8h, v6.8h
192        SQADD   v24.8h, v24.8h, v6.8h
193        SQADD   v25.8h, v25.8h, v6.8h
194        SQADD   v26.8h, v26.8h, v6.8h
195        SQADD   v27.8h, v27.8h, v6.8h
196        LD1R    {v4.16b}, [x11], 1      // clamp min value
197
198        SQXTN   v0.8b, v16.8h
199        SQXTN   v1.8b, v17.8h
200        SQXTN   v2.8b, v18.8h
201        SQXTN   v3.8b, v19.8h
202        LD1R    {v5.16b}, [x11]         // clamp max value
203        SQXTN2  v0.16b, v24.8h
204        SQXTN2  v1.16b, v25.8h
205        SQXTN2  v2.16b, v26.8h
206        SQXTN2  v3.16b, v27.8h
207        SUB     x11, x11, 3            // rewind params pointer
208
209        SMAX    v0.16b, v0.16b, v4.16b
210        SMAX    v1.16b, v1.16b, v4.16b
211        SMAX    v2.16b, v2.16b, v4.16b
212        SMAX    v3.16b, v3.16b, v4.16b
213        SUBS    x1, x1, 16
214        SMIN    v0.16b, v0.16b, v5.16b
215        SMIN    v1.16b, v1.16b, v5.16b
216        SMIN    v2.16b, v2.16b, v5.16b
217        SMIN    v3.16b, v3.16b, v5.16b
218        B.LO    2f
219
220        # Store full 4 x 16
221        ST1     {v0.16b}, [x6], x12
222        SUB     x3,  x3, x2             // a0 -= kc
223        ST1     {v1.16b}, [x8], x12
224        SUB     x15, x15, x2            // a1 -= kc
225        ST1     {v2.16b}, [x9], x12
226        SUB     x13, x13, x2            // a2 -= kc
227        ST1     {v3.16b}, [x7], x12
228        SUB     x4,  x4, x2             // a3 -= kc
229        B.NE    0b
230        RET
231
232        # Store odd width
233        .p2align 3
2342:
235        TBZ     x1, 3, 3f
236        STR     d0, [x6], 8
237        STR     d1, [x8], 8
238        DUP     d0, v0.d[1]
239        DUP     d1, v1.d[1]
240        STR     d2, [x9], 8
241        STR     d3, [x7], 8
242        DUP     d2, v2.d[1]
243        DUP     d3, v3.d[1]
2443:
245        TBZ     x1, 2, 4f
246        STR     s0, [x6], 4
247        STR     s1, [x8], 4
248        DUP     s0, v0.s[1]
249        DUP     s1, v1.s[1]
250        STR     s2, [x9], 4
251        STR     s3, [x7], 4
252        DUP     s2, v2.s[1]
253        DUP     s3, v3.s[1]
2544:
255        TBZ     x1, 1, 5f
256        STR     h0, [x6], 2
257        STR     h1, [x8], 2
258        DUP     h0, v0.h[1]
259        DUP     h1, v1.h[1]
260        STR     h2, [x9], 2
261        STR     h3, [x7], 2
262        DUP     h2, v2.h[1]
263        DUP     h3, v3.h[1]
2645:
265        TBZ     x1, 0, 6f
266        STR     b0, [x6]
267        STR     b1, [x8]
268        STR     b2, [x9]
269        STR     b3, [x7]
2706:
271        RET
272
273END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
274
275#ifdef __ELF__
276.section ".note.GNU-stack","",%progbits
277#endif
278