xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> (x0)
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x10  v3
34# B    x5  v4  v5  v6  v7
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused v8 v9 v10 v11 v12 v13 v14 v15
40
41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
42
43        # Clamp C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDR     x8, [sp, 8]             // Load a_offset
46        ADD     x16, x6, x7             // c1 = c0 + cm_stride
47        CSEL    x16, x6,  x16, LO       //   c1 = c0
48        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54        BIC     x2, x2, 3
55
56        CMP     x0, 4                   // if mr < 4
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60        .p2align 3
610:
62        # Load initial bias from w into accumulators
63        LDP     q16, q20, [x5], 32
64        MOV     v17.16b, v16.16b
65        MOV     v18.16b, v16.16b
66        LDP     q24, q28, [x5], 32
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOV     v25.16b, v24.16b
72        MOV     v26.16b, v24.16b
73        MOV     v27.16b, v24.16b
74        MOV     v29.16b, v28.16b
75        MOV     v30.16b, v28.16b
76        MOV     v31.16b, v28.16b
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x10, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x10, x12                // if a3 == zero
95        ADD     x10, x10, x8            // a3 += a_offset
96        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 8 bytes for main loop?
99        SUBS    x0, x2, 8               // k = kc - 8
100        B.LO    4f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1042:
105        LDR     d0, [x13], 8
106        LDR     q4,  [x5], 16
107        LDR     d1, [x14], 8
108        LDR     d2, [x15], 8
109        LDR     d3, [x10], 8
110        LDR     q5,  [x5], 16
111        SDOT    v16.4s, v4.16b,  v0.4b[0]
112        SDOT    v17.4s, v4.16b,  v1.4b[0]
113        LDP     q6, q7, [x5], 32
114        SDOT    v18.4s, v4.16b,  v2.4b[0]
115        SDOT    v19.4s, v4.16b,  v3.4b[0]
116        SDOT    v20.4s, v5.16b,  v0.4b[0]
117        SDOT    v21.4s, v5.16b,  v1.4b[0]
118        SDOT    v22.4s, v5.16b,  v2.4b[0]
119        SDOT    v23.4s, v5.16b,  v3.4b[0]
120        SDOT    v24.4s, v6.16b, v0.4b[0]
121        SDOT    v25.4s, v6.16b, v1.4b[0]
122        LDP     q4, q5, [x5], 32
123        SDOT    v26.4s, v6.16b, v2.4b[0]
124        SDOT    v27.4s, v6.16b, v3.4b[0]
125        SDOT    v28.4s, v7.16b, v0.4b[0]
126        SDOT    v29.4s, v7.16b, v1.4b[0]
127        SDOT    v30.4s, v7.16b, v2.4b[0]
128        SDOT    v31.4s, v7.16b, v3.4b[0]
129        SDOT    v16.4s, v4.16b,  v0.4b[1]
130        SDOT    v17.4s, v4.16b,  v1.4b[1]
131        LDP     q6, q7, [x5], 32
132        SDOT    v18.4s, v4.16b,  v2.4b[1]
133        SDOT    v19.4s, v4.16b,  v3.4b[1]
134        SDOT    v20.4s, v5.16b,  v0.4b[1]
135        SDOT    v21.4s, v5.16b,  v1.4b[1]
136        SDOT    v22.4s, v5.16b,  v2.4b[1]
137        SDOT    v23.4s, v5.16b,  v3.4b[1]
138        SDOT    v24.4s, v6.16b,  v0.4b[1]
139        SDOT    v25.4s, v6.16b,  v1.4b[1]
140        SDOT    v26.4s, v6.16b,  v2.4b[1]
141        SDOT    v27.4s, v6.16b,  v3.4b[1]
142        SDOT    v28.4s, v7.16b,  v0.4b[1]
143        SDOT    v29.4s, v7.16b,  v1.4b[1]
144        SDOT    v30.4s, v7.16b,  v2.4b[1]
145        SUBS    x0, x0, 8
146        SDOT    v31.4s, v7.16b,  v3.4b[1]
147        B.HS    2b
148
149        # Is there a remainder?- 4 bytes of A
150        TBNZ    x0, 2, 4f
151
152        # ks loop
153        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
154        B.HI    1b
155
1563:
157        SCVTF   v16.4s, v16.4s
158        SCVTF   v17.4s, v17.4s
159        # Load per channel scale values from weights
160        LDR     q4, [x5], 16
161        SCVTF   v18.4s, v18.4s
162        SCVTF   v19.4s, v19.4s
163        LDR     q5, [x5], 16
164        SCVTF   v20.4s, v20.4s
165        SCVTF   v21.4s, v21.4s
166        SCVTF   v22.4s, v22.4s
167        SCVTF   v23.4s, v23.4s
168        SCVTF   v24.4s, v24.4s
169        SCVTF   v25.4s, v25.4s
170        SCVTF   v26.4s, v26.4s
171        SCVTF   v27.4s, v27.4s
172        SCVTF   v28.4s, v28.4s
173        SCVTF   v29.4s, v29.4s
174        SCVTF   v30.4s, v30.4s
175        SCVTF   v31.4s, v31.4s
176
177        LDR     q6, [x5], 16
178        FMUL    v16.4s, v16.4s, v4.4s
179        FMUL    v17.4s, v17.4s, v4.4s
180        FMUL    v18.4s, v18.4s, v4.4s
181        FMUL    v19.4s, v19.4s, v4.4s
182        FMUL    v20.4s, v20.4s, v5.4s
183        LDR     q4, [x5], 16
184        FMUL    v21.4s, v21.4s, v5.4s
185        FMUL    v22.4s, v22.4s, v5.4s
186        FMUL    v23.4s, v23.4s, v5.4s
187        FMUL    v24.4s, v24.4s, v6.4s
188        FMUL    v25.4s, v25.4s, v6.4s
189        FMUL    v26.4s, v26.4s, v6.4s
190        FMUL    v27.4s, v27.4s, v6.4s
191        FMUL    v28.4s, v28.4s, v4.4s
192        FMUL    v29.4s, v29.4s, v4.4s
193        FMUL    v30.4s, v30.4s, v4.4s
194        FMUL    v31.4s, v31.4s, v4.4s
195
196        FCVTNS  v16.4s, v16.4s
197        FCVTNS  v17.4s, v17.4s
198        FCVTNS  v18.4s, v18.4s
199        FCVTNS  v19.4s, v19.4s
200        FCVTNS  v20.4s, v20.4s
201        FCVTNS  v21.4s, v21.4s
202        FCVTNS  v22.4s, v22.4s
203        FCVTNS  v23.4s, v23.4s
204        FCVTNS  v24.4s, v24.4s
205        FCVTNS  v25.4s, v25.4s
206        FCVTNS  v26.4s, v26.4s
207        FCVTNS  v27.4s, v27.4s
208        FCVTNS  v28.4s, v28.4s
209        FCVTNS  v29.4s, v29.4s
210        FCVTNS  v30.4s, v30.4s
211        FCVTNS  v31.4s, v31.4s
212
213        SQXTN   v16.4h, v16.4s
214        SQXTN   v17.4h, v17.4s
215        SQXTN   v18.4h, v18.4s
216        SQXTN   v19.4h, v19.4s
217        SQXTN   v24.4h, v24.4s
218        SQXTN   v25.4h, v25.4s
219        SQXTN   v26.4h, v26.4s
220        SQXTN   v27.4h, v27.4s
221        LD1R    {v6.8h}, [x11], 2        // add bias
222
223        SQXTN2  v16.8h, v20.4s
224        SQXTN2  v17.8h, v21.4s
225        SQXTN2  v18.8h, v22.4s
226        SQXTN2  v19.8h, v23.4s
227        SQXTN2  v24.8h, v28.4s
228        SQXTN2  v25.8h, v29.4s
229        SQXTN2  v26.8h, v30.4s
230        SQXTN2  v27.8h, v31.4s
231
232        SQADD   v16.8h, v16.8h, v6.8h
233        SQADD   v17.8h, v17.8h, v6.8h
234        SQADD   v18.8h, v18.8h, v6.8h
235        SQADD   v19.8h, v19.8h, v6.8h
236        SQADD   v24.8h, v24.8h, v6.8h
237        SQADD   v25.8h, v25.8h, v6.8h
238        SQADD   v26.8h, v26.8h, v6.8h
239        SQADD   v27.8h, v27.8h, v6.8h
240        LD1R    {v4.16b}, [x11], 1       // clamp min value
241
242        SQXTN   v0.8b, v16.8h
243        SQXTN   v1.8b, v17.8h
244        SQXTN   v2.8b, v18.8h
245        SQXTN   v3.8b, v19.8h
246        LD1R    {v5.16b}, [x11]          // clamp max value
247        SQXTN2  v0.16b, v24.8h
248        SQXTN2  v1.16b, v25.8h
249        SQXTN2  v2.16b, v26.8h
250        SQXTN2  v3.16b, v27.8h
251        LDR     x0, [sp]                 // cn_stride
252        SMAX    v0.16b, v0.16b, v4.16b
253        SMAX    v1.16b, v1.16b, v4.16b
254        SUB     x11, x11, 3          // rewind params pointer
255        SMAX    v2.16b, v2.16b, v4.16b
256        SMAX    v3.16b, v3.16b, v4.16b
257        SUBS    x1, x1, 16
258        SMIN    v0.16b, v0.16b, v5.16b
259        SMIN    v1.16b, v1.16b, v5.16b
260        SMIN    v2.16b, v2.16b, v5.16b
261        SMIN    v3.16b, v3.16b, v5.16b
262        B.LO    5f
263
264        # Store full 4 x 16
265        ST1     {v3.16b},  [x7], x0
266        ST1     {v2.16b}, [x17], x0
267        ST1     {v1.16b}, [x16], x0
268        ST1     {v0.16b},  [x6], x0
269
270        SUB     x4, x4, x3              // a -= ks
271
272        # nc loop
273        B.HI    0b
274        RET
275
276        # Remainder- 4 bytes of A
277        .p2align 3
2784:
279        LDR     s0, [x13], 4
280        LDR     q4, [x5], 16
281        LDR     s1, [x14], 4
282        LDR     s2, [x15], 4
283        LDR     s3, [x10], 4
284        LDR     q5, [x5], 16
285        SDOT    v16.4s, v4.16b,  v0.4b[0]
286        SDOT    v17.4s, v4.16b,  v1.4b[0]
287        LDP     q6, q7, [x5], 32
288        SDOT    v18.4s, v4.16b,  v2.4b[0]
289        SDOT    v19.4s, v4.16b,  v3.4b[0]
290        SDOT    v20.4s, v5.16b,  v0.4b[0]
291        SDOT    v21.4s, v5.16b,  v1.4b[0]
292        SDOT    v22.4s, v5.16b,  v2.4b[0]
293        SDOT    v23.4s, v5.16b,  v3.4b[0]
294        SDOT    v24.4s, v6.16b, v0.4b[0]
295        SDOT    v25.4s, v6.16b, v1.4b[0]
296        SDOT    v26.4s, v6.16b, v2.4b[0]
297        SDOT    v27.4s, v6.16b, v3.4b[0]
298        SDOT    v28.4s, v7.16b, v0.4b[0]
299        SDOT    v29.4s, v7.16b, v1.4b[0]
300        SDOT    v30.4s, v7.16b, v2.4b[0]
301        SDOT    v31.4s, v7.16b, v3.4b[0]
302
303        # ks loop
304        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
305        B.HI    1b
306        B       3b
307
308        # Store odd width
309        .p2align 3
3105:
311        TBZ     x1, 3, 6f
312        STR     d3, [x7], 8
313        STR     d2, [x17], 8
314        DUP     d3, v3.d[1]
315        DUP     d2, v2.d[1]
316        STR     d1, [x16], 8
317        STR     d0, [x6], 8
318        DUP     d1, v1.d[1]
319        DUP     d0, v0.d[1]
3206:
321        TBZ     x1, 2, 7f
322        STR     s3, [x7], 4
323        STR     s2, [x17], 4
324        DUP     s3, v3.s[1]
325        DUP     s2, v2.s[1]
326        STR     s1, [x16], 4
327        STR     s0, [x6], 4
328        DUP     s1, v1.s[1]
329        DUP     s0, v0.s[1]
3307:
331        TBZ     x1, 1, 8f
332        STR     h3, [x7], 2
333        STR     h2, [x17], 2
334        DUP     h3, v3.h[1]
335        DUP     h2, v2.h[1]
336        STR     h1, [x16], 2
337        STR     h0, [x6], 2
338        DUP     h1, v1.h[1]
339        DUP     h0, v0.h[1]
3408:
341        TBZ     x1, 0, 9f
342        STR     b3, [x7]
343        STR     b2, [x17]
344        STR     b1, [x16]
345        STR     b0, [x6]
3469:
347        RET
348
349END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
350
351#ifdef __ELF__
352.section ".note.GNU-stack","",%progbits
353#endif
354