xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> (x0)
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x10  v3
34# B    x5  v4  v5  v6  v7
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused v8 v9 v10 v11 v12 v13 v14 v15
40
41BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
42
43        # Clamp C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDR     x8, [sp, 8]             // Load a_offset
46        ADD     x16, x6, x7             // c1 = c0 + cm_stride
47        CSEL    x16, x6,  x16, LO       //   c1 = c0
48        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54        BIC     x2, x2, 3
55
56        CMP     x0, 4                   // if mr < 4
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60        .p2align 3
610:
62        # Load initial bias from w into accumulators
63        LDP     q16, q20, [x5], 32
64        MOV     v17.16b, v16.16b
65        MOV     v18.16b, v16.16b
66        LDP     q24, q28, [x5], 32
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOV     v25.16b, v24.16b
72        MOV     v26.16b, v24.16b
73        MOV     v27.16b, v24.16b
74        MOV     v29.16b, v28.16b
75        MOV     v30.16b, v28.16b
76        MOV     v31.16b, v28.16b
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x10, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x10, x12                // if a3 == zero
95        ADD     x10, x10, x8            // a3 += a_offset
96        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 8 bytes for main loop?
99        SUBS    x0, x2, 8               // k = kc - 8
100        B.LO    4f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1042:
105        LDR     d0, [x13], 8
106        LDR     q4,  [x5], 16
107        LDR     d1, [x14], 8
108        LDR     d2, [x15], 8
109        LDR     d3, [x10], 8
110        LDR     q5,  [x5], 16
111        SDOT    v16.4s, v4.16b,  v0.4b[0]
112        SDOT    v17.4s, v4.16b,  v1.4b[0]
113        LDP     q6, q7, [x5], 32
114        SDOT    v18.4s, v4.16b,  v2.4b[0]
115        SDOT    v19.4s, v4.16b,  v3.4b[0]
116        SDOT    v20.4s, v5.16b,  v0.4b[0]
117        SDOT    v21.4s, v5.16b,  v1.4b[0]
118        SDOT    v22.4s, v5.16b,  v2.4b[0]
119        SDOT    v23.4s, v5.16b,  v3.4b[0]
120        SDOT    v24.4s, v6.16b, v0.4b[0]
121        SDOT    v25.4s, v6.16b, v1.4b[0]
122        LDP     q4, q5, [x5], 32
123        SDOT    v26.4s, v6.16b, v2.4b[0]
124        SDOT    v27.4s, v6.16b, v3.4b[0]
125        SDOT    v28.4s, v7.16b, v0.4b[0]
126        SDOT    v29.4s, v7.16b, v1.4b[0]
127        SDOT    v30.4s, v7.16b, v2.4b[0]
128        SDOT    v31.4s, v7.16b, v3.4b[0]
129        SDOT    v16.4s, v4.16b,  v0.4b[1]
130        SDOT    v17.4s, v4.16b,  v1.4b[1]
131        LDP     q6, q7, [x5], 32
132        SDOT    v18.4s, v4.16b,  v2.4b[1]
133        SDOT    v19.4s, v4.16b,  v3.4b[1]
134        SDOT    v20.4s, v5.16b,  v0.4b[1]
135        SDOT    v21.4s, v5.16b,  v1.4b[1]
136        SDOT    v22.4s, v5.16b,  v2.4b[1]
137        SDOT    v23.4s, v5.16b,  v3.4b[1]
138        SDOT    v24.4s, v6.16b,  v0.4b[1]
139        SDOT    v25.4s, v6.16b,  v1.4b[1]
140        SDOT    v26.4s, v6.16b,  v2.4b[1]
141        SDOT    v27.4s, v6.16b,  v3.4b[1]
142        SDOT    v28.4s, v7.16b,  v0.4b[1]
143        SDOT    v29.4s, v7.16b,  v1.4b[1]
144        SDOT    v30.4s, v7.16b,  v2.4b[1]
145        SUBS    x0, x0, 8
146        SDOT    v31.4s, v7.16b,  v3.4b[1]
147        B.HS    2b
148
149        # Is there a remainder?- 4 bytes of A
150        TBNZ    x0, 2, 4f
151
152        # ks loop
153        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
154        B.HI    1b
155
1563:
157        SCVTF   v16.4s, v16.4s
158        SCVTF   v17.4s, v17.4s
159        # Apply params - scale, bias and clamp
160        LD1R    {v4.4s}, [x11], 4
161        SCVTF   v18.4s, v18.4s
162        SCVTF   v19.4s, v19.4s
163        SCVTF   v20.4s, v20.4s
164        SCVTF   v21.4s, v21.4s
165        SCVTF   v22.4s, v22.4s
166        SCVTF   v23.4s, v23.4s
167        SCVTF   v24.4s, v24.4s
168        SCVTF   v25.4s, v25.4s
169        SCVTF   v26.4s, v26.4s
170        SCVTF   v27.4s, v27.4s
171        SCVTF   v28.4s, v28.4s
172        SCVTF   v29.4s, v29.4s
173        SCVTF   v30.4s, v30.4s
174        SCVTF   v31.4s, v31.4s
175
176        FMUL    v16.4s, v16.4s, v4.4s
177        FMUL    v17.4s, v17.4s, v4.4s
178        FMUL    v18.4s, v18.4s, v4.4s
179        FMUL    v19.4s, v19.4s, v4.4s
180        FMUL    v20.4s, v20.4s, v4.4s
181        FMUL    v21.4s, v21.4s, v4.4s
182        FMUL    v22.4s, v22.4s, v4.4s
183        FMUL    v23.4s, v23.4s, v4.4s
184        FMUL    v24.4s, v24.4s, v4.4s
185        FMUL    v25.4s, v25.4s, v4.4s
186        FMUL    v26.4s, v26.4s, v4.4s
187        FMUL    v27.4s, v27.4s, v4.4s
188        FMUL    v28.4s, v28.4s, v4.4s
189        FMUL    v29.4s, v29.4s, v4.4s
190        FMUL    v30.4s, v30.4s, v4.4s
191        FMUL    v31.4s, v31.4s, v4.4s
192
193        FCVTNS  v16.4s, v16.4s
194        FCVTNS  v17.4s, v17.4s
195        FCVTNS  v18.4s, v18.4s
196        FCVTNS  v19.4s, v19.4s
197        FCVTNS  v20.4s, v20.4s
198        FCVTNS  v21.4s, v21.4s
199        FCVTNS  v22.4s, v22.4s
200        FCVTNS  v23.4s, v23.4s
201        FCVTNS  v24.4s, v24.4s
202        FCVTNS  v25.4s, v25.4s
203        FCVTNS  v26.4s, v26.4s
204        FCVTNS  v27.4s, v27.4s
205        FCVTNS  v28.4s, v28.4s
206        FCVTNS  v29.4s, v29.4s
207        FCVTNS  v30.4s, v30.4s
208        FCVTNS  v31.4s, v31.4s
209
210        SQXTN   v16.4h, v16.4s
211        SQXTN   v17.4h, v17.4s
212        SQXTN   v18.4h, v18.4s
213        SQXTN   v19.4h, v19.4s
214        SQXTN   v24.4h, v24.4s
215        SQXTN   v25.4h, v25.4s
216        SQXTN   v26.4h, v26.4s
217        SQXTN   v27.4h, v27.4s
218        LD1R    {v6.8h}, [x11], 2        // add bias
219
220        SQXTN2  v16.8h, v20.4s
221        SQXTN2  v17.8h, v21.4s
222        SQXTN2  v18.8h, v22.4s
223        SQXTN2  v19.8h, v23.4s
224        SQXTN2  v24.8h, v28.4s
225        SQXTN2  v25.8h, v29.4s
226        SQXTN2  v26.8h, v30.4s
227        SQXTN2  v27.8h, v31.4s
228
229        SQADD   v16.8h, v16.8h, v6.8h
230        SQADD   v17.8h, v17.8h, v6.8h
231        SQADD   v18.8h, v18.8h, v6.8h
232        SQADD   v19.8h, v19.8h, v6.8h
233        SQADD   v24.8h, v24.8h, v6.8h
234        SQADD   v25.8h, v25.8h, v6.8h
235        SQADD   v26.8h, v26.8h, v6.8h
236        SQADD   v27.8h, v27.8h, v6.8h
237        LD1R    {v4.16b}, [x11], 1       // clamp min value
238
239        SQXTN   v0.8b, v16.8h
240        SQXTN   v1.8b, v17.8h
241        SQXTN   v2.8b, v18.8h
242        SQXTN   v3.8b, v19.8h
243        LD1R    {v5.16b}, [x11]          // clamp max value
244        SQXTN2  v0.16b, v24.8h
245        SQXTN2  v1.16b, v25.8h
246        SQXTN2  v2.16b, v26.8h
247        SQXTN2  v3.16b, v27.8h
248        LDR     x0, [sp]                 // cn_stride
249        SMAX    v0.16b, v0.16b, v4.16b
250        SMAX    v1.16b, v1.16b, v4.16b
251        SUB     x11, x11, 7          // rewind params pointer
252        SMAX    v2.16b, v2.16b, v4.16b
253        SMAX    v3.16b, v3.16b, v4.16b
254        SUBS    x1, x1, 16
255        SMIN    v0.16b, v0.16b, v5.16b
256        SMIN    v1.16b, v1.16b, v5.16b
257        SMIN    v2.16b, v2.16b, v5.16b
258        SMIN    v3.16b, v3.16b, v5.16b
259        B.LO    5f
260
261        # Store full 4 x 16
262        ST1     {v3.16b},  [x7], x0
263        ST1     {v2.16b}, [x17], x0
264        ST1     {v1.16b}, [x16], x0
265        ST1     {v0.16b},  [x6], x0
266
267        SUB     x4, x4, x3              // a -= ks
268
269        # nc loop
270        B.HI    0b
271        RET
272
273        # Remainder- 4 bytes of A
274        .p2align 3
2754:
276        LDR     s0, [x13], 4
277        LDR     q4, [x5], 16
278        LDR     s1, [x14], 4
279        LDR     s2, [x15], 4
280        LDR     s3, [x10], 4
281        LDR     q5, [x5], 16
282        SDOT    v16.4s, v4.16b,  v0.4b[0]
283        SDOT    v17.4s, v4.16b,  v1.4b[0]
284        LDP     q6, q7, [x5], 32
285        SDOT    v18.4s, v4.16b,  v2.4b[0]
286        SDOT    v19.4s, v4.16b,  v3.4b[0]
287        SDOT    v20.4s, v5.16b,  v0.4b[0]
288        SDOT    v21.4s, v5.16b,  v1.4b[0]
289        SDOT    v22.4s, v5.16b,  v2.4b[0]
290        SDOT    v23.4s, v5.16b,  v3.4b[0]
291        SDOT    v24.4s, v6.16b, v0.4b[0]
292        SDOT    v25.4s, v6.16b, v1.4b[0]
293        SDOT    v26.4s, v6.16b, v2.4b[0]
294        SDOT    v27.4s, v6.16b, v3.4b[0]
295        SDOT    v28.4s, v7.16b, v0.4b[0]
296        SDOT    v29.4s, v7.16b, v1.4b[0]
297        SDOT    v30.4s, v7.16b, v2.4b[0]
298        SDOT    v31.4s, v7.16b, v3.4b[0]
299
300        # ks loop
301        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
302        B.HI    1b
303        B       3b
304
305        # Store odd width
306        .p2align 3
3075:
308        TBZ     x1, 3, 6f
309        STR     d3, [x7], 8
310        STR     d2, [x17], 8
311        DUP     d3, v3.d[1]
312        DUP     d2, v2.d[1]
313        STR     d1, [x16], 8
314        STR     d0, [x6], 8
315        DUP     d1, v1.d[1]
316        DUP     d0, v0.d[1]
3176:
318        TBZ     x1, 2, 7f
319        STR     s3, [x7], 4
320        STR     s2, [x17], 4
321        DUP     s3, v3.s[1]
322        DUP     s2, v2.s[1]
323        STR     s1, [x16], 4
324        STR     s0, [x6], 4
325        DUP     s1, v1.s[1]
326        DUP     s0, v0.s[1]
3277:
328        TBZ     x1, 1, 8f
329        STR     h3, [x7], 2
330        STR     h2, [x17], 2
331        DUP     h3, v3.h[1]
332        DUP     h2, v2.h[1]
333        STR     h1, [x16], 2
334        STR     h0, [x6], 2
335        DUP     h1, v1.h[1]
336        DUP     h0, v0.h[1]
3378:
338        TBZ     x1, 0, 9f
339        STR     b3, [x7]
340        STR     b2, [x17]
341        STR     b1, [x16]
342        STR     b0, [x6]
3439:
344        RET
345
346END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
347
348#ifdef __ELF__
349.section ".note.GNU-stack","",%progbits
350#endif
351