xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> (x0)
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x10  v3
34# B    x5  v4  v5  v6  v7
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused v8 v9 v10 v11 v12 v13 v14 v15
40
41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
42
43        # Clamp C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDR     x8, [sp, 8]             // Load a_offset
46        ADD     x16, x6, x7             // c1 = c0 + cm_stride
47        CSEL    x16, x6,  x16, LO       //   c1 = c0
48        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54        BIC     x2, x2, 3
55
56        CMP     x0, 4                   // if mr < 4
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60        .p2align 3
610:
62        # Load initial bias from w into accumulators
63        LDP     q16, q20, [x5], 32
64        MOV     v17.16b, v16.16b
65        MOV     v18.16b, v16.16b
66        LDP     q24, q28, [x5], 32
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOV     v25.16b, v24.16b
72        MOV     v26.16b, v24.16b
73        MOV     v27.16b, v24.16b
74        MOV     v29.16b, v28.16b
75        MOV     v30.16b, v28.16b
76        MOV     v31.16b, v28.16b
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x10, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x10, x12                 // if a3 == zero
95        ADD     x10, x10, x8             // a3 += a_offset
96        CSEL    x10, x12, x10, EQ         //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 16 bytes for main loop?
99        SUBS    x0, x2, 16              // k = kc - 16
100        B.LO    4f
101
102        # Main loop - 16 bytes of A
103        .p2align 3
1042:
105        LDR     q0, [x13], 16
106        LDR     q4,  [x5], 16
107        LDR     q1, [x14], 16
108        LDR     q2, [x15], 16
109        LDR     q3,  [x10], 16
110        LDR     q5,  [x5], 16
111        SDOT    v16.4s, v4.16b,  v0.4b[0]
112        SDOT    v17.4s, v4.16b,  v1.4b[0]
113        LDP     q6, q7, [x5], 32
114        SDOT    v18.4s, v4.16b,  v2.4b[0]
115        SDOT    v19.4s, v4.16b,  v3.4b[0]
116        SDOT    v20.4s, v5.16b,  v0.4b[0]
117        SDOT    v21.4s, v5.16b,  v1.4b[0]
118        SDOT    v22.4s, v5.16b,  v2.4b[0]
119        SDOT    v23.4s, v5.16b,  v3.4b[0]
120        SDOT    v24.4s, v6.16b, v0.4b[0]
121        SDOT    v25.4s, v6.16b, v1.4b[0]
122        LDP     q4, q5, [x5], 32
123        SDOT    v26.4s, v6.16b, v2.4b[0]
124        SDOT    v27.4s, v6.16b, v3.4b[0]
125        SDOT    v28.4s, v7.16b, v0.4b[0]
126        SDOT    v29.4s, v7.16b, v1.4b[0]
127        SDOT    v30.4s, v7.16b, v2.4b[0]
128        SDOT    v31.4s, v7.16b, v3.4b[0]
129
130        SDOT    v16.4s, v4.16b,  v0.4b[1]
131        SDOT    v17.4s, v4.16b,  v1.4b[1]
132        LDP     q6, q7, [x5], 32
133        SDOT    v18.4s, v4.16b,  v2.4b[1]
134        SDOT    v19.4s, v4.16b,  v3.4b[1]
135        SDOT    v20.4s, v5.16b,  v0.4b[1]
136        SDOT    v21.4s, v5.16b,  v1.4b[1]
137        SDOT    v22.4s, v5.16b,  v2.4b[1]
138        SDOT    v23.4s, v5.16b,  v3.4b[1]
139        SDOT    v24.4s, v6.16b,  v0.4b[1]
140        SDOT    v25.4s, v6.16b,  v1.4b[1]
141        LDP     q4, q5, [x5], 32
142        SDOT    v26.4s, v6.16b,  v2.4b[1]
143        SDOT    v27.4s, v6.16b,  v3.4b[1]
144        SDOT    v28.4s, v7.16b,  v0.4b[1]
145        SDOT    v29.4s, v7.16b,  v1.4b[1]
146        SDOT    v30.4s, v7.16b,  v2.4b[1]
147        SDOT    v31.4s, v7.16b,  v3.4b[1]
148
149        SDOT    v16.4s, v4.16b,  v0.4b[2]
150        SDOT    v17.4s, v4.16b,  v1.4b[2]
151        LDP     q6, q7, [x5], 32
152        SDOT    v18.4s, v4.16b,  v2.4b[2]
153        SDOT    v19.4s, v4.16b,  v3.4b[2]
154        SDOT    v20.4s, v5.16b,  v0.4b[2]
155        SDOT    v21.4s, v5.16b,  v1.4b[2]
156        SDOT    v22.4s, v5.16b,  v2.4b[2]
157        SDOT    v23.4s, v5.16b,  v3.4b[2]
158        SDOT    v24.4s, v6.16b,  v0.4b[2]
159        SDOT    v25.4s, v6.16b,  v1.4b[2]
160        LDP     q4, q5, [x5], 32
161        SDOT    v26.4s, v6.16b,  v2.4b[2]
162        SDOT    v27.4s, v6.16b,  v3.4b[2]
163        SDOT    v28.4s, v7.16b,  v0.4b[2]
164        SDOT    v29.4s, v7.16b,  v1.4b[2]
165        SDOT    v30.4s, v7.16b,  v2.4b[2]
166        SDOT    v31.4s, v7.16b,  v3.4b[2]
167
168        SDOT    v16.4s, v4.16b,  v0.4b[3]
169        SDOT    v17.4s, v4.16b,  v1.4b[3]
170        LDP     q6, q7, [x5], 32
171        SDOT    v18.4s, v4.16b,  v2.4b[3]
172        SDOT    v19.4s, v4.16b,  v3.4b[3]
173        SDOT    v20.4s, v5.16b,  v0.4b[3]
174        SDOT    v21.4s, v5.16b,  v1.4b[3]
175        SDOT    v22.4s, v5.16b,  v2.4b[3]
176        SDOT    v23.4s, v5.16b,  v3.4b[3]
177        SDOT    v24.4s, v6.16b,  v0.4b[3]
178        SDOT    v25.4s, v6.16b,  v1.4b[3]
179        SDOT    v26.4s, v6.16b,  v2.4b[3]
180        SDOT    v27.4s, v6.16b,  v3.4b[3]
181        SUBS    x0, x0, 16
182        SDOT    v28.4s, v7.16b,  v0.4b[3]
183        SDOT    v29.4s, v7.16b,  v1.4b[3]
184        SDOT    v30.4s, v7.16b,  v2.4b[3]
185        SDOT    v31.4s, v7.16b,  v3.4b[3]
186        B.HS    2b
187
188        # Is there a remainder?- 4 to 12 bytes of A
189        TST     x0, 15
190        B.NE    4f
191
1923:
193        # ks loop
194        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
195        B.HI    1b
196
197        SCVTF   v16.4s, v16.4s
198        SCVTF   v17.4s, v17.4s
199        # Load per channel scale values from weights
200        LDR     q4, [x5], 16
201        SCVTF   v18.4s, v18.4s
202        SCVTF   v19.4s, v19.4s
203        LDR     q5, [x5], 16
204        SCVTF   v20.4s, v20.4s
205        SCVTF   v21.4s, v21.4s
206        SCVTF   v22.4s, v22.4s
207        SCVTF   v23.4s, v23.4s
208        SCVTF   v24.4s, v24.4s
209        SCVTF   v25.4s, v25.4s
210        SCVTF   v26.4s, v26.4s
211        SCVTF   v27.4s, v27.4s
212        SCVTF   v28.4s, v28.4s
213        SCVTF   v29.4s, v29.4s
214        SCVTF   v30.4s, v30.4s
215        SCVTF   v31.4s, v31.4s
216
217        LDR     q6, [x5], 16
218        FMUL    v16.4s, v16.4s, v4.4s
219        FMUL    v17.4s, v17.4s, v4.4s
220        FMUL    v18.4s, v18.4s, v4.4s
221        FMUL    v19.4s, v19.4s, v4.4s
222        FMUL    v20.4s, v20.4s, v5.4s
223        LDR     q4, [x5], 16
224        FMUL    v21.4s, v21.4s, v5.4s
225        FMUL    v22.4s, v22.4s, v5.4s
226        FMUL    v23.4s, v23.4s, v5.4s
227        FMUL    v24.4s, v24.4s, v6.4s
228        FMUL    v25.4s, v25.4s, v6.4s
229        FMUL    v26.4s, v26.4s, v6.4s
230        FMUL    v27.4s, v27.4s, v6.4s
231        FMUL    v28.4s, v28.4s, v4.4s
232        FMUL    v29.4s, v29.4s, v4.4s
233        FMUL    v30.4s, v30.4s, v4.4s
234        FMUL    v31.4s, v31.4s, v4.4s
235
236        FCVTNS  v16.4s, v16.4s
237        FCVTNS  v17.4s, v17.4s
238        FCVTNS  v18.4s, v18.4s
239        FCVTNS  v19.4s, v19.4s
240        FCVTNS  v20.4s, v20.4s
241        FCVTNS  v21.4s, v21.4s
242        FCVTNS  v22.4s, v22.4s
243        FCVTNS  v23.4s, v23.4s
244        FCVTNS  v24.4s, v24.4s
245        FCVTNS  v25.4s, v25.4s
246        FCVTNS  v26.4s, v26.4s
247        FCVTNS  v27.4s, v27.4s
248        FCVTNS  v28.4s, v28.4s
249        FCVTNS  v29.4s, v29.4s
250        FCVTNS  v30.4s, v30.4s
251        FCVTNS  v31.4s, v31.4s
252
253        SQXTN   v16.4h, v16.4s
254        SQXTN   v17.4h, v17.4s
255        SQXTN   v18.4h, v18.4s
256        SQXTN   v19.4h, v19.4s
257        SQXTN   v24.4h, v24.4s
258        SQXTN   v25.4h, v25.4s
259        SQXTN   v26.4h, v26.4s
260        SQXTN   v27.4h, v27.4s
261        LD1R    {v6.8h}, [x11], 2        // add bias
262
263        SQXTN2  v16.8h, v20.4s
264        SQXTN2  v17.8h, v21.4s
265        SQXTN2  v18.8h, v22.4s
266        SQXTN2  v19.8h, v23.4s
267        SQXTN2  v24.8h, v28.4s
268        SQXTN2  v25.8h, v29.4s
269        SQXTN2  v26.8h, v30.4s
270        SQXTN2  v27.8h, v31.4s
271
272        SQADD   v16.8h, v16.8h, v6.8h
273        SQADD   v17.8h, v17.8h, v6.8h
274        SQADD   v18.8h, v18.8h, v6.8h
275        SQADD   v19.8h, v19.8h, v6.8h
276        SQADD   v24.8h, v24.8h, v6.8h
277        SQADD   v25.8h, v25.8h, v6.8h
278        SQADD   v26.8h, v26.8h, v6.8h
279        SQADD   v27.8h, v27.8h, v6.8h
280        LD1R    {v4.16b}, [x11], 1      // clamp min value
281
282        SQXTN   v0.8b, v16.8h
283        SQXTN   v1.8b, v17.8h
284        SQXTN   v2.8b, v18.8h
285        SQXTN   v3.8b, v19.8h
286        LD1R    {v5.16b}, [x11]         // clamp max value
287        SQXTN2  v0.16b, v24.8h
288        SQXTN2  v1.16b, v25.8h
289        SQXTN2  v2.16b, v26.8h
290        SQXTN2  v3.16b, v27.8h
291        LDR     x0, [sp]                // cn_stride
292
293        SMAX    v0.16b, v0.16b, v4.16b
294        SMAX    v1.16b, v1.16b, v4.16b
295        SUB     x11, x11, 3          // rewind params pointer
296        SMAX    v2.16b, v2.16b, v4.16b
297        SMAX    v3.16b, v3.16b, v4.16b
298        SUBS    x1, x1, 16
299        SMIN    v0.16b, v0.16b, v5.16b
300        SMIN    v1.16b, v1.16b, v5.16b
301        SMIN    v2.16b, v2.16b, v5.16b
302        SMIN    v3.16b, v3.16b, v5.16b
303        B.LO    6f
304
305        # Store full 4 x 16
306        ST1     {v3.16b},  [x7], x0
307        ST1     {v2.16b}, [x17], x0
308        ST1     {v1.16b}, [x16], x0
309        ST1     {v0.16b},  [x6], x0
310
311        SUB     x4, x4, x3              // a -= ks
312
313        # nc loop
314        B.HI    0b
315        RET
316
317        # Remainder- 8 bytes of A
318        .p2align 3
3194:
320        # Is there a remainder?- 8 bytes of A
321        TBZ     x0, 3, 5f
322
323        LDR     d0, [x13], 8
324        LDR     q4,  [x5], 16
325        LDR     d1, [x14], 8
326        LDR     d2, [x15], 8
327        LDR     d3,  [x10], 8
328        LDR     q5,  [x5], 16
329        SDOT    v16.4s, v4.16b,  v0.4b[0]
330        SDOT    v17.4s, v4.16b,  v1.4b[0]
331        LDP     q6, q7, [x5], 32
332        SDOT    v18.4s, v4.16b,  v2.4b[0]
333        SDOT    v19.4s, v4.16b,  v3.4b[0]
334        SDOT    v20.4s, v5.16b,  v0.4b[0]
335        SDOT    v21.4s, v5.16b,  v1.4b[0]
336        SDOT    v22.4s, v5.16b,  v2.4b[0]
337        SDOT    v23.4s, v5.16b,  v3.4b[0]
338        SDOT    v24.4s, v6.16b, v0.4b[0]
339        SDOT    v25.4s, v6.16b, v1.4b[0]
340        LDP     q4, q5, [x5], 32
341        SDOT    v26.4s, v6.16b, v2.4b[0]
342        SDOT    v27.4s, v6.16b, v3.4b[0]
343        SDOT    v28.4s, v7.16b, v0.4b[0]
344        SDOT    v29.4s, v7.16b, v1.4b[0]
345        SDOT    v30.4s, v7.16b, v2.4b[0]
346        SDOT    v31.4s, v7.16b, v3.4b[0]
347        SDOT    v16.4s, v4.16b,  v0.4b[1]
348        SDOT    v17.4s, v4.16b,  v1.4b[1]
349        LDP     q6, q7, [x5], 32
350        SDOT    v18.4s, v4.16b,  v2.4b[1]
351        SDOT    v19.4s, v4.16b,  v3.4b[1]
352        SDOT    v20.4s, v5.16b,  v0.4b[1]
353        SDOT    v21.4s, v5.16b,  v1.4b[1]
354        SDOT    v22.4s, v5.16b,  v2.4b[1]
355        SDOT    v23.4s, v5.16b,  v3.4b[1]
356        SDOT    v24.4s, v6.16b,  v0.4b[1]
357        SDOT    v25.4s, v6.16b,  v1.4b[1]
358        SDOT    v26.4s, v6.16b,  v2.4b[1]
359        SDOT    v27.4s, v6.16b,  v3.4b[1]
360        SDOT    v28.4s, v7.16b,  v0.4b[1]
361        SDOT    v29.4s, v7.16b,  v1.4b[1]
362        SDOT    v30.4s, v7.16b,  v2.4b[1]
363        SDOT    v31.4s, v7.16b,  v3.4b[1]
364        # Is there a remainder?- 4 bytes of A
365        TBZ     x0, 2, 3b
366
367        # Remainder- 4 bytes of A
3685:
369        LDR     s0, [x13], 4
370        LDR     q4, [x5], 16
371        LDR     s1, [x14], 4
372        LDR     s2, [x15], 4
373        LDR     s3,  [x10], 4
374        LDR     q5, [x5], 16
375        SDOT    v16.4s, v4.16b,  v0.4b[0]
376        SDOT    v17.4s, v4.16b,  v1.4b[0]
377        LDP     q6, q7, [x5], 32
378        SDOT    v18.4s, v4.16b,  v2.4b[0]
379        SDOT    v19.4s, v4.16b,  v3.4b[0]
380        SDOT    v20.4s, v5.16b,  v0.4b[0]
381        SDOT    v21.4s, v5.16b,  v1.4b[0]
382        SDOT    v22.4s, v5.16b,  v2.4b[0]
383        SDOT    v23.4s, v5.16b,  v3.4b[0]
384        SDOT    v24.4s, v6.16b, v0.4b[0]
385        SDOT    v25.4s, v6.16b, v1.4b[0]
386        SDOT    v26.4s, v6.16b, v2.4b[0]
387        SDOT    v27.4s, v6.16b, v3.4b[0]
388        SDOT    v28.4s, v7.16b, v0.4b[0]
389        SDOT    v29.4s, v7.16b, v1.4b[0]
390        SDOT    v30.4s, v7.16b, v2.4b[0]
391        SDOT    v31.4s, v7.16b, v3.4b[0]
392        B       3b
393
394        # Store odd width
395        .p2align 3
3966:
397        TBZ     x1, 3, 7f
398        STR     d3, [x7], 8
399        STR     d2, [x17], 8
400        DUP     d3, v3.d[1]
401        DUP     d2, v2.d[1]
402        STR     d1, [x16], 8
403        STR     d0, [x6], 8
404        DUP     d1, v1.d[1]
405        DUP     d0, v0.d[1]
4067:
407        TBZ     x1, 2, 8f
408        STR     s3, [x7], 4
409        STR     s2, [x17], 4
410        DUP     s3, v3.s[1]
411        DUP     s2, v2.s[1]
412        STR     s1, [x16], 4
413        STR     s0, [x6], 4
414        DUP     s1, v1.s[1]
415        DUP     s0, v0.s[1]
4168:
417        TBZ     x1, 1, 9f
418        STR     h3, [x7], 2
419        STR     h2, [x17], 2
420        DUP     h3, v3.h[1]
421        DUP     h2, v2.h[1]
422        STR     h1, [x16], 2
423        STR     h0, [x6], 2
424        DUP     h1, v1.h[1]
425        DUP     h0, v0.h[1]
4269:
427        TBZ     x1, 0, 10f
428        STR     b3, [x7]
429        STR     b2, [x17]
430        STR     b1, [x16]
431        STR     b0, [x6]
43210:
433        RET
434
435END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
436
437#ifdef __ELF__
438.section ".note.GNU-stack","",%progbits
439#endif
440