xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> (x0)
23#     size_t a_offset,           [sp + 8] -> x8
24#     const int8_t* zero,        [sp + 16] -> x12
25#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x10  v3
34# B    x5  v4  v5  v6  v7
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# zero_point v8 v12 v13 v14 v15
40# unused v9 v10 v11
41
42BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDR     x8, [sp, 8]             // Load a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
50        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
51        ADD     x17, x16, x7            // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        # Save d8,d12-d15 on stack
54        STR     d8,  [sp, -48]!
55        CSEL    x17, x16, x17, LS       //   c2 = c1
56        BIC     x2, x2, 3
57        STP     d12, d13, [sp, 16]
58        CMP     x0, 4                   // if mr < 4
59        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
60        STP     d14, d15, [sp, 32]
61        CSEL    x7,  x17, x7, LO        //   c3 = c2
62        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68
69        MOVI    v12.4s, 0
70        MOVI    v13.4s, 0
71        MOVI    v14.4s, 0
72        MOVI    v15.4s, 0
73
74        MOV     v17.16b, v16.16b
75        MOV     v18.16b, v16.16b
76        LDP     q24, q28, [x5], 32
77        MOV     v19.16b, v16.16b
78        MOV     v21.16b, v20.16b
79        MOV     v22.16b, v20.16b
80        MOV     v23.16b, v20.16b
81        MOV     v25.16b, v24.16b
82        MOV     v26.16b, v24.16b
83        MOV     v27.16b, v24.16b
84        MOV     v29.16b, v28.16b
85        MOV     v30.16b, v28.16b
86        MOV     v31.16b, v28.16b
87
88        MOV     x9, x3                  // p = ks
89
90        .p2align 3
911:
92        # Load next 4 A pointers
93        LDP     x13, x14, [x4], 16
94        LDP     x15, x10, [x4], 16
95
96        CMP     x13, x12                // if a0 == zero
97        ADD     x13, x13, x8            // a0 += a_offset
98        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
99        CMP     x14, x12                // if a1 == zero
100        ADD     x14, x14, x8            // a1 += a_offset
101        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
102        CMP     x15, x12                // if a2 == zero
103        ADD     x15, x15, x8            // a2 += a_offset
104        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
105        CMP     x10, x12                // if a3 == zero
106        ADD     x10, x10, x8            // a3 += a_offset
107        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
108
109        # Is there at least 16 bytes for main loop?
110        SUBS    x0, x2, 16              // k = kc - 16
111        B.LO    4f
112
113        # Main loop - 16 bytes of A
114        .p2align 3
1152:
116        LDR     q0, [x13], 16
117        LDR     q4,  [x5], 16
118        LDR     q1, [x14], 16
119        LDR     q2, [x15], 16
120        LDR     q3, [x10], 16
121        LDR     q5,  [x5], 16
122
123        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
124        UDOT    v13.4s, v8.16b,  v1.16b
125        UDOT    v14.4s, v8.16b,  v2.16b
126        UDOT    v15.4s, v8.16b,  v3.16b
127
128        UDOT    v16.4s, v4.16b,  v0.4b[0]
129        UDOT    v17.4s, v4.16b,  v1.4b[0]
130        LDP     q6, q7, [x5], 32
131        UDOT    v18.4s, v4.16b,  v2.4b[0]
132        UDOT    v19.4s, v4.16b,  v3.4b[0]
133        UDOT    v20.4s, v5.16b,  v0.4b[0]
134        UDOT    v21.4s, v5.16b,  v1.4b[0]
135        UDOT    v22.4s, v5.16b,  v2.4b[0]
136        UDOT    v23.4s, v5.16b,  v3.4b[0]
137        UDOT    v24.4s, v6.16b, v0.4b[0]
138        UDOT    v25.4s, v6.16b, v1.4b[0]
139        LDP     q4, q5, [x5], 32
140        UDOT    v26.4s, v6.16b, v2.4b[0]
141        UDOT    v27.4s, v6.16b, v3.4b[0]
142        UDOT    v28.4s, v7.16b, v0.4b[0]
143        UDOT    v29.4s, v7.16b, v1.4b[0]
144        UDOT    v30.4s, v7.16b, v2.4b[0]
145        UDOT    v31.4s, v7.16b, v3.4b[0]
146
147        UDOT    v16.4s, v4.16b,  v0.4b[1]
148        UDOT    v17.4s, v4.16b,  v1.4b[1]
149        LDP     q6, q7, [x5], 32
150        UDOT    v18.4s, v4.16b,  v2.4b[1]
151        UDOT    v19.4s, v4.16b,  v3.4b[1]
152        UDOT    v20.4s, v5.16b,  v0.4b[1]
153        UDOT    v21.4s, v5.16b,  v1.4b[1]
154        UDOT    v22.4s, v5.16b,  v2.4b[1]
155        UDOT    v23.4s, v5.16b,  v3.4b[1]
156        UDOT    v24.4s, v6.16b,  v0.4b[1]
157        UDOT    v25.4s, v6.16b,  v1.4b[1]
158        LDP     q4, q5, [x5], 32
159        UDOT    v26.4s, v6.16b,  v2.4b[1]
160        UDOT    v27.4s, v6.16b,  v3.4b[1]
161        UDOT    v28.4s, v7.16b,  v0.4b[1]
162        UDOT    v29.4s, v7.16b,  v1.4b[1]
163        UDOT    v30.4s, v7.16b,  v2.4b[1]
164        UDOT    v31.4s, v7.16b,  v3.4b[1]
165
166        UDOT    v16.4s, v4.16b,  v0.4b[2]
167        UDOT    v17.4s, v4.16b,  v1.4b[2]
168        LDP     q6, q7, [x5], 32
169        UDOT    v18.4s, v4.16b,  v2.4b[2]
170        UDOT    v19.4s, v4.16b,  v3.4b[2]
171        UDOT    v20.4s, v5.16b,  v0.4b[2]
172        UDOT    v21.4s, v5.16b,  v1.4b[2]
173        UDOT    v22.4s, v5.16b,  v2.4b[2]
174        UDOT    v23.4s, v5.16b,  v3.4b[2]
175        UDOT    v24.4s, v6.16b,  v0.4b[2]
176        UDOT    v25.4s, v6.16b,  v1.4b[2]
177        LDP     q4, q5, [x5], 32
178        UDOT    v26.4s, v6.16b,  v2.4b[2]
179        UDOT    v27.4s, v6.16b,  v3.4b[2]
180        UDOT    v28.4s, v7.16b,  v0.4b[2]
181        UDOT    v29.4s, v7.16b,  v1.4b[2]
182        UDOT    v30.4s, v7.16b,  v2.4b[2]
183        UDOT    v31.4s, v7.16b,  v3.4b[2]
184
185        UDOT    v16.4s, v4.16b,  v0.4b[3]
186        UDOT    v17.4s, v4.16b,  v1.4b[3]
187        LDP     q6, q7, [x5], 32
188        UDOT    v18.4s, v4.16b,  v2.4b[3]
189        UDOT    v19.4s, v4.16b,  v3.4b[3]
190        UDOT    v20.4s, v5.16b,  v0.4b[3]
191        UDOT    v21.4s, v5.16b,  v1.4b[3]
192        UDOT    v22.4s, v5.16b,  v2.4b[3]
193        UDOT    v23.4s, v5.16b,  v3.4b[3]
194        UDOT    v24.4s, v6.16b,  v0.4b[3]
195        UDOT    v25.4s, v6.16b,  v1.4b[3]
196        UDOT    v26.4s, v6.16b,  v2.4b[3]
197        UDOT    v27.4s, v6.16b,  v3.4b[3]
198        SUBS    x0, x0, 16
199        UDOT    v28.4s, v7.16b,  v0.4b[3]
200        UDOT    v29.4s, v7.16b,  v1.4b[3]
201        UDOT    v30.4s, v7.16b,  v2.4b[3]
202        UDOT    v31.4s, v7.16b,  v3.4b[3]
203        B.HS    2b
204
205        # Is there a remainder?- 4 to 12 bytes of A
206        TST     x0, 15
207        B.NE    4f
208
2093:
210        # ks loop
211        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
212        B.HI    1b
213
214        ADDP    v0.4s, v12.4s, v12.4s
215        ADDP    v1.4s, v13.4s, v13.4s
216        ADDP    v2.4s, v14.4s, v14.4s
217        ADDP    v3.4s, v15.4s, v15.4s
218        ADDP    v12.4s, v0.4s, v0.4s
219        ADDP    v13.4s, v1.4s, v1.4s
220        ADDP    v14.4s, v2.4s, v2.4s
221        ADDP    v15.4s, v3.4s, v3.4s
222
223        # Subtract zero point from accumulators
224        SUB     v16.4s, v16.4s, v12.4s
225        SUB     v17.4s, v17.4s, v13.4s
226        SUB     v18.4s, v18.4s, v14.4s
227        SUB     v19.4s, v19.4s, v15.4s
228        SUB     v20.4s, v20.4s, v12.4s
229        SUB     v21.4s, v21.4s, v13.4s
230        SUB     v22.4s, v22.4s, v14.4s
231        SUB     v23.4s, v23.4s, v15.4s
232        SUB     v24.4s, v24.4s, v12.4s
233        SUB     v25.4s, v25.4s, v13.4s
234        SUB     v26.4s, v26.4s, v14.4s
235        SUB     v27.4s, v27.4s, v15.4s
236        SUB     v28.4s, v28.4s, v12.4s
237        SUB     v29.4s, v29.4s, v13.4s
238        SUB     v30.4s, v30.4s, v14.4s
239        SUB     v31.4s, v31.4s, v15.4s
240
241        SCVTF   v16.4s, v16.4s
242        SCVTF   v17.4s, v17.4s
243        # Apply params - scale, bias and clamp
244        LD1R    {v4.4s}, [x11], 4
245        SCVTF   v18.4s, v18.4s
246        SCVTF   v19.4s, v19.4s
247        SCVTF   v20.4s, v20.4s
248        SCVTF   v21.4s, v21.4s
249        SCVTF   v22.4s, v22.4s
250        SCVTF   v23.4s, v23.4s
251        SCVTF   v24.4s, v24.4s
252        SCVTF   v25.4s, v25.4s
253        SCVTF   v26.4s, v26.4s
254        SCVTF   v27.4s, v27.4s
255        SCVTF   v28.4s, v28.4s
256        SCVTF   v29.4s, v29.4s
257        SCVTF   v30.4s, v30.4s
258        SCVTF   v31.4s, v31.4s
259
260        FMUL    v16.4s, v16.4s, v4.4s
261        FMUL    v17.4s, v17.4s, v4.4s
262        FMUL    v18.4s, v18.4s, v4.4s
263        FMUL    v19.4s, v19.4s, v4.4s
264        FMUL    v20.4s, v20.4s, v4.4s
265        FMUL    v21.4s, v21.4s, v4.4s
266        FMUL    v22.4s, v22.4s, v4.4s
267        FMUL    v23.4s, v23.4s, v4.4s
268        FMUL    v24.4s, v24.4s, v4.4s
269        FMUL    v25.4s, v25.4s, v4.4s
270        FMUL    v26.4s, v26.4s, v4.4s
271        FMUL    v27.4s, v27.4s, v4.4s
272        FMUL    v28.4s, v28.4s, v4.4s
273        FMUL    v29.4s, v29.4s, v4.4s
274        FMUL    v30.4s, v30.4s, v4.4s
275        FMUL    v31.4s, v31.4s, v4.4s
276
277        FCVTNS  v16.4s, v16.4s
278        FCVTNS  v17.4s, v17.4s
279        FCVTNS  v18.4s, v18.4s
280        FCVTNS  v19.4s, v19.4s
281        FCVTNS  v20.4s, v20.4s
282        FCVTNS  v21.4s, v21.4s
283        FCVTNS  v22.4s, v22.4s
284        FCVTNS  v23.4s, v23.4s
285        FCVTNS  v24.4s, v24.4s
286        FCVTNS  v25.4s, v25.4s
287        FCVTNS  v26.4s, v26.4s
288        FCVTNS  v27.4s, v27.4s
289        FCVTNS  v28.4s, v28.4s
290        FCVTNS  v29.4s, v29.4s
291        FCVTNS  v30.4s, v30.4s
292        FCVTNS  v31.4s, v31.4s
293
294        SQXTN   v16.4h, v16.4s
295        SQXTN   v17.4h, v17.4s
296        SQXTN   v18.4h, v18.4s
297        SQXTN   v19.4h, v19.4s
298        SQXTN   v24.4h, v24.4s
299        SQXTN   v25.4h, v25.4s
300        SQXTN   v26.4h, v26.4s
301        SQXTN   v27.4h, v27.4s
302        LD1R    {v6.8h}, [x11], 2        // add bias
303
304        SQXTN2  v16.8h, v20.4s
305        SQXTN2  v17.8h, v21.4s
306        SQXTN2  v18.8h, v22.4s
307        SQXTN2  v19.8h, v23.4s
308        SQXTN2  v24.8h, v28.4s
309        SQXTN2  v25.8h, v29.4s
310        SQXTN2  v26.8h, v30.4s
311        SQXTN2  v27.8h, v31.4s
312
313        SQADD   v16.8h, v16.8h, v6.8h
314        SQADD   v17.8h, v17.8h, v6.8h
315        SQADD   v18.8h, v18.8h, v6.8h
316        SQADD   v19.8h, v19.8h, v6.8h
317        SQADD   v24.8h, v24.8h, v6.8h
318        SQADD   v25.8h, v25.8h, v6.8h
319        SQADD   v26.8h, v26.8h, v6.8h
320        SQADD   v27.8h, v27.8h, v6.8h
321        LD1R    {v4.16b}, [x11], 1      // clamp min value
322
323        SQXTUN  v0.8b, v16.8h
324        SQXTUN  v1.8b, v17.8h
325        SQXTUN  v2.8b, v18.8h
326        SQXTUN  v3.8b, v19.8h
327        LD1R    {v5.16b}, [x11]         // clamp max value
328        SQXTUN2 v0.16b, v24.8h
329        SQXTUN2 v1.16b, v25.8h
330        SQXTUN2 v2.16b, v26.8h
331        SQXTUN2 v3.16b, v27.8h
332        LDR     x0, [sp, 48]            // Load cn_stride
333
334        UMAX    v0.16b, v0.16b, v4.16b
335        UMAX    v1.16b, v1.16b, v4.16b
336        SUB     x11, x11, 7             // rewind params pointer
337        UMAX    v2.16b, v2.16b, v4.16b
338        UMAX    v3.16b, v3.16b, v4.16b
339        SUBS    x1, x1, 16
340        UMIN    v0.16b, v0.16b, v5.16b
341        UMIN    v1.16b, v1.16b, v5.16b
342        UMIN    v2.16b, v2.16b, v5.16b
343        UMIN    v3.16b, v3.16b, v5.16b
344        B.LO    6f
345
346        # Store full 4 x 16
347        ST1     {v3.16b},  [x7], x0
348        ST1     {v2.16b}, [x17], x0
349        ST1     {v1.16b}, [x16], x0
350        ST1     {v0.16b},  [x6], x0
351
352        SUB     x4, x4, x3              // a -= ks
353
354        # nc loop
355        B.HI    0b
356
357        # Restore d8,d12-d15 from stack
358        LDP     d14, d15, [sp, 32]
359        LDP     d12, d13, [sp, 16]
360        LDR     d8,  [sp], 48
361        RET
362
363        # Remainder- 8 bytes of A
364        .p2align 3
3654:
366        # Is there a remainder?- 8 bytes of A
367        TBZ     x0, 3, 5f
368
369        LDR     d0, [x13], 8
370        LDR     q4,  [x5], 16
371        LDR     d1, [x14], 8
372        LDR     d2, [x15], 8
373        LDR     d3, [x10], 8
374        LDR     q5,  [x5], 16
375
376        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
377        UDOT    v13.4s, v8.16b,  v1.16b
378        UDOT    v14.4s, v8.16b,  v2.16b
379        UDOT    v15.4s, v8.16b,  v3.16b
380
381        UDOT    v16.4s, v4.16b,  v0.4b[0]
382        UDOT    v17.4s, v4.16b,  v1.4b[0]
383        LDP     q6, q7, [x5], 32
384        UDOT    v18.4s, v4.16b,  v2.4b[0]
385        UDOT    v19.4s, v4.16b,  v3.4b[0]
386        UDOT    v20.4s, v5.16b,  v0.4b[0]
387        UDOT    v21.4s, v5.16b,  v1.4b[0]
388        UDOT    v22.4s, v5.16b,  v2.4b[0]
389        UDOT    v23.4s, v5.16b,  v3.4b[0]
390        UDOT    v24.4s, v6.16b, v0.4b[0]
391        UDOT    v25.4s, v6.16b, v1.4b[0]
392        LDP     q4, q5, [x5], 32
393        UDOT    v26.4s, v6.16b, v2.4b[0]
394        UDOT    v27.4s, v6.16b, v3.4b[0]
395        UDOT    v28.4s, v7.16b, v0.4b[0]
396        UDOT    v29.4s, v7.16b, v1.4b[0]
397        UDOT    v30.4s, v7.16b, v2.4b[0]
398        UDOT    v31.4s, v7.16b, v3.4b[0]
399        UDOT    v16.4s, v4.16b,  v0.4b[1]
400        UDOT    v17.4s, v4.16b,  v1.4b[1]
401        LDP     q6, q7, [x5], 32
402        UDOT    v18.4s, v4.16b,  v2.4b[1]
403        UDOT    v19.4s, v4.16b,  v3.4b[1]
404        UDOT    v20.4s, v5.16b,  v0.4b[1]
405        UDOT    v21.4s, v5.16b,  v1.4b[1]
406        UDOT    v22.4s, v5.16b,  v2.4b[1]
407        UDOT    v23.4s, v5.16b,  v3.4b[1]
408        UDOT    v24.4s, v6.16b,  v0.4b[1]
409        UDOT    v25.4s, v6.16b,  v1.4b[1]
410        UDOT    v26.4s, v6.16b,  v2.4b[1]
411        UDOT    v27.4s, v6.16b,  v3.4b[1]
412        UDOT    v28.4s, v7.16b,  v0.4b[1]
413        UDOT    v29.4s, v7.16b,  v1.4b[1]
414        UDOT    v30.4s, v7.16b,  v2.4b[1]
415        UDOT    v31.4s, v7.16b,  v3.4b[1]
416        # Is there a remainder?- 4 bytes of A
417        TBZ     x0, 2, 3b
418
419        # Remainder- 4 bytes of A
4205:
421        LDR     s0, [x13], 4
422        LDR     q4, [x5], 16
423        LDR     s1, [x14], 4
424        LDR     s2, [x15], 4
425        LDR     s3, [x10], 4
426        LDR     q5, [x5], 16
427
428        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
429        UDOT    v13.4s, v8.16b,  v1.16b
430        UDOT    v14.4s, v8.16b,  v2.16b
431        UDOT    v15.4s, v8.16b,  v3.16b
432
433        UDOT    v16.4s, v4.16b,  v0.4b[0]
434        UDOT    v17.4s, v4.16b,  v1.4b[0]
435        UDOT    v18.4s, v4.16b,  v2.4b[0]
436        UDOT    v19.4s, v4.16b,  v3.4b[0]
437        LDP     q6, q7, [x5], 32
438        UDOT    v20.4s, v5.16b,  v0.4b[0]
439        UDOT    v21.4s, v5.16b,  v1.4b[0]
440        UDOT    v22.4s, v5.16b,  v2.4b[0]
441        UDOT    v23.4s, v5.16b,  v3.4b[0]
442        UDOT    v24.4s, v6.16b, v0.4b[0]
443        UDOT    v25.4s, v6.16b, v1.4b[0]
444        UDOT    v26.4s, v6.16b, v2.4b[0]
445        UDOT    v27.4s, v6.16b, v3.4b[0]
446        UDOT    v28.4s, v7.16b, v0.4b[0]
447        UDOT    v29.4s, v7.16b, v1.4b[0]
448        UDOT    v30.4s, v7.16b, v2.4b[0]
449        UDOT    v31.4s, v7.16b, v3.4b[0]
450        B       3b
451
452        # Store odd width
453        .p2align 3
4546:
455        TBZ     x1, 3, 7f
456        STR     d3, [x7], 8
457        STR     d2, [x17], 8
458        DUP     d3, v3.d[1]
459        DUP     d2, v2.d[1]
460        STR     d1, [x16], 8
461        STR     d0, [x6], 8
462        DUP     d1, v1.d[1]
463        DUP     d0, v0.d[1]
4647:
465        TBZ     x1, 2, 8f
466        STR     s3, [x7], 4
467        STR     s2, [x17], 4
468        DUP     s3, v3.s[1]
469        DUP     s2, v2.s[1]
470        STR     s1, [x16], 4
471        STR     s0, [x6], 4
472        DUP     s1, v1.s[1]
473        DUP     s0, v0.s[1]
4748:
475        TBZ     x1, 1, 9f
476        STR     h3, [x7], 2
477        STR     h2, [x17], 2
478        DUP     h3, v3.h[1]
479        DUP     h2, v2.h[1]
480        STR     h1, [x16], 2
481        STR     h0, [x6], 2
482        DUP     h1, v1.h[1]
483        DUP     h0, v0.h[1]
4849:
485        TBZ     x1, 0, 10f
486        STR     b3, [x7]
487        STR     b2, [x17]
488        STR     b1, [x16]
489        STR     b0, [x6]
49010:
491        # Restore d8,d12-d15 from stack
492        LDP     d14, d15, [sp, 32]
493        LDP     d12, d13, [sp, 16]
494        LDR     d8,  [sp], 48
495        RET
496
497END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
498
499#ifdef __ELF__
500.section ".note.GNU-stack","",%progbits
501#endif
502