xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 12 bytes
26#  struct {
27#    uint8_t kernel_zero_point[4];
28#    float scale;
29#    int16_t output_zero_point;
30#    int8_t output_min;
31#    int8_t output_max;
32#  } fp32_neonv8;
33
34# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
35
36# Register usage
37# A0  x3 v0
38# A1 x15 v1
39# A2 x13 v2
40# A3  x4 v3
41# B   x5 v4  v5  v6  v7
42# C0  x6 v16 v20 v24 v28
43# C1  x8 v17 v21 v25 v29
44# C2  x9 v18 v22 v26 v30
45# C3  x7 v19 v23 v27 v31
46# zero_point v8 v12 v13 v14 v15
47# unused v9 v10 v11
48
49BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
50
51        # Clamp A and C pointers
52        CMP     x0, 2                   // if mr < 2
53        LDP     x12, x11, [sp]          // cn_stride, params
54        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
55        ADD     x15, x3, x4             // a1 = a0 + a_stride
56        ADD     x8, x6, x7              // c1 = c0 + cm_stride
57
58        # Save d8,d12-d15 on stack
59        STR     d8,  [sp, -48]!
60        CSEL    x15, x3, x15, LO        //   a1 = a0
61        CSEL    x8, x6,  x8, LO         //   c1 = c0
62        BIC     x2, x2, 3
63
64        STP     d12, d13, [sp, 16]
65        ADD     x13, x15, x4            // a2 = a1 + a_stride
66        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
67                                        // if mr <= 2
68        CSEL    x13, x15, x13, LS       //   a2 = a1
69        CSEL    x9,  x8,  x9, LS        //   c2 = c1
70
71        STP     d14, d15, [sp, 32]
72        CMP     x0, 4                   // if mr < 4
73        ADD     x4, x13, x4             // a3 = a2 + a_stride
74        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
75
76        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
77
78        CSEL    x4, x13, x4, LO         //   a3 = a2
79        CSEL    x7,  x9, x7, LO         //   c3 = c2
80
81        .p2align 3
820:
83        # Load initial bias from w into accumulators
84        LDP     q16, q20, [x5], 32
85
86        MOVI    v12.4s, 0
87        MOVI    v13.4s, 0
88        MOVI    v14.4s, 0
89        MOVI    v15.4s, 0
90
91        MOV     v17.16b, v16.16b
92        MOV     v18.16b, v16.16b
93        LDP     q24, q28, [x5], 32
94        MOV     v19.16b, v16.16b
95        MOV     v21.16b, v20.16b
96        SUBS    x0, x2, 16              // k = kc - 16
97        MOV     v22.16b, v20.16b
98        MOV     v23.16b, v20.16b
99        MOV     v25.16b, v24.16b
100        MOV     v26.16b, v24.16b
101        MOV     v27.16b, v24.16b
102        MOV     v29.16b, v28.16b
103        MOV     v30.16b, v28.16b
104        MOV     v31.16b, v28.16b
105
106        # Is there at least 16 bytes?
107        B.LO    3f
108
109        # Main loop - 16 bytes of A
110        .p2align 3
1111:
112        LDR     q0,  [x3], 16
113        LDR     q4,  [x5], 16
114        LDR     q1, [x15], 16
115        LDR     q2, [x13], 16
116        LDR     q3,  [x4], 16
117        LDR     q5,  [x5], 16
118
119        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
120        UDOT    v13.4s, v8.16b,  v1.16b
121        UDOT    v14.4s, v8.16b,  v2.16b
122        UDOT    v15.4s, v8.16b,  v3.16b
123
124        UDOT    v16.4s, v4.16b,  v0.4b[0]
125        UDOT    v17.4s, v4.16b,  v1.4b[0]
126        LDP     q6, q7, [x5], 32
127        UDOT    v18.4s, v4.16b,  v2.4b[0]
128        UDOT    v19.4s, v4.16b,  v3.4b[0]
129        UDOT    v20.4s, v5.16b,  v0.4b[0]
130        UDOT    v21.4s, v5.16b,  v1.4b[0]
131        UDOT    v22.4s, v5.16b,  v2.4b[0]
132        UDOT    v23.4s, v5.16b,  v3.4b[0]
133        UDOT    v24.4s, v6.16b, v0.4b[0]
134        UDOT    v25.4s, v6.16b, v1.4b[0]
135        LDP     q4, q5, [x5], 32
136        UDOT    v26.4s, v6.16b, v2.4b[0]
137        UDOT    v27.4s, v6.16b, v3.4b[0]
138        UDOT    v28.4s, v7.16b, v0.4b[0]
139        UDOT    v29.4s, v7.16b, v1.4b[0]
140        UDOT    v30.4s, v7.16b, v2.4b[0]
141        UDOT    v31.4s, v7.16b, v3.4b[0]
142
143        UDOT    v16.4s, v4.16b,  v0.4b[1]
144        UDOT    v17.4s, v4.16b,  v1.4b[1]
145        LDP     q6, q7, [x5], 32
146        UDOT    v18.4s, v4.16b,  v2.4b[1]
147        UDOT    v19.4s, v4.16b,  v3.4b[1]
148        UDOT    v20.4s, v5.16b,  v0.4b[1]
149        UDOT    v21.4s, v5.16b,  v1.4b[1]
150        UDOT    v22.4s, v5.16b,  v2.4b[1]
151        UDOT    v23.4s, v5.16b,  v3.4b[1]
152        UDOT    v24.4s, v6.16b,  v0.4b[1]
153        UDOT    v25.4s, v6.16b,  v1.4b[1]
154        LDP     q4, q5, [x5], 32
155        UDOT    v26.4s, v6.16b,  v2.4b[1]
156        UDOT    v27.4s, v6.16b,  v3.4b[1]
157        UDOT    v28.4s, v7.16b,  v0.4b[1]
158        UDOT    v29.4s, v7.16b,  v1.4b[1]
159        UDOT    v30.4s, v7.16b,  v2.4b[1]
160        UDOT    v31.4s, v7.16b,  v3.4b[1]
161
162        UDOT    v16.4s, v4.16b,  v0.4b[2]
163        UDOT    v17.4s, v4.16b,  v1.4b[2]
164        LDP     q6, q7, [x5], 32
165        UDOT    v18.4s, v4.16b,  v2.4b[2]
166        UDOT    v19.4s, v4.16b,  v3.4b[2]
167        UDOT    v20.4s, v5.16b,  v0.4b[2]
168        UDOT    v21.4s, v5.16b,  v1.4b[2]
169        UDOT    v22.4s, v5.16b,  v2.4b[2]
170        UDOT    v23.4s, v5.16b,  v3.4b[2]
171        UDOT    v24.4s, v6.16b,  v0.4b[2]
172        UDOT    v25.4s, v6.16b,  v1.4b[2]
173        LDP     q4, q5, [x5], 32
174        UDOT    v26.4s, v6.16b,  v2.4b[2]
175        UDOT    v27.4s, v6.16b,  v3.4b[2]
176        UDOT    v28.4s, v7.16b,  v0.4b[2]
177        UDOT    v29.4s, v7.16b,  v1.4b[2]
178        UDOT    v30.4s, v7.16b,  v2.4b[2]
179        UDOT    v31.4s, v7.16b,  v3.4b[2]
180
181        UDOT    v16.4s, v4.16b,  v0.4b[3]
182        UDOT    v17.4s, v4.16b,  v1.4b[3]
183        LDP     q6, q7, [x5], 32
184        UDOT    v18.4s, v4.16b,  v2.4b[3]
185        UDOT    v19.4s, v4.16b,  v3.4b[3]
186        UDOT    v20.4s, v5.16b,  v0.4b[3]
187        UDOT    v21.4s, v5.16b,  v1.4b[3]
188        UDOT    v22.4s, v5.16b,  v2.4b[3]
189        UDOT    v23.4s, v5.16b,  v3.4b[3]
190        UDOT    v24.4s, v6.16b,  v0.4b[3]
191        UDOT    v25.4s, v6.16b,  v1.4b[3]
192        UDOT    v26.4s, v6.16b,  v2.4b[3]
193        UDOT    v27.4s, v6.16b,  v3.4b[3]
194        SUBS    x0, x0, 16
195        UDOT    v28.4s, v7.16b,  v0.4b[3]
196        UDOT    v29.4s, v7.16b,  v1.4b[3]
197        UDOT    v30.4s, v7.16b,  v2.4b[3]
198        UDOT    v31.4s, v7.16b,  v3.4b[3]
199        B.HS    1b
200
201        # Is there a remainder?- 4 to 12 bytes of A
202        TST     x0, 15
203        B.NE    3f
204
2052:
206        ADDP    v0.4s, v12.4s, v12.4s
207        ADDP    v1.4s, v13.4s, v13.4s
208        ADDP    v2.4s, v14.4s, v14.4s
209        ADDP    v3.4s, v15.4s, v15.4s
210        ADDP    v12.4s, v0.4s, v0.4s
211        ADDP    v13.4s, v1.4s, v1.4s
212        ADDP    v14.4s, v2.4s, v2.4s
213        ADDP    v15.4s, v3.4s, v3.4s
214
215        # Subtract zero point from accumulators
216        SUB     v16.4s, v16.4s, v12.4s
217        SUB     v17.4s, v17.4s, v13.4s
218        SUB     v18.4s, v18.4s, v14.4s
219        SUB     v19.4s, v19.4s, v15.4s
220        SUB     v20.4s, v20.4s, v12.4s
221        SUB     v21.4s, v21.4s, v13.4s
222        SUB     v22.4s, v22.4s, v14.4s
223        SUB     v23.4s, v23.4s, v15.4s
224        SUB     v24.4s, v24.4s, v12.4s
225        SUB     v25.4s, v25.4s, v13.4s
226        SUB     v26.4s, v26.4s, v14.4s
227        SUB     v27.4s, v27.4s, v15.4s
228        SUB     v28.4s, v28.4s, v12.4s
229        SUB     v29.4s, v29.4s, v13.4s
230        SUB     v30.4s, v30.4s, v14.4s
231        SUB     v31.4s, v31.4s, v15.4s
232
233        SCVTF   v16.4s, v16.4s
234        SCVTF   v17.4s, v17.4s
235        # Apply params - scale, bias and clamp
236        LD1R    {v4.4s}, [x11], 4
237        SCVTF   v18.4s, v18.4s
238        SCVTF   v19.4s, v19.4s
239        SCVTF   v20.4s, v20.4s
240        SCVTF   v21.4s, v21.4s
241        SCVTF   v22.4s, v22.4s
242        SCVTF   v23.4s, v23.4s
243        SCVTF   v24.4s, v24.4s
244        SCVTF   v25.4s, v25.4s
245        SCVTF   v26.4s, v26.4s
246        SCVTF   v27.4s, v27.4s
247        SCVTF   v28.4s, v28.4s
248        SCVTF   v29.4s, v29.4s
249        SCVTF   v30.4s, v30.4s
250        SCVTF   v31.4s, v31.4s
251
252        FMUL    v16.4s, v16.4s, v4.4s
253        FMUL    v17.4s, v17.4s, v4.4s
254        FMUL    v18.4s, v18.4s, v4.4s
255        FMUL    v19.4s, v19.4s, v4.4s
256        FMUL    v20.4s, v20.4s, v4.4s
257        FMUL    v21.4s, v21.4s, v4.4s
258        FMUL    v22.4s, v22.4s, v4.4s
259        FMUL    v23.4s, v23.4s, v4.4s
260        FMUL    v24.4s, v24.4s, v4.4s
261        FMUL    v25.4s, v25.4s, v4.4s
262        FMUL    v26.4s, v26.4s, v4.4s
263        FMUL    v27.4s, v27.4s, v4.4s
264        FMUL    v28.4s, v28.4s, v4.4s
265        FMUL    v29.4s, v29.4s, v4.4s
266        FMUL    v30.4s, v30.4s, v4.4s
267        FMUL    v31.4s, v31.4s, v4.4s
268
269        FCVTNS  v16.4s, v16.4s
270        FCVTNS  v17.4s, v17.4s
271        FCVTNS  v18.4s, v18.4s
272        FCVTNS  v19.4s, v19.4s
273        FCVTNS  v20.4s, v20.4s
274        FCVTNS  v21.4s, v21.4s
275        FCVTNS  v22.4s, v22.4s
276        FCVTNS  v23.4s, v23.4s
277        FCVTNS  v24.4s, v24.4s
278        FCVTNS  v25.4s, v25.4s
279        FCVTNS  v26.4s, v26.4s
280        FCVTNS  v27.4s, v27.4s
281        FCVTNS  v28.4s, v28.4s
282        FCVTNS  v29.4s, v29.4s
283        FCVTNS  v30.4s, v30.4s
284        FCVTNS  v31.4s, v31.4s
285
286        SQXTN   v16.4h, v16.4s
287        SQXTN   v17.4h, v17.4s
288        SQXTN   v18.4h, v18.4s
289        SQXTN   v19.4h, v19.4s
290        SQXTN   v24.4h, v24.4s
291        SQXTN   v25.4h, v25.4s
292        SQXTN   v26.4h, v26.4s
293        SQXTN   v27.4h, v27.4s
294        LD1R    {v6.8h}, [x11], 2       // add bias
295
296        SQXTN2  v16.8h, v20.4s
297        SQXTN2  v17.8h, v21.4s
298        SQXTN2  v18.8h, v22.4s
299        SQXTN2  v19.8h, v23.4s
300        SQXTN2  v24.8h, v28.4s
301        SQXTN2  v25.8h, v29.4s
302        SQXTN2  v26.8h, v30.4s
303        SQXTN2  v27.8h, v31.4s
304
305        SQADD   v16.8h, v16.8h, v6.8h
306        SQADD   v17.8h, v17.8h, v6.8h
307        SQADD   v18.8h, v18.8h, v6.8h
308        SQADD   v19.8h, v19.8h, v6.8h
309        SQADD   v24.8h, v24.8h, v6.8h
310        SQADD   v25.8h, v25.8h, v6.8h
311        SQADD   v26.8h, v26.8h, v6.8h
312        SQADD   v27.8h, v27.8h, v6.8h
313        LD1R    {v4.16b}, [x11], 1      // clamp min value
314
315        SQXTUN  v0.8b, v16.8h
316        SQXTUN  v1.8b, v17.8h
317        SQXTUN  v2.8b, v18.8h
318        SQXTUN  v3.8b, v19.8h
319        LD1R    {v5.16b}, [x11]         // clamp max value
320        SQXTUN2 v0.16b, v24.8h
321        SQXTUN2 v1.16b, v25.8h
322        SQXTUN2 v2.16b, v26.8h
323        SQXTUN2 v3.16b, v27.8h
324
325        SUB     x11, x11, 7             // rewind params pointer
326
327        UMAX    v0.16b, v0.16b, v4.16b
328        UMAX    v1.16b, v1.16b, v4.16b
329        UMAX    v2.16b, v2.16b, v4.16b
330        UMAX    v3.16b, v3.16b, v4.16b
331        SUBS    x1, x1, 16
332        UMIN    v0.16b, v0.16b, v5.16b
333        UMIN    v1.16b, v1.16b, v5.16b
334        UMIN    v2.16b, v2.16b, v5.16b
335        UMIN    v3.16b, v3.16b, v5.16b
336        B.LO    5f
337
338        # Store full 4 x 16
339        ST1     {v0.16b}, [x6], x12
340        SUB     x3,  x3, x2             // a0 -= kc
341        ST1     {v1.16b}, [x8], x12
342        SUB     x15, x15, x2            // a1 -= kc
343        ST1     {v2.16b}, [x9], x12
344        SUB     x13, x13, x2            // a2 -= kc
345        ST1     {v3.16b}, [x7], x12
346        SUB     x4,  x4, x2             // a3 -= kc
347        B.NE    0b
348
349        # Restore d8,d12-d15 from stack
350        LDP     d14, d15, [sp, 32]
351        LDP     d12, d13, [sp, 16]
352        LDR     d8,  [sp], 48
353        RET
354
355        # Remainder- 8 bytes of A
356        .p2align 3
3573:
358        # Is there a remainder?- 8 bytes of A
359        TBZ     x0, 3, 4f
360
361        LDR     d0,  [x3], 8
362        LDR     q4,  [x5], 16
363        LDR     d1, [x15], 8
364        LDR     d2, [x13], 8
365        LDR     d3,  [x4], 8
366        LDR     q5,  [x5], 16
367
368        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
369        UDOT    v13.4s, v8.16b,  v1.16b
370        UDOT    v14.4s, v8.16b,  v2.16b
371        UDOT    v15.4s, v8.16b,  v3.16b
372
373        UDOT    v16.4s, v4.16b,  v0.4b[0]
374        UDOT    v17.4s, v4.16b,  v1.4b[0]
375        LDP     q6, q7, [x5], 32
376        UDOT    v18.4s, v4.16b,  v2.4b[0]
377        UDOT    v19.4s, v4.16b,  v3.4b[0]
378        UDOT    v20.4s, v5.16b,  v0.4b[0]
379        UDOT    v21.4s, v5.16b,  v1.4b[0]
380        UDOT    v22.4s, v5.16b,  v2.4b[0]
381        UDOT    v23.4s, v5.16b,  v3.4b[0]
382        UDOT    v24.4s, v6.16b, v0.4b[0]
383        UDOT    v25.4s, v6.16b, v1.4b[0]
384        LDP     q4, q5, [x5], 32
385        UDOT    v26.4s, v6.16b, v2.4b[0]
386        UDOT    v27.4s, v6.16b, v3.4b[0]
387        UDOT    v28.4s, v7.16b, v0.4b[0]
388        UDOT    v29.4s, v7.16b, v1.4b[0]
389        UDOT    v30.4s, v7.16b, v2.4b[0]
390        UDOT    v31.4s, v7.16b, v3.4b[0]
391        UDOT    v16.4s, v4.16b,  v0.4b[1]
392        UDOT    v17.4s, v4.16b,  v1.4b[1]
393        LDP     q6, q7, [x5], 32
394        UDOT    v18.4s, v4.16b,  v2.4b[1]
395        UDOT    v19.4s, v4.16b,  v3.4b[1]
396        UDOT    v20.4s, v5.16b,  v0.4b[1]
397        UDOT    v21.4s, v5.16b,  v1.4b[1]
398        UDOT    v22.4s, v5.16b,  v2.4b[1]
399        UDOT    v23.4s, v5.16b,  v3.4b[1]
400        UDOT    v24.4s, v6.16b,  v0.4b[1]
401        UDOT    v25.4s, v6.16b,  v1.4b[1]
402        UDOT    v26.4s, v6.16b,  v2.4b[1]
403        UDOT    v27.4s, v6.16b,  v3.4b[1]
404        UDOT    v28.4s, v7.16b,  v0.4b[1]
405        UDOT    v29.4s, v7.16b,  v1.4b[1]
406        UDOT    v30.4s, v7.16b,  v2.4b[1]
407        UDOT    v31.4s, v7.16b,  v3.4b[1]
408        # Is there a remainder?- 4 bytes of A
409        TBZ     x0, 2, 2b
410
411        # Remainder- 4 bytes of A
4124:
413        LDR     s0,  [x3], 4
414        LDR     q4,  [x5], 16
415        LDR     s1, [x15], 4
416        LDR     s2, [x13], 4
417        LDR     s3,  [x4], 4
418        LDR     q5, [x5], 16
419
420        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
421        UDOT    v13.4s, v8.16b,  v1.16b
422        UDOT    v14.4s, v8.16b,  v2.16b
423        UDOT    v15.4s, v8.16b,  v3.16b
424
425        UDOT    v16.4s, v4.16b,  v0.4b[0]
426        UDOT    v17.4s, v4.16b,  v1.4b[0]
427        UDOT    v18.4s, v4.16b,  v2.4b[0]
428        UDOT    v19.4s, v4.16b,  v3.4b[0]
429        LDP     q6, q7, [x5], 32
430        UDOT    v20.4s, v5.16b,  v0.4b[0]
431        UDOT    v21.4s, v5.16b,  v1.4b[0]
432        UDOT    v22.4s, v5.16b,  v2.4b[0]
433        UDOT    v23.4s, v5.16b,  v3.4b[0]
434        UDOT    v24.4s, v6.16b, v0.4b[0]
435        UDOT    v25.4s, v6.16b, v1.4b[0]
436        UDOT    v26.4s, v6.16b, v2.4b[0]
437        UDOT    v27.4s, v6.16b, v3.4b[0]
438        UDOT    v28.4s, v7.16b, v0.4b[0]
439        UDOT    v29.4s, v7.16b, v1.4b[0]
440        UDOT    v30.4s, v7.16b, v2.4b[0]
441        UDOT    v31.4s, v7.16b, v3.4b[0]
442        B       2b
443
444        # Store odd width
445        .p2align 3
4465:
447        TBZ     x1, 3, 6f
448        STR     d0, [x6], 8
449        STR     d1, [x8], 8
450        DUP     d0, v0.d[1]
451        DUP     d1, v1.d[1]
452        STR     d2, [x9], 8
453        STR     d3, [x7], 8
454        DUP     d2, v2.d[1]
455        DUP     d3, v3.d[1]
4566:
457        TBZ     x1, 2, 7f
458        STR     s0, [x6], 4
459        STR     s1, [x8], 4
460        DUP     s0, v0.s[1]
461        DUP     s1, v1.s[1]
462        STR     s2, [x9], 4
463        STR     s3, [x7], 4
464        DUP     s2, v2.s[1]
465        DUP     s3, v3.s[1]
4667:
467        TBZ     x1, 1, 8f
468        STR     h0, [x6], 2
469        STR     h1, [x8], 2
470        DUP     h0, v0.h[1]
471        DUP     h1, v1.h[1]
472        STR     h2, [x9], 2
473        STR     h3, [x7], 2
474        DUP     h2, v2.h[1]
475        DUP     h3, v3.h[1]
4768:
477        TBZ     x1, 0, 9f
478        STR     b0, [x6]
479        STR     b1, [x8]
480        STR     b2, [x9]
481        STR     b3, [x7]
4829:
483        # Restore d8,d12-d15 from stack
484        LDP     d14, d15, [sp, 32]
485        LDP     d12, d13, [sp, 16]
486        LDR     d8,  [sp], 48
487        RET
488
489END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
490
491#ifdef __ELF__
492.section ".note.GNU-stack","",%progbits
493#endif
494