xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 8 bytes
26#  struct {
27#    float scale;
28#    int16_t output_zero_point;
29#    int8_t output_min;
30#    int8_t output_max;
31#  } fp32_neonv8;
32
33# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
34
35# Register usage
36# A0  x3 v0
37# A1 x15 v1
38# A2 x13 v2
39# A3  x4 v3
40# B   x5 v4  v5  v6  v7
41# C0  x6 v16 v20 v24 v28
42# C1  x8 v17 v21 v25 v29
43# C2  x9 v18 v22 v26 v30
44# C3  x7 v19 v23 v27 v31
45# unused v8 v9 v10 v11 v12 v13 v14 v15
46
47BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
48
49        # Clamp A and C pointers
50        CMP     x0, 2                   // if mr < 2
51        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
52        ADD     x15, x3, x4             // a1 = a0 + a_stride
53        ADD     x8, x6, x7              // c1 = c0 + cm_stride
54        CSEL    x15, x3, x15, LO        //   a1 = a0
55        CSEL    x8, x6,  x8, LO         //   c1 = c0
56        BIC     x2, x2, 3
57
58        ADD     x13, x15, x4            // a2 = a1 + a_stride
59        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
60                                        // if mr <= 2
61        CSEL    x13, x15, x13, LS       //   a2 = a1
62        CSEL    x9,  x8,  x9, LS        //   c2 = c1
63
64        LDP     x12, x11, [sp]          // cn_stride, params
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x4, x13, x4             // a3 = a2 + a_stride
68        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
69        CSEL    x4, x13, x4, LO         //   a3 = a2
70        CSEL    x7,  x9, x7, LO         //   c3 = c2
71
72        .p2align 3
730:
74        # Load initial bias from w into accumulators
75        LDP     q16, q20, [x5], 32
76        MOV     v17.16b, v16.16b
77        MOV     v18.16b, v16.16b
78        LDP     q24, q28, [x5], 32
79        MOV     v19.16b, v16.16b
80        MOV     v21.16b, v20.16b
81        MOV     v22.16b, v20.16b
82        MOV     v23.16b, v20.16b
83        MOV     v25.16b, v24.16b
84        MOV     v26.16b, v24.16b
85        SUBS    x0, x2, 16              // k = kc - 16
86        MOV     v27.16b, v24.16b
87        MOV     v29.16b, v28.16b
88        MOV     v30.16b, v28.16b
89        MOV     v31.16b, v28.16b
90        # Is there at least 16 bytes?
91        B.LO    3f
92
93        # Main loop - 16 bytes of A
94        .p2align 3
951:
96        LDR     q0,  [x3], 16
97        LDR     q4,  [x5], 16
98        LDR     q1, [x15], 16
99        LDR     q2, [x13], 16
100        LDR     q3,  [x4], 16
101        LDR     q5,  [x5], 16
102        SDOT    v16.4s, v4.16b,  v0.4b[0]
103        SDOT    v17.4s, v4.16b,  v1.4b[0]
104        LDP     q6, q7, [x5], 32
105        SDOT    v18.4s, v4.16b,  v2.4b[0]
106        SDOT    v19.4s, v4.16b,  v3.4b[0]
107        SDOT    v20.4s, v5.16b,  v0.4b[0]
108        SDOT    v21.4s, v5.16b,  v1.4b[0]
109        SDOT    v22.4s, v5.16b,  v2.4b[0]
110        SDOT    v23.4s, v5.16b,  v3.4b[0]
111        SDOT    v24.4s, v6.16b, v0.4b[0]
112        SDOT    v25.4s, v6.16b, v1.4b[0]
113        LDP     q4, q5, [x5], 32
114        SDOT    v26.4s, v6.16b, v2.4b[0]
115        SDOT    v27.4s, v6.16b, v3.4b[0]
116        SDOT    v28.4s, v7.16b, v0.4b[0]
117        SDOT    v29.4s, v7.16b, v1.4b[0]
118        SDOT    v30.4s, v7.16b, v2.4b[0]
119        SDOT    v31.4s, v7.16b, v3.4b[0]
120
121        SDOT    v16.4s, v4.16b,  v0.4b[1]
122        SDOT    v17.4s, v4.16b,  v1.4b[1]
123        LDP     q6, q7, [x5], 32
124        SDOT    v18.4s, v4.16b,  v2.4b[1]
125        SDOT    v19.4s, v4.16b,  v3.4b[1]
126        SDOT    v20.4s, v5.16b,  v0.4b[1]
127        SDOT    v21.4s, v5.16b,  v1.4b[1]
128        SDOT    v22.4s, v5.16b,  v2.4b[1]
129        SDOT    v23.4s, v5.16b,  v3.4b[1]
130        SDOT    v24.4s, v6.16b,  v0.4b[1]
131        SDOT    v25.4s, v6.16b,  v1.4b[1]
132        LDP     q4, q5, [x5], 32
133        SDOT    v26.4s, v6.16b,  v2.4b[1]
134        SDOT    v27.4s, v6.16b,  v3.4b[1]
135        SDOT    v28.4s, v7.16b,  v0.4b[1]
136        SDOT    v29.4s, v7.16b,  v1.4b[1]
137        SDOT    v30.4s, v7.16b,  v2.4b[1]
138        SDOT    v31.4s, v7.16b,  v3.4b[1]
139
140        SDOT    v16.4s, v4.16b,  v0.4b[2]
141        SDOT    v17.4s, v4.16b,  v1.4b[2]
142        LDP     q6, q7, [x5], 32
143        SDOT    v18.4s, v4.16b,  v2.4b[2]
144        SDOT    v19.4s, v4.16b,  v3.4b[2]
145        SDOT    v20.4s, v5.16b,  v0.4b[2]
146        SDOT    v21.4s, v5.16b,  v1.4b[2]
147        SDOT    v22.4s, v5.16b,  v2.4b[2]
148        SDOT    v23.4s, v5.16b,  v3.4b[2]
149        SDOT    v24.4s, v6.16b,  v0.4b[2]
150        SDOT    v25.4s, v6.16b,  v1.4b[2]
151        LDP     q4, q5, [x5], 32
152        SDOT    v26.4s, v6.16b,  v2.4b[2]
153        SDOT    v27.4s, v6.16b,  v3.4b[2]
154        SDOT    v28.4s, v7.16b,  v0.4b[2]
155        SDOT    v29.4s, v7.16b,  v1.4b[2]
156        SDOT    v30.4s, v7.16b,  v2.4b[2]
157        SDOT    v31.4s, v7.16b,  v3.4b[2]
158
159        SDOT    v16.4s, v4.16b,  v0.4b[3]
160        SDOT    v17.4s, v4.16b,  v1.4b[3]
161        LDP     q6, q7, [x5], 32
162        SDOT    v18.4s, v4.16b,  v2.4b[3]
163        SDOT    v19.4s, v4.16b,  v3.4b[3]
164        SDOT    v20.4s, v5.16b,  v0.4b[3]
165        SDOT    v21.4s, v5.16b,  v1.4b[3]
166        SDOT    v22.4s, v5.16b,  v2.4b[3]
167        SDOT    v23.4s, v5.16b,  v3.4b[3]
168        SDOT    v24.4s, v6.16b,  v0.4b[3]
169        SDOT    v25.4s, v6.16b,  v1.4b[3]
170        SDOT    v26.4s, v6.16b,  v2.4b[3]
171        SDOT    v27.4s, v6.16b,  v3.4b[3]
172        SUBS    x0, x0, 16
173        SDOT    v28.4s, v7.16b,  v0.4b[3]
174        SDOT    v29.4s, v7.16b,  v1.4b[3]
175        SDOT    v30.4s, v7.16b,  v2.4b[3]
176        SDOT    v31.4s, v7.16b,  v3.4b[3]
177        B.HS    1b
178
179        # Is there a remainder?- 4 to 12 bytes of A
180        TST     x0, 15
181        B.NE    3f
182
1832:
184        SCVTF   v16.4s, v16.4s
185        SCVTF   v17.4s, v17.4s
186        # Apply params - scale, bias and clamp
187        LD1R    {v4.4s}, [x11], 4
188        SCVTF   v18.4s, v18.4s
189        SCVTF   v19.4s, v19.4s
190        SCVTF   v20.4s, v20.4s
191        SCVTF   v21.4s, v21.4s
192        SCVTF   v22.4s, v22.4s
193        SCVTF   v23.4s, v23.4s
194        SCVTF   v24.4s, v24.4s
195        SCVTF   v25.4s, v25.4s
196        SCVTF   v26.4s, v26.4s
197        SCVTF   v27.4s, v27.4s
198        SCVTF   v28.4s, v28.4s
199        SCVTF   v29.4s, v29.4s
200        SCVTF   v30.4s, v30.4s
201        SCVTF   v31.4s, v31.4s
202
203        FMUL    v16.4s, v16.4s, v4.4s
204        FMUL    v17.4s, v17.4s, v4.4s
205        FMUL    v18.4s, v18.4s, v4.4s
206        FMUL    v19.4s, v19.4s, v4.4s
207        FMUL    v20.4s, v20.4s, v4.4s
208        FMUL    v21.4s, v21.4s, v4.4s
209        FMUL    v22.4s, v22.4s, v4.4s
210        FMUL    v23.4s, v23.4s, v4.4s
211        FMUL    v24.4s, v24.4s, v4.4s
212        FMUL    v25.4s, v25.4s, v4.4s
213        FMUL    v26.4s, v26.4s, v4.4s
214        FMUL    v27.4s, v27.4s, v4.4s
215        FMUL    v28.4s, v28.4s, v4.4s
216        FMUL    v29.4s, v29.4s, v4.4s
217        FMUL    v30.4s, v30.4s, v4.4s
218        FMUL    v31.4s, v31.4s, v4.4s
219
220        FCVTNS  v16.4s, v16.4s
221        FCVTNS  v17.4s, v17.4s
222        FCVTNS  v18.4s, v18.4s
223        FCVTNS  v19.4s, v19.4s
224        FCVTNS  v20.4s, v20.4s
225        FCVTNS  v21.4s, v21.4s
226        FCVTNS  v22.4s, v22.4s
227        FCVTNS  v23.4s, v23.4s
228        FCVTNS  v24.4s, v24.4s
229        FCVTNS  v25.4s, v25.4s
230        FCVTNS  v26.4s, v26.4s
231        FCVTNS  v27.4s, v27.4s
232        FCVTNS  v28.4s, v28.4s
233        FCVTNS  v29.4s, v29.4s
234        FCVTNS  v30.4s, v30.4s
235        FCVTNS  v31.4s, v31.4s
236
237        SQXTN   v16.4h, v16.4s
238        SQXTN   v17.4h, v17.4s
239        SQXTN   v18.4h, v18.4s
240        SQXTN   v19.4h, v19.4s
241        SQXTN   v24.4h, v24.4s
242        SQXTN   v25.4h, v25.4s
243        SQXTN   v26.4h, v26.4s
244        SQXTN   v27.4h, v27.4s
245        LD1R    {v6.8h}, [x11], 2       // add bias
246
247        SQXTN2  v16.8h, v20.4s
248        SQXTN2  v17.8h, v21.4s
249        SQXTN2  v18.8h, v22.4s
250        SQXTN2  v19.8h, v23.4s
251        SQXTN2  v24.8h, v28.4s
252        SQXTN2  v25.8h, v29.4s
253        SQXTN2  v26.8h, v30.4s
254        SQXTN2  v27.8h, v31.4s
255
256        SQADD   v16.8h, v16.8h, v6.8h
257        SQADD   v17.8h, v17.8h, v6.8h
258        SQADD   v18.8h, v18.8h, v6.8h
259        SQADD   v19.8h, v19.8h, v6.8h
260        SQADD   v24.8h, v24.8h, v6.8h
261        SQADD   v25.8h, v25.8h, v6.8h
262        SQADD   v26.8h, v26.8h, v6.8h
263        SQADD   v27.8h, v27.8h, v6.8h
264        LD1R    {v4.16b}, [x11], 1      // clamp min value
265
266        SQXTN   v0.8b, v16.8h
267        SQXTN   v1.8b, v17.8h
268        SQXTN   v2.8b, v18.8h
269        SQXTN   v3.8b, v19.8h
270        LD1R    {v5.16b}, [x11]         // clamp max value
271        SQXTN2  v0.16b, v24.8h
272        SQXTN2  v1.16b, v25.8h
273        SQXTN2  v2.16b, v26.8h
274        SQXTN2  v3.16b, v27.8h
275        SUB     x11, x11, 7            // rewind params pointer
276
277        SMAX    v0.16b, v0.16b, v4.16b
278        SMAX    v1.16b, v1.16b, v4.16b
279        SMAX    v2.16b, v2.16b, v4.16b
280        SMAX    v3.16b, v3.16b, v4.16b
281        SUBS    x1, x1, 16
282        SMIN    v0.16b, v0.16b, v5.16b
283        SMIN    v1.16b, v1.16b, v5.16b
284        SMIN    v2.16b, v2.16b, v5.16b
285        SMIN    v3.16b, v3.16b, v5.16b
286        B.LO    5f
287
288        # Store full 4 x 16
289        ST1     {v0.16b}, [x6], x12
290        SUB     x3,  x3, x2             // a0 -= kc
291        ST1     {v1.16b}, [x8], x12
292        SUB     x15, x15, x2            // a1 -= kc
293        ST1     {v2.16b}, [x9], x12
294        SUB     x13, x13, x2            // a2 -= kc
295        ST1     {v3.16b}, [x7], x12
296        SUB     x4,  x4, x2             // a3 -= kc
297        B.NE    0b
298        RET
299
300        # Remainder- 8 bytes of A
301        .p2align 3
3023:
303        # Is there a remainder?- 8 bytes of A
304        TBZ     x0, 3, 4f
305
306        LDR     d0,  [x3], 8
307        LDR     q4,  [x5], 16
308        LDR     d1, [x15], 8
309        LDR     d2, [x13], 8
310        LDR     d3,  [x4], 8
311        LDR     q5,  [x5], 16
312        SDOT    v16.4s, v4.16b,  v0.4b[0]
313        SDOT    v17.4s, v4.16b,  v1.4b[0]
314        LDP     q6, q7, [x5], 32
315        SDOT    v18.4s, v4.16b,  v2.4b[0]
316        SDOT    v19.4s, v4.16b,  v3.4b[0]
317        SDOT    v20.4s, v5.16b,  v0.4b[0]
318        SDOT    v21.4s, v5.16b,  v1.4b[0]
319        SDOT    v22.4s, v5.16b,  v2.4b[0]
320        SDOT    v23.4s, v5.16b,  v3.4b[0]
321        SDOT    v24.4s, v6.16b, v0.4b[0]
322        SDOT    v25.4s, v6.16b, v1.4b[0]
323        LDP     q4, q5, [x5], 32
324        SDOT    v26.4s, v6.16b, v2.4b[0]
325        SDOT    v27.4s, v6.16b, v3.4b[0]
326        SDOT    v28.4s, v7.16b, v0.4b[0]
327        SDOT    v29.4s, v7.16b, v1.4b[0]
328        SDOT    v30.4s, v7.16b, v2.4b[0]
329        SDOT    v31.4s, v7.16b, v3.4b[0]
330        SDOT    v16.4s, v4.16b,  v0.4b[1]
331        SDOT    v17.4s, v4.16b,  v1.4b[1]
332        LDP     q6, q7, [x5], 32
333        SDOT    v18.4s, v4.16b,  v2.4b[1]
334        SDOT    v19.4s, v4.16b,  v3.4b[1]
335        SDOT    v20.4s, v5.16b,  v0.4b[1]
336        SDOT    v21.4s, v5.16b,  v1.4b[1]
337        SDOT    v22.4s, v5.16b,  v2.4b[1]
338        SDOT    v23.4s, v5.16b,  v3.4b[1]
339        SDOT    v24.4s, v6.16b,  v0.4b[1]
340        SDOT    v25.4s, v6.16b,  v1.4b[1]
341        SDOT    v26.4s, v6.16b,  v2.4b[1]
342        SDOT    v27.4s, v6.16b,  v3.4b[1]
343        SDOT    v28.4s, v7.16b,  v0.4b[1]
344        SDOT    v29.4s, v7.16b,  v1.4b[1]
345        SDOT    v30.4s, v7.16b,  v2.4b[1]
346        SDOT    v31.4s, v7.16b,  v3.4b[1]
347        # Is there a remainder?- 4 bytes of A
348        TBZ     x0, 2, 2b
349
350        # Remainder- 4 bytes of A
3514:
352        LDR     s0,  [x3], 4
353        LDR     q4, [x5], 16
354        LDR     s1, [x15], 4
355        LDR     s2, [x13], 4
356        LDR     s3,  [x4], 4
357        SDOT    v16.4s, v4.16b,  v0.4b[0]
358        LDR     q5, [x5], 16
359        SDOT    v17.4s, v4.16b,  v1.4b[0]
360        SDOT    v18.4s, v4.16b,  v2.4b[0]
361        SDOT    v19.4s, v4.16b,  v3.4b[0]
362        SDOT    v20.4s, v5.16b,  v0.4b[0]
363        LDP     q6, q7, [x5], 32
364        SDOT    v21.4s, v5.16b,  v1.4b[0]
365        SDOT    v22.4s, v5.16b,  v2.4b[0]
366        SDOT    v23.4s, v5.16b,  v3.4b[0]
367        SDOT    v24.4s, v6.16b, v0.4b[0]
368        SDOT    v25.4s, v6.16b, v1.4b[0]
369        SDOT    v26.4s, v6.16b, v2.4b[0]
370        SDOT    v27.4s, v6.16b, v3.4b[0]
371        SDOT    v28.4s, v7.16b, v0.4b[0]
372        SDOT    v29.4s, v7.16b, v1.4b[0]
373        SDOT    v30.4s, v7.16b, v2.4b[0]
374        SDOT    v31.4s, v7.16b, v3.4b[0]
375        B       2b
376
377        # Store odd width
378        .p2align 3
3795:
380        TBZ     x1, 3, 6f
381        STR     d0, [x6], 8
382        STR     d1, [x8], 8
383        DUP     d0, v0.d[1]
384        DUP     d1, v1.d[1]
385        STR     d2, [x9], 8
386        STR     d3, [x7], 8
387        DUP     d2, v2.d[1]
388        DUP     d3, v3.d[1]
3896:
390        TBZ     x1, 2, 7f
391        STR     s0, [x6], 4
392        STR     s1, [x8], 4
393        DUP     s0, v0.s[1]
394        DUP     s1, v1.s[1]
395        STR     s2, [x9], 4
396        STR     s3, [x7], 4
397        DUP     s2, v2.s[1]
398        DUP     s3, v3.s[1]
3997:
400        TBZ     x1, 1, 8f
401        STR     h0, [x6], 2
402        STR     h1, [x8], 2
403        DUP     h0, v0.h[1]
404        DUP     h1, v1.h[1]
405        STR     h2, [x9], 2
406        STR     h3, [x7], 2
407        DUP     h2, v2.h[1]
408        DUP     h3, v3.h[1]
4098:
410        TBZ     x1, 0, 9f
411        STR     b0, [x6]
412        STR     b1, [x8]
413        STR     b2, [x9]
414        STR     b3, [x7]
4159:
416        RET
417
418END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
419
420#ifdef __ELF__
421.section ".note.GNU-stack","",%progbits
422#endif
423