xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 16 bytes
26#  struct {
27#    int32_t right_pre_shift;
28#    int32_t multiplier;
29#    int32_t right_post_shift;
30#    int16_t output_zero_point;
31#    int8_t output_min;
32#    int8_t output_max;
33#  } rndnu_neon;
34
35# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
36
37# Register usage
38# A0  x3 v0
39# A1 x15 v1
40# A2 x13 v2
41# A3  x4 v3
42# B   x5 v4  v5  v6  v7
43# C0  x6 v16 v20 v24 v28
44# C1  x8 v17 v21 v25 v29
45# C2  x9 v18 v22 v26 v30
46# C3  x7 v19 v23 v27 v31
47# unused v8 v9 v10 v11 v12 v13 v14 v15
48
49BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128
50
51        # Clamp A and C pointers
52        CMP     x0, 2                   // if mr < 2
53        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
54        ADD     x15, x3, x4             // a1 = a0 + a_stride
55        ADD     x8, x6, x7              // c1 = c0 + cm_stride
56        CSEL    x15, x3, x15, LO        //   a1 = a0
57        CSEL    x8, x6,  x8, LO         //   c1 = c0
58        BIC     x2, x2, 3
59
60        ADD     x13, x15, x4            // a2 = a1 + a_stride
61        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
62                                        // if mr <= 2
63        CSEL    x13, x15, x13, LS       //   a2 = a1
64        CSEL    x9,  x8,  x9, LS        //   c2 = c1
65
66        LDP     x12, x11, [sp]          // cn_stride, params
67
68        CMP     x0, 4                   // if mr < 4
69        ADD     x4, x13, x4             // a3 = a2 + a_stride
70        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
71        CSEL    x4, x13, x4, LO         //   a3 = a2
72        CSEL    x7,  x9, x7, LO         //   c3 = c2
73
74        .p2align 3
750:
76        # Load initial bias from w into accumulators
77        LDP     q16, q20, [x5], 32
78        MOV     v17.16b, v16.16b
79        MOV     v18.16b, v16.16b
80        LDP     q24, q28, [x5], 32
81        MOV     v19.16b, v16.16b
82        MOV     v21.16b, v20.16b
83        MOV     v22.16b, v20.16b
84        MOV     v23.16b, v20.16b
85        MOV     v25.16b, v24.16b
86        MOV     v26.16b, v24.16b
87        SUBS    x0, x2, 16              // k = kc - 16
88        MOV     v27.16b, v24.16b
89        MOV     v29.16b, v28.16b
90        MOV     v30.16b, v28.16b
91        MOV     v31.16b, v28.16b
92        # Is there at least 16 bytes?
93        B.LO    3f
94
95        # Main loop - 16 bytes of A
96        .p2align 3
971:
98        LDR     q0,  [x3], 16
99        LDR     q4,  [x5], 16
100        LDR     q1, [x15], 16
101        LDR     q2, [x13], 16
102        LDR     q3,  [x4], 16
103        LDR     q5,  [x5], 16
104        SDOT    v16.4s, v4.16b,  v0.4b[0]
105        SDOT    v17.4s, v4.16b,  v1.4b[0]
106        LDP     q6, q7, [x5], 32
107        SDOT    v18.4s, v4.16b,  v2.4b[0]
108        SDOT    v19.4s, v4.16b,  v3.4b[0]
109        SDOT    v20.4s, v5.16b,  v0.4b[0]
110        SDOT    v21.4s, v5.16b,  v1.4b[0]
111        SDOT    v22.4s, v5.16b,  v2.4b[0]
112        SDOT    v23.4s, v5.16b,  v3.4b[0]
113        SDOT    v24.4s, v6.16b, v0.4b[0]
114        SDOT    v25.4s, v6.16b, v1.4b[0]
115        LDP     q4, q5, [x5], 32
116        SDOT    v26.4s, v6.16b, v2.4b[0]
117        SDOT    v27.4s, v6.16b, v3.4b[0]
118        SDOT    v28.4s, v7.16b, v0.4b[0]
119        SDOT    v29.4s, v7.16b, v1.4b[0]
120        SDOT    v30.4s, v7.16b, v2.4b[0]
121        SDOT    v31.4s, v7.16b, v3.4b[0]
122
123        SDOT    v16.4s, v4.16b,  v0.4b[1]
124        SDOT    v17.4s, v4.16b,  v1.4b[1]
125        LDP     q6, q7, [x5], 32
126        SDOT    v18.4s, v4.16b,  v2.4b[1]
127        SDOT    v19.4s, v4.16b,  v3.4b[1]
128        SDOT    v20.4s, v5.16b,  v0.4b[1]
129        SDOT    v21.4s, v5.16b,  v1.4b[1]
130        SDOT    v22.4s, v5.16b,  v2.4b[1]
131        SDOT    v23.4s, v5.16b,  v3.4b[1]
132        SDOT    v24.4s, v6.16b,  v0.4b[1]
133        SDOT    v25.4s, v6.16b,  v1.4b[1]
134        LDP     q4, q5, [x5], 32
135        SDOT    v26.4s, v6.16b,  v2.4b[1]
136        SDOT    v27.4s, v6.16b,  v3.4b[1]
137        SDOT    v28.4s, v7.16b,  v0.4b[1]
138        SDOT    v29.4s, v7.16b,  v1.4b[1]
139        SDOT    v30.4s, v7.16b,  v2.4b[1]
140        SDOT    v31.4s, v7.16b,  v3.4b[1]
141
142        SDOT    v16.4s, v4.16b,  v0.4b[2]
143        SDOT    v17.4s, v4.16b,  v1.4b[2]
144        LDP     q6, q7, [x5], 32
145        SDOT    v18.4s, v4.16b,  v2.4b[2]
146        SDOT    v19.4s, v4.16b,  v3.4b[2]
147        SDOT    v20.4s, v5.16b,  v0.4b[2]
148        SDOT    v21.4s, v5.16b,  v1.4b[2]
149        SDOT    v22.4s, v5.16b,  v2.4b[2]
150        SDOT    v23.4s, v5.16b,  v3.4b[2]
151        SDOT    v24.4s, v6.16b,  v0.4b[2]
152        SDOT    v25.4s, v6.16b,  v1.4b[2]
153        LDP     q4, q5, [x5], 32
154        SDOT    v26.4s, v6.16b,  v2.4b[2]
155        SDOT    v27.4s, v6.16b,  v3.4b[2]
156        SDOT    v28.4s, v7.16b,  v0.4b[2]
157        SDOT    v29.4s, v7.16b,  v1.4b[2]
158        SDOT    v30.4s, v7.16b,  v2.4b[2]
159        SDOT    v31.4s, v7.16b,  v3.4b[2]
160
161        SDOT    v16.4s, v4.16b,  v0.4b[3]
162        SDOT    v17.4s, v4.16b,  v1.4b[3]
163        LDP     q6, q7, [x5], 32
164        SDOT    v18.4s, v4.16b,  v2.4b[3]
165        SDOT    v19.4s, v4.16b,  v3.4b[3]
166        SDOT    v20.4s, v5.16b,  v0.4b[3]
167        SDOT    v21.4s, v5.16b,  v1.4b[3]
168        SDOT    v22.4s, v5.16b,  v2.4b[3]
169        SDOT    v23.4s, v5.16b,  v3.4b[3]
170        SDOT    v24.4s, v6.16b,  v0.4b[3]
171        SDOT    v25.4s, v6.16b,  v1.4b[3]
172        SDOT    v26.4s, v6.16b,  v2.4b[3]
173        SDOT    v27.4s, v6.16b,  v3.4b[3]
174        SUBS    x0, x0, 16
175        SDOT    v28.4s, v7.16b,  v0.4b[3]
176        SDOT    v29.4s, v7.16b,  v1.4b[3]
177        SDOT    v30.4s, v7.16b,  v2.4b[3]
178        SDOT    v31.4s, v7.16b,  v3.4b[3]
179        B.HS    1b
180
181        # Is there a remainder?- 4 to 12 bytes of A
182        TST     x0, 15
183        B.NE    3f
184
1852:
186        # Apply params - preshift, scale, postshift, bias and clamp
187        LD1R    {v4.4s}, [x11], 4
188        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
189        SQSHL   v17.4s, v17.4s, v4.4s
190        SQSHL   v18.4s, v18.4s, v4.4s
191        SQSHL   v19.4s, v19.4s, v4.4s
192        SQSHL   v20.4s, v20.4s, v4.4s
193        SQSHL   v21.4s, v21.4s, v4.4s
194        SQSHL   v22.4s, v22.4s, v4.4s
195        SQSHL   v23.4s, v23.4s, v4.4s
196        LD1R    {v5.4s}, [x11], 4
197        SQSHL   v24.4s, v24.4s, v4.4s
198        SQSHL   v25.4s, v25.4s, v4.4s
199        SQSHL   v26.4s, v26.4s, v4.4s
200        SQSHL   v27.4s, v27.4s, v4.4s
201        SQSHL   v28.4s, v28.4s, v4.4s
202        SQSHL   v29.4s, v29.4s, v4.4s
203        SQSHL   v30.4s, v30.4s, v4.4s
204        SQSHL   v31.4s, v31.4s, v4.4s
205        LD1R    {v6.4s}, [x11], 4
206        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
207        SQDMULH v17.4s, v17.4s, v5.4s
208        SQDMULH v18.4s, v18.4s, v5.4s
209        SQDMULH v19.4s, v19.4s, v5.4s
210        SQDMULH v20.4s, v20.4s, v5.4s
211        SQDMULH v21.4s, v21.4s, v5.4s
212        SQDMULH v22.4s, v22.4s, v5.4s
213        SQDMULH v23.4s, v23.4s, v5.4s
214        SQDMULH v24.4s, v24.4s, v5.4s
215        SQDMULH v25.4s, v25.4s, v5.4s
216        SQDMULH v26.4s, v26.4s, v5.4s
217        SQDMULH v27.4s, v27.4s, v5.4s
218        SQDMULH v28.4s, v28.4s, v5.4s
219        SQDMULH v29.4s, v29.4s, v5.4s
220        SQDMULH v30.4s, v30.4s, v5.4s
221        SQDMULH v31.4s, v31.4s, v5.4s
222        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
223        SRSHL   v17.4s, v17.4s, v6.4s
224        SRSHL   v18.4s, v18.4s, v6.4s
225        SRSHL   v19.4s, v19.4s, v6.4s
226        SRSHL   v20.4s, v20.4s, v6.4s
227        SRSHL   v21.4s, v21.4s, v6.4s
228        SRSHL   v22.4s, v22.4s, v6.4s
229        SRSHL   v23.4s, v23.4s, v6.4s
230        SRSHL   v24.4s, v24.4s, v6.4s
231        SRSHL   v25.4s, v25.4s, v6.4s
232        SRSHL   v26.4s, v26.4s, v6.4s
233        SRSHL   v27.4s, v27.4s, v6.4s
234        SRSHL   v28.4s, v28.4s, v6.4s
235        SRSHL   v29.4s, v29.4s, v6.4s
236        SRSHL   v30.4s, v30.4s, v6.4s
237        SRSHL   v31.4s, v31.4s, v6.4s
238
239        SQXTN   v16.4h, v16.4s
240        SQXTN   v17.4h, v17.4s
241        SQXTN   v18.4h, v18.4s
242        SQXTN   v19.4h, v19.4s
243        SQXTN   v24.4h, v24.4s
244        SQXTN   v25.4h, v25.4s
245        SQXTN   v26.4h, v26.4s
246        SQXTN   v27.4h, v27.4s
247        LD1R    {v6.8h}, [x11], 2       // add bias
248
249        SQXTN2  v16.8h, v20.4s
250        SQXTN2  v17.8h, v21.4s
251        SQXTN2  v18.8h, v22.4s
252        SQXTN2  v19.8h, v23.4s
253        SQXTN2  v24.8h, v28.4s
254        SQXTN2  v25.8h, v29.4s
255        SQXTN2  v26.8h, v30.4s
256        SQXTN2  v27.8h, v31.4s
257
258        SQADD   v16.8h, v16.8h, v6.8h
259        SQADD   v17.8h, v17.8h, v6.8h
260        SQADD   v18.8h, v18.8h, v6.8h
261        SQADD   v19.8h, v19.8h, v6.8h
262        SQADD   v24.8h, v24.8h, v6.8h
263        SQADD   v25.8h, v25.8h, v6.8h
264        SQADD   v26.8h, v26.8h, v6.8h
265        SQADD   v27.8h, v27.8h, v6.8h
266        LD1R    {v4.16b}, [x11], 1      // clamp min value
267
268        SQXTN   v0.8b, v16.8h
269        SQXTN   v1.8b, v17.8h
270        SQXTN   v2.8b, v18.8h
271        SQXTN   v3.8b, v19.8h
272        LD1R    {v5.16b}, [x11]         // clamp max value
273        SQXTN2  v0.16b, v24.8h
274        SQXTN2  v1.16b, v25.8h
275        SQXTN2  v2.16b, v26.8h
276        SQXTN2  v3.16b, v27.8h
277        SUB     x11, x11, 15            // rewind params pointer
278
279        SMAX    v0.16b, v0.16b, v4.16b
280        SMAX    v1.16b, v1.16b, v4.16b
281        SMAX    v2.16b, v2.16b, v4.16b
282        SMAX    v3.16b, v3.16b, v4.16b
283        SUBS    x1, x1, 16
284        SMIN    v0.16b, v0.16b, v5.16b
285        SMIN    v1.16b, v1.16b, v5.16b
286        SMIN    v2.16b, v2.16b, v5.16b
287        SMIN    v3.16b, v3.16b, v5.16b
288        B.LO    5f
289
290        # Store full 4 x 16
291        ST1     {v0.16b}, [x6], x12
292        SUB     x3,  x3, x2             // a0 -= kc
293        ST1     {v1.16b}, [x8], x12
294        SUB     x15, x15, x2            // a1 -= kc
295        ST1     {v2.16b}, [x9], x12
296        SUB     x13, x13, x2            // a2 -= kc
297        ST1     {v3.16b}, [x7], x12
298        SUB     x4,  x4, x2             // a3 -= kc
299        B.NE    0b
300        RET
301
302        # Remainder- 8 bytes of A
303        .p2align 3
3043:
305        # Is there a remainder?- 8 bytes of A
306        TBZ     x0, 3, 4f
307
308        LDR     d0,  [x3], 8
309        LDR     q4,  [x5], 16
310        LDR     d1, [x15], 8
311        LDR     d2, [x13], 8
312        LDR     d3,  [x4], 8
313        LDR     q5,  [x5], 16
314        SDOT    v16.4s, v4.16b,  v0.4b[0]
315        SDOT    v17.4s, v4.16b,  v1.4b[0]
316        LDP     q6, q7, [x5], 32
317        SDOT    v18.4s, v4.16b,  v2.4b[0]
318        SDOT    v19.4s, v4.16b,  v3.4b[0]
319        SDOT    v20.4s, v5.16b,  v0.4b[0]
320        SDOT    v21.4s, v5.16b,  v1.4b[0]
321        SDOT    v22.4s, v5.16b,  v2.4b[0]
322        SDOT    v23.4s, v5.16b,  v3.4b[0]
323        SDOT    v24.4s, v6.16b, v0.4b[0]
324        SDOT    v25.4s, v6.16b, v1.4b[0]
325        LDP     q4, q5, [x5], 32
326        SDOT    v26.4s, v6.16b, v2.4b[0]
327        SDOT    v27.4s, v6.16b, v3.4b[0]
328        SDOT    v28.4s, v7.16b, v0.4b[0]
329        SDOT    v29.4s, v7.16b, v1.4b[0]
330        SDOT    v30.4s, v7.16b, v2.4b[0]
331        SDOT    v31.4s, v7.16b, v3.4b[0]
332        SDOT    v16.4s, v4.16b,  v0.4b[1]
333        SDOT    v17.4s, v4.16b,  v1.4b[1]
334        LDP     q6, q7, [x5], 32
335        SDOT    v18.4s, v4.16b,  v2.4b[1]
336        SDOT    v19.4s, v4.16b,  v3.4b[1]
337        SDOT    v20.4s, v5.16b,  v0.4b[1]
338        SDOT    v21.4s, v5.16b,  v1.4b[1]
339        SDOT    v22.4s, v5.16b,  v2.4b[1]
340        SDOT    v23.4s, v5.16b,  v3.4b[1]
341        SDOT    v24.4s, v6.16b,  v0.4b[1]
342        SDOT    v25.4s, v6.16b,  v1.4b[1]
343        SDOT    v26.4s, v6.16b,  v2.4b[1]
344        SDOT    v27.4s, v6.16b,  v3.4b[1]
345        SDOT    v28.4s, v7.16b,  v0.4b[1]
346        SDOT    v29.4s, v7.16b,  v1.4b[1]
347        SDOT    v30.4s, v7.16b,  v2.4b[1]
348        SDOT    v31.4s, v7.16b,  v3.4b[1]
349        # Is there a remainder?- 4 bytes of A
350        TBZ     x0, 2, 2b
351
352        # Remainder- 4 bytes of A
3534:
354        LDR     s0,  [x3], 4
355        LDR     q4, [x5], 16
356        LDR     s1, [x15], 4
357        LDR     s2, [x13], 4
358        LDR     s3,  [x4], 4
359        SDOT    v16.4s, v4.16b,  v0.4b[0]
360        LDR     q5, [x5], 16
361        SDOT    v17.4s, v4.16b,  v1.4b[0]
362        SDOT    v18.4s, v4.16b,  v2.4b[0]
363        SDOT    v19.4s, v4.16b,  v3.4b[0]
364        SDOT    v20.4s, v5.16b,  v0.4b[0]
365        LDP     q6, q7, [x5], 32
366        SDOT    v21.4s, v5.16b,  v1.4b[0]
367        SDOT    v22.4s, v5.16b,  v2.4b[0]
368        SDOT    v23.4s, v5.16b,  v3.4b[0]
369        SDOT    v24.4s, v6.16b, v0.4b[0]
370        SDOT    v25.4s, v6.16b, v1.4b[0]
371        SDOT    v26.4s, v6.16b, v2.4b[0]
372        SDOT    v27.4s, v6.16b, v3.4b[0]
373        SDOT    v28.4s, v7.16b, v0.4b[0]
374        SDOT    v29.4s, v7.16b, v1.4b[0]
375        SDOT    v30.4s, v7.16b, v2.4b[0]
376        SDOT    v31.4s, v7.16b, v3.4b[0]
377        B       2b
378
379        # Store odd width
380        .p2align 3
3815:
382        TBZ     x1, 3, 6f
383        STR     d0, [x6], 8
384        STR     d1, [x8], 8
385        DUP     d0, v0.d[1]
386        DUP     d1, v1.d[1]
387        STR     d2, [x9], 8
388        STR     d3, [x7], 8
389        DUP     d2, v2.d[1]
390        DUP     d3, v3.d[1]
3916:
392        TBZ     x1, 2, 7f
393        STR     s0, [x6], 4
394        STR     s1, [x8], 4
395        DUP     s0, v0.s[1]
396        DUP     s1, v1.s[1]
397        STR     s2, [x9], 4
398        STR     s3, [x7], 4
399        DUP     s2, v2.s[1]
400        DUP     s3, v3.s[1]
4017:
402        TBZ     x1, 1, 8f
403        STR     h0, [x6], 2
404        STR     h1, [x8], 2
405        DUP     h0, v0.h[1]
406        DUP     h1, v1.h[1]
407        STR     h2, [x9], 2
408        STR     h3, [x7], 2
409        DUP     h2, v2.h[1]
410        DUP     h3, v3.h[1]
4118:
412        TBZ     x1, 0, 9f
413        STR     b0, [x6]
414        STR     b1, [x8]
415        STR     b2, [x9]
416        STR     b3, [x7]
4179:
418        RET
419
420END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128
421
422#ifdef __ELF__
423.section ".note.GNU-stack","",%progbits
424#endif
425