xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld32.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6  v7
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v8 v9 v10 v11 v12 v13 v14 v15
38
39BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32
40
41        # Clamp A and C pointers
42        CMP     x0, 2                   // if mr < 2
43        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
44        ADD     x15, x3, x4             // a1 = a0 + a_stride
45        ADD     x8, x6, x7              // c1 = c0 + cm_stride
46        CSEL    x15, x3, x15, LO        //   a1 = a0
47        CSEL    x8, x6,  x8, LO         //   c1 = c0
48        BIC     x2, x2, 3
49
50        ADD     x13, x15, x4            // a2 = a1 + a_stride
51        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
52                                        // if mr <= 2
53        CSEL    x13, x15, x13, LS       //   a2 = a1
54        CSEL    x9,  x8,  x9, LS        //   c2 = c1
55
56        LDP     x12, x11, [sp]          // cn_stride, params
57
58        CMP     x0, 4                   // if mr < 4
59        ADD     x4, x13, x4             // a3 = a2 + a_stride
60        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
61        CSEL    x4, x13, x4, LO         //   a3 = a2
62        CSEL    x7,  x9, x7, LO         //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     x0, x2                  // k = kc.  assumes kc > 0
78        MOV     v27.16b, v24.16b
79        MOV     v29.16b, v28.16b
80        MOV     v30.16b, v28.16b
81        MOV     v31.16b, v28.16b
82
83        # Main loop - 4 bytes of A
84        .p2align 3
851:
86        LD1R    {v0.4s},  [x3], 4
87        LDR     q4, [x5], 16
88        LD1R    {v1.4s}, [x15], 4
89        LD1R    {v2.4s}, [x13], 4
90        LD1R    {v3.4s},  [x4], 4
91        SDOT    v16.4s, v4.16b, v0.16b
92        SDOT    v17.4s, v4.16b, v1.16b
93        LDR     q5, [x5], 16
94        SDOT    v18.4s, v4.16b, v2.16b
95        SDOT    v19.4s, v4.16b, v3.16b
96        LDR     q6, [x5], 16
97        SDOT    v20.4s, v5.16b, v0.16b
98        SDOT    v21.4s, v5.16b, v1.16b
99        LDR     q7, [x5], 16
100        SDOT    v22.4s, v5.16b, v2.16b
101        SDOT    v23.4s, v5.16b, v3.16b
102        SUBS    x0, x0, 4
103        SDOT    v24.4s, v6.16b, v0.16b
104        SDOT    v25.4s, v6.16b, v1.16b
105        SDOT    v26.4s, v6.16b, v2.16b
106        SDOT    v27.4s, v6.16b, v3.16b
107        SDOT    v28.4s, v7.16b, v0.16b
108        SDOT    v29.4s, v7.16b, v1.16b
109        SDOT    v30.4s, v7.16b, v2.16b
110        SDOT    v31.4s, v7.16b, v3.16b
111        B.HI    1b
112
113        # Apply params - preshift, scale, postshift, bias and clamp
114        LD1R    {v4.4s}, [x11], 4
115        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
116        SQSHL   v17.4s, v17.4s, v4.4s
117        SQSHL   v18.4s, v18.4s, v4.4s
118        SQSHL   v19.4s, v19.4s, v4.4s
119        SQSHL   v20.4s, v20.4s, v4.4s
120        SQSHL   v21.4s, v21.4s, v4.4s
121        SQSHL   v22.4s, v22.4s, v4.4s
122        SQSHL   v23.4s, v23.4s, v4.4s
123        LD1R    {v5.4s}, [x11], 4
124        SQSHL   v24.4s, v24.4s, v4.4s
125        SQSHL   v25.4s, v25.4s, v4.4s
126        SQSHL   v26.4s, v26.4s, v4.4s
127        SQSHL   v27.4s, v27.4s, v4.4s
128        SQSHL   v28.4s, v28.4s, v4.4s
129        SQSHL   v29.4s, v29.4s, v4.4s
130        SQSHL   v30.4s, v30.4s, v4.4s
131        SQSHL   v31.4s, v31.4s, v4.4s
132        LD1R    {v6.4s}, [x11], 4
133        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
134        SQDMULH v17.4s, v17.4s, v5.4s
135        SQDMULH v18.4s, v18.4s, v5.4s
136        SQDMULH v19.4s, v19.4s, v5.4s
137        SQDMULH v20.4s, v20.4s, v5.4s
138        SQDMULH v21.4s, v21.4s, v5.4s
139        SQDMULH v22.4s, v22.4s, v5.4s
140        SQDMULH v23.4s, v23.4s, v5.4s
141        SQDMULH v24.4s, v24.4s, v5.4s
142        SQDMULH v25.4s, v25.4s, v5.4s
143        SQDMULH v26.4s, v26.4s, v5.4s
144        SQDMULH v27.4s, v27.4s, v5.4s
145        SQDMULH v28.4s, v28.4s, v5.4s
146        SQDMULH v29.4s, v29.4s, v5.4s
147        SQDMULH v30.4s, v30.4s, v5.4s
148        SQDMULH v31.4s, v31.4s, v5.4s
149        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
150        SRSHL   v17.4s, v17.4s, v6.4s
151        SRSHL   v18.4s, v18.4s, v6.4s
152        SRSHL   v19.4s, v19.4s, v6.4s
153        SRSHL   v20.4s, v20.4s, v6.4s
154        SRSHL   v21.4s, v21.4s, v6.4s
155        SRSHL   v22.4s, v22.4s, v6.4s
156        SRSHL   v23.4s, v23.4s, v6.4s
157        SRSHL   v24.4s, v24.4s, v6.4s
158        SRSHL   v25.4s, v25.4s, v6.4s
159        SRSHL   v26.4s, v26.4s, v6.4s
160        SRSHL   v27.4s, v27.4s, v6.4s
161        SRSHL   v28.4s, v28.4s, v6.4s
162        SRSHL   v29.4s, v29.4s, v6.4s
163        SRSHL   v30.4s, v30.4s, v6.4s
164        SRSHL   v31.4s, v31.4s, v6.4s
165
166        SQXTN   v16.4h, v16.4s
167        SQXTN   v17.4h, v17.4s
168        SQXTN   v18.4h, v18.4s
169        SQXTN   v19.4h, v19.4s
170        SQXTN   v24.4h, v24.4s
171        SQXTN   v25.4h, v25.4s
172        SQXTN   v26.4h, v26.4s
173        SQXTN   v27.4h, v27.4s
174        LD1R    {v6.8h}, [x11], 2       // add bias
175
176        SQXTN2  v16.8h, v20.4s
177        SQXTN2  v17.8h, v21.4s
178        SQXTN2  v18.8h, v22.4s
179        SQXTN2  v19.8h, v23.4s
180        SQXTN2  v24.8h, v28.4s
181        SQXTN2  v25.8h, v29.4s
182        SQXTN2  v26.8h, v30.4s
183        SQXTN2  v27.8h, v31.4s
184
185        SQADD   v16.8h, v16.8h, v6.8h
186        SQADD   v17.8h, v17.8h, v6.8h
187        SQADD   v18.8h, v18.8h, v6.8h
188        SQADD   v19.8h, v19.8h, v6.8h
189        SQADD   v24.8h, v24.8h, v6.8h
190        SQADD   v25.8h, v25.8h, v6.8h
191        SQADD   v26.8h, v26.8h, v6.8h
192        SQADD   v27.8h, v27.8h, v6.8h
193        LD1R    {v4.16b}, [x11], 1      // clamp min value
194
195        SQXTN   v0.8b, v16.8h
196        SQXTN   v1.8b, v17.8h
197        SQXTN   v2.8b, v18.8h
198        SQXTN   v3.8b, v19.8h
199        LD1R    {v5.16b}, [x11]         // clamp max value
200        SQXTN2  v0.16b, v24.8h
201        SQXTN2  v1.16b, v25.8h
202        SQXTN2  v2.16b, v26.8h
203        SQXTN2  v3.16b, v27.8h
204        SUB     x11, x11, 15            // rewind params pointer
205
206        SMAX    v0.16b, v0.16b, v4.16b
207        SMAX    v1.16b, v1.16b, v4.16b
208        SMAX    v2.16b, v2.16b, v4.16b
209        SMAX    v3.16b, v3.16b, v4.16b
210        SUBS    x1, x1, 16
211        SMIN    v0.16b, v0.16b, v5.16b
212        SMIN    v1.16b, v1.16b, v5.16b
213        SMIN    v2.16b, v2.16b, v5.16b
214        SMIN    v3.16b, v3.16b, v5.16b
215        B.LO    2f
216
217        # Store full 4 x 16
218        ST1     {v0.16b}, [x6], x12
219        SUB     x3,  x3, x2             // a0 -= kc
220        ST1     {v1.16b}, [x8], x12
221        SUB     x15, x15, x2            // a1 -= kc
222        ST1     {v2.16b}, [x9], x12
223        SUB     x13, x13, x2            // a2 -= kc
224        ST1     {v3.16b}, [x7], x12
225        SUB     x4,  x4, x2             // a3 -= kc
226        B.NE    0b
227        RET
228
229        # Store odd width
230        .p2align 3
2312:
232        TBZ     x1, 3, 3f
233        STR     d0, [x6], 8
234        STR     d1, [x8], 8
235        DUP     d0, v0.d[1]
236        DUP     d1, v1.d[1]
237        STR     d2, [x9], 8
238        STR     d3, [x7], 8
239        DUP     d2, v2.d[1]
240        DUP     d3, v3.d[1]
2413:
242        TBZ     x1, 2, 4f
243        STR     s0, [x6], 4
244        STR     s1, [x8], 4
245        DUP     s0, v0.s[1]
246        DUP     s1, v1.s[1]
247        STR     s2, [x9], 4
248        STR     s3, [x7], 4
249        DUP     s2, v2.s[1]
250        DUP     s3, v3.s[1]
2514:
252        TBZ     x1, 1, 5f
253        STR     h0, [x6], 2
254        STR     h1, [x8], 2
255        DUP     h0, v0.h[1]
256        DUP     h1, v1.h[1]
257        STR     h2, [x9], 2
258        STR     h3, [x7], 2
259        DUP     h2, v2.h[1]
260        DUP     h3, v3.h[1]
2615:
262        TBZ     x1, 0, 6f
263        STR     b0, [x6]
264        STR     b1, [x8]
265        STR     b2, [x9]
266        STR     b3, [x7]
2676:
268        RET
269
270END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32
271
272#ifdef __ELF__
273.section ".note.GNU-stack","",%progbits
274#endif
275