xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128(
12#     size_t mr,                 x0
13#     size_t nc,                 x1
14#     size_t kc,                 x2 / x0
15#     size_t ks,                 x3 / x9
16#     const int8_t**restrict a,  x4
17#     const int8_t* restrict w,  x5
18#     int8_t* restrict c,        x6
19#     size_t cm_stride,          x7
20#     size_t cn_stride,          [sp] -> (x0)
21#     size_t a_offset,           [sp + 8] -> x8
22#     const int8_t* zero,        [sp + 16] -> x12
23#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x13  v0
29# A1  x14  v1
30# A2  x15  v2
31# A3  x10  v3
32# B    x5  v4  v5  v6  v7
33# C0   x6 v16 v20 v24 v28
34# C1  x16 v17 v21 v25 v29
35# C2  x17 v18 v22 v26 v30
36# C3   x7 v19 v23 v27 v31
37# zero_point v8 v12 v13 v14 v15
38# unused v9 v10 v11
39
40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
41
42        # Clamp C pointers
43        CMP     x0, 2                   // if mr < 2
44        LDR     x8, [sp, 8]             // Load a_offset
45        ADD     x16, x6, x7             // c1 = c0 + cm_stride
46        CSEL    x16, x6,  x16, LO       //   c1 = c0
47        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
48        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
49        ADD     x17, x16, x7            // c2 = c1 + cm_stride
50                                        // if mr <= 2
51        # Save d8,d12-d15 on stack
52        STR     d8,  [sp, -48]!
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54        BIC     x2, x2, 3
55        STP     d12, d13, [sp, 16]
56        CMP     x0, 4                   // if mr < 4
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        STP     d14, d15, [sp, 32]
59        CSEL    x7,  x17, x7, LO        //   c3 = c2
60        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
61
62        .p2align 3
630:
64        # Load initial bias from w into accumulators
65        LDP     q16, q20, [x5], 32
66
67        MOVI    v12.4s, 0
68        MOVI    v13.4s, 0
69        MOVI    v14.4s, 0
70        MOVI    v15.4s, 0
71
72        MOV     v17.16b, v16.16b
73        MOV     v18.16b, v16.16b
74        LDP     q24, q28, [x5], 32
75        MOV     v19.16b, v16.16b
76        MOV     v21.16b, v20.16b
77        MOV     v22.16b, v20.16b
78        MOV     v23.16b, v20.16b
79        MOV     v25.16b, v24.16b
80        MOV     v26.16b, v24.16b
81        MOV     v27.16b, v24.16b
82        MOV     v29.16b, v28.16b
83        MOV     v30.16b, v28.16b
84        MOV     v31.16b, v28.16b
85
86        MOV     x9, x3                  // p = ks
87
88        .p2align 3
891:
90        # Load next 4 A pointers
91        LDP     x13, x14, [x4], 16
92        LDP     x15, x10, [x4], 16
93
94        CMP     x13, x12                // if a0 == zero
95        ADD     x13, x13, x8            // a0 += a_offset
96        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
97        CMP     x14, x12                // if a1 == zero
98        ADD     x14, x14, x8            // a1 += a_offset
99        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
100        CMP     x15, x12                // if a2 == zero
101        ADD     x15, x15, x8            // a2 += a_offset
102        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
103        CMP     x10, x12                // if a3 == zero
104        ADD     x10, x10, x8            // a3 += a_offset
105        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
106
107        # Is there at least 16 bytes for main loop?
108        SUBS    x0, x2, 16              // k = kc - 16
109        B.LO    4f
110
111        # Main loop - 16 bytes of A
112        .p2align 3
1132:
114        LDR     q0, [x13], 16
115        LDR     q4,  [x5], 16
116        LDR     q1, [x14], 16
117        LDR     q2, [x15], 16
118        LDR     q3, [x10], 16
119        LDR     q5,  [x5], 16
120
121        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
122        UDOT    v13.4s, v8.16b,  v1.16b
123        UDOT    v14.4s, v8.16b,  v2.16b
124        UDOT    v15.4s, v8.16b,  v3.16b
125
126        UDOT    v16.4s, v4.16b,  v0.4b[0]
127        UDOT    v17.4s, v4.16b,  v1.4b[0]
128        LDP     q6, q7, [x5], 32
129        UDOT    v18.4s, v4.16b,  v2.4b[0]
130        UDOT    v19.4s, v4.16b,  v3.4b[0]
131        UDOT    v20.4s, v5.16b,  v0.4b[0]
132        UDOT    v21.4s, v5.16b,  v1.4b[0]
133        UDOT    v22.4s, v5.16b,  v2.4b[0]
134        UDOT    v23.4s, v5.16b,  v3.4b[0]
135        UDOT    v24.4s, v6.16b, v0.4b[0]
136        UDOT    v25.4s, v6.16b, v1.4b[0]
137        LDP     q4, q5, [x5], 32
138        UDOT    v26.4s, v6.16b, v2.4b[0]
139        UDOT    v27.4s, v6.16b, v3.4b[0]
140        UDOT    v28.4s, v7.16b, v0.4b[0]
141        UDOT    v29.4s, v7.16b, v1.4b[0]
142        UDOT    v30.4s, v7.16b, v2.4b[0]
143        UDOT    v31.4s, v7.16b, v3.4b[0]
144
145        UDOT    v16.4s, v4.16b,  v0.4b[1]
146        UDOT    v17.4s, v4.16b,  v1.4b[1]
147        LDP     q6, q7, [x5], 32
148        UDOT    v18.4s, v4.16b,  v2.4b[1]
149        UDOT    v19.4s, v4.16b,  v3.4b[1]
150        UDOT    v20.4s, v5.16b,  v0.4b[1]
151        UDOT    v21.4s, v5.16b,  v1.4b[1]
152        UDOT    v22.4s, v5.16b,  v2.4b[1]
153        UDOT    v23.4s, v5.16b,  v3.4b[1]
154        UDOT    v24.4s, v6.16b,  v0.4b[1]
155        UDOT    v25.4s, v6.16b,  v1.4b[1]
156        LDP     q4, q5, [x5], 32
157        UDOT    v26.4s, v6.16b,  v2.4b[1]
158        UDOT    v27.4s, v6.16b,  v3.4b[1]
159        UDOT    v28.4s, v7.16b,  v0.4b[1]
160        UDOT    v29.4s, v7.16b,  v1.4b[1]
161        UDOT    v30.4s, v7.16b,  v2.4b[1]
162        UDOT    v31.4s, v7.16b,  v3.4b[1]
163
164        UDOT    v16.4s, v4.16b,  v0.4b[2]
165        UDOT    v17.4s, v4.16b,  v1.4b[2]
166        LDP     q6, q7, [x5], 32
167        UDOT    v18.4s, v4.16b,  v2.4b[2]
168        UDOT    v19.4s, v4.16b,  v3.4b[2]
169        UDOT    v20.4s, v5.16b,  v0.4b[2]
170        UDOT    v21.4s, v5.16b,  v1.4b[2]
171        UDOT    v22.4s, v5.16b,  v2.4b[2]
172        UDOT    v23.4s, v5.16b,  v3.4b[2]
173        UDOT    v24.4s, v6.16b,  v0.4b[2]
174        UDOT    v25.4s, v6.16b,  v1.4b[2]
175        LDP     q4, q5, [x5], 32
176        UDOT    v26.4s, v6.16b,  v2.4b[2]
177        UDOT    v27.4s, v6.16b,  v3.4b[2]
178        UDOT    v28.4s, v7.16b,  v0.4b[2]
179        UDOT    v29.4s, v7.16b,  v1.4b[2]
180        UDOT    v30.4s, v7.16b,  v2.4b[2]
181        UDOT    v31.4s, v7.16b,  v3.4b[2]
182
183        UDOT    v16.4s, v4.16b,  v0.4b[3]
184        UDOT    v17.4s, v4.16b,  v1.4b[3]
185        LDP     q6, q7, [x5], 32
186        UDOT    v18.4s, v4.16b,  v2.4b[3]
187        UDOT    v19.4s, v4.16b,  v3.4b[3]
188        UDOT    v20.4s, v5.16b,  v0.4b[3]
189        UDOT    v21.4s, v5.16b,  v1.4b[3]
190        UDOT    v22.4s, v5.16b,  v2.4b[3]
191        UDOT    v23.4s, v5.16b,  v3.4b[3]
192        UDOT    v24.4s, v6.16b,  v0.4b[3]
193        UDOT    v25.4s, v6.16b,  v1.4b[3]
194        UDOT    v26.4s, v6.16b,  v2.4b[3]
195        UDOT    v27.4s, v6.16b,  v3.4b[3]
196        SUBS    x0, x0, 16
197        UDOT    v28.4s, v7.16b,  v0.4b[3]
198        UDOT    v29.4s, v7.16b,  v1.4b[3]
199        UDOT    v30.4s, v7.16b,  v2.4b[3]
200        UDOT    v31.4s, v7.16b,  v3.4b[3]
201        B.HS    2b
202
203        # Is there a remainder?- 4 to 12 bytes of A
204        TST     x0, 15
205        B.NE    4f
206
2073:
208        # ks loop
209        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
210        B.HI    1b
211
212        ADDP    v0.4s, v12.4s, v12.4s
213        ADDP    v1.4s, v13.4s, v13.4s
214        ADDP    v2.4s, v14.4s, v14.4s
215        ADDP    v3.4s, v15.4s, v15.4s
216        ADDP    v12.4s, v0.4s, v0.4s
217        ADDP    v13.4s, v1.4s, v1.4s
218        ADDP    v14.4s, v2.4s, v2.4s
219        ADDP    v15.4s, v3.4s, v3.4s
220
221        # Subtract zero point from accumulators
222        SUB     v16.4s, v16.4s, v12.4s
223        SUB     v17.4s, v17.4s, v13.4s
224        SUB     v18.4s, v18.4s, v14.4s
225        SUB     v19.4s, v19.4s, v15.4s
226        SUB     v20.4s, v20.4s, v12.4s
227        SUB     v21.4s, v21.4s, v13.4s
228        SUB     v22.4s, v22.4s, v14.4s
229        SUB     v23.4s, v23.4s, v15.4s
230        SUB     v24.4s, v24.4s, v12.4s
231        SUB     v25.4s, v25.4s, v13.4s
232        SUB     v26.4s, v26.4s, v14.4s
233        SUB     v27.4s, v27.4s, v15.4s
234        SUB     v28.4s, v28.4s, v12.4s
235        SUB     v29.4s, v29.4s, v13.4s
236        SUB     v30.4s, v30.4s, v14.4s
237        SUB     v31.4s, v31.4s, v15.4s
238
239        $if REQUANTIZATION == "RNDNU":
240          # Apply params - preshift, scale, postshift, bias and clamp
241          LD1R    {v4.4s}, [x11], 4
242          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
243          SSHL    v17.4s, v17.4s, v4.4s
244          SSHL    v18.4s, v18.4s, v4.4s
245          SSHL    v19.4s, v19.4s, v4.4s
246          SSHL    v20.4s, v20.4s, v4.4s
247          SSHL    v21.4s, v21.4s, v4.4s
248          SSHL    v22.4s, v22.4s, v4.4s
249          SSHL    v23.4s, v23.4s, v4.4s
250          LD1R    {v5.4s}, [x11], 4
251          SSHL    v24.4s, v24.4s, v4.4s
252          SSHL    v25.4s, v25.4s, v4.4s
253          SSHL    v26.4s, v26.4s, v4.4s
254          SSHL    v27.4s, v27.4s, v4.4s
255          SSHL    v28.4s, v28.4s, v4.4s
256          SSHL    v29.4s, v29.4s, v4.4s
257          SSHL    v30.4s, v30.4s, v4.4s
258          SSHL    v31.4s, v31.4s, v4.4s
259          LD1R    {v6.4s}, [x11], 4
260          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
261          SQDMULH v17.4s, v17.4s, v5.4s
262          SQDMULH v18.4s, v18.4s, v5.4s
263          SQDMULH v19.4s, v19.4s, v5.4s
264          SQDMULH v20.4s, v20.4s, v5.4s
265          SQDMULH v21.4s, v21.4s, v5.4s
266          SQDMULH v22.4s, v22.4s, v5.4s
267          SQDMULH v23.4s, v23.4s, v5.4s
268          SQDMULH v24.4s, v24.4s, v5.4s
269          SQDMULH v25.4s, v25.4s, v5.4s
270          SQDMULH v26.4s, v26.4s, v5.4s
271          SQDMULH v27.4s, v27.4s, v5.4s
272          SQDMULH v28.4s, v28.4s, v5.4s
273          SQDMULH v29.4s, v29.4s, v5.4s
274          SQDMULH v30.4s, v30.4s, v5.4s
275          SQDMULH v31.4s, v31.4s, v5.4s
276          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
277          SRSHL   v17.4s, v17.4s, v6.4s
278          SRSHL   v18.4s, v18.4s, v6.4s
279          SRSHL   v19.4s, v19.4s, v6.4s
280          SRSHL   v20.4s, v20.4s, v6.4s
281          SRSHL   v21.4s, v21.4s, v6.4s
282          SRSHL   v22.4s, v22.4s, v6.4s
283          SRSHL   v23.4s, v23.4s, v6.4s
284          SRSHL   v24.4s, v24.4s, v6.4s
285          SRSHL   v25.4s, v25.4s, v6.4s
286          SRSHL   v26.4s, v26.4s, v6.4s
287          SRSHL   v27.4s, v27.4s, v6.4s
288          SRSHL   v28.4s, v28.4s, v6.4s
289          SRSHL   v29.4s, v29.4s, v6.4s
290          SRSHL   v30.4s, v30.4s, v6.4s
291          SRSHL   v31.4s, v31.4s, v6.4s
292        $elif REQUANTIZATION == "FP32":
293          SCVTF   v16.4s, v16.4s
294          SCVTF   v17.4s, v17.4s
295          # Apply params - scale, bias and clamp
296          LD1R    {v4.4s}, [x11], 4
297          SCVTF   v18.4s, v18.4s
298          SCVTF   v19.4s, v19.4s
299          SCVTF   v20.4s, v20.4s
300          SCVTF   v21.4s, v21.4s
301          SCVTF   v22.4s, v22.4s
302          SCVTF   v23.4s, v23.4s
303          SCVTF   v24.4s, v24.4s
304          SCVTF   v25.4s, v25.4s
305          SCVTF   v26.4s, v26.4s
306          SCVTF   v27.4s, v27.4s
307          SCVTF   v28.4s, v28.4s
308          SCVTF   v29.4s, v29.4s
309          SCVTF   v30.4s, v30.4s
310          SCVTF   v31.4s, v31.4s
311
312          FMUL    v16.4s, v16.4s, v4.4s
313          FMUL    v17.4s, v17.4s, v4.4s
314          FMUL    v18.4s, v18.4s, v4.4s
315          FMUL    v19.4s, v19.4s, v4.4s
316          FMUL    v20.4s, v20.4s, v4.4s
317          FMUL    v21.4s, v21.4s, v4.4s
318          FMUL    v22.4s, v22.4s, v4.4s
319          FMUL    v23.4s, v23.4s, v4.4s
320          FMUL    v24.4s, v24.4s, v4.4s
321          FMUL    v25.4s, v25.4s, v4.4s
322          FMUL    v26.4s, v26.4s, v4.4s
323          FMUL    v27.4s, v27.4s, v4.4s
324          FMUL    v28.4s, v28.4s, v4.4s
325          FMUL    v29.4s, v29.4s, v4.4s
326          FMUL    v30.4s, v30.4s, v4.4s
327          FMUL    v31.4s, v31.4s, v4.4s
328
329          FCVTNS  v16.4s, v16.4s
330          FCVTNS  v17.4s, v17.4s
331          FCVTNS  v18.4s, v18.4s
332          FCVTNS  v19.4s, v19.4s
333          FCVTNS  v20.4s, v20.4s
334          FCVTNS  v21.4s, v21.4s
335          FCVTNS  v22.4s, v22.4s
336          FCVTNS  v23.4s, v23.4s
337          FCVTNS  v24.4s, v24.4s
338          FCVTNS  v25.4s, v25.4s
339          FCVTNS  v26.4s, v26.4s
340          FCVTNS  v27.4s, v27.4s
341          FCVTNS  v28.4s, v28.4s
342          FCVTNS  v29.4s, v29.4s
343          FCVTNS  v30.4s, v30.4s
344          FCVTNS  v31.4s, v31.4s
345
346        SQXTN   v16.4h, v16.4s
347        SQXTN   v17.4h, v17.4s
348        SQXTN   v18.4h, v18.4s
349        SQXTN   v19.4h, v19.4s
350        SQXTN   v24.4h, v24.4s
351        SQXTN   v25.4h, v25.4s
352        SQXTN   v26.4h, v26.4s
353        SQXTN   v27.4h, v27.4s
354        LD1R    {v6.8h}, [x11], 2        // add bias
355
356        SQXTN2  v16.8h, v20.4s
357        SQXTN2  v17.8h, v21.4s
358        SQXTN2  v18.8h, v22.4s
359        SQXTN2  v19.8h, v23.4s
360        SQXTN2  v24.8h, v28.4s
361        SQXTN2  v25.8h, v29.4s
362        SQXTN2  v26.8h, v30.4s
363        SQXTN2  v27.8h, v31.4s
364
365        SQADD   v16.8h, v16.8h, v6.8h
366        SQADD   v17.8h, v17.8h, v6.8h
367        SQADD   v18.8h, v18.8h, v6.8h
368        SQADD   v19.8h, v19.8h, v6.8h
369        SQADD   v24.8h, v24.8h, v6.8h
370        SQADD   v25.8h, v25.8h, v6.8h
371        SQADD   v26.8h, v26.8h, v6.8h
372        SQADD   v27.8h, v27.8h, v6.8h
373        LD1R    {v4.16b}, [x11], 1      // clamp min value
374
375        SQXTUN  v0.8b, v16.8h
376        SQXTUN  v1.8b, v17.8h
377        SQXTUN  v2.8b, v18.8h
378        SQXTUN  v3.8b, v19.8h
379        LD1R    {v5.16b}, [x11]         // clamp max value
380        SQXTUN2 v0.16b, v24.8h
381        SQXTUN2 v1.16b, v25.8h
382        SQXTUN2 v2.16b, v26.8h
383        SQXTUN2 v3.16b, v27.8h
384        LDR     x0, [sp, 48]            // Load cn_stride
385
386        UMAX    v0.16b, v0.16b, v4.16b
387        UMAX    v1.16b, v1.16b, v4.16b
388        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
389        UMAX    v2.16b, v2.16b, v4.16b
390        UMAX    v3.16b, v3.16b, v4.16b
391        SUBS    x1, x1, 16
392        UMIN    v0.16b, v0.16b, v5.16b
393        UMIN    v1.16b, v1.16b, v5.16b
394        UMIN    v2.16b, v2.16b, v5.16b
395        UMIN    v3.16b, v3.16b, v5.16b
396        B.LO    6f
397
398        # Store full 4 x 16
399        ST1     {v3.16b},  [x7], x0
400        ST1     {v2.16b}, [x17], x0
401        ST1     {v1.16b}, [x16], x0
402        ST1     {v0.16b},  [x6], x0
403
404        SUB     x4, x4, x3              // a -= ks
405
406        # nc loop
407        B.HI    0b
408
409        # Restore d8,d12-d15 from stack
410        LDP     d14, d15, [sp, 32]
411        LDP     d12, d13, [sp, 16]
412        LDR     d8,  [sp], 48
413        RET
414
415        # Remainder- 8 bytes of A
416        .p2align 3
4174:
418        # Is there a remainder?- 8 bytes of A
419        TBZ     x0, 3, 5f
420
421        LDR     d0, [x13], 8
422        LDR     q4,  [x5], 16
423        LDR     d1, [x14], 8
424        LDR     d2, [x15], 8
425        LDR     d3, [x10], 8
426        LDR     q5,  [x5], 16
427
428        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
429        UDOT    v13.4s, v8.16b,  v1.16b
430        UDOT    v14.4s, v8.16b,  v2.16b
431        UDOT    v15.4s, v8.16b,  v3.16b
432
433        UDOT    v16.4s, v4.16b,  v0.4b[0]
434        UDOT    v17.4s, v4.16b,  v1.4b[0]
435        LDP     q6, q7, [x5], 32
436        UDOT    v18.4s, v4.16b,  v2.4b[0]
437        UDOT    v19.4s, v4.16b,  v3.4b[0]
438        UDOT    v20.4s, v5.16b,  v0.4b[0]
439        UDOT    v21.4s, v5.16b,  v1.4b[0]
440        UDOT    v22.4s, v5.16b,  v2.4b[0]
441        UDOT    v23.4s, v5.16b,  v3.4b[0]
442        UDOT    v24.4s, v6.16b, v0.4b[0]
443        UDOT    v25.4s, v6.16b, v1.4b[0]
444        LDP     q4, q5, [x5], 32
445        UDOT    v26.4s, v6.16b, v2.4b[0]
446        UDOT    v27.4s, v6.16b, v3.4b[0]
447        UDOT    v28.4s, v7.16b, v0.4b[0]
448        UDOT    v29.4s, v7.16b, v1.4b[0]
449        UDOT    v30.4s, v7.16b, v2.4b[0]
450        UDOT    v31.4s, v7.16b, v3.4b[0]
451        UDOT    v16.4s, v4.16b,  v0.4b[1]
452        UDOT    v17.4s, v4.16b,  v1.4b[1]
453        LDP     q6, q7, [x5], 32
454        UDOT    v18.4s, v4.16b,  v2.4b[1]
455        UDOT    v19.4s, v4.16b,  v3.4b[1]
456        UDOT    v20.4s, v5.16b,  v0.4b[1]
457        UDOT    v21.4s, v5.16b,  v1.4b[1]
458        UDOT    v22.4s, v5.16b,  v2.4b[1]
459        UDOT    v23.4s, v5.16b,  v3.4b[1]
460        UDOT    v24.4s, v6.16b,  v0.4b[1]
461        UDOT    v25.4s, v6.16b,  v1.4b[1]
462        UDOT    v26.4s, v6.16b,  v2.4b[1]
463        UDOT    v27.4s, v6.16b,  v3.4b[1]
464        UDOT    v28.4s, v7.16b,  v0.4b[1]
465        UDOT    v29.4s, v7.16b,  v1.4b[1]
466        UDOT    v30.4s, v7.16b,  v2.4b[1]
467        UDOT    v31.4s, v7.16b,  v3.4b[1]
468        # Is there a remainder?- 4 bytes of A
469        TBZ     x0, 2, 3b
470
471        # Remainder- 4 bytes of A
4725:
473        LDR     s0, [x13], 4
474        LDR     q4, [x5], 16
475        LDR     s1, [x14], 4
476        LDR     s2, [x15], 4
477        LDR     s3, [x10], 4
478        LDR     q5, [x5], 16
479
480        UDOT    v12.4s, v8.16b,  v0.16b  // update zero point
481        UDOT    v13.4s, v8.16b,  v1.16b
482        UDOT    v14.4s, v8.16b,  v2.16b
483        UDOT    v15.4s, v8.16b,  v3.16b
484
485        UDOT    v16.4s, v4.16b,  v0.4b[0]
486        UDOT    v17.4s, v4.16b,  v1.4b[0]
487        UDOT    v18.4s, v4.16b,  v2.4b[0]
488        UDOT    v19.4s, v4.16b,  v3.4b[0]
489        LDP     q6, q7, [x5], 32
490        UDOT    v20.4s, v5.16b,  v0.4b[0]
491        UDOT    v21.4s, v5.16b,  v1.4b[0]
492        UDOT    v22.4s, v5.16b,  v2.4b[0]
493        UDOT    v23.4s, v5.16b,  v3.4b[0]
494        UDOT    v24.4s, v6.16b, v0.4b[0]
495        UDOT    v25.4s, v6.16b, v1.4b[0]
496        UDOT    v26.4s, v6.16b, v2.4b[0]
497        UDOT    v27.4s, v6.16b, v3.4b[0]
498        UDOT    v28.4s, v7.16b, v0.4b[0]
499        UDOT    v29.4s, v7.16b, v1.4b[0]
500        UDOT    v30.4s, v7.16b, v2.4b[0]
501        UDOT    v31.4s, v7.16b, v3.4b[0]
502        B       3b
503
504        # Store odd width
505        .p2align 3
5066:
507        TBZ     x1, 3, 7f
508        STR     d3, [x7], 8
509        STR     d2, [x17], 8
510        DUP     d3, v3.d[1]
511        DUP     d2, v2.d[1]
512        STR     d1, [x16], 8
513        STR     d0, [x6], 8
514        DUP     d1, v1.d[1]
515        DUP     d0, v0.d[1]
5167:
517        TBZ     x1, 2, 8f
518        STR     s3, [x7], 4
519        STR     s2, [x17], 4
520        DUP     s3, v3.s[1]
521        DUP     s2, v2.s[1]
522        STR     s1, [x16], 4
523        STR     s0, [x6], 4
524        DUP     s1, v1.s[1]
525        DUP     s0, v0.s[1]
5268:
527        TBZ     x1, 1, 9f
528        STR     h3, [x7], 2
529        STR     h2, [x17], 2
530        DUP     h3, v3.h[1]
531        DUP     h2, v2.h[1]
532        STR     h1, [x16], 2
533        STR     h0, [x6], 2
534        DUP     h1, v1.h[1]
535        DUP     h0, v0.h[1]
5369:
537        TBZ     x1, 0, 10f
538        STR     b3, [x7]
539        STR     b2, [x17]
540        STR     b1, [x16]
541        STR     b0, [x6]
54210:
543        # Restore d8,d12-d15 from stack
544        LDP     d14, d15, [sp, 32]
545        LDP     d12, d13, [sp, 16]
546        LDR     d8,  [sp], 48
547        RET
548
549END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
550
551#ifdef __ELF__
552.section ".note.GNU-stack","",%progbits
553#endif
554