xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8
9#include <xnnpack/assembly.h>
10
11$DATATYPE = "qc8" if CHANNELWISE else "qs8"
12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
14# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128(
15#     size_t mr,                 x0
16#     size_t nc,                 x1
17#     size_t kc,                 x2 / x0
18#     size_t ks,                 x3 / x9
19#     const int8_t**restrict a,  x4
20#     const int8_t* restrict w,  x5
21#     int8_t* restrict c,        x6
22#     size_t cm_stride,          x7
23#     size_t cn_stride,                  [sp] -> (x0)
24#     size_t a_offset,                   [sp + 8] -> x8
25#     const int8_t* zero,                [sp + 16] -> x12
26#     const union ${PARAMS_UNION} params [sp + 24] -> x11
27
28# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
29
30# Register usage
31# A0  x13  v0
32# A1  x14  v1
33# A2  x15  v2
34# A3  x10  v3
35# B    x5  v4  v5  v6  v7
36# C0   x6 v16 v20 v24 v28
37# C1  x16 v17 v21 v25 v29
38# C2  x17 v18 v22 v26 v30
39# C3   x7 v19 v23 v27 v31
40# unused v8 v9 v10 v11 v12 v13 v14 v15
41
42BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDR     x8, [sp, 8]             // Load a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
50
51        ADD     x17, x16, x7            // c2 = c1 + cm_stride
52        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
53                                        // if mr <= 2
54        CSEL    x17, x16, x17, LS       //   c2 = c1
55        BIC     x2, x2, 3
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
59        CSEL    x7,  x17, x7, LO        //   c3 = c2
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        LDP     q24, q28, [x5], 32
68        MOV     v19.16b, v16.16b
69        MOV     v21.16b, v20.16b
70        MOV     v22.16b, v20.16b
71        MOV     v23.16b, v20.16b
72        MOV     v25.16b, v24.16b
73        MOV     v26.16b, v24.16b
74        MOV     v27.16b, v24.16b
75        MOV     v29.16b, v28.16b
76        MOV     v30.16b, v28.16b
77        MOV     v31.16b, v28.16b
78        MOV     x9, x3                  // p = ks
79
80        .p2align 3
811:
82        # Load next 4 A pointers
83        LDP     x13, x14, [x4], 16
84        LDP     x15, x10, [x4], 16
85
86        CMP     x13, x12                // if a0 == zero
87        ADD     x13, x13, x8            // a0 += a_offset
88        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
89        CMP     x14, x12                // if a1 == zero
90        ADD     x14, x14, x8            // a1 += a_offset
91        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
92        CMP     x15, x12                // if a2 == zero
93        ADD     x15, x15, x8            // a2 += a_offset
94        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
95        CMP     x10, x12                 // if a3 == zero
96        ADD     x10, x10, x8             // a3 += a_offset
97        CSEL    x10, x12, x10, EQ         //   a3 = zero, else += a3 + a_offset
98
99        # Is there at least 16 bytes for main loop?
100        SUBS    x0, x2, 16              // k = kc - 16
101        B.LO    4f
102
103        # Main loop - 16 bytes of A
104        .p2align 3
1052:
106        LDR     q0, [x13], 16
107        LDR     q4,  [x5], 16
108        LDR     q1, [x14], 16
109        LDR     q2, [x15], 16
110        LDR     q3,  [x10], 16
111        LDR     q5,  [x5], 16
112        SDOT    v16.4s, v4.16b,  v0.4b[0]
113        SDOT    v17.4s, v4.16b,  v1.4b[0]
114        LDP     q6, q7, [x5], 32
115        SDOT    v18.4s, v4.16b,  v2.4b[0]
116        SDOT    v19.4s, v4.16b,  v3.4b[0]
117        SDOT    v20.4s, v5.16b,  v0.4b[0]
118        SDOT    v21.4s, v5.16b,  v1.4b[0]
119        SDOT    v22.4s, v5.16b,  v2.4b[0]
120        SDOT    v23.4s, v5.16b,  v3.4b[0]
121        SDOT    v24.4s, v6.16b, v0.4b[0]
122        SDOT    v25.4s, v6.16b, v1.4b[0]
123        LDP     q4, q5, [x5], 32
124        SDOT    v26.4s, v6.16b, v2.4b[0]
125        SDOT    v27.4s, v6.16b, v3.4b[0]
126        SDOT    v28.4s, v7.16b, v0.4b[0]
127        SDOT    v29.4s, v7.16b, v1.4b[0]
128        SDOT    v30.4s, v7.16b, v2.4b[0]
129        SDOT    v31.4s, v7.16b, v3.4b[0]
130
131        SDOT    v16.4s, v4.16b,  v0.4b[1]
132        SDOT    v17.4s, v4.16b,  v1.4b[1]
133        LDP     q6, q7, [x5], 32
134        SDOT    v18.4s, v4.16b,  v2.4b[1]
135        SDOT    v19.4s, v4.16b,  v3.4b[1]
136        SDOT    v20.4s, v5.16b,  v0.4b[1]
137        SDOT    v21.4s, v5.16b,  v1.4b[1]
138        SDOT    v22.4s, v5.16b,  v2.4b[1]
139        SDOT    v23.4s, v5.16b,  v3.4b[1]
140        SDOT    v24.4s, v6.16b,  v0.4b[1]
141        SDOT    v25.4s, v6.16b,  v1.4b[1]
142        LDP     q4, q5, [x5], 32
143        SDOT    v26.4s, v6.16b,  v2.4b[1]
144        SDOT    v27.4s, v6.16b,  v3.4b[1]
145        SDOT    v28.4s, v7.16b,  v0.4b[1]
146        SDOT    v29.4s, v7.16b,  v1.4b[1]
147        SDOT    v30.4s, v7.16b,  v2.4b[1]
148        SDOT    v31.4s, v7.16b,  v3.4b[1]
149
150        SDOT    v16.4s, v4.16b,  v0.4b[2]
151        SDOT    v17.4s, v4.16b,  v1.4b[2]
152        LDP     q6, q7, [x5], 32
153        SDOT    v18.4s, v4.16b,  v2.4b[2]
154        SDOT    v19.4s, v4.16b,  v3.4b[2]
155        SDOT    v20.4s, v5.16b,  v0.4b[2]
156        SDOT    v21.4s, v5.16b,  v1.4b[2]
157        SDOT    v22.4s, v5.16b,  v2.4b[2]
158        SDOT    v23.4s, v5.16b,  v3.4b[2]
159        SDOT    v24.4s, v6.16b,  v0.4b[2]
160        SDOT    v25.4s, v6.16b,  v1.4b[2]
161        LDP     q4, q5, [x5], 32
162        SDOT    v26.4s, v6.16b,  v2.4b[2]
163        SDOT    v27.4s, v6.16b,  v3.4b[2]
164        SDOT    v28.4s, v7.16b,  v0.4b[2]
165        SDOT    v29.4s, v7.16b,  v1.4b[2]
166        SDOT    v30.4s, v7.16b,  v2.4b[2]
167        SDOT    v31.4s, v7.16b,  v3.4b[2]
168
169        SDOT    v16.4s, v4.16b,  v0.4b[3]
170        SDOT    v17.4s, v4.16b,  v1.4b[3]
171        LDP     q6, q7, [x5], 32
172        SDOT    v18.4s, v4.16b,  v2.4b[3]
173        SDOT    v19.4s, v4.16b,  v3.4b[3]
174        SDOT    v20.4s, v5.16b,  v0.4b[3]
175        SDOT    v21.4s, v5.16b,  v1.4b[3]
176        SDOT    v22.4s, v5.16b,  v2.4b[3]
177        SDOT    v23.4s, v5.16b,  v3.4b[3]
178        SDOT    v24.4s, v6.16b,  v0.4b[3]
179        SDOT    v25.4s, v6.16b,  v1.4b[3]
180        SDOT    v26.4s, v6.16b,  v2.4b[3]
181        SDOT    v27.4s, v6.16b,  v3.4b[3]
182        SUBS    x0, x0, 16
183        SDOT    v28.4s, v7.16b,  v0.4b[3]
184        SDOT    v29.4s, v7.16b,  v1.4b[3]
185        SDOT    v30.4s, v7.16b,  v2.4b[3]
186        SDOT    v31.4s, v7.16b,  v3.4b[3]
187        B.HS    2b
188
189        # Is there a remainder?- 4 to 12 bytes of A
190        TST     x0, 15
191        B.NE    4f
192
1933:
194        # ks loop
195        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
196        B.HI    1b
197
198        $if REQUANTIZATION == "RNDNU":
199          # Apply params - preshift, scale, postshift, bias and clamp
200          LD1R    {v4.4s}, [x11], 4
201          SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
202          SQSHL   v17.4s, v17.4s, v4.4s
203          SQSHL   v18.4s, v18.4s, v4.4s
204          SQSHL   v19.4s, v19.4s, v4.4s
205          SQSHL   v20.4s, v20.4s, v4.4s
206          SQSHL   v21.4s, v21.4s, v4.4s
207          SQSHL   v22.4s, v22.4s, v4.4s
208          SQSHL   v23.4s, v23.4s, v4.4s
209          LD1R    {v5.4s}, [x11], 4
210          SQSHL   v24.4s, v24.4s, v4.4s
211          SQSHL   v25.4s, v25.4s, v4.4s
212          SQSHL   v26.4s, v26.4s, v4.4s
213          SQSHL   v27.4s, v27.4s, v4.4s
214          SQSHL   v28.4s, v28.4s, v4.4s
215          SQSHL   v29.4s, v29.4s, v4.4s
216          SQSHL   v30.4s, v30.4s, v4.4s
217          SQSHL   v31.4s, v31.4s, v4.4s
218          LD1R    {v6.4s}, [x11], 4
219          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
220          SQDMULH v17.4s, v17.4s, v5.4s
221          SQDMULH v18.4s, v18.4s, v5.4s
222          SQDMULH v19.4s, v19.4s, v5.4s
223          SQDMULH v20.4s, v20.4s, v5.4s
224          SQDMULH v21.4s, v21.4s, v5.4s
225          SQDMULH v22.4s, v22.4s, v5.4s
226          SQDMULH v23.4s, v23.4s, v5.4s
227          SQDMULH v24.4s, v24.4s, v5.4s
228          SQDMULH v25.4s, v25.4s, v5.4s
229          SQDMULH v26.4s, v26.4s, v5.4s
230          SQDMULH v27.4s, v27.4s, v5.4s
231          SQDMULH v28.4s, v28.4s, v5.4s
232          SQDMULH v29.4s, v29.4s, v5.4s
233          SQDMULH v30.4s, v30.4s, v5.4s
234          SQDMULH v31.4s, v31.4s, v5.4s
235          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
236          SRSHL   v17.4s, v17.4s, v6.4s
237          SRSHL   v18.4s, v18.4s, v6.4s
238          SRSHL   v19.4s, v19.4s, v6.4s
239          SRSHL   v20.4s, v20.4s, v6.4s
240          SRSHL   v21.4s, v21.4s, v6.4s
241          SRSHL   v22.4s, v22.4s, v6.4s
242          SRSHL   v23.4s, v23.4s, v6.4s
243          SRSHL   v24.4s, v24.4s, v6.4s
244          SRSHL   v25.4s, v25.4s, v6.4s
245          SRSHL   v26.4s, v26.4s, v6.4s
246          SRSHL   v27.4s, v27.4s, v6.4s
247          SRSHL   v28.4s, v28.4s, v6.4s
248          SRSHL   v29.4s, v29.4s, v6.4s
249          SRSHL   v30.4s, v30.4s, v6.4s
250          SRSHL   v31.4s, v31.4s, v6.4s
251        $elif REQUANTIZATION == "FP32":
252          SCVTF   v16.4s, v16.4s
253          SCVTF   v17.4s, v17.4s
254          $if not CHANNELWISE:
255            # Apply params - scale, bias and clamp
256            LD1R    {v4.4s}, [x11], 4
257            SCVTF   v18.4s, v18.4s
258            SCVTF   v19.4s, v19.4s
259          $else:
260            # Load per channel scale values from weights
261            LDR     q4, [x5], 16
262            SCVTF   v18.4s, v18.4s
263            SCVTF   v19.4s, v19.4s
264            LDR     q5, [x5], 16
265          SCVTF   v20.4s, v20.4s
266          SCVTF   v21.4s, v21.4s
267          SCVTF   v22.4s, v22.4s
268          SCVTF   v23.4s, v23.4s
269          SCVTF   v24.4s, v24.4s
270          SCVTF   v25.4s, v25.4s
271          SCVTF   v26.4s, v26.4s
272          SCVTF   v27.4s, v27.4s
273          SCVTF   v28.4s, v28.4s
274          SCVTF   v29.4s, v29.4s
275          SCVTF   v30.4s, v30.4s
276          SCVTF   v31.4s, v31.4s
277
278          $if CHANNELWISE:
279            LDR     q6, [x5], 16
280            FMUL    v16.4s, v16.4s, v4.4s
281            FMUL    v17.4s, v17.4s, v4.4s
282            FMUL    v18.4s, v18.4s, v4.4s
283            FMUL    v19.4s, v19.4s, v4.4s
284            FMUL    v20.4s, v20.4s, v5.4s
285            LDR     q4, [x5], 16
286            FMUL    v21.4s, v21.4s, v5.4s
287            FMUL    v22.4s, v22.4s, v5.4s
288            FMUL    v23.4s, v23.4s, v5.4s
289            FMUL    v24.4s, v24.4s, v6.4s
290            FMUL    v25.4s, v25.4s, v6.4s
291            FMUL    v26.4s, v26.4s, v6.4s
292            FMUL    v27.4s, v27.4s, v6.4s
293            FMUL    v28.4s, v28.4s, v4.4s
294            FMUL    v29.4s, v29.4s, v4.4s
295            FMUL    v30.4s, v30.4s, v4.4s
296            FMUL    v31.4s, v31.4s, v4.4s
297          $else:
298            FMUL    v16.4s, v16.4s, v4.4s
299            FMUL    v17.4s, v17.4s, v4.4s
300            FMUL    v18.4s, v18.4s, v4.4s
301            FMUL    v19.4s, v19.4s, v4.4s
302            FMUL    v20.4s, v20.4s, v4.4s
303            FMUL    v21.4s, v21.4s, v4.4s
304            FMUL    v22.4s, v22.4s, v4.4s
305            FMUL    v23.4s, v23.4s, v4.4s
306            FMUL    v24.4s, v24.4s, v4.4s
307            FMUL    v25.4s, v25.4s, v4.4s
308            FMUL    v26.4s, v26.4s, v4.4s
309            FMUL    v27.4s, v27.4s, v4.4s
310            FMUL    v28.4s, v28.4s, v4.4s
311            FMUL    v29.4s, v29.4s, v4.4s
312            FMUL    v30.4s, v30.4s, v4.4s
313            FMUL    v31.4s, v31.4s, v4.4s
314
315          FCVTNS  v16.4s, v16.4s
316          FCVTNS  v17.4s, v17.4s
317          FCVTNS  v18.4s, v18.4s
318          FCVTNS  v19.4s, v19.4s
319          FCVTNS  v20.4s, v20.4s
320          FCVTNS  v21.4s, v21.4s
321          FCVTNS  v22.4s, v22.4s
322          FCVTNS  v23.4s, v23.4s
323          FCVTNS  v24.4s, v24.4s
324          FCVTNS  v25.4s, v25.4s
325          FCVTNS  v26.4s, v26.4s
326          FCVTNS  v27.4s, v27.4s
327          FCVTNS  v28.4s, v28.4s
328          FCVTNS  v29.4s, v29.4s
329          FCVTNS  v30.4s, v30.4s
330          FCVTNS  v31.4s, v31.4s
331
332        SQXTN   v16.4h, v16.4s
333        SQXTN   v17.4h, v17.4s
334        SQXTN   v18.4h, v18.4s
335        SQXTN   v19.4h, v19.4s
336        SQXTN   v24.4h, v24.4s
337        SQXTN   v25.4h, v25.4s
338        SQXTN   v26.4h, v26.4s
339        SQXTN   v27.4h, v27.4s
340        LD1R    {v6.8h}, [x11], 2        // add bias
341
342        SQXTN2  v16.8h, v20.4s
343        SQXTN2  v17.8h, v21.4s
344        SQXTN2  v18.8h, v22.4s
345        SQXTN2  v19.8h, v23.4s
346        SQXTN2  v24.8h, v28.4s
347        SQXTN2  v25.8h, v29.4s
348        SQXTN2  v26.8h, v30.4s
349        SQXTN2  v27.8h, v31.4s
350
351        SQADD   v16.8h, v16.8h, v6.8h
352        SQADD   v17.8h, v17.8h, v6.8h
353        SQADD   v18.8h, v18.8h, v6.8h
354        SQADD   v19.8h, v19.8h, v6.8h
355        SQADD   v24.8h, v24.8h, v6.8h
356        SQADD   v25.8h, v25.8h, v6.8h
357        SQADD   v26.8h, v26.8h, v6.8h
358        SQADD   v27.8h, v27.8h, v6.8h
359        LD1R    {v4.16b}, [x11], 1      // clamp min value
360
361        SQXTN   v0.8b, v16.8h
362        SQXTN   v1.8b, v17.8h
363        SQXTN   v2.8b, v18.8h
364        SQXTN   v3.8b, v19.8h
365        LD1R    {v5.16b}, [x11]         // clamp max value
366        SQXTN2  v0.16b, v24.8h
367        SQXTN2  v1.16b, v25.8h
368        SQXTN2  v2.16b, v26.8h
369        SQXTN2  v3.16b, v27.8h
370        LDR     x0, [sp]                // cn_stride
371
372        SMAX    v0.16b, v0.16b, v4.16b
373        SMAX    v1.16b, v1.16b, v4.16b
374        SUB     x11, x11, ${REWIND_DECREMENT}          // rewind params pointer
375        SMAX    v2.16b, v2.16b, v4.16b
376        SMAX    v3.16b, v3.16b, v4.16b
377        SUBS    x1, x1, 16
378        SMIN    v0.16b, v0.16b, v5.16b
379        SMIN    v1.16b, v1.16b, v5.16b
380        SMIN    v2.16b, v2.16b, v5.16b
381        SMIN    v3.16b, v3.16b, v5.16b
382        B.LO    6f
383
384        # Store full 4 x 16
385        ST1     {v3.16b},  [x7], x0
386        ST1     {v2.16b}, [x17], x0
387        ST1     {v1.16b}, [x16], x0
388        ST1     {v0.16b},  [x6], x0
389
390        SUB     x4, x4, x3              // a -= ks
391
392        # nc loop
393        B.HI    0b
394        RET
395
396        # Remainder- 8 bytes of A
397        .p2align 3
3984:
399        # Is there a remainder?- 8 bytes of A
400        TBZ     x0, 3, 5f
401
402        LDR     d0, [x13], 8
403        LDR     q4,  [x5], 16
404        LDR     d1, [x14], 8
405        LDR     d2, [x15], 8
406        LDR     d3,  [x10], 8
407        LDR     q5,  [x5], 16
408        SDOT    v16.4s, v4.16b,  v0.4b[0]
409        SDOT    v17.4s, v4.16b,  v1.4b[0]
410        LDP     q6, q7, [x5], 32
411        SDOT    v18.4s, v4.16b,  v2.4b[0]
412        SDOT    v19.4s, v4.16b,  v3.4b[0]
413        SDOT    v20.4s, v5.16b,  v0.4b[0]
414        SDOT    v21.4s, v5.16b,  v1.4b[0]
415        SDOT    v22.4s, v5.16b,  v2.4b[0]
416        SDOT    v23.4s, v5.16b,  v3.4b[0]
417        SDOT    v24.4s, v6.16b, v0.4b[0]
418        SDOT    v25.4s, v6.16b, v1.4b[0]
419        LDP     q4, q5, [x5], 32
420        SDOT    v26.4s, v6.16b, v2.4b[0]
421        SDOT    v27.4s, v6.16b, v3.4b[0]
422        SDOT    v28.4s, v7.16b, v0.4b[0]
423        SDOT    v29.4s, v7.16b, v1.4b[0]
424        SDOT    v30.4s, v7.16b, v2.4b[0]
425        SDOT    v31.4s, v7.16b, v3.4b[0]
426        SDOT    v16.4s, v4.16b,  v0.4b[1]
427        SDOT    v17.4s, v4.16b,  v1.4b[1]
428        LDP     q6, q7, [x5], 32
429        SDOT    v18.4s, v4.16b,  v2.4b[1]
430        SDOT    v19.4s, v4.16b,  v3.4b[1]
431        SDOT    v20.4s, v5.16b,  v0.4b[1]
432        SDOT    v21.4s, v5.16b,  v1.4b[1]
433        SDOT    v22.4s, v5.16b,  v2.4b[1]
434        SDOT    v23.4s, v5.16b,  v3.4b[1]
435        SDOT    v24.4s, v6.16b,  v0.4b[1]
436        SDOT    v25.4s, v6.16b,  v1.4b[1]
437        SDOT    v26.4s, v6.16b,  v2.4b[1]
438        SDOT    v27.4s, v6.16b,  v3.4b[1]
439        SDOT    v28.4s, v7.16b,  v0.4b[1]
440        SDOT    v29.4s, v7.16b,  v1.4b[1]
441        SDOT    v30.4s, v7.16b,  v2.4b[1]
442        SDOT    v31.4s, v7.16b,  v3.4b[1]
443        # Is there a remainder?- 4 bytes of A
444        TBZ     x0, 2, 3b
445
446        # Remainder- 4 bytes of A
4475:
448        LDR     s0, [x13], 4
449        LDR     q4, [x5], 16
450        LDR     s1, [x14], 4
451        LDR     s2, [x15], 4
452        LDR     s3,  [x10], 4
453        LDR     q5, [x5], 16
454        SDOT    v16.4s, v4.16b,  v0.4b[0]
455        SDOT    v17.4s, v4.16b,  v1.4b[0]
456        LDP     q6, q7, [x5], 32
457        SDOT    v18.4s, v4.16b,  v2.4b[0]
458        SDOT    v19.4s, v4.16b,  v3.4b[0]
459        SDOT    v20.4s, v5.16b,  v0.4b[0]
460        SDOT    v21.4s, v5.16b,  v1.4b[0]
461        SDOT    v22.4s, v5.16b,  v2.4b[0]
462        SDOT    v23.4s, v5.16b,  v3.4b[0]
463        SDOT    v24.4s, v6.16b, v0.4b[0]
464        SDOT    v25.4s, v6.16b, v1.4b[0]
465        SDOT    v26.4s, v6.16b, v2.4b[0]
466        SDOT    v27.4s, v6.16b, v3.4b[0]
467        SDOT    v28.4s, v7.16b, v0.4b[0]
468        SDOT    v29.4s, v7.16b, v1.4b[0]
469        SDOT    v30.4s, v7.16b, v2.4b[0]
470        SDOT    v31.4s, v7.16b, v3.4b[0]
471        B       3b
472
473        # Store odd width
474        .p2align 3
4756:
476        TBZ     x1, 3, 7f
477        STR     d3, [x7], 8
478        STR     d2, [x17], 8
479        DUP     d3, v3.d[1]
480        DUP     d2, v2.d[1]
481        STR     d1, [x16], 8
482        STR     d0, [x6], 8
483        DUP     d1, v1.d[1]
484        DUP     d0, v0.d[1]
4857:
486        TBZ     x1, 2, 8f
487        STR     s3, [x7], 4
488        STR     s2, [x17], 4
489        DUP     s3, v3.s[1]
490        DUP     s2, v2.s[1]
491        STR     s1, [x16], 4
492        STR     s0, [x6], 4
493        DUP     s1, v1.s[1]
494        DUP     s0, v0.s[1]
4958:
496        TBZ     x1, 1, 9f
497        STR     h3, [x7], 2
498        STR     h2, [x17], 2
499        DUP     h3, v3.h[1]
500        DUP     h2, v2.h[1]
501        STR     h1, [x16], 2
502        STR     h0, [x6], 2
503        DUP     h1, v1.h[1]
504        DUP     h0, v0.h[1]
5059:
506        TBZ     x1, 0, 10f
507        STR     b3, [x7]
508        STR     b2, [x17]
509        STR     b1, [x16]
510        STR     b0, [x6]
51110:
512        RET
513
514END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
515
516#ifdef __ELF__
517.section ".note.GNU-stack","",%progbits
518#endif
519