xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40
41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
42
43        # Clamp C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
46        ADD     x16, x6, x7             // c1 = c0 + cm_stride
47        CSEL    x16, x6,  x16, LO       //   c1 = c0
48
49        ADD     x17, x16, x7            // c2 = c1 + cm_stride
50        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
51                                        // if mr <= 2
52        CSEL    x17, x16, x17, LS       //   c2 = c1
53
54        CMP     x0, 4                   // if mr < 4
55        STR     x20, [sp, -16]!         // Save x20 on stack
56        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
57        CSEL    x7,  x17, x7, LO        //   c3 = c2
58
59
60        .p2align 3
610:
62        # Load initial bias from w into accumulators
63        LDP     q16, q20, [x5], 32
64        MOV     v17.16b, v16.16b
65        MOV     v18.16b, v16.16b
66        LDP     q24, q28, [x5], 32
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOV     v25.16b, v24.16b
72        MOV     v26.16b, v24.16b
73        MOV     v27.16b, v24.16b
74        MOV     v29.16b, v28.16b
75        MOV     v30.16b, v28.16b
76        MOV     v31.16b, v28.16b
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x20, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x20, x12                // if a3 == zero
95        ADD     x20, x20, x8            // a3 += a_offset
96        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 8 bytes for main loop?
99        SUBS    x0, x2, 8               // k = kc - 8
100        B.LO    4f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1042:
105        LD1     {v0.8b}, [x13], 8
106        LDP     d4, d5, [x5], 16
107        LD1     {v1.8b}, [x14], 8
108        LD1     {v2.8b}, [x15], 8
109        LD1     {v3.8b}, [x20], 8
110        SXTL    v0.8h, v0.8b
111        SXTL    v4.8h, v4.8b
112        SXTL    v5.8h, v5.8b
113        SXTL    v1.8h, v1.8b
114        SXTL    v2.8h, v2.8b
115        SXTL    v3.8h, v3.8b
116        SMLAL   v16.4s, v4.4h, v0.h[0]
117        SMLAL2  v20.4s, v4.8h, v0.h[0]
118        SMLAL   v24.4s, v5.4h, v0.h[0]
119        SMLAL2  v28.4s, v5.8h, v0.h[0]
120        SMLAL   v17.4s, v4.4h, v1.h[0]
121        SMLAL2  v21.4s, v4.8h, v1.h[0]
122        SMLAL   v25.4s, v5.4h, v1.h[0]
123        SMLAL2  v29.4s, v5.8h, v1.h[0]
124        SMLAL   v18.4s, v4.4h, v2.h[0]
125        SMLAL2  v22.4s, v4.8h, v2.h[0]
126        SMLAL   v26.4s, v5.4h, v2.h[0]
127        SMLAL2  v30.4s, v5.8h, v2.h[0]
128        SMLAL   v19.4s, v4.4h, v3.h[0]
129        SMLAL2  v23.4s, v4.8h, v3.h[0]
130        SMLAL   v27.4s, v5.4h, v3.h[0]
131        SMLAL2  v31.4s, v5.8h, v3.h[0]
132
133        LDP     d4, d5, [x5], 16
134        SXTL    v4.8h, v4.8b
135        SXTL    v5.8h, v5.8b
136        SMLAL   v16.4s, v4.4h, v0.h[1]
137        SMLAL2  v20.4s, v4.8h, v0.h[1]
138        SMLAL   v24.4s, v5.4h, v0.h[1]
139        SMLAL2  v28.4s, v5.8h, v0.h[1]
140        SMLAL   v17.4s, v4.4h, v1.h[1]
141        SMLAL2  v21.4s, v4.8h, v1.h[1]
142        SMLAL   v25.4s, v5.4h, v1.h[1]
143        SMLAL2  v29.4s, v5.8h, v1.h[1]
144        SMLAL   v18.4s, v4.4h, v2.h[1]
145        SMLAL2  v22.4s, v4.8h, v2.h[1]
146        SMLAL   v26.4s, v5.4h, v2.h[1]
147        SMLAL2  v30.4s, v5.8h, v2.h[1]
148        SMLAL   v19.4s, v4.4h, v3.h[1]
149        SMLAL2  v23.4s, v4.8h, v3.h[1]
150        SMLAL   v27.4s, v5.4h, v3.h[1]
151        SMLAL2  v31.4s, v5.8h, v3.h[1]
152
153        LDP     d4, d5, [x5], 16
154        SXTL    v4.8h, v4.8b
155        SXTL    v5.8h, v5.8b
156        SMLAL   v16.4s, v4.4h, v0.h[2]
157        SMLAL2  v20.4s, v4.8h, v0.h[2]
158        SMLAL   v24.4s, v5.4h, v0.h[2]
159        SMLAL2  v28.4s, v5.8h, v0.h[2]
160        SMLAL   v17.4s, v4.4h, v1.h[2]
161        SMLAL2  v21.4s, v4.8h, v1.h[2]
162        SMLAL   v25.4s, v5.4h, v1.h[2]
163        SMLAL2  v29.4s, v5.8h, v1.h[2]
164        SMLAL   v18.4s, v4.4h, v2.h[2]
165        SMLAL2  v22.4s, v4.8h, v2.h[2]
166        SMLAL   v26.4s, v5.4h, v2.h[2]
167        SMLAL2  v30.4s, v5.8h, v2.h[2]
168        SMLAL   v19.4s, v4.4h, v3.h[2]
169        SMLAL2  v23.4s, v4.8h, v3.h[2]
170        SMLAL   v27.4s, v5.4h, v3.h[2]
171        SMLAL2  v31.4s, v5.8h, v3.h[2]
172
173        LDP     d4, d5, [x5], 16
174        SXTL    v4.8h, v4.8b
175        SXTL    v5.8h, v5.8b
176        SMLAL   v16.4s, v4.4h, v0.h[3]
177        SMLAL2  v20.4s, v4.8h, v0.h[3]
178        SMLAL   v24.4s, v5.4h, v0.h[3]
179        SMLAL2  v28.4s, v5.8h, v0.h[3]
180        SMLAL   v17.4s, v4.4h, v1.h[3]
181        SMLAL2  v21.4s, v4.8h, v1.h[3]
182        SMLAL   v25.4s, v5.4h, v1.h[3]
183        SMLAL2  v29.4s, v5.8h, v1.h[3]
184        SMLAL   v18.4s, v4.4h, v2.h[3]
185        SMLAL2  v22.4s, v4.8h, v2.h[3]
186        SMLAL   v26.4s, v5.4h, v2.h[3]
187        SMLAL2  v30.4s, v5.8h, v2.h[3]
188        SMLAL   v19.4s, v4.4h, v3.h[3]
189        SMLAL2  v23.4s, v4.8h, v3.h[3]
190        SMLAL   v27.4s, v5.4h, v3.h[3]
191        SMLAL2  v31.4s, v5.8h, v3.h[3]
192
193        LDP     d4, d5, [x5], 16
194        SXTL    v4.8h, v4.8b
195        SXTL    v5.8h, v5.8b
196        SMLAL   v16.4s, v4.4h, v0.h[4]
197        SMLAL2  v20.4s, v4.8h, v0.h[4]
198        SMLAL   v24.4s, v5.4h, v0.h[4]
199        SMLAL2  v28.4s, v5.8h, v0.h[4]
200        SMLAL   v17.4s, v4.4h, v1.h[4]
201        SMLAL2  v21.4s, v4.8h, v1.h[4]
202        SMLAL   v25.4s, v5.4h, v1.h[4]
203        SMLAL2  v29.4s, v5.8h, v1.h[4]
204        SMLAL   v18.4s, v4.4h, v2.h[4]
205        SMLAL2  v22.4s, v4.8h, v2.h[4]
206        SMLAL   v26.4s, v5.4h, v2.h[4]
207        SMLAL2  v30.4s, v5.8h, v2.h[4]
208        SMLAL   v19.4s, v4.4h, v3.h[4]
209        SMLAL2  v23.4s, v4.8h, v3.h[4]
210        SMLAL   v27.4s, v5.4h, v3.h[4]
211        SMLAL2  v31.4s, v5.8h, v3.h[4]
212
213        LDP     d4, d5, [x5], 16
214        SXTL    v4.8h, v4.8b
215        SXTL    v5.8h, v5.8b
216        SMLAL   v16.4s, v4.4h, v0.h[5]
217        SMLAL2  v20.4s, v4.8h, v0.h[5]
218        SMLAL   v24.4s, v5.4h, v0.h[5]
219        SMLAL2  v28.4s, v5.8h, v0.h[5]
220        SMLAL   v17.4s, v4.4h, v1.h[5]
221        SMLAL2  v21.4s, v4.8h, v1.h[5]
222        SMLAL   v25.4s, v5.4h, v1.h[5]
223        SMLAL2  v29.4s, v5.8h, v1.h[5]
224        SMLAL   v18.4s, v4.4h, v2.h[5]
225        SMLAL2  v22.4s, v4.8h, v2.h[5]
226        SMLAL   v26.4s, v5.4h, v2.h[5]
227        SMLAL2  v30.4s, v5.8h, v2.h[5]
228        SMLAL   v19.4s, v4.4h, v3.h[5]
229        SMLAL2  v23.4s, v4.8h, v3.h[5]
230        SMLAL   v27.4s, v5.4h, v3.h[5]
231        SMLAL2  v31.4s, v5.8h, v3.h[5]
232
233        LDP     d4, d5, [x5], 16
234        SXTL    v4.8h, v4.8b
235        SXTL    v5.8h, v5.8b
236        SMLAL   v16.4s, v4.4h, v0.h[6]
237        SMLAL2  v20.4s, v4.8h, v0.h[6]
238        SMLAL   v24.4s, v5.4h, v0.h[6]
239        SMLAL2  v28.4s, v5.8h, v0.h[6]
240        SMLAL   v17.4s, v4.4h, v1.h[6]
241        SMLAL2  v21.4s, v4.8h, v1.h[6]
242        SMLAL   v25.4s, v5.4h, v1.h[6]
243        SMLAL2  v29.4s, v5.8h, v1.h[6]
244        SMLAL   v18.4s, v4.4h, v2.h[6]
245        SMLAL2  v22.4s, v4.8h, v2.h[6]
246        SMLAL   v26.4s, v5.4h, v2.h[6]
247        SMLAL2  v30.4s, v5.8h, v2.h[6]
248        SMLAL   v19.4s, v4.4h, v3.h[6]
249        SMLAL2  v23.4s, v4.8h, v3.h[6]
250        SMLAL   v27.4s, v5.4h, v3.h[6]
251        SMLAL2  v31.4s, v5.8h, v3.h[6]
252
253        LDP     d4, d5, [x5], 16
254        SXTL    v4.8h, v4.8b
255        SXTL    v5.8h, v5.8b
256        SMLAL   v16.4s, v4.4h, v0.h[7]
257        SMLAL2  v20.4s, v4.8h, v0.h[7]
258        SMLAL   v24.4s, v5.4h, v0.h[7]
259        SMLAL2  v28.4s, v5.8h, v0.h[7]
260        SMLAL   v17.4s, v4.4h, v1.h[7]
261        SMLAL2  v21.4s, v4.8h, v1.h[7]
262        SMLAL   v25.4s, v5.4h, v1.h[7]
263        SMLAL2  v29.4s, v5.8h, v1.h[7]
264        SMLAL   v18.4s, v4.4h, v2.h[7]
265        SMLAL2  v22.4s, v4.8h, v2.h[7]
266        SMLAL   v26.4s, v5.4h, v2.h[7]
267        SMLAL2  v30.4s, v5.8h, v2.h[7]
268        SMLAL   v19.4s, v4.4h, v3.h[7]
269        SMLAL2  v23.4s, v4.8h, v3.h[7]
270        SMLAL   v27.4s, v5.4h, v3.h[7]
271        SMLAL2  v31.4s, v5.8h, v3.h[7]
272
273        SUBS    x0, x0, 8
274        B.HS    2b
275
276        AND     x0, x2, 7               // kc remainder 0 to 7
277        # Is there a remainder?- 1 to 7 bytes of A
278        CBNZ    x0, 4f
279
2803:
281        # ks loop
282        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
283        B.HI    1b
284
285        SCVTF   v16.4s, v16.4s
286        SCVTF   v17.4s, v17.4s
287        # Load per channel scale values from weights
288        LDR     q4, [x5], 16
289        SCVTF   v18.4s, v18.4s
290        SCVTF   v19.4s, v19.4s
291        LDR     q5, [x5], 16
292        SCVTF   v20.4s, v20.4s
293        SCVTF   v21.4s, v21.4s
294        SCVTF   v22.4s, v22.4s
295        SCVTF   v23.4s, v23.4s
296        SCVTF   v24.4s, v24.4s
297        SCVTF   v25.4s, v25.4s
298        SCVTF   v26.4s, v26.4s
299        SCVTF   v27.4s, v27.4s
300        SCVTF   v28.4s, v28.4s
301        SCVTF   v29.4s, v29.4s
302        SCVTF   v30.4s, v30.4s
303        SCVTF   v31.4s, v31.4s
304
305        LDR     q6, [x5], 16
306        FMUL    v16.4s, v16.4s, v4.4s
307        FMUL    v17.4s, v17.4s, v4.4s
308        FMUL    v18.4s, v18.4s, v4.4s
309        FMUL    v19.4s, v19.4s, v4.4s
310        FMUL    v20.4s, v20.4s, v5.4s
311        LDR     q4, [x5], 16
312        FMUL    v21.4s, v21.4s, v5.4s
313        FMUL    v22.4s, v22.4s, v5.4s
314        FMUL    v23.4s, v23.4s, v5.4s
315        FMUL    v24.4s, v24.4s, v6.4s
316        FMUL    v25.4s, v25.4s, v6.4s
317        FMUL    v26.4s, v26.4s, v6.4s
318        FMUL    v27.4s, v27.4s, v6.4s
319        FMUL    v28.4s, v28.4s, v4.4s
320        FMUL    v29.4s, v29.4s, v4.4s
321        FMUL    v30.4s, v30.4s, v4.4s
322        FMUL    v31.4s, v31.4s, v4.4s
323
324        FCVTNS  v16.4s, v16.4s
325        FCVTNS  v17.4s, v17.4s
326        FCVTNS  v18.4s, v18.4s
327        FCVTNS  v19.4s, v19.4s
328        FCVTNS  v20.4s, v20.4s
329        FCVTNS  v21.4s, v21.4s
330        FCVTNS  v22.4s, v22.4s
331        FCVTNS  v23.4s, v23.4s
332        FCVTNS  v24.4s, v24.4s
333        FCVTNS  v25.4s, v25.4s
334        FCVTNS  v26.4s, v26.4s
335        FCVTNS  v27.4s, v27.4s
336        FCVTNS  v28.4s, v28.4s
337        FCVTNS  v29.4s, v29.4s
338        FCVTNS  v30.4s, v30.4s
339        FCVTNS  v31.4s, v31.4s
340
341        SQXTN   v16.4h, v16.4s
342        SQXTN   v17.4h, v17.4s
343        SQXTN   v18.4h, v18.4s
344        SQXTN   v19.4h, v19.4s
345        SQXTN   v24.4h, v24.4s
346        SQXTN   v25.4h, v25.4s
347        SQXTN   v26.4h, v26.4s
348        SQXTN   v27.4h, v27.4s
349        LD1R    {v6.8h}, [x11], 2        // add bias
350
351        SQXTN2  v16.8h, v20.4s
352        SQXTN2  v17.8h, v21.4s
353        SQXTN2  v18.8h, v22.4s
354        SQXTN2  v19.8h, v23.4s
355        SQXTN2  v24.8h, v28.4s
356        SQXTN2  v25.8h, v29.4s
357        SQXTN2  v26.8h, v30.4s
358        SQXTN2  v27.8h, v31.4s
359
360        SQADD   v16.8h, v16.8h, v6.8h
361        SQADD   v17.8h, v17.8h, v6.8h
362        SQADD   v18.8h, v18.8h, v6.8h
363        SQADD   v19.8h, v19.8h, v6.8h
364        SQADD   v24.8h, v24.8h, v6.8h
365        SQADD   v25.8h, v25.8h, v6.8h
366        SQADD   v26.8h, v26.8h, v6.8h
367        SQADD   v27.8h, v27.8h, v6.8h
368        LD1R    {v4.16b}, [x11], 1       // clamp min value
369
370        SQXTN   v0.8b, v16.8h
371        SQXTN   v1.8b, v17.8h
372        SQXTN   v2.8b, v18.8h
373        SQXTN   v3.8b, v19.8h
374        LD1R    {v5.16b}, [x11]          // clamp max value
375        SQXTN2  v0.16b, v24.8h
376        SQXTN2  v1.16b, v25.8h
377        SQXTN2  v2.16b, v26.8h
378        SQXTN2  v3.16b, v27.8h
379        SUB     x11, x11, 3             // rewind params pointer
380
381        SMAX    v0.16b, v0.16b, v4.16b
382        SMAX    v1.16b, v1.16b, v4.16b
383        SMAX    v2.16b, v2.16b, v4.16b
384        SMAX    v3.16b, v3.16b, v4.16b
385        SUBS    x1, x1, 16
386        SMIN    v0.16b, v0.16b, v5.16b
387        SMIN    v1.16b, v1.16b, v5.16b
388        SMIN    v2.16b, v2.16b, v5.16b
389        SMIN    v3.16b, v3.16b, v5.16b
390        B.LO    5f
391
392        # Store full 4 x 16
393        ST1     {v3.16b},  [x7], x10
394        ST1     {v2.16b}, [x17], x10
395        ST1     {v1.16b}, [x16], x10
396        ST1     {v0.16b},  [x6], x10
397
398        SUB     x4, x4, x3              // a -= ks
399
400        # nc loop
401        B.HI    0b
402
403        # Restore x20 from stack
404        LDR     x20, [sp], 16
405        RET
406
407        # Remainder- 1 to 7 bytes of A
408        .p2align 3
4094:
410        AND     x0, x2, 7               // kc remainder 1 to 7
411
412        LD1     {v0.8b}, [x13], x0
413        LDP     d4, d5, [x5], 16
414        LD1     {v1.8b}, [x14], x0
415        LD1     {v2.8b}, [x15], x0
416        LD1     {v3.8b}, [x20], x0
417        SXTL    v0.8h, v0.8b
418        SXTL    v4.8h, v4.8b
419        SXTL    v5.8h, v5.8b
420        SXTL    v1.8h, v1.8b
421        SXTL    v2.8h, v2.8b
422        SXTL    v3.8h, v3.8b
423        SMLAL   v16.4s, v4.4h, v0.h[0]
424        SMLAL2  v20.4s, v4.8h, v0.h[0]
425        SMLAL   v24.4s, v5.4h, v0.h[0]
426        SMLAL2  v28.4s, v5.8h, v0.h[0]
427        SMLAL   v17.4s, v4.4h, v1.h[0]
428        SMLAL2  v21.4s, v4.8h, v1.h[0]
429        SMLAL   v25.4s, v5.4h, v1.h[0]
430        SMLAL2  v29.4s, v5.8h, v1.h[0]
431        SMLAL   v18.4s, v4.4h, v2.h[0]
432        SMLAL2  v22.4s, v4.8h, v2.h[0]
433        SMLAL   v26.4s, v5.4h, v2.h[0]
434        SMLAL2  v30.4s, v5.8h, v2.h[0]
435        SMLAL   v19.4s, v4.4h, v3.h[0]
436        SMLAL2  v23.4s, v4.8h, v3.h[0]
437        SMLAL   v27.4s, v5.4h, v3.h[0]
438        SMLAL2  v31.4s, v5.8h, v3.h[0]
439        CMP     x0, 2
440        B.LO    3b
441
442        LDP     d4, d5, [x5], 16
443        SXTL    v4.8h, v4.8b
444        SXTL    v5.8h, v5.8b
445        SMLAL   v16.4s, v4.4h, v0.h[1]
446        SMLAL2  v20.4s, v4.8h, v0.h[1]
447        SMLAL   v24.4s, v5.4h, v0.h[1]
448        SMLAL2  v28.4s, v5.8h, v0.h[1]
449        SMLAL   v17.4s, v4.4h, v1.h[1]
450        SMLAL2  v21.4s, v4.8h, v1.h[1]
451        SMLAL   v25.4s, v5.4h, v1.h[1]
452        SMLAL2  v29.4s, v5.8h, v1.h[1]
453        SMLAL   v18.4s, v4.4h, v2.h[1]
454        SMLAL2  v22.4s, v4.8h, v2.h[1]
455        SMLAL   v26.4s, v5.4h, v2.h[1]
456        SMLAL2  v30.4s, v5.8h, v2.h[1]
457        SMLAL   v19.4s, v4.4h, v3.h[1]
458        SMLAL2  v23.4s, v4.8h, v3.h[1]
459        SMLAL   v27.4s, v5.4h, v3.h[1]
460        SMLAL2  v31.4s, v5.8h, v3.h[1]
461        B.EQ    3b
462
463        LDP     d4, d5, [x5], 16
464        SXTL    v4.8h, v4.8b
465        SXTL    v5.8h, v5.8b
466        SMLAL   v16.4s, v4.4h, v0.h[2]
467        SMLAL2  v20.4s, v4.8h, v0.h[2]
468        SMLAL   v24.4s, v5.4h, v0.h[2]
469        SMLAL2  v28.4s, v5.8h, v0.h[2]
470        SMLAL   v17.4s, v4.4h, v1.h[2]
471        SMLAL2  v21.4s, v4.8h, v1.h[2]
472        SMLAL   v25.4s, v5.4h, v1.h[2]
473        SMLAL2  v29.4s, v5.8h, v1.h[2]
474        SMLAL   v18.4s, v4.4h, v2.h[2]
475        SMLAL2  v22.4s, v4.8h, v2.h[2]
476        SMLAL   v26.4s, v5.4h, v2.h[2]
477        SMLAL2  v30.4s, v5.8h, v2.h[2]
478        SMLAL   v19.4s, v4.4h, v3.h[2]
479        SMLAL2  v23.4s, v4.8h, v3.h[2]
480        SMLAL   v27.4s, v5.4h, v3.h[2]
481        SMLAL2  v31.4s, v5.8h, v3.h[2]
482        CMP     x0, 4
483        B.LO    3b
484
485        LDP     d4, d5, [x5], 16
486        SXTL    v4.8h, v4.8b
487        SXTL    v5.8h, v5.8b
488        SMLAL   v16.4s, v4.4h, v0.h[3]
489        SMLAL2  v20.4s, v4.8h, v0.h[3]
490        SMLAL   v24.4s, v5.4h, v0.h[3]
491        SMLAL2  v28.4s, v5.8h, v0.h[3]
492        SMLAL   v17.4s, v4.4h, v1.h[3]
493        SMLAL2  v21.4s, v4.8h, v1.h[3]
494        SMLAL   v25.4s, v5.4h, v1.h[3]
495        SMLAL2  v29.4s, v5.8h, v1.h[3]
496        SMLAL   v18.4s, v4.4h, v2.h[3]
497        SMLAL2  v22.4s, v4.8h, v2.h[3]
498        SMLAL   v26.4s, v5.4h, v2.h[3]
499        SMLAL2  v30.4s, v5.8h, v2.h[3]
500        SMLAL   v19.4s, v4.4h, v3.h[3]
501        SMLAL2  v23.4s, v4.8h, v3.h[3]
502        SMLAL   v27.4s, v5.4h, v3.h[3]
503        SMLAL2  v31.4s, v5.8h, v3.h[3]
504        B.EQ    3b
505
506        LDP     d4, d5, [x5], 16
507        SXTL    v4.8h, v4.8b
508        SXTL    v5.8h, v5.8b
509        SMLAL   v16.4s, v4.4h, v0.h[4]
510        SMLAL2  v20.4s, v4.8h, v0.h[4]
511        SMLAL   v24.4s, v5.4h, v0.h[4]
512        SMLAL2  v28.4s, v5.8h, v0.h[4]
513        SMLAL   v17.4s, v4.4h, v1.h[4]
514        SMLAL2  v21.4s, v4.8h, v1.h[4]
515        SMLAL   v25.4s, v5.4h, v1.h[4]
516        SMLAL2  v29.4s, v5.8h, v1.h[4]
517        SMLAL   v18.4s, v4.4h, v2.h[4]
518        SMLAL2  v22.4s, v4.8h, v2.h[4]
519        SMLAL   v26.4s, v5.4h, v2.h[4]
520        SMLAL2  v30.4s, v5.8h, v2.h[4]
521        SMLAL   v19.4s, v4.4h, v3.h[4]
522        SMLAL2  v23.4s, v4.8h, v3.h[4]
523        SMLAL   v27.4s, v5.4h, v3.h[4]
524        SMLAL2  v31.4s, v5.8h, v3.h[4]
525        CMP     x0, 6
526        B.LO    3b
527
528        LDP     d4, d5, [x5], 16
529        SXTL    v4.8h, v4.8b
530        SXTL    v5.8h, v5.8b
531        SMLAL   v16.4s, v4.4h, v0.h[5]
532        SMLAL2  v20.4s, v4.8h, v0.h[5]
533        SMLAL   v24.4s, v5.4h, v0.h[5]
534        SMLAL2  v28.4s, v5.8h, v0.h[5]
535        SMLAL   v17.4s, v4.4h, v1.h[5]
536        SMLAL2  v21.4s, v4.8h, v1.h[5]
537        SMLAL   v25.4s, v5.4h, v1.h[5]
538        SMLAL2  v29.4s, v5.8h, v1.h[5]
539        SMLAL   v18.4s, v4.4h, v2.h[5]
540        SMLAL2  v22.4s, v4.8h, v2.h[5]
541        SMLAL   v26.4s, v5.4h, v2.h[5]
542        SMLAL2  v30.4s, v5.8h, v2.h[5]
543        SMLAL   v19.4s, v4.4h, v3.h[5]
544        SMLAL2  v23.4s, v4.8h, v3.h[5]
545        SMLAL   v27.4s, v5.4h, v3.h[5]
546        SMLAL2  v31.4s, v5.8h, v3.h[5]
547        B.EQ    3b
548
549        LDP     d4, d5, [x5], 16
550        SXTL    v4.8h, v4.8b
551        SXTL    v5.8h, v5.8b
552        SMLAL   v16.4s, v4.4h, v0.h[6]
553        SMLAL2  v20.4s, v4.8h, v0.h[6]
554        SMLAL   v24.4s, v5.4h, v0.h[6]
555        SMLAL2  v28.4s, v5.8h, v0.h[6]
556        SMLAL   v17.4s, v4.4h, v1.h[6]
557        SMLAL2  v21.4s, v4.8h, v1.h[6]
558        SMLAL   v25.4s, v5.4h, v1.h[6]
559        SMLAL2  v29.4s, v5.8h, v1.h[6]
560        SMLAL   v18.4s, v4.4h, v2.h[6]
561        SMLAL2  v22.4s, v4.8h, v2.h[6]
562        SMLAL   v26.4s, v5.4h, v2.h[6]
563        SMLAL2  v30.4s, v5.8h, v2.h[6]
564        SMLAL   v19.4s, v4.4h, v3.h[6]
565        SMLAL2  v23.4s, v4.8h, v3.h[6]
566        SMLAL   v27.4s, v5.4h, v3.h[6]
567        SMLAL2  v31.4s, v5.8h, v3.h[6]
568        B       3b
569
570        # Store odd width
571        .p2align 3
5725:
573        TBZ     x1, 3, 6f
574        STR     d3, [x7], 8
575        STR     d2, [x17], 8
576        DUP     d3, v3.d[1]
577        DUP     d2, v2.d[1]
578        STR     d1, [x16], 8
579        STR     d0, [x6], 8
580        DUP     d1, v1.d[1]
581        DUP     d0, v0.d[1]
5826:
583        TBZ     x1, 2, 7f
584        STR     s3, [x7], 4
585        STR     s2, [x17], 4
586        DUP     s3, v3.s[1]
587        DUP     s2, v2.s[1]
588        STR     s1, [x16], 4
589        STR     s0, [x6], 4
590        DUP     s1, v1.s[1]
591        DUP     s0, v0.s[1]
5927:
593        TBZ     x1, 1, 8f
594        STR     h3, [x7], 2
595        STR     h2, [x17], 2
596        DUP     h3, v3.h[1]
597        DUP     h2, v2.h[1]
598        STR     h1, [x16], 2
599        STR     h0, [x6], 2
600        DUP     h1, v1.h[1]
601        DUP     h0, v0.h[1]
6028:
603        TBZ     x1, 0, 9f
604        STR     b3, [x7]
605        STR     b2, [x17]
606        STR     b1, [x16]
607        STR     b0, [x6]
6089:
609        # Restore x20 from stack
610        LDR     x20, [sp], 16
611        RET
612
613END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
614
615#ifdef __ELF__
616.section ".note.GNU-stack","",%progbits
617#endif
618