xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40
41BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
42
43        # Clamp C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
46        ADD     x16, x6, x7             // c1 = c0 + cm_stride
47        CSEL    x16, x6,  x16, LO       //   c1 = c0
48
49        ADD     x17, x16, x7            // c2 = c1 + cm_stride
50        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
51                                        // if mr <= 2
52        CSEL    x17, x16, x17, LS       //   c2 = c1
53
54        CMP     x0, 4                   // if mr < 4
55        STR     x20, [sp, -16]!         // Save x20 on stack
56        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
57        CSEL    x7,  x17, x7, LO        //   c3 = c2
58
59
60        .p2align 3
610:
62        # Load initial bias from w into accumulators
63        LDP     q16, q20, [x5], 32
64        MOV     v17.16b, v16.16b
65        MOV     v18.16b, v16.16b
66        LDP     q24, q28, [x5], 32
67        MOV     v19.16b, v16.16b
68        MOV     v21.16b, v20.16b
69        MOV     v22.16b, v20.16b
70        MOV     v23.16b, v20.16b
71        MOV     v25.16b, v24.16b
72        MOV     v26.16b, v24.16b
73        MOV     v27.16b, v24.16b
74        MOV     v29.16b, v28.16b
75        MOV     v30.16b, v28.16b
76        MOV     v31.16b, v28.16b
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x20, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x20, x12                // if a3 == zero
95        ADD     x20, x20, x8            // a3 += a_offset
96        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 8 bytes for main loop?
99        SUBS    x0, x2, 8               // k = kc - 8
100        B.LO    4f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1042:
105        LD1     {v0.8b}, [x13], 8
106        LDP     d4, d5, [x5], 16
107        LD1     {v1.8b}, [x14], 8
108        LD1     {v2.8b}, [x15], 8
109        LD1     {v3.8b}, [x20], 8
110        SXTL    v0.8h, v0.8b
111        SXTL    v4.8h, v4.8b
112        SXTL    v5.8h, v5.8b
113        SXTL    v1.8h, v1.8b
114        SXTL    v2.8h, v2.8b
115        SXTL    v3.8h, v3.8b
116        SMLAL   v16.4s, v4.4h, v0.h[0]
117        SMLAL2  v20.4s, v4.8h, v0.h[0]
118        SMLAL   v24.4s, v5.4h, v0.h[0]
119        SMLAL2  v28.4s, v5.8h, v0.h[0]
120        SMLAL   v17.4s, v4.4h, v1.h[0]
121        SMLAL2  v21.4s, v4.8h, v1.h[0]
122        SMLAL   v25.4s, v5.4h, v1.h[0]
123        SMLAL2  v29.4s, v5.8h, v1.h[0]
124        SMLAL   v18.4s, v4.4h, v2.h[0]
125        SMLAL2  v22.4s, v4.8h, v2.h[0]
126        SMLAL   v26.4s, v5.4h, v2.h[0]
127        SMLAL2  v30.4s, v5.8h, v2.h[0]
128        SMLAL   v19.4s, v4.4h, v3.h[0]
129        SMLAL2  v23.4s, v4.8h, v3.h[0]
130        SMLAL   v27.4s, v5.4h, v3.h[0]
131        SMLAL2  v31.4s, v5.8h, v3.h[0]
132
133        LDP     d4, d5, [x5], 16
134        SXTL    v4.8h, v4.8b
135        SXTL    v5.8h, v5.8b
136        SMLAL   v16.4s, v4.4h, v0.h[1]
137        SMLAL2  v20.4s, v4.8h, v0.h[1]
138        SMLAL   v24.4s, v5.4h, v0.h[1]
139        SMLAL2  v28.4s, v5.8h, v0.h[1]
140        SMLAL   v17.4s, v4.4h, v1.h[1]
141        SMLAL2  v21.4s, v4.8h, v1.h[1]
142        SMLAL   v25.4s, v5.4h, v1.h[1]
143        SMLAL2  v29.4s, v5.8h, v1.h[1]
144        SMLAL   v18.4s, v4.4h, v2.h[1]
145        SMLAL2  v22.4s, v4.8h, v2.h[1]
146        SMLAL   v26.4s, v5.4h, v2.h[1]
147        SMLAL2  v30.4s, v5.8h, v2.h[1]
148        SMLAL   v19.4s, v4.4h, v3.h[1]
149        SMLAL2  v23.4s, v4.8h, v3.h[1]
150        SMLAL   v27.4s, v5.4h, v3.h[1]
151        SMLAL2  v31.4s, v5.8h, v3.h[1]
152
153        LDP     d4, d5, [x5], 16
154        SXTL    v4.8h, v4.8b
155        SXTL    v5.8h, v5.8b
156        SMLAL   v16.4s, v4.4h, v0.h[2]
157        SMLAL2  v20.4s, v4.8h, v0.h[2]
158        SMLAL   v24.4s, v5.4h, v0.h[2]
159        SMLAL2  v28.4s, v5.8h, v0.h[2]
160        SMLAL   v17.4s, v4.4h, v1.h[2]
161        SMLAL2  v21.4s, v4.8h, v1.h[2]
162        SMLAL   v25.4s, v5.4h, v1.h[2]
163        SMLAL2  v29.4s, v5.8h, v1.h[2]
164        SMLAL   v18.4s, v4.4h, v2.h[2]
165        SMLAL2  v22.4s, v4.8h, v2.h[2]
166        SMLAL   v26.4s, v5.4h, v2.h[2]
167        SMLAL2  v30.4s, v5.8h, v2.h[2]
168        SMLAL   v19.4s, v4.4h, v3.h[2]
169        SMLAL2  v23.4s, v4.8h, v3.h[2]
170        SMLAL   v27.4s, v5.4h, v3.h[2]
171        SMLAL2  v31.4s, v5.8h, v3.h[2]
172
173        LDP     d4, d5, [x5], 16
174        SXTL    v4.8h, v4.8b
175        SXTL    v5.8h, v5.8b
176        SMLAL   v16.4s, v4.4h, v0.h[3]
177        SMLAL2  v20.4s, v4.8h, v0.h[3]
178        SMLAL   v24.4s, v5.4h, v0.h[3]
179        SMLAL2  v28.4s, v5.8h, v0.h[3]
180        SMLAL   v17.4s, v4.4h, v1.h[3]
181        SMLAL2  v21.4s, v4.8h, v1.h[3]
182        SMLAL   v25.4s, v5.4h, v1.h[3]
183        SMLAL2  v29.4s, v5.8h, v1.h[3]
184        SMLAL   v18.4s, v4.4h, v2.h[3]
185        SMLAL2  v22.4s, v4.8h, v2.h[3]
186        SMLAL   v26.4s, v5.4h, v2.h[3]
187        SMLAL2  v30.4s, v5.8h, v2.h[3]
188        SMLAL   v19.4s, v4.4h, v3.h[3]
189        SMLAL2  v23.4s, v4.8h, v3.h[3]
190        SMLAL   v27.4s, v5.4h, v3.h[3]
191        SMLAL2  v31.4s, v5.8h, v3.h[3]
192
193        LDP     d4, d5, [x5], 16
194        SXTL    v4.8h, v4.8b
195        SXTL    v5.8h, v5.8b
196        SMLAL   v16.4s, v4.4h, v0.h[4]
197        SMLAL2  v20.4s, v4.8h, v0.h[4]
198        SMLAL   v24.4s, v5.4h, v0.h[4]
199        SMLAL2  v28.4s, v5.8h, v0.h[4]
200        SMLAL   v17.4s, v4.4h, v1.h[4]
201        SMLAL2  v21.4s, v4.8h, v1.h[4]
202        SMLAL   v25.4s, v5.4h, v1.h[4]
203        SMLAL2  v29.4s, v5.8h, v1.h[4]
204        SMLAL   v18.4s, v4.4h, v2.h[4]
205        SMLAL2  v22.4s, v4.8h, v2.h[4]
206        SMLAL   v26.4s, v5.4h, v2.h[4]
207        SMLAL2  v30.4s, v5.8h, v2.h[4]
208        SMLAL   v19.4s, v4.4h, v3.h[4]
209        SMLAL2  v23.4s, v4.8h, v3.h[4]
210        SMLAL   v27.4s, v5.4h, v3.h[4]
211        SMLAL2  v31.4s, v5.8h, v3.h[4]
212
213        LDP     d4, d5, [x5], 16
214        SXTL    v4.8h, v4.8b
215        SXTL    v5.8h, v5.8b
216        SMLAL   v16.4s, v4.4h, v0.h[5]
217        SMLAL2  v20.4s, v4.8h, v0.h[5]
218        SMLAL   v24.4s, v5.4h, v0.h[5]
219        SMLAL2  v28.4s, v5.8h, v0.h[5]
220        SMLAL   v17.4s, v4.4h, v1.h[5]
221        SMLAL2  v21.4s, v4.8h, v1.h[5]
222        SMLAL   v25.4s, v5.4h, v1.h[5]
223        SMLAL2  v29.4s, v5.8h, v1.h[5]
224        SMLAL   v18.4s, v4.4h, v2.h[5]
225        SMLAL2  v22.4s, v4.8h, v2.h[5]
226        SMLAL   v26.4s, v5.4h, v2.h[5]
227        SMLAL2  v30.4s, v5.8h, v2.h[5]
228        SMLAL   v19.4s, v4.4h, v3.h[5]
229        SMLAL2  v23.4s, v4.8h, v3.h[5]
230        SMLAL   v27.4s, v5.4h, v3.h[5]
231        SMLAL2  v31.4s, v5.8h, v3.h[5]
232
233        LDP     d4, d5, [x5], 16
234        SXTL    v4.8h, v4.8b
235        SXTL    v5.8h, v5.8b
236        SMLAL   v16.4s, v4.4h, v0.h[6]
237        SMLAL2  v20.4s, v4.8h, v0.h[6]
238        SMLAL   v24.4s, v5.4h, v0.h[6]
239        SMLAL2  v28.4s, v5.8h, v0.h[6]
240        SMLAL   v17.4s, v4.4h, v1.h[6]
241        SMLAL2  v21.4s, v4.8h, v1.h[6]
242        SMLAL   v25.4s, v5.4h, v1.h[6]
243        SMLAL2  v29.4s, v5.8h, v1.h[6]
244        SMLAL   v18.4s, v4.4h, v2.h[6]
245        SMLAL2  v22.4s, v4.8h, v2.h[6]
246        SMLAL   v26.4s, v5.4h, v2.h[6]
247        SMLAL2  v30.4s, v5.8h, v2.h[6]
248        SMLAL   v19.4s, v4.4h, v3.h[6]
249        SMLAL2  v23.4s, v4.8h, v3.h[6]
250        SMLAL   v27.4s, v5.4h, v3.h[6]
251        SMLAL2  v31.4s, v5.8h, v3.h[6]
252
253        LDP     d4, d5, [x5], 16
254        SXTL    v4.8h, v4.8b
255        SXTL    v5.8h, v5.8b
256        SMLAL   v16.4s, v4.4h, v0.h[7]
257        SMLAL2  v20.4s, v4.8h, v0.h[7]
258        SMLAL   v24.4s, v5.4h, v0.h[7]
259        SMLAL2  v28.4s, v5.8h, v0.h[7]
260        SMLAL   v17.4s, v4.4h, v1.h[7]
261        SMLAL2  v21.4s, v4.8h, v1.h[7]
262        SMLAL   v25.4s, v5.4h, v1.h[7]
263        SMLAL2  v29.4s, v5.8h, v1.h[7]
264        SMLAL   v18.4s, v4.4h, v2.h[7]
265        SMLAL2  v22.4s, v4.8h, v2.h[7]
266        SMLAL   v26.4s, v5.4h, v2.h[7]
267        SMLAL2  v30.4s, v5.8h, v2.h[7]
268        SMLAL   v19.4s, v4.4h, v3.h[7]
269        SMLAL2  v23.4s, v4.8h, v3.h[7]
270        SMLAL   v27.4s, v5.4h, v3.h[7]
271        SMLAL2  v31.4s, v5.8h, v3.h[7]
272
273        SUBS    x0, x0, 8
274        B.HS    2b
275
276        AND     x0, x2, 7               // kc remainder 0 to 7
277        # Is there a remainder?- 1 to 7 bytes of A
278        CBNZ    x0, 4f
279
2803:
281        # ks loop
282        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
283        B.HI    1b
284
285        SCVTF   v16.4s, v16.4s
286        SCVTF   v17.4s, v17.4s
287        # Apply params - scale, bias and clamp
288        LD1R    {v4.4s}, [x11], 4
289        SCVTF   v18.4s, v18.4s
290        SCVTF   v19.4s, v19.4s
291        SCVTF   v20.4s, v20.4s
292        SCVTF   v21.4s, v21.4s
293        SCVTF   v22.4s, v22.4s
294        SCVTF   v23.4s, v23.4s
295        SCVTF   v24.4s, v24.4s
296        SCVTF   v25.4s, v25.4s
297        SCVTF   v26.4s, v26.4s
298        SCVTF   v27.4s, v27.4s
299        SCVTF   v28.4s, v28.4s
300        SCVTF   v29.4s, v29.4s
301        SCVTF   v30.4s, v30.4s
302        SCVTF   v31.4s, v31.4s
303
304        FMUL    v16.4s, v16.4s, v4.4s
305        FMUL    v17.4s, v17.4s, v4.4s
306        FMUL    v18.4s, v18.4s, v4.4s
307        FMUL    v19.4s, v19.4s, v4.4s
308        FMUL    v20.4s, v20.4s, v4.4s
309        FMUL    v21.4s, v21.4s, v4.4s
310        FMUL    v22.4s, v22.4s, v4.4s
311        FMUL    v23.4s, v23.4s, v4.4s
312        FMUL    v24.4s, v24.4s, v4.4s
313        FMUL    v25.4s, v25.4s, v4.4s
314        FMUL    v26.4s, v26.4s, v4.4s
315        FMUL    v27.4s, v27.4s, v4.4s
316        FMUL    v28.4s, v28.4s, v4.4s
317        FMUL    v29.4s, v29.4s, v4.4s
318        FMUL    v30.4s, v30.4s, v4.4s
319        FMUL    v31.4s, v31.4s, v4.4s
320
321        FCVTNS  v16.4s, v16.4s
322        FCVTNS  v17.4s, v17.4s
323        FCVTNS  v18.4s, v18.4s
324        FCVTNS  v19.4s, v19.4s
325        FCVTNS  v20.4s, v20.4s
326        FCVTNS  v21.4s, v21.4s
327        FCVTNS  v22.4s, v22.4s
328        FCVTNS  v23.4s, v23.4s
329        FCVTNS  v24.4s, v24.4s
330        FCVTNS  v25.4s, v25.4s
331        FCVTNS  v26.4s, v26.4s
332        FCVTNS  v27.4s, v27.4s
333        FCVTNS  v28.4s, v28.4s
334        FCVTNS  v29.4s, v29.4s
335        FCVTNS  v30.4s, v30.4s
336        FCVTNS  v31.4s, v31.4s
337
338        SQXTN   v16.4h, v16.4s
339        SQXTN   v17.4h, v17.4s
340        SQXTN   v18.4h, v18.4s
341        SQXTN   v19.4h, v19.4s
342        SQXTN   v24.4h, v24.4s
343        SQXTN   v25.4h, v25.4s
344        SQXTN   v26.4h, v26.4s
345        SQXTN   v27.4h, v27.4s
346        LD1R    {v6.8h}, [x11], 2        // add bias
347
348        SQXTN2  v16.8h, v20.4s
349        SQXTN2  v17.8h, v21.4s
350        SQXTN2  v18.8h, v22.4s
351        SQXTN2  v19.8h, v23.4s
352        SQXTN2  v24.8h, v28.4s
353        SQXTN2  v25.8h, v29.4s
354        SQXTN2  v26.8h, v30.4s
355        SQXTN2  v27.8h, v31.4s
356
357        SQADD   v16.8h, v16.8h, v6.8h
358        SQADD   v17.8h, v17.8h, v6.8h
359        SQADD   v18.8h, v18.8h, v6.8h
360        SQADD   v19.8h, v19.8h, v6.8h
361        SQADD   v24.8h, v24.8h, v6.8h
362        SQADD   v25.8h, v25.8h, v6.8h
363        SQADD   v26.8h, v26.8h, v6.8h
364        SQADD   v27.8h, v27.8h, v6.8h
365        LD1R    {v4.16b}, [x11], 1       // clamp min value
366
367        SQXTN   v0.8b, v16.8h
368        SQXTN   v1.8b, v17.8h
369        SQXTN   v2.8b, v18.8h
370        SQXTN   v3.8b, v19.8h
371        LD1R    {v5.16b}, [x11]          // clamp max value
372        SQXTN2  v0.16b, v24.8h
373        SQXTN2  v1.16b, v25.8h
374        SQXTN2  v2.16b, v26.8h
375        SQXTN2  v3.16b, v27.8h
376        SUB     x11, x11, 7             // rewind params pointer
377
378        SMAX    v0.16b, v0.16b, v4.16b
379        SMAX    v1.16b, v1.16b, v4.16b
380        SMAX    v2.16b, v2.16b, v4.16b
381        SMAX    v3.16b, v3.16b, v4.16b
382        SUBS    x1, x1, 16
383        SMIN    v0.16b, v0.16b, v5.16b
384        SMIN    v1.16b, v1.16b, v5.16b
385        SMIN    v2.16b, v2.16b, v5.16b
386        SMIN    v3.16b, v3.16b, v5.16b
387        B.LO    5f
388
389        # Store full 4 x 16
390        ST1     {v3.16b},  [x7], x10
391        ST1     {v2.16b}, [x17], x10
392        ST1     {v1.16b}, [x16], x10
393        ST1     {v0.16b},  [x6], x10
394
395        SUB     x4, x4, x3              // a -= ks
396
397        # nc loop
398        B.HI    0b
399
400        # Restore x20 from stack
401        LDR     x20, [sp], 16
402        RET
403
404        # Remainder- 1 to 7 bytes of A
405        .p2align 3
4064:
407        AND     x0, x2, 7               // kc remainder 1 to 7
408
409        LD1     {v0.8b}, [x13], x0
410        LDP     d4, d5, [x5], 16
411        LD1     {v1.8b}, [x14], x0
412        LD1     {v2.8b}, [x15], x0
413        LD1     {v3.8b}, [x20], x0
414        SXTL    v0.8h, v0.8b
415        SXTL    v4.8h, v4.8b
416        SXTL    v5.8h, v5.8b
417        SXTL    v1.8h, v1.8b
418        SXTL    v2.8h, v2.8b
419        SXTL    v3.8h, v3.8b
420        SMLAL   v16.4s, v4.4h, v0.h[0]
421        SMLAL2  v20.4s, v4.8h, v0.h[0]
422        SMLAL   v24.4s, v5.4h, v0.h[0]
423        SMLAL2  v28.4s, v5.8h, v0.h[0]
424        SMLAL   v17.4s, v4.4h, v1.h[0]
425        SMLAL2  v21.4s, v4.8h, v1.h[0]
426        SMLAL   v25.4s, v5.4h, v1.h[0]
427        SMLAL2  v29.4s, v5.8h, v1.h[0]
428        SMLAL   v18.4s, v4.4h, v2.h[0]
429        SMLAL2  v22.4s, v4.8h, v2.h[0]
430        SMLAL   v26.4s, v5.4h, v2.h[0]
431        SMLAL2  v30.4s, v5.8h, v2.h[0]
432        SMLAL   v19.4s, v4.4h, v3.h[0]
433        SMLAL2  v23.4s, v4.8h, v3.h[0]
434        SMLAL   v27.4s, v5.4h, v3.h[0]
435        SMLAL2  v31.4s, v5.8h, v3.h[0]
436        CMP     x0, 2
437        B.LO    3b
438
439        LDP     d4, d5, [x5], 16
440        SXTL    v4.8h, v4.8b
441        SXTL    v5.8h, v5.8b
442        SMLAL   v16.4s, v4.4h, v0.h[1]
443        SMLAL2  v20.4s, v4.8h, v0.h[1]
444        SMLAL   v24.4s, v5.4h, v0.h[1]
445        SMLAL2  v28.4s, v5.8h, v0.h[1]
446        SMLAL   v17.4s, v4.4h, v1.h[1]
447        SMLAL2  v21.4s, v4.8h, v1.h[1]
448        SMLAL   v25.4s, v5.4h, v1.h[1]
449        SMLAL2  v29.4s, v5.8h, v1.h[1]
450        SMLAL   v18.4s, v4.4h, v2.h[1]
451        SMLAL2  v22.4s, v4.8h, v2.h[1]
452        SMLAL   v26.4s, v5.4h, v2.h[1]
453        SMLAL2  v30.4s, v5.8h, v2.h[1]
454        SMLAL   v19.4s, v4.4h, v3.h[1]
455        SMLAL2  v23.4s, v4.8h, v3.h[1]
456        SMLAL   v27.4s, v5.4h, v3.h[1]
457        SMLAL2  v31.4s, v5.8h, v3.h[1]
458        B.EQ    3b
459
460        LDP     d4, d5, [x5], 16
461        SXTL    v4.8h, v4.8b
462        SXTL    v5.8h, v5.8b
463        SMLAL   v16.4s, v4.4h, v0.h[2]
464        SMLAL2  v20.4s, v4.8h, v0.h[2]
465        SMLAL   v24.4s, v5.4h, v0.h[2]
466        SMLAL2  v28.4s, v5.8h, v0.h[2]
467        SMLAL   v17.4s, v4.4h, v1.h[2]
468        SMLAL2  v21.4s, v4.8h, v1.h[2]
469        SMLAL   v25.4s, v5.4h, v1.h[2]
470        SMLAL2  v29.4s, v5.8h, v1.h[2]
471        SMLAL   v18.4s, v4.4h, v2.h[2]
472        SMLAL2  v22.4s, v4.8h, v2.h[2]
473        SMLAL   v26.4s, v5.4h, v2.h[2]
474        SMLAL2  v30.4s, v5.8h, v2.h[2]
475        SMLAL   v19.4s, v4.4h, v3.h[2]
476        SMLAL2  v23.4s, v4.8h, v3.h[2]
477        SMLAL   v27.4s, v5.4h, v3.h[2]
478        SMLAL2  v31.4s, v5.8h, v3.h[2]
479        CMP     x0, 4
480        B.LO    3b
481
482        LDP     d4, d5, [x5], 16
483        SXTL    v4.8h, v4.8b
484        SXTL    v5.8h, v5.8b
485        SMLAL   v16.4s, v4.4h, v0.h[3]
486        SMLAL2  v20.4s, v4.8h, v0.h[3]
487        SMLAL   v24.4s, v5.4h, v0.h[3]
488        SMLAL2  v28.4s, v5.8h, v0.h[3]
489        SMLAL   v17.4s, v4.4h, v1.h[3]
490        SMLAL2  v21.4s, v4.8h, v1.h[3]
491        SMLAL   v25.4s, v5.4h, v1.h[3]
492        SMLAL2  v29.4s, v5.8h, v1.h[3]
493        SMLAL   v18.4s, v4.4h, v2.h[3]
494        SMLAL2  v22.4s, v4.8h, v2.h[3]
495        SMLAL   v26.4s, v5.4h, v2.h[3]
496        SMLAL2  v30.4s, v5.8h, v2.h[3]
497        SMLAL   v19.4s, v4.4h, v3.h[3]
498        SMLAL2  v23.4s, v4.8h, v3.h[3]
499        SMLAL   v27.4s, v5.4h, v3.h[3]
500        SMLAL2  v31.4s, v5.8h, v3.h[3]
501        B.EQ    3b
502
503        LDP     d4, d5, [x5], 16
504        SXTL    v4.8h, v4.8b
505        SXTL    v5.8h, v5.8b
506        SMLAL   v16.4s, v4.4h, v0.h[4]
507        SMLAL2  v20.4s, v4.8h, v0.h[4]
508        SMLAL   v24.4s, v5.4h, v0.h[4]
509        SMLAL2  v28.4s, v5.8h, v0.h[4]
510        SMLAL   v17.4s, v4.4h, v1.h[4]
511        SMLAL2  v21.4s, v4.8h, v1.h[4]
512        SMLAL   v25.4s, v5.4h, v1.h[4]
513        SMLAL2  v29.4s, v5.8h, v1.h[4]
514        SMLAL   v18.4s, v4.4h, v2.h[4]
515        SMLAL2  v22.4s, v4.8h, v2.h[4]
516        SMLAL   v26.4s, v5.4h, v2.h[4]
517        SMLAL2  v30.4s, v5.8h, v2.h[4]
518        SMLAL   v19.4s, v4.4h, v3.h[4]
519        SMLAL2  v23.4s, v4.8h, v3.h[4]
520        SMLAL   v27.4s, v5.4h, v3.h[4]
521        SMLAL2  v31.4s, v5.8h, v3.h[4]
522        CMP     x0, 6
523        B.LO    3b
524
525        LDP     d4, d5, [x5], 16
526        SXTL    v4.8h, v4.8b
527        SXTL    v5.8h, v5.8b
528        SMLAL   v16.4s, v4.4h, v0.h[5]
529        SMLAL2  v20.4s, v4.8h, v0.h[5]
530        SMLAL   v24.4s, v5.4h, v0.h[5]
531        SMLAL2  v28.4s, v5.8h, v0.h[5]
532        SMLAL   v17.4s, v4.4h, v1.h[5]
533        SMLAL2  v21.4s, v4.8h, v1.h[5]
534        SMLAL   v25.4s, v5.4h, v1.h[5]
535        SMLAL2  v29.4s, v5.8h, v1.h[5]
536        SMLAL   v18.4s, v4.4h, v2.h[5]
537        SMLAL2  v22.4s, v4.8h, v2.h[5]
538        SMLAL   v26.4s, v5.4h, v2.h[5]
539        SMLAL2  v30.4s, v5.8h, v2.h[5]
540        SMLAL   v19.4s, v4.4h, v3.h[5]
541        SMLAL2  v23.4s, v4.8h, v3.h[5]
542        SMLAL   v27.4s, v5.4h, v3.h[5]
543        SMLAL2  v31.4s, v5.8h, v3.h[5]
544        B.EQ    3b
545
546        LDP     d4, d5, [x5], 16
547        SXTL    v4.8h, v4.8b
548        SXTL    v5.8h, v5.8b
549        SMLAL   v16.4s, v4.4h, v0.h[6]
550        SMLAL2  v20.4s, v4.8h, v0.h[6]
551        SMLAL   v24.4s, v5.4h, v0.h[6]
552        SMLAL2  v28.4s, v5.8h, v0.h[6]
553        SMLAL   v17.4s, v4.4h, v1.h[6]
554        SMLAL2  v21.4s, v4.8h, v1.h[6]
555        SMLAL   v25.4s, v5.4h, v1.h[6]
556        SMLAL2  v29.4s, v5.8h, v1.h[6]
557        SMLAL   v18.4s, v4.4h, v2.h[6]
558        SMLAL2  v22.4s, v4.8h, v2.h[6]
559        SMLAL   v26.4s, v5.4h, v2.h[6]
560        SMLAL2  v30.4s, v5.8h, v2.h[6]
561        SMLAL   v19.4s, v4.4h, v3.h[6]
562        SMLAL2  v23.4s, v4.8h, v3.h[6]
563        SMLAL   v27.4s, v5.4h, v3.h[6]
564        SMLAL2  v31.4s, v5.8h, v3.h[6]
565        B       3b
566
567        # Store odd width
568        .p2align 3
5695:
570        TBZ     x1, 3, 6f
571        STR     d3, [x7], 8
572        STR     d2, [x17], 8
573        DUP     d3, v3.d[1]
574        DUP     d2, v2.d[1]
575        STR     d1, [x16], 8
576        STR     d0, [x6], 8
577        DUP     d1, v1.d[1]
578        DUP     d0, v0.d[1]
5796:
580        TBZ     x1, 2, 7f
581        STR     s3, [x7], 4
582        STR     s2, [x17], 4
583        DUP     s3, v3.s[1]
584        DUP     s2, v2.s[1]
585        STR     s1, [x16], 4
586        STR     s0, [x6], 4
587        DUP     s1, v1.s[1]
588        DUP     s0, v0.s[1]
5897:
590        TBZ     x1, 1, 8f
591        STR     h3, [x7], 2
592        STR     h2, [x17], 2
593        DUP     h3, v3.h[1]
594        DUP     h2, v2.h[1]
595        STR     h1, [x16], 2
596        STR     h0, [x6], 2
597        DUP     h1, v1.h[1]
598        DUP     h0, v0.h[1]
5998:
600        TBZ     x1, 0, 9f
601        STR     b3, [x7]
602        STR     b2, [x17]
603        STR     b1, [x16]
604        STR     b0, [x6]
6059:
606        # Restore x20 from stack
607        LDR     x20, [sp], 16
608        RET
609
610END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
611
612#ifdef __ELF__
613.section ".note.GNU-stack","",%progbits
614#endif
615