xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/6x16-minmax-aarch64-neonfp16arith-cortex-a55r0.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14  v0     v3
26# A1 x15  v0[1]  v3[1]
27# A2 x20  v1     v4
28# A3 x21  v1[1]  v4[1]
29# A4 x22  v2     v5
30# A5 x23  v2[1]  v5[1]
31
32# B   x5  v12 v13 v14 v15 second set of B
33# B       v16 v17 v18 v19 first set
34
35# C0  x6  v20 v21
36# C1 x16  v22 v23
37# C2 x17  v24 v25
38# C3 x10  v26 v27
39# C4 x13  v28 v29
40# C5  x7  v30 v31
41
42# Clamp v6, (v4), (v5)
43# unused     v7 v8 v9 v10 v11
44
45# x8 temporary vector shadow register
46
47BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0
48
49        # Load zero, params pointer
50        LDP     x12, x8, [sp, 16]
51
52        # Clamp C pointers
53        CMP     x0, 2                   // if mr < 2
54        ADD     x16, x6, x7             // c1 = c0 + cm_stride
55        CSEL    x16, x6, x16, LO        //   c1 = c0
56        ADD     x17, x16, x7            // c2 = c1 + cm_stride
57                                        // if mr <= 2
58        CSEL    x17, x16, x17, LS       //   c2 = c1
59
60        # Load params
61        LDR     s6, [x8]
62
63        CMP     x0, 4                   // if mr < 4
64        ADD     x10, x17, x7            // c3 = c2 + cm_stride
65        CSEL    x10, x17, x10, LO       //   c3 = c2
66        ADD     x13, x10, x7            // c4 = c3 + cm_stride
67                                        // if mr <= 4
68        CSEL    x13, x10, x13, LS       //   c4 = c3
69        CMP     x0, 6                   // if mr < 6
70        ADD     x7, x13, x7             // c5 = c4 + cm_stride
71        CSEL    x7, x13, x7, LO         //   c5 = c4
72
73        # Load a_offset
74        LDR     x11, [sp, 8]
75
76        # Save x20-x23, d12-d15 on stack
77        STP     d12, d13, [sp, -64]!
78        STP     d14, d15, [sp, 16]
79        STP     x20, x21, [sp, 32]
80        STP     x22, x23, [sp, 48]
810:
82        # Load initial bias from w into accumulators
83        LDP     q20, q21, [x5], 32
84        MOV     x9, x3                  // p = ks
85        MOV     v22.16b, v20.16b
86        MOV     v23.16b, v21.16b
87        MOV     v24.16b, v20.16b
88        MOV     v25.16b, v21.16b
89        MOV     v26.16b, v20.16b
90        MOV     v27.16b, v21.16b
91        MOV     v28.16b, v20.16b
92        MOV     v29.16b, v21.16b
93        MOV     v30.16b, v20.16b
94        MOV     v31.16b, v21.16b
95
961:
97        # Load next 6 A pointers
98        LDP     x14, x15, [x4], 16
99        LDP     x20, x21, [x4], 16
100        LDP     x22, x23, [x4], 16
101
102        CMP     x14, x12                // if a0 == zero
103        ADD     x14, x14, x11           // a0 += a_offset
104        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
105        CMP     x15, x12                // if a1 == zero
106        ADD     x15, x15, x11           // a1 += a_offset
107        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
108        CMP     x20, x12                // if a2 == zero
109        ADD     x20, x20, x11           // a2 += a_offset
110        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
111        CMP     x21, x12                // if a3 == zero
112        ADD     x21, x21, x11           // a3 += a_offset
113        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
114        CMP     x22, x12                // if a4 == zero
115        ADD     x22, x22, x11           // a4 += a_offset
116        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
117        CMP     x23, x12                // if a5 == zero
118        ADD     x23, x23, x11           // a5 += a_offset
119        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
120
121        # Is there at least 4 halffloats (8 bytes) for prologue + epilogue?
122        SUBS    x0, x2, 8               // k = kc - 8
123        B.LO    5f
124
125        # Prologue - First group loads, no FMA
126        LDR     s0, [x14], 4              // A0
127        LDP     q16, q17, [x5], 32        // B
128        LDR     s1, [x20], 4              // A2
129        LDR     s2, [x22], 4              // A4
130        LD1     {v0.s}[2], [x15], 4       // A1
131        LD1     {v1.s}[2], [x21], 4       // A3
132        LD1     {v2.s}[2], [x23], 4       // A5
133        LDR     q18, [x5], 16
134        LDR     d19, [x5], 8
135        LDR     x8, [x5], 8               // ins is in BLOCK 0
136        SUBS    x0, x0, 8
137
138        # Is there at least 4 halffloats (8 bytes) for main loop?
139        B.LO    3f
140
141       .p2align 3
142        # Main loop - 4 halffloats of A (8 bytes)
143        # 48 FMA + 12 LD32 A + 8 LDR B
1442:
145        # First group of 24 FMA, Second group loads
146        # BLOCK 0
147        LDR     s3, [x14], 4              // A0
148        INS     v19.d[1], x8              // B from second group
149        FMLA    v20.8h, v16.8h,  v0.h[0]
150        LDR     w8, [x15], 4              // A1
151        FMLA    v22.8h, v16.8h,  v0.h[4]
152        FMLA    v24.8h, v16.8h,  v1.h[0]
153
154        # BLOCK 1
155        LDR     d12, [x5]
156        INS     v3.d[1], x8               // A1 ins
157        FMLA    v26.8h, v16.8h,  v1.h[4]
158        LDR     x8, [x5, 8]               // B
159        FMLA    v28.8h, v16.8h,  v2.h[0]
160        FMLA    v30.8h, v16.8h,  v2.h[4]
161
162        # BLOCK 2
163        LDR     s4, [x20], 4              // A2
164        INS     v12.d[1], x8              // B  ins
165        FMLA    v21.8h, v17.8h,  v0.h[0]
166        LDR     w8, [x21], 4              // A3
167        FMLA    v23.8h, v17.8h,  v0.h[4]
168        FMLA    v25.8h, v17.8h,  v1.h[0]
169
170        # BLOCK 3
171        LDR     s5, [x22], 4              // A4
172        INS     v4.d[1], x8               // A3 ins
173        FMLA    v27.8h, v17.8h,  v1.h[4]
174        LDR     w8, [x23], 4              // A5
175        FMLA    v29.8h, v17.8h,  v2.h[0]
176        FMLA    v31.8h, v17.8h,  v2.h[4]
177
178        # BLOCK 4
179        LDR     d13, [x5, 16]
180        INS     v5.d[1], x8               // A5 ins
181        FMLA    v20.8h, v18.8h,  v0.h[1]
182        LDR     x8, [x5, 24]
183        FMLA    v22.8h, v18.8h,  v0.h[5]
184        FMLA    v24.8h, v18.8h,  v1.h[1]
185
186        # BLOCK 5
187        LDR     d14, [x5, 32]
188        INS     v13.d[1], x8              // B
189        FMLA    v26.8h, v18.8h,  v1.h[5]
190        LDR     x8, [x5, 40]
191        FMLA    v28.8h, v18.8h,  v2.h[1]
192        FMLA    v30.8h, v18.8h,  v2.h[5]
193
194        # BLOCK 6
195        LDR     d15, [x5, 48]
196        INS     v14.d[1], x8              // B
197        FMLA    v21.8h, v19.8h,  v0.h[1]
198        LDR     x8, [x5, 56]
199        FMLA    v23.8h, v19.8h,  v0.h[5]
200        FMLA    v25.8h, v19.8h,  v1.h[1]
201
202        # BLOCK 7
203        INS     v15.d[1], x8
204        FMLA    v27.8h, v19.8h,  v1.h[5]
205        FMLA    v29.8h, v19.8h,  v2.h[1]
206        FMLA    v31.8h, v19.8h,  v2.h[5]
207
208        # Second group of 24 FMA, First group of loads
209        # BLOCK 0
210        LDR     s0, [x14], 4              // A0
211        FMLA    v20.8h, v12.8h,  v3.h[0]
212        LDR     w8, [x15], 4              // A1
213        FMLA    v22.8h, v12.8h,  v3.h[4]
214        FMLA    v24.8h, v12.8h,  v4.h[0]
215
216        # BLOCK 1
217        LDR     d16, [x5, 64]
218        INS     v0.d[1], x8               // A1 ins
219        FMLA    v26.8h, v12.8h,  v4.h[4]
220        LDR     x8, [x5, 72]              // B
221        FMLA    v28.8h, v12.8h,  v5.h[0]
222        FMLA    v30.8h, v12.8h,  v5.h[4]
223
224        # BLOCK 2
225        LDR     s1, [x20], 4              // A2
226        INS     v16.d[1], x8              // B
227        FMLA    v21.8h, v13.8h,  v3.h[0]
228        LDR     w8, [x21], 4              // A3
229        FMLA    v23.8h, v13.8h,  v3.h[4]
230        FMLA    v25.8h, v13.8h,  v4.h[0]
231
232        # BLOCK 3
233        LDR     s2, [x22], 4              // A4
234        INS     v1.d[1], x8               // A3 ins
235        FMLA    v27.8h, v13.8h,  v4.h[4]
236        LDR     w8,  [x23], 4             // A5
237        FMLA    v29.8h, v13.8h,  v5.h[0]
238        FMLA    v31.8h, v13.8h,  v5.h[4]
239
240        # BLOCK 4
241        LDR     d17, [x5, 80]
242        INS     v2.d[1], x8               // A5 ins
243        FMLA    v20.8h, v14.8h,  v3.h[1]
244        LDR     x8, [x5, 88]
245        FMLA    v22.8h, v14.8h,  v3.h[5]
246        FMLA    v24.8h, v14.8h,  v4.h[1]
247
248        # BLOCK 5
249        LDR     d18, [x5, 96]
250        INS     v17.d[1], x8              // B
251        FMLA    v26.8h, v14.8h,  v4.h[5]
252        LDR     x8, [x5, 104]
253        FMLA    v28.8h, v14.8h,  v5.h[1]
254        FMLA    v30.8h, v14.8h,  v5.h[5]
255
256        # BLOCK 6
257        LDR     d19, [x5, 112]
258        INS     v18.d[1], x8              // B
259        FMLA    v21.8h, v15.8h,  v3.h[1]
260        LDR     x8, [x5, 120]
261        FMLA    v23.8h, v15.8h,  v3.h[5]
262        FMLA    v25.8h, v15.8h,  v4.h[1]
263
264        # BLOCK 7
265        SUBS    x0, x0, 8                 // LDR lands here
266        FMLA    v27.8h, v15.8h,  v4.h[5]
267        FMLA    v29.8h, v15.8h,  v5.h[1]
268        ADD     x5, x5, 128
269        FMLA    v31.8h, v15.8h,  v5.h[5]
270        B.HS    2b
271
272        # Epilogue - 4 halffloats of A (8 bytes)
273        # 48 FMA + 12 LD32 A + 8 LDR B
2743:
275        # First group of 24 FMA, Second group loads
276        # BLOCK 0
277        LDR     s3, [x14], 4              // A0
278        INS     v19.d[1], x8              // B from second group
279        FMLA    v20.8h, v16.8h,  v0.h[0]
280        LDR     w8, [x15], 4              // A1
281        FMLA    v22.8h, v16.8h,  v0.h[4]
282        FMLA    v24.8h, v16.8h,  v1.h[0]
283
284        # BLOCK 1
285        LDR     d12, [x5]
286        INS     v3.d[1], x8               // A1 ins
287        FMLA    v26.8h, v16.8h,  v1.h[4]
288        LDR     x8, [x5, 8]               // B
289        FMLA    v28.8h, v16.8h,  v2.h[0]
290        FMLA    v30.8h, v16.8h,  v2.h[4]
291
292        # BLOCK 2
293        LDR     s4, [x20], 4              // A2
294        INS     v12.d[1], x8              // B  ins
295        FMLA    v21.8h, v17.8h,  v0.h[0]
296        LDR     w8, [x21], 4              // A3
297        FMLA    v23.8h, v17.8h,  v0.h[4]
298        FMLA    v25.8h, v17.8h,  v1.h[0]
299
300        # BLOCK 3
301        LDR     s5, [x22], 4              // A4
302        INS     v4.d[1], x8               // A3 ins
303        FMLA    v27.8h, v17.8h,  v1.h[4]
304        LDR     w8, [x23], 4              // A5
305        FMLA    v29.8h, v17.8h,  v2.h[0]
306        FMLA    v31.8h, v17.8h,  v2.h[4]
307
308        # BLOCK 4
309        LDR     d13, [x5, 16]
310        INS     v5.d[1], x8               // A5 ins
311        FMLA    v20.8h, v18.8h,  v0.h[1]
312        LDR     x8, [x5, 24]
313        FMLA    v22.8h, v18.8h,  v0.h[5]
314        FMLA    v24.8h, v18.8h,  v1.h[1]
315
316        # BLOCK 5
317        LDR     d14, [x5, 32]
318        INS     v13.d[1], x8              // B
319        FMLA    v26.8h, v18.8h,  v1.h[5]
320        LDR     x8, [x5, 40]
321        FMLA    v28.8h, v18.8h,  v2.h[1]
322        FMLA    v30.8h, v18.8h,  v2.h[5]
323
324        # BLOCK 6
325        LDR     d15, [x5, 48]
326        INS     v14.d[1], x8              // B
327        FMLA    v21.8h, v19.8h,  v0.h[1]
328        LDR     x8, [x5, 56]
329        FMLA    v23.8h, v19.8h,  v0.h[5]
330        FMLA    v25.8h, v19.8h,  v1.h[1]
331
332        # BLOCK 7
333        INS     v15.d[1], x8              // B
334        FMLA    v27.8h, v19.8h,  v1.h[5]
335        FMLA    v29.8h, v19.8h,  v2.h[1]
336        FMLA    v31.8h, v19.8h,  v2.h[5]
337
338        # Second group of 24 FMA, First group of loads
339        # BLOCK 0
340        FMLA    v20.8h, v12.8h,  v3.h[0]
341        FMLA    v22.8h, v12.8h,  v3.h[4]
342        FMLA    v24.8h, v12.8h,  v4.h[0]
343
344        # BLOCK 1
345        FMLA    v26.8h, v12.8h,  v4.h[4]
346        FMLA    v28.8h, v12.8h,  v5.h[0]
347        FMLA    v30.8h, v12.8h,  v5.h[4]
348
349        # BLOCK 2
350        FMLA    v21.8h, v13.8h,  v3.h[0]
351        FMLA    v23.8h, v13.8h,  v3.h[4]
352        FMLA    v25.8h, v13.8h,  v4.h[0]
353
354        # BLOCK 3
355        FMLA    v27.8h, v13.8h,  v4.h[4]
356        FMLA    v29.8h, v13.8h,  v5.h[0]
357        FMLA    v31.8h, v13.8h,  v5.h[4]
358
359        # BLOCK 4
360        FMLA    v20.8h, v14.8h,  v3.h[1]
361        FMLA    v22.8h, v14.8h,  v3.h[5]
362        FMLA    v24.8h, v14.8h,  v4.h[1]
363
364        # BLOCK 5
365        FMLA    v26.8h, v14.8h,  v4.h[5]
366        FMLA    v28.8h, v14.8h,  v5.h[1]
367        FMLA    v30.8h, v14.8h,  v5.h[5]
368        TST     x0, 7
369
370        # BLOCK 6
371        FMLA    v21.8h, v15.8h,  v3.h[1]
372        FMLA    v23.8h, v15.8h,  v3.h[5]
373        FMLA    v25.8h, v15.8h,  v4.h[1]
374        ADD     x5, x5, 64
375
376        # BLOCK 7
377        FMLA    v27.8h, v15.8h,  v4.h[5]
378        FMLA    v29.8h, v15.8h,  v5.h[1]
379        FMLA    v31.8h, v15.8h,  v5.h[5]
380
381        # Is there a remainder?- 2 halffloats of A (4 bytes) or less
382        B.NE    5f
383
3844:
385        # ks loop
386        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
387        B.HI    1b
388
389        # Clamp
390        DUP     v4.8h, v6.h[0]
391        DUP     v5.8h, v6.h[1]
392        LDR     x0, [sp, 64]            // cn_stride
393        FMAX    v20.8h, v20.8h, v4.8h
394        FMAX    v21.8h, v21.8h, v4.8h
395        FMAX    v22.8h, v22.8h, v4.8h
396        FMAX    v23.8h, v23.8h, v4.8h
397        FMAX    v24.8h, v24.8h, v4.8h
398        FMAX    v25.8h, v25.8h, v4.8h
399        FMAX    v26.8h, v26.8h, v4.8h
400        FMAX    v27.8h, v27.8h, v4.8h
401        FMAX    v28.8h, v28.8h, v4.8h
402        FMAX    v29.8h, v29.8h, v4.8h
403        FMAX    v30.8h, v30.8h, v4.8h
404        FMAX    v31.8h, v31.8h, v4.8h
405        SUBS    x1, x1, 16
406        FMIN    v20.8h, v20.8h, v5.8h
407        FMIN    v21.8h, v21.8h, v5.8h
408        FMIN    v22.8h, v22.8h, v5.8h
409        FMIN    v23.8h, v23.8h, v5.8h
410        FMIN    v24.8h, v24.8h, v5.8h
411        FMIN    v25.8h, v25.8h, v5.8h
412        FMIN    v26.8h, v26.8h, v5.8h
413        FMIN    v27.8h, v27.8h, v5.8h
414        FMIN    v28.8h, v28.8h, v5.8h
415        FMIN    v29.8h, v29.8h, v5.8h
416        FMIN    v30.8h, v30.8h, v5.8h
417        FMIN    v31.8h, v31.8h, v5.8h
418
419        # Store full 6 x 16
420        B.LO    7f
421
422        ST1     {v30.16b, v31.16b},  [x7], x0
423        ST1     {v28.16b, v29.16b}, [x13], x0
424        ST1     {v26.16b, v27.16b}, [x10], x0
425        ST1     {v24.16b, v25.16b}, [x17], x0
426        ST1     {v22.16b, v23.16b}, [x16], x0
427        ST1     {v20.16b, v21.16b},  [x6], x0
428
429        SUB     x4, x4, x3              // a -= ks
430
431        # nc loop
432        B.HI    0b
433
434        # Restore x20-x23, d12-d15 from stack
435        LDP     x22, x23, [sp, 48]
436        LDP     x20, x21, [sp, 32]
437        LDP     d14, d15, [sp, 16]
438        LDP     d12, d13, [sp], 64
439        RET
440
4415:
442        # Is there a remainder?- 2 halffloats of A (4 bytes)
443        TBZ     x0, 2, 6f
444
445        # Remainder- 2 halffloats of A (4 bytes)
446        LDR     s0, [x14], 4              // A0
447        LDP     q16, q17, [x5], 32        // B
448        LDR     s1, [x20], 4              // A2
449        LDR     s2, [x22], 4              // A4
450        LD1     {v0.s}[2], [x15], 4       // A1
451        LD1     {v1.s}[2], [x21], 4       // A3
452        LD1     {v2.s}[2], [x23], 4       // A5
453        LDR     q18, [x5], 16
454        LDR     q19, [x5], 16
455        FMLA    v20.8h, v16.8h,  v0.h[0]
456        FMLA    v22.8h, v16.8h,  v0.h[4]
457        FMLA    v24.8h, v16.8h,  v1.h[0]
458        FMLA    v26.8h, v16.8h,  v1.h[4]
459        FMLA    v28.8h, v16.8h,  v2.h[0]
460        FMLA    v30.8h, v16.8h,  v2.h[4]
461        FMLA    v21.8h, v17.8h,  v0.h[0]
462        FMLA    v23.8h, v17.8h,  v0.h[4]
463        FMLA    v25.8h, v17.8h,  v1.h[0]
464        FMLA    v27.8h, v17.8h,  v1.h[4]
465        FMLA    v29.8h, v17.8h,  v2.h[0]
466        FMLA    v31.8h, v17.8h,  v2.h[4]
467        FMLA    v20.8h, v18.8h,  v0.h[1]
468        FMLA    v22.8h, v18.8h,  v0.h[5]
469        FMLA    v24.8h, v18.8h,  v1.h[1]
470        FMLA    v26.8h, v18.8h,  v1.h[5]
471        FMLA    v28.8h, v18.8h,  v2.h[1]
472        FMLA    v30.8h, v18.8h,  v2.h[5]
473        FMLA    v21.8h, v19.8h,  v0.h[1]
474        FMLA    v23.8h, v19.8h,  v0.h[5]
475        FMLA    v25.8h, v19.8h,  v1.h[1]
476        FMLA    v27.8h, v19.8h,  v1.h[5]
477        FMLA    v29.8h, v19.8h,  v2.h[1]
478        FMLA    v31.8h, v19.8h,  v2.h[5]
479
480        # Is there a remainder?- 1 halffloat of A (2 bytes)
481        TBZ     x0, 1, 4b
4826:
483        # Remainder- 1 halffloat of A (2 bytes)
484        LDR     h0, [x14], 2              // A0
485        LDP     q16, q17, [x5], 32        // B
486        LDR     h1, [x20], 2              // A2
487        LDR     h2, [x22], 2              // A4
488        LD1     {v0.h}[4], [x15], 2       // A1
489        LD1     {v1.h}[4], [x21], 2       // A3
490        LD1     {v2.h}[4], [x23], 2       // A5
491        FMLA    v20.8h, v16.8h,  v0.h[0]
492        FMLA    v22.8h, v16.8h,  v0.h[4]
493        FMLA    v24.8h, v16.8h,  v1.h[0]
494        FMLA    v26.8h, v16.8h,  v1.h[4]
495        FMLA    v28.8h, v16.8h,  v2.h[0]
496        FMLA    v30.8h, v16.8h,  v2.h[4]
497        FMLA    v21.8h, v17.8h,  v0.h[0]
498        FMLA    v23.8h, v17.8h,  v0.h[4]
499        FMLA    v25.8h, v17.8h,  v1.h[0]
500        FMLA    v27.8h, v17.8h,  v1.h[4]
501        FMLA    v29.8h, v17.8h,  v2.h[0]
502        FMLA    v31.8h, v17.8h,  v2.h[4]
503        B       4b
504
505        # Store odd width
5067:
507        TBZ     x1, 3, 8f
508        STR     q30,  [x7], 16
509        MOV     v30.16b, v31.16b
510        STR     q28, [x13], 16
511        MOV     v28.16b, v29.16b
512        STR     q26, [x10], 16
513        MOV     v26.16b, v27.16b
514        STR     q24, [x17], 16
515        MOV     v24.16b, v25.16b
516        STR     q22, [x16], 16
517        MOV     v22.16b, v23.16b
518        STR     q20,  [x6], 16
519        MOV     v20.16b, v21.16b
5208:
521        TBZ     x1, 2, 9f
522        STR     d30,  [x7], 8
523        STR     d28, [x13], 8
524        DUP     d30, v30.d[1]
525        DUP     d28, v28.d[1]
526        STR     d26, [x10], 8
527        STR     d24, [x17], 8
528        DUP     d26, v26.d[1]
529        DUP     d24, v24.d[1]
530        STR     d22, [x16], 8
531        STR     d20,  [x6], 8
532        DUP     d22, v22.d[1]
533        DUP     d20, v20.d[1]
534
5359:
536        TBZ     x1, 1, 10f
537        STR     s30,  [x7], 4
538        STR     s28, [x13], 4
539        DUP     s30, v30.s[1]
540        DUP     s28, v28.s[1]
541        STR     s26, [x10], 4
542        STR     s24, [x17], 4
543        DUP     s26, v26.s[1]
544        DUP     s24, v24.s[1]
545        STR     s22, [x16], 4
546        STR     s20,  [x6], 4
547        DUP     s22, v22.s[1]
548        DUP     s20, v20.s[1]
549
55010:
551        TBZ     x1, 0, 11f
552        STR     h30,  [x7]
553        STR     h28, [x13]
554        STR     h26, [x10]
555        STR     h24, [x17]
556        STR     h22, [x16]
557        STR     h20,  [x6]
55811:
559        # Restore x20-x23, d12-d15 from stack
560        LDP     x22, x23, [sp, 48]
561        LDP     x20, x21, [sp, 32]
562        LDP     d14, d15, [sp, 16]
563        LDP     d12, d13, [sp], 64
564        RET
565
566END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0
567
568#ifdef __ELF__
569.section ".note.GNU-stack","",%progbits
570#endif
571