xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# Register usage
29# A0 x14  v0     v3
30# A1 x15  v0[1]  v3[1]
31# A2 x20  v1     v4
32# A3 x21  v1[1]  v4[1]
33# A4 x22  v2     v5
34# A5 x23  v2[1]  v5[1]
35
36# B   x5  v12 v13 v14 v15 second set of B
37# B       v16 v17 v18 v19 first set
38
39# C0  x6  v20 v21
40# C1 x16  v22 v23
41# C2 x17  v24 v25
42# C3 x10  v26 v27
43# C4 x13  v28 v29
44# C5  x7  v30 v31
45
46# Clamp v6 v7
47# unused A   v8 v9 v10 v11
48# x8 temporary vector shadow register
49
50BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
51
52        # Load a_offset
53        LDR     x11, [sp, 8]
54
55        # Load zero, params pointer
56        LDP     x12, x8, [sp, 16]
57
58        # Clamp C pointers
59        CMP     x0, 2                   // if mr < 2
60        ADD     x16, x6, x7             // c1 = c0 + cm_stride
61        CSEL    x16, x6, x16, LO        //   c1 = c0
62
63        ADD     x17, x16, x7            // c2 = c1 + cm_stride
64                                        // if mr <= 2
65        CSEL    x17, x16, x17, LS       //   c2 = c1
66
67        CMP     x0, 4                   // if mr < 4
68        ADD     x10, x17, x7            // c3 = c2 + cm_stride
69        CSEL    x10, x17, x10, LO       //   c3 = c2
70
71        ADD     x13, x10, x7            // c4 = c3 + cm_stride
72                                        // if mr <= 4
73        CSEL    x13, x10, x13, LS       //   c4 = c3
74
75        CMP     x0, 6                   // if mr < 6
76        ADD     x7, x13, x7             // c5 = c4 + cm_stride
77        CSEL    x7, x13, x7, LO         //   c5 = c4
78
79        # Load min/max values
80        LD2R    {v6.4s, v7.4s}, [x8]
81
82        # Save x20-x23, d12-d15 on stack
83        STP     d12, d13, [sp, -64]!
84        STP     d14, d15, [sp, 16]
85        STP     x20, x21, [sp, 32]
86        STP     x22, x23, [sp, 48]
87
880:
89        # Load initial bias from w into accumulators
90        LDP     q20, q21, [x5], 32
91        MOV     v22.16b, v20.16b
92        MOV     v23.16b, v21.16b
93        MOV     v24.16b, v20.16b
94        MOV     v25.16b, v21.16b
95        MOV     v26.16b, v20.16b
96        MOV     v27.16b, v21.16b
97        MOV     v28.16b, v20.16b
98        MOV     v29.16b, v21.16b
99        MOV     v30.16b, v20.16b
100        MOV     v31.16b, v21.16b
101
102        MOV     x9, x3                  // p = ks
103
1041:
105        # Load next 6 A pointers
106        LDP     x14, x15, [x4], 16
107        LDP     x20, x21, [x4], 16
108        LDP     x22, x23, [x4], 16
109
110        CMP     x14, x12                // if a0 == zero
111        ADD     x14, x14, x11           // A0 += a_offset
112        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
113        CMP     x15, x12                // if a1 == zero
114        ADD     x15, x15, x11           // A1 += a_offset
115        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
116        CMP     x20, x12                // if a2 == zero
117        ADD     x20, x20, x11           // A2 += a_offset
118        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
119        CMP     x21, x12                // if a3 == zero
120        ADD     x21, x21, x11           // A3 += a_offset
121        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
122        CMP     x22, x12                // if a4 == zero
123        ADD     x22, x22, x11           // A4 += a_offset
124        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
125        CMP     x23, x12                // if a5 == zero
126        ADD     x23, x23, x11           // A5 += a_offset
127        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
128
129        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
130        SUBS    x0, x2, 16              // k = kc - 16
131        B.LO    5f
132
133        # Prologue - First group loads, no FMA
134        LDR     d0, [x14], 8              // A0
135        LDP     q16, q17, [x5], 32        // B
136        LDR     d1, [x20], 8              // A2
137        LDR     d2, [x22], 8              // A4
138        LD1     {v0.d}[1], [x15], 8       // A1
139        LD1     {v1.d}[1], [x21], 8       // A3
140        LD1     {v2.d}[1],  [x23], 8      // A5
141        SUBS    x0, x0, 16
142        LDR     q18, [x5], 16
143        LDR     d19, [x5], 8
144        LDR     x8, [x5], 8              // ins is in BLOCK 0
145
146        # Is there at least 4 floats (16 bytes) for main loop?
147        B.LO    3f
148
149        # Main loop - 4 floats of A (16 bytes)
150        # 48 FMA + 12 LD64 A + 8 LDR B
1512:
152        # First group of 24 FMA, Second group loads
153        # BLOCK 0
154        LDR     d3, [x14], 8              // A0
155        INS     v19.d[1], x8              // B from second group
156        FMLA    v20.4s, v16.4s,  v0.s[0]
157        LDR     x8, [x15], 8              // A1
158        FMLA    v22.4s, v16.4s,  v0.s[2]
159        FMLA    v24.4s, v16.4s,  v1.s[0]
160
161        # BLOCK 1
162        LDR     d12, [x5]
163        INS     v3.d[1], x8               // A1 ins
164        FMLA    v26.4s, v16.4s,  v1.s[2]
165        LDR     x8, [x5, 8]               // B
166        FMLA    v28.4s, v16.4s,  v2.s[0]
167        FMLA    v30.4s, v16.4s,  v2.s[2]
168
169        # BLOCK 2
170        LDR     d4, [x20], 8              // A2
171        INS     v12.d[1], x8              // B  ins
172        FMLA    v21.4s, v17.4s,  v0.s[0]
173        LDR     x8, [x21], 8              // A3
174        FMLA    v23.4s, v17.4s,  v0.s[2]
175        FMLA    v25.4s, v17.4s,  v1.s[0]
176
177        # BLOCK 3
178        LDR     d5, [x22], 8              // A4
179        INS     v4.d[1], x8               // A3 ins
180        FMLA    v27.4s, v17.4s,  v1.s[2]
181        LDR     x8, [x23], 8              // A5
182        FMLA    v29.4s, v17.4s,  v2.s[0]
183        FMLA    v31.4s, v17.4s,  v2.s[2]
184
185        # BLOCK 4
186        LDR     d13, [x5, 16]
187        INS     v5.d[1], x8               // A5 ins
188        FMLA    v20.4s, v18.4s,  v0.s[1]
189        LDR     x8, [x5, 24]
190        FMLA    v22.4s, v18.4s,  v0.s[3]
191        FMLA    v24.4s, v18.4s,  v1.s[1]
192
193        # BLOCK 5
194        LDR     d14, [x5, 32]
195        INS     v13.d[1], x8              // B
196        FMLA    v26.4s, v18.4s,  v1.s[3]
197        LDR     x8, [x5, 40]
198        FMLA    v28.4s, v18.4s,  v2.s[1]
199        FMLA    v30.4s, v18.4s,  v2.s[3]
200
201        # BLOCK 6
202        LDR     d15, [x5, 48]
203        INS     v14.d[1], x8              // B
204        FMLA    v21.4s, v19.4s,  v0.s[1]
205        LDR     x8, [x5, 56]
206        FMLA    v23.4s, v19.4s,  v0.s[3]
207        FMLA    v25.4s, v19.4s,  v1.s[1]
208
209        # BLOCK 7
210        INS     v15.d[1], x8
211        FMLA    v27.4s, v19.4s,  v1.s[3]
212        FMLA    v29.4s, v19.4s,  v2.s[1]
213        FMLA    v31.4s, v19.4s,  v2.s[3]
214
215        # Second group of 24 FMA, First group of loads
216        # BLOCK 0
217        LDR     d0, [x14], 8              // A0
218        FMLA    v20.4s, v12.4s,  v3.s[0]
219        LDR     x8, [x15], 8              // A1
220        FMLA    v22.4s, v12.4s,  v3.s[2]
221        FMLA    v24.4s, v12.4s,  v4.s[0]
222
223        # BLOCK 1
224        LDR     d16, [x5, 64]
225        INS     v0.d[1], x8               // A1 ins
226        FMLA    v26.4s, v12.4s,  v4.s[2]
227        LDR     x8, [x5, 72]              // B
228        FMLA    v28.4s, v12.4s,  v5.s[0]
229        FMLA    v30.4s, v12.4s,  v5.s[2]
230
231        # BLOCK 2
232        LDR     d1, [x20], 8              // A2
233        INS     v16.d[1], x8              // B
234        FMLA    v21.4s, v13.4s,  v3.s[0]
235        LDR     x8, [x21], 8              // A3
236        FMLA    v23.4s, v13.4s,  v3.s[2]
237        FMLA    v25.4s, v13.4s,  v4.s[0]
238
239        # BLOCK 3
240        LDR     d2, [x22], 8              // A4
241        INS     v1.d[1], x8               // A3 ins
242        FMLA    v27.4s, v13.4s,  v4.s[2]
243        LDR     x8,  [x23], 8             // A5
244        FMLA    v29.4s, v13.4s,  v5.s[0]
245        FMLA    v31.4s, v13.4s,  v5.s[2]
246
247        # BLOCK 4
248        LDR     d17, [x5, 80]
249        INS     v2.d[1], x8               // A5 ins
250        FMLA    v20.4s, v14.4s,  v3.s[1]
251        LDR     x8, [x5, 88]
252        FMLA    v22.4s, v14.4s,  v3.s[3]
253        FMLA    v24.4s, v14.4s,  v4.s[1]
254
255        # BLOCK 5
256        LDR     d18, [x5, 96]
257        INS     v17.d[1], x8              // B
258        FMLA    v26.4s, v14.4s,  v4.s[3]
259        LDR     x8, [x5, 104]
260        FMLA    v28.4s, v14.4s,  v5.s[1]
261        FMLA    v30.4s, v14.4s,  v5.s[3]
262
263        # BLOCK 6
264        LDR     d19, [x5, 112]
265        INS     v18.d[1], x8              // B
266        FMLA    v21.4s, v15.4s,  v3.s[1]
267        LDR     x8, [x5, 120]
268        FMLA    v23.4s, v15.4s,  v3.s[3]
269        FMLA    v25.4s, v15.4s,  v4.s[1]
270
271        # BLOCK 7
272        SUBS    x0, x0, 16                // LDR lands here
273        FMLA    v27.4s, v15.4s,  v4.s[3]
274        FMLA    v29.4s, v15.4s,  v5.s[1]
275        ADD     x5, x5, 128
276        FMLA    v31.4s, v15.4s,  v5.s[3]
277        B.HS    2b
278
279        # Epilogue - 4 floats of A (16 bytes)
280        # 48 FMA + 12 LD64 A + 8 LDR B
2813:
282        # First group of 24 FMA, Second group loads
283        # BLOCK 0
284        LDR     d3, [x14], 8              // A0
285        INS     v19.d[1], x8              // B from second group
286        FMLA    v20.4s, v16.4s,  v0.s[0]
287        LDR     x8, [x15], 8              // A1
288        FMLA    v22.4s, v16.4s,  v0.s[2]
289        FMLA    v24.4s, v16.4s,  v1.s[0]
290
291        # BLOCK 1
292        LDR     d12, [x5]
293        INS     v3.d[1], x8               // A1 ins
294        FMLA    v26.4s, v16.4s,  v1.s[2]
295        LDR     x8, [x5, 8]               // B
296        FMLA    v28.4s, v16.4s,  v2.s[0]
297        FMLA    v30.4s, v16.4s,  v2.s[2]
298
299        # BLOCK 2
300        LDR     d4, [x20], 8              // A2
301        INS     v12.d[1], x8              // B  ins
302        FMLA    v21.4s, v17.4s,  v0.s[0]
303        LDR     x8, [x21], 8              // A3
304        FMLA    v23.4s, v17.4s,  v0.s[2]
305        FMLA    v25.4s, v17.4s,  v1.s[0]
306
307        # BLOCK 3
308        LDR     d5, [x22], 8              // A4
309        INS     v4.d[1], x8               // A3 ins
310        FMLA    v27.4s, v17.4s,  v1.s[2]
311        LDR     x8, [x23], 8              // A5
312        FMLA    v29.4s, v17.4s,  v2.s[0]
313        FMLA    v31.4s, v17.4s,  v2.s[2]
314
315        # BLOCK 4
316        LDR     d13, [x5, 16]
317        INS     v5.d[1], x8               // A5 ins
318        FMLA    v20.4s, v18.4s,  v0.s[1]
319        LDR     x8, [x5, 24]
320        FMLA    v22.4s, v18.4s,  v0.s[3]
321        FMLA    v24.4s, v18.4s,  v1.s[1]
322
323        # BLOCK 5
324        LDR     d14, [x5, 32]
325        INS     v13.d[1], x8              // B
326        FMLA    v26.4s, v18.4s,  v1.s[3]
327        LDR     x8, [x5, 40]
328        FMLA    v28.4s, v18.4s,  v2.s[1]
329        FMLA    v30.4s, v18.4s,  v2.s[3]
330
331        # BLOCK 6
332        LDR     d15, [x5, 48]
333        INS     v14.d[1], x8              // B
334        FMLA    v21.4s, v19.4s,  v0.s[1]
335        LDR     x8, [x5, 56]
336        FMLA    v23.4s, v19.4s,  v0.s[3]
337        FMLA    v25.4s, v19.4s,  v1.s[1]
338
339        # BLOCK 7
340        INS     v15.d[1], x8              // B from previous
341        FMLA    v27.4s, v19.4s,  v1.s[3]
342        FMLA    v29.4s, v19.4s,  v2.s[1]
343        FMLA    v31.4s, v19.4s,  v2.s[3]
344
345        # Second group of 24 FMA, First group of loads
346        # BLOCK 0
347        FMLA    v20.4s, v12.4s,  v3.s[0]
348        FMLA    v22.4s, v12.4s,  v3.s[2]
349        FMLA    v24.4s, v12.4s,  v4.s[0]
350
351        # BLOCK 1
352        FMLA    v26.4s, v12.4s,  v4.s[2]
353        FMLA    v28.4s, v12.4s,  v5.s[0]
354        FMLA    v30.4s, v12.4s,  v5.s[2]
355
356        # BLOCK 2
357        FMLA    v21.4s, v13.4s,  v3.s[0]
358        FMLA    v23.4s, v13.4s,  v3.s[2]
359        FMLA    v25.4s, v13.4s,  v4.s[0]
360
361        # BLOCK 3
362        FMLA    v27.4s, v13.4s,  v4.s[2]
363        FMLA    v29.4s, v13.4s,  v5.s[0]
364        FMLA    v31.4s, v13.4s,  v5.s[2]
365
366        # BLOCK 4
367        FMLA    v20.4s, v14.4s,  v3.s[1]
368        FMLA    v22.4s, v14.4s,  v3.s[3]
369        FMLA    v24.4s, v14.4s,  v4.s[1]
370
371        # BLOCK 5
372        FMLA    v26.4s, v14.4s,  v4.s[3]
373        FMLA    v28.4s, v14.4s,  v5.s[1]
374        FMLA    v30.4s, v14.4s,  v5.s[3]
375        TST     x0, 15
376
377        # BLOCK 6
378        FMLA    v21.4s, v15.4s,  v3.s[1]
379        FMLA    v23.4s, v15.4s,  v3.s[3]
380        FMLA    v25.4s, v15.4s,  v4.s[1]
381        ADD     x5, x5, 64
382
383        # BLOCK 7
384        FMLA    v27.4s, v15.4s,  v4.s[3]
385        FMLA    v29.4s, v15.4s,  v5.s[1]
386        FMLA    v31.4s, v15.4s,  v5.s[3]
387
388        # Is there a remainder?- 2 floats of A (8 bytes) or less
389        B.NE    5f
390
3914:
392        # ks loop
393        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
394        B.HI    1b
395
396        # Clamp
397        FMAX    v20.4s, v20.4s, v6.4s
398        # Load cn_stride
399        LDR     x0, [sp, 64]
400        FMAX    v21.4s, v21.4s, v6.4s
401        FMAX    v22.4s, v22.4s, v6.4s
402        FMAX    v23.4s, v23.4s, v6.4s
403        FMAX    v24.4s, v24.4s, v6.4s
404        FMAX    v25.4s, v25.4s, v6.4s
405        FMAX    v26.4s, v26.4s, v6.4s
406        FMAX    v27.4s, v27.4s, v6.4s
407        FMAX    v28.4s, v28.4s, v6.4s
408        FMAX    v29.4s, v29.4s, v6.4s
409        FMAX    v30.4s, v30.4s, v6.4s
410        FMAX    v31.4s, v31.4s, v6.4s
411        SUBS    x1, x1, 8
412        FMIN    v20.4s, v20.4s, v7.4s
413        FMIN    v21.4s, v21.4s, v7.4s
414        FMIN    v22.4s, v22.4s, v7.4s
415        FMIN    v23.4s, v23.4s, v7.4s
416        FMIN    v24.4s, v24.4s, v7.4s
417        FMIN    v25.4s, v25.4s, v7.4s
418        FMIN    v26.4s, v26.4s, v7.4s
419        FMIN    v27.4s, v27.4s, v7.4s
420        FMIN    v28.4s, v28.4s, v7.4s
421        FMIN    v29.4s, v29.4s, v7.4s
422        FMIN    v30.4s, v30.4s, v7.4s
423        FMIN    v31.4s, v31.4s, v7.4s
424
425        # Store full 6 x 8
426        B.LO    7f
427
428        STP     q30, q31,  [x7]
429        ADD     x7, x7, x0
430        STP     q28, q29, [x13]
431        ADD     x13, x13, x0
432        STP     q26, q27, [x10]
433        ADD     x10, x10, x0
434        STP     q24, q25, [x17]
435        ADD     x17, x17, x0
436        STP     q22, q23, [x16]
437        ADD     x16, x16, x0
438        STP     q20, q21,  [x6]
439        ADD     x6,  x6, x0
440
441        SUB     x4, x4, x3              // A -= ks
442
443        # nc loop
444        B.HI    0b
445
446        # Restore x20-x23, d12-d15 from stack
447        LDP     x22, x23, [sp, 48]
448        LDP     x20, x21, [sp, 32]
449        LDP     d14, d15, [sp, 16]
450        LDP     d12, d13, [sp], 64
451        RET
452
4535:
454        # Is there a remainder?- 2 floats of A (8 bytes)
455        TBZ     x0, 3, 6f
456
457        # Remainder- 2 floats of A (8 bytes)
458        LDR     d0, [x14], 8
459        LDR     q16, [x5], 16
460        LD1     {v0.d}[1], [x15], 8
461        LDR     d1, [x20], 8
462        LD1     {v1.d}[1], [x21], 8
463        LDR     d2, [x22], 8
464        LD1     {v2.d}[1], [x23], 8
465        LDR     q17, [x5], 16
466        LDR     q18, [x5], 16
467        LDR     q19, [x5], 16
468        FMLA    v20.4s, v16.4s,  v0.s[0]
469        FMLA    v22.4s, v16.4s,  v0.s[2]
470        FMLA    v24.4s, v16.4s,  v1.s[0]
471        FMLA    v26.4s, v16.4s,  v1.s[2]
472        FMLA    v28.4s, v16.4s,  v2.s[0]
473        FMLA    v30.4s, v16.4s,  v2.s[2]
474        FMLA    v21.4s, v17.4s,  v0.s[0]
475        FMLA    v23.4s, v17.4s,  v0.s[2]
476        FMLA    v25.4s, v17.4s,  v1.s[0]
477        FMLA    v27.4s, v17.4s,  v1.s[2]
478        FMLA    v29.4s, v17.4s,  v2.s[0]
479        FMLA    v31.4s, v17.4s,  v2.s[2]
480
481        FMLA    v20.4s, v18.4s,  v0.s[1]
482        FMLA    v22.4s, v18.4s,  v0.s[3]
483        FMLA    v24.4s, v18.4s,  v1.s[1]
484        FMLA    v26.4s, v18.4s,  v1.s[3]
485        FMLA    v28.4s, v18.4s,  v2.s[1]
486        FMLA    v30.4s, v18.4s,  v2.s[3]
487        FMLA    v21.4s, v19.4s,  v0.s[1]
488        FMLA    v23.4s, v19.4s,  v0.s[3]
489        FMLA    v25.4s, v19.4s,  v1.s[1]
490        FMLA    v27.4s, v19.4s,  v1.s[3]
491        FMLA    v29.4s, v19.4s,  v2.s[1]
492        FMLA    v31.4s, v19.4s,  v2.s[3]
493
494        # Is there a remainder?- 1 float of A (4 bytes)
495        TBZ     x0, 2, 4b
4966:
497        # Remainder- 1 float of A (4 bytes)
498        LDR     s0,  [x14], 4
499        LDR     q16, [x5], 16
500        LD1     {v0.s}[2], [x15], 4
501        LDR     s1, [x20], 4
502        LD1     {v1.s}[2], [x21], 4
503        LDR     s2, [x22], 4
504        LD1     {v2.s}[2], [x23], 4
505        LDR     q17, [x5], 16
506
507        FMLA    v20.4s, v16.4s,  v0.s[0]
508        FMLA    v22.4s, v16.4s,  v0.s[2]
509        FMLA    v24.4s, v16.4s,  v1.s[0]
510        FMLA    v26.4s, v16.4s,  v1.s[2]
511        FMLA    v28.4s, v16.4s,  v2.s[0]
512        FMLA    v30.4s, v16.4s,  v2.s[2]
513        FMLA    v21.4s, v17.4s,  v0.s[0]
514        FMLA    v23.4s, v17.4s,  v0.s[2]
515        FMLA    v25.4s, v17.4s,  v1.s[0]
516        FMLA    v27.4s, v17.4s,  v1.s[2]
517        FMLA    v29.4s, v17.4s,  v2.s[0]
518        FMLA    v31.4s, v17.4s,  v2.s[2]
519        B       4b
520
521        # Store odd width
5227:
523        TBZ     x1, 2, 8f
524        STR     q30,  [x7], 16
525        MOV     v30.16b, v31.16b
526        STR     q28, [x13], 16
527        MOV     v28.16b, v29.16b
528        STR     q26, [x10], 16
529        MOV     v26.16b, v27.16b
530        STR     q24, [x17], 16
531        MOV     v24.16b, v25.16b
532        STR     q22, [x16], 16
533        MOV     v22.16b, v23.16b
534        STR     q20,  [x6], 16
535        MOV     v20.16b, v21.16b
5368:
537        TBZ     x1, 1, 9f
538        STR     d30,  [x7], 8
539        STR     d28, [x13], 8
540        DUP     d30, v30.d[1]
541        DUP     d28, v28.d[1]
542        STR     d26, [x10], 8
543        STR     d24, [x17], 8
544        DUP     d26, v26.d[1]
545        DUP     d24, v24.d[1]
546        STR     d22, [x16], 8
547        STR     d20,  [x6], 8
548        DUP     d22, v22.d[1]
549        DUP     d20, v20.d[1]
550
5519:
552        TBZ     x1, 0, 10f
553        STR     s30,  [x7]
554        STR     s28, [x13]
555        STR     s26, [x10]
556        STR     s24, [x17]
557        STR     s22, [x16]
558        STR     s20,  [x6]
55910:
560        # Restore x20-x23, d12-d15 from stack
561        LDP     x22, x23, [sp, 48]
562        LDP     x20, x21, [sp, 32]
563        LDP     d14, d15, [sp, 16]
564        LDP     d12, d13, [sp], 64
565        RET
566
567END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
568
569#ifdef __ELF__
570.section ".note.GNU-stack","",%progbits
571#endif
572