xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14  v0     v3
26# A1 x15  v0[1]  v3[1]
27# A2 x20  v1     v4
28# A3 x21  v1[1]  v4[1]
29# A4 x22  v2     v5
30# A5 x23  v2[1]  v5[1]
31
32# B   x5  v12 v13 v14 v15 second set of B
33# B       v16 v17 v18 v19 first set
34
35# C0  x6 v20 v21
36# C1 x16 v22 v23
37# C2 x17 v24 v25
38# C3 x10 v26 v27
39# C4 x13 v28 v29
40# C5  x7 v30 v31
41
42# Clamp v6 v7
43# unused A   v8 v9 v10 v11
44# x19 temporary vector shadow register
45
46BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
47
48        # Clamp C pointers
49        CMP     x0, 2                   // if mr < 2
50        ADD     x16, x6, x7             // c1 = c0 + cm_stride
51        CSEL    x16, x6, x16, LO        //   c1 = c0
52
53        ADD     x17, x16, x7            // c2 = c1 + cm_stride
54                                        // if mr <= 2
55        CSEL    x17, x16, x17, LS       //   c2 = c1
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x10, x17, x7            // c3 = c2 + cm_stride
59        CSEL    x10, x17, x10, LO       //   c3 = c2
60
61        ADD     x13, x10, x7            // c4 = c3 + cm_stride
62                                 // if mr <= 4
63        CSEL    x13, x10, x13, LS       //   c4 = c3
64
65
66        CMP     x0, 6                   // if mr < 6
67        ADD     x7, x13, x7             // c5 = c4 + cm_stride
68        CSEL    x7, x13, x7, LO         //   c5 = c4
69
70        # Load a_offset
71        LDR     x11, [sp, 8]
72
73        # Load zero, params pointer
74        LDP     x12, x8, [sp, 16]
75
76        # Load min/max values
77        LD2R    {v6.4s, v7.4s}, [x8]
78
79        # Save x19-x23, d12-d15 on stack
80        STP     d12, d13, [sp, -80]!
81        STP     d14, d15, [sp, 16]
82        STP     x19, x20, [sp, 32]
83        STP     x21, x22, [sp, 48]
84        STR     x23,      [sp, 64]
85
860:
87        # Load initial bias from w into accumulators
88        LDP     q20, q21, [x5], 32
89        MOV     x9, x3                  // p = ks
90        MOV     v22.16b, v20.16b
91        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
92        MOV     v23.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 64]
94        MOV     v24.16b, v20.16b
95        PRFM    PLDL1KEEP, [x5, 128]
96        MOV     v25.16b, v21.16b
97        PRFM    PLDL1KEEP, [x5, 192]
98        MOV     v26.16b, v20.16b
99        PRFM    PLDL1KEEP, [x5, 256]
100        MOV     v27.16b, v21.16b
101        PRFM    PLDL1KEEP, [x5, 320]
102        MOV     v28.16b, v20.16b
103        MOV     v29.16b, v21.16b
104        MOV     v30.16b, v20.16b
105        MOV     v31.16b, v21.16b
106
107
1081:
109        # Load next 6 A pointers
110        LDP     x14, x15, [x4], 16
111        LDP     x20, x21, [x4], 16
112        LDP     x22, x23, [x4], 16
113
114        CMP     x14, x12                // if a0 == zero
115        ADD     x14, x14, x11           // a0 += a_offset
116        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
117        CMP     x15, x12                // if a1 == zero
118        ADD     x15, x15, x11           // a1 += a_offset
119        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
120        CMP     x20, x12                // if a2 == zero
121        ADD     x20, x20, x11           // a2 += a_offset
122        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
123        CMP     x21, x12                // if a3 == zero
124        ADD     x21, x21, x11           // a3 += a_offset
125        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
126        CMP     x22, x12                // if a4 == zero
127        ADD     x22, x22, x11           // a4 += a_offset
128        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
129        CMP     x23, x12                // if a5 == zero
130        ADD     x23, x23, x11           // a5 += a_offset
131        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
132
133        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
134        SUBS    x0, x2, 16              // k = kc - 16
135        B.LO    5f
136
137        # Prologue - First group loads, no FMA
138        LDR     d0, [x14], 8              // a0
139        LDP     q16, q17, [x5], 32        // b
140        LDR     d1, [x20], 8              // a2
141        LDR     d2, [x22], 8              // a4
142        LD1     {v0.d}[1], [x15], 8       // a1
143        LD1     {v1.d}[1], [x21], 8       // a3
144        LD1     {v2.d}[1],  [x23], 8      // a5
145        SUBS    x0, x0, 16
146        LDR     q18, [x5], 16
147        LDR     d19, [x5], 8
148        LDR     x19, [x5], 8            // ins is in BLOCK 0
149
150        # Is there at least 4 floats (16 bytes) for main loop?
151        B.LO    3f
152
153        # Main loop - 4 floats of A (16 bytes)
154        # 48 FMA + 12 LD64 A + 8 LDR B
1552:
156        # First group of 24 FMA, Second group loads
157        # BLOCK 0
158        FMLA    v20.4s, v16.4s,  v0.s[0]
159        LDR     d3, [x14], 8             // a0
160        FMLA    v22.4s, v16.4s,  v0.s[2]
161        INS     v19.d[1], x19              // b from second group
162        FMLA    v24.4s, v16.4s,  v1.s[0]
163        LDR     x19, [x15], 8             // a1
164
165        # BLOCK 1
166        FMLA    v26.4s, v16.4s,  v1.s[2]
167        LDR     d12, [x5]
168        FMLA    v28.4s, v16.4s,  v2.s[0]
169        INS     v3.d[1], x19               // a1 ins
170        FMLA    v30.4s, v16.4s,  v2.s[2]
171        LDR     x19, [x5, 8]            // b
172
173        # BLOCK 2
174        FMLA    v21.4s, v17.4s,  v0.s[0]
175        LDR     d4, [x20], 8             // a2
176        FMLA    v23.4s, v17.4s,  v0.s[2]
177        INS     v12.d[1], x19           // b  ins
178        FMLA    v25.4s, v17.4s,  v1.s[0]
179        LDR     x19, [x21], 8             // a3
180
181        # BLOCK 3
182        FMLA    v27.4s, v17.4s,  v1.s[2]
183        LDR     d5, [x22], 8             // a4
184        FMLA    v29.4s, v17.4s,  v2.s[0]
185        INS     v4.d[1], x19               // a3 ins
186        FMLA    v31.4s, v17.4s,  v2.s[2]
187        LDR     x19, [x23], 8             // a5
188
189        # BLOCK 4
190        FMLA    v20.4s, v18.4s,  v0.s[1]
191        LDR     d13, [x5, 16]
192        FMLA    v22.4s, v18.4s,  v0.s[3]
193        INS     v5.d[1], x19               // a5 ins
194        FMLA    v24.4s, v18.4s,  v1.s[1]
195        LDR     x19, [x5, 24]
196
197        # BLOCK 5
198        FMLA    v26.4s, v18.4s,  v1.s[3]
199        LDR     d14, [x5, 32]
200        FMLA    v28.4s, v18.4s,  v2.s[1]
201        INS     v13.d[1], x19           // b
202        FMLA    v30.4s, v18.4s,  v2.s[3]
203        LDR     x19, [x5, 40]
204
205        # BLOCK 6
206        FMLA    v21.4s, v19.4s,  v0.s[1]
207        LDR     d15, [x5, 48]
208        FMLA    v23.4s, v19.4s,  v0.s[3]
209        INS     v14.d[1], x19           // b
210        FMLA    v25.4s, v19.4s,  v1.s[1]
211        LDR     x19, [x5, 56]
212
213        # BLOCK 7
214        FMLA    v27.4s, v19.4s,  v1.s[3]
215        FMLA    v29.4s, v19.4s,  v2.s[1]
216        INS     v15.d[1], x19
217        FMLA    v31.4s, v19.4s,  v2.s[3]
218
219        # Second group of 24 FMA, First group of loads
220        # BLOCK 0
221        FMLA    v20.4s, v12.4s,  v3.s[0]
222        LDR     d0, [x14], 8             // a0
223        FMLA    v22.4s, v12.4s,  v3.s[2]
224        FMLA    v24.4s, v12.4s,  v4.s[0]
225        LDR     x19, [x15], 8             // a1
226
227        # BLOCK 1
228        FMLA    v26.4s, v12.4s,  v4.s[2]
229        LDR     d16, [x5, 64]
230        FMLA    v28.4s, v12.4s,  v5.s[0]
231        INS     v0.d[1], x19               // a1 ins
232        FMLA    v30.4s, v12.4s,  v5.s[2]
233        LDR     x19, [x5, 72]           // b
234
235        # BLOCK 2
236        FMLA    v21.4s, v13.4s,  v3.s[0]
237        LDR     d1, [x20], 8             // a2
238        FMLA    v23.4s, v13.4s,  v3.s[2]
239        INS     v16.d[1], x19           // b
240        FMLA    v25.4s, v13.4s,  v4.s[0]
241        LDR     x19, [x21], 8             // a3
242
243        # BLOCK 3
244        FMLA    v27.4s, v13.4s,  v4.s[2]
245        LDR     d2, [x22], 8             // a4
246        FMLA    v29.4s, v13.4s,  v5.s[0]
247        INS     v1.d[1], x19               // a3 ins
248        FMLA    v31.4s, v13.4s,  v5.s[2]
249        LDR     x19,  [x23], 8            // a5
250
251        # BLOCK 4
252        FMLA    v20.4s, v14.4s,  v3.s[1]
253        LDR     d17, [x5, 80]
254        FMLA    v22.4s, v14.4s,  v3.s[3]
255        INS     v2.d[1], x19               // a5 ins
256        FMLA    v24.4s, v14.4s,  v4.s[1]
257        LDR     x19, [x5, 88]
258
259        # BLOCK 5
260        FMLA    v26.4s, v14.4s,  v4.s[3]
261        LDR     d18, [x5, 96]
262        FMLA    v28.4s, v14.4s,  v5.s[1]
263        INS     v17.d[1], x19           // b
264        FMLA    v30.4s, v14.4s,  v5.s[3]
265        LDR     x19, [x5, 104]
266
267        # BLOCK 6
268        FMLA    v21.4s, v15.4s,  v3.s[1]
269        LDR     d19, [x5, 112]
270        FMLA    v23.4s, v15.4s,  v3.s[3]
271        INS     v18.d[1], x19           // b
272        FMLA    v25.4s, v15.4s,  v4.s[1]
273        LDR     x19, [x5, 120]
274
275        # BLOCK 7
276        FMLA    v27.4s, v15.4s,  v4.s[3]
277        SUBS    x0, x0, 16
278        FMLA    v29.4s, v15.4s,  v5.s[1]
279        ADD     x5, x5, 128
280        FMLA    v31.4s, v15.4s,  v5.s[3]
281        B.HS    2b
282
283        # Epilogue - 4 floats of A (16 bytes)
284        # 48 FMA + 12 LD64 A + 8 LDR B
2853:
286        # First group of 24 FMA, Second group loads
287        # BLOCK 0
288        FMLA    v20.4s, v16.4s,  v0.s[0]
289        LDR     d3, [x14], 8             // a0
290        FMLA    v22.4s, v16.4s,  v0.s[2]
291        INS     v19.d[1], x19              // b from second group
292        FMLA    v24.4s, v16.4s,  v1.s[0]
293        LDR     x19, [x15], 8             // a1
294
295        # BLOCK 1
296        FMLA    v26.4s, v16.4s,  v1.s[2]
297        LDR     d12, [x5]
298        FMLA    v28.4s, v16.4s,  v2.s[0]
299        INS     v3.d[1], x19               // a1 ins
300        FMLA    v30.4s, v16.4s,  v2.s[2]
301        LDR     x19, [x5, 8]            // b
302
303        # BLOCK 2
304        FMLA    v21.4s, v17.4s,  v0.s[0]
305        LDR     d4, [x20], 8             // a2
306        FMLA    v23.4s, v17.4s,  v0.s[2]
307        INS     v12.d[1], x19           // b  ins
308        FMLA    v25.4s, v17.4s,  v1.s[0]
309        LDR     x19, [x21], 8             // a3
310
311        # BLOCK 3
312        FMLA    v27.4s, v17.4s,  v1.s[2]
313        LDR     d5, [x22], 8             // a4
314        FMLA    v29.4s, v17.4s,  v2.s[0]
315        INS     v4.d[1], x19               // a3 ins
316        FMLA    v31.4s, v17.4s,  v2.s[2]
317        LDR     x19, [x23], 8             // a5
318
319        # BLOCK 4
320        FMLA    v20.4s, v18.4s,  v0.s[1]
321        LDR     d13, [x5, 16]
322        FMLA    v22.4s, v18.4s,  v0.s[3]
323        INS     v5.d[1], x19               // a5 ins
324        FMLA    v24.4s, v18.4s,  v1.s[1]
325        LDR     x19, [x5, 24]
326
327        # BLOCK 5
328        FMLA    v26.4s, v18.4s,  v1.s[3]
329        LDR     d14, [x5, 32]
330        FMLA    v28.4s, v18.4s,  v2.s[1]
331        INS     v13.d[1], x19           // b
332        FMLA    v30.4s, v18.4s,  v2.s[3]
333        LDR     x19, [x5, 40]
334
335        # BLOCK 6
336        LDR     d15, [x5, 48]
337        FMLA    v21.4s, v19.4s,  v0.s[1]
338        INS     v14.d[1], x19           // b
339        FMLA    v23.4s, v19.4s,  v0.s[3]
340        LDR     x19, [x5, 56]
341        FMLA    v25.4s, v19.4s,  v1.s[1]
342
343        # BLOCK 7
344        INS     v15.d[1], x19           // b from previous
345        FMLA    v27.4s, v19.4s,  v1.s[3]
346        FMLA    v29.4s, v19.4s,  v2.s[1]
347        FMLA    v31.4s, v19.4s,  v2.s[3]
348
349        # Second group of 24 FMA, First group of loads
350        # BLOCK 0
351        FMLA    v20.4s, v12.4s,  v3.s[0]
352        PRFM    PSTL1KEEP,  [x6]          // Prefetch C0
353        FMLA    v22.4s, v12.4s,  v3.s[2]
354        PRFM    PSTL1KEEP, [x16]          // Prefetch C1
355        FMLA    v24.4s, v12.4s,  v4.s[0]
356        PRFM    PSTL1KEEP, [x17]          // Prefetch C2
357
358        # BLOCK 1
359        FMLA    v26.4s, v12.4s,  v4.s[2]
360        PRFM    PSTL1KEEP, [x10]          // Prefetch C3
361        FMLA    v28.4s, v12.4s,  v5.s[0]
362        PRFM    PSTL1KEEP, [x13]          // Prefetch C4
363        FMLA    v30.4s, v12.4s,  v5.s[2]
364        PRFM    PSTL1KEEP, [x7]           // Prefetch C5
365
366        # BLOCK 2
367        FMLA    v21.4s, v13.4s,  v3.s[0]
368        FMLA    v23.4s, v13.4s,  v3.s[2]
369        FMLA    v25.4s, v13.4s,  v4.s[0]
370
371        # BLOCK 3
372        FMLA    v27.4s, v13.4s,  v4.s[2]
373        FMLA    v29.4s, v13.4s,  v5.s[0]
374        FMLA    v31.4s, v13.4s,  v5.s[2]
375
376        # BLOCK 4
377        FMLA    v20.4s, v14.4s,  v3.s[1]
378        FMLA    v22.4s, v14.4s,  v3.s[3]
379        FMLA    v24.4s, v14.4s,  v4.s[1]
380
381        # BLOCK 5
382        FMLA    v26.4s, v14.4s,  v4.s[3]
383        FMLA    v28.4s, v14.4s,  v5.s[1]
384        FMLA    v30.4s, v14.4s,  v5.s[3]
385        TST     x0, 15
386
387        # BLOCK 6
388        FMLA    v21.4s, v15.4s,  v3.s[1]
389        FMLA    v23.4s, v15.4s,  v3.s[3]
390        FMLA    v25.4s, v15.4s,  v4.s[1]
391        ADD     x5, x5, 64
392
393        # BLOCK 7
394        FMLA    v27.4s, v15.4s,  v4.s[3]
395        FMLA    v29.4s, v15.4s,  v5.s[1]
396        FMLA    v31.4s, v15.4s,  v5.s[3]
397
398        # Is there a remainder?- 2 floats of A (8 bytes) or less
399        B.NE    5f
400
4014:
402        # ks loop
403        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
404        B.HI    1b
405
406        # Clamp
407        FMAX    v20.4s, v20.4s, v6.4s
408        # Load cn_stride
409        LDR     x0, [sp, 80]
410        FMAX    v21.4s, v21.4s, v6.4s
411        FMAX    v22.4s, v22.4s, v6.4s
412        FMAX    v23.4s, v23.4s, v6.4s
413        FMAX    v24.4s, v24.4s, v6.4s
414        FMAX    v25.4s, v25.4s, v6.4s
415        FMAX    v26.4s, v26.4s, v6.4s
416        FMAX    v27.4s, v27.4s, v6.4s
417        FMAX    v28.4s, v28.4s, v6.4s
418        FMAX    v29.4s, v29.4s, v6.4s
419        FMAX    v30.4s, v30.4s, v6.4s
420        FMAX    v31.4s, v31.4s, v6.4s
421        SUBS    x1, x1, 8
422        FMIN    v20.4s, v20.4s, v7.4s
423        FMIN    v21.4s, v21.4s, v7.4s
424        FMIN    v22.4s, v22.4s, v7.4s
425        FMIN    v23.4s, v23.4s, v7.4s
426        FMIN    v24.4s, v24.4s, v7.4s
427        FMIN    v25.4s, v25.4s, v7.4s
428        FMIN    v26.4s, v26.4s, v7.4s
429        FMIN    v27.4s, v27.4s, v7.4s
430        FMIN    v28.4s, v28.4s, v7.4s
431        FMIN    v29.4s, v29.4s, v7.4s
432        FMIN    v30.4s, v30.4s, v7.4s
433        FMIN    v31.4s, v31.4s, v7.4s
434
435        # Store full 6 x 8
436        B.LO    7f
437
438        STP     q30, q31,  [x7]
439        ADD     x7, x7, x0
440        STP     q28, q29, [x13]
441        ADD     x13, x13, x0
442        STP     q26, q27, [x10]
443        ADD     x10, x10, x0
444        STP     q24, q25, [x17]
445        ADD     x17, x17, x0
446        STP     q22, q23, [x16]
447        ADD     x16, x16, x0
448        STP     q20, q21,  [x6]
449        ADD     x6,  x6, x0
450
451        SUB     x4, x4, x3              // a -= ks
452
453        # nc loop
454        B.HI    0b
455
456        # Restore x19-x23, d12-d15 from stack
457        LDR     x23,      [sp, 64]
458        LDP     x21, x22, [sp, 48]
459        LDP     x19, x20, [sp, 32]
460        LDP     d14, d15, [sp, 16]
461        LDP     d12, d13, [sp], 80
462        RET
463
4645:
465        # Is there a remainder?- 2 floats of A (8 bytes)
466        TBZ     x0, 3, 6f
467
468        # Remainder- 2 floats of A (8 bytes)
469        LDR     d0, [x14], 8
470        LDR     q16, [x5], 16
471        LD1     {v0.d}[1], [x15], 8
472        LDR     d1, [x20], 8
473        LD1     {v1.d}[1], [x21], 8
474        LDR     d2, [x22], 8
475        LD1     {v2.d}[1], [x23], 8
476        LDR     q17, [x5], 16
477        LDR     q18, [x5], 16
478        LDR     q19, [x5], 16
479        FMLA    v20.4s, v16.4s,  v0.s[0]
480        FMLA    v22.4s, v16.4s,  v0.s[2]
481        FMLA    v24.4s, v16.4s,  v1.s[0]
482        FMLA    v26.4s, v16.4s,  v1.s[2]
483        FMLA    v28.4s, v16.4s,  v2.s[0]
484        FMLA    v30.4s, v16.4s,  v2.s[2]
485        FMLA    v21.4s, v17.4s,  v0.s[0]
486        FMLA    v23.4s, v17.4s,  v0.s[2]
487        FMLA    v25.4s, v17.4s,  v1.s[0]
488        FMLA    v27.4s, v17.4s,  v1.s[2]
489        FMLA    v29.4s, v17.4s,  v2.s[0]
490        FMLA    v31.4s, v17.4s,  v2.s[2]
491
492        FMLA    v20.4s, v18.4s,  v0.s[1]
493        FMLA    v22.4s, v18.4s,  v0.s[3]
494        FMLA    v24.4s, v18.4s,  v1.s[1]
495        FMLA    v26.4s, v18.4s,  v1.s[3]
496        FMLA    v28.4s, v18.4s,  v2.s[1]
497        FMLA    v30.4s, v18.4s,  v2.s[3]
498        FMLA    v21.4s, v19.4s,  v0.s[1]
499        FMLA    v23.4s, v19.4s,  v0.s[3]
500        FMLA    v25.4s, v19.4s,  v1.s[1]
501        FMLA    v27.4s, v19.4s,  v1.s[3]
502        FMLA    v29.4s, v19.4s,  v2.s[1]
503        FMLA    v31.4s, v19.4s,  v2.s[3]
504
505        # Is there a remainder?- 1 float of A (4 bytes)
506        TBZ     x0, 2, 4b
5076:
508        # Remainder- 1 float of A (4 bytes)
509        LDR     s0,  [x14], 4
510        LDR     q16, [x5], 16
511        LD1     {v0.s}[2], [x15], 4
512        LDR     s1, [x20], 4
513        LD1     {v1.s}[2], [x21], 4
514        LDR     s2, [x22], 4
515        LD1     {v2.s}[2], [x23], 4
516        LDR     q17, [x5], 16
517
518        FMLA    v20.4s, v16.4s,  v0.s[0]
519        FMLA    v22.4s, v16.4s,  v0.s[2]
520        FMLA    v24.4s, v16.4s,  v1.s[0]
521        FMLA    v26.4s, v16.4s,  v1.s[2]
522        FMLA    v28.4s, v16.4s,  v2.s[0]
523        FMLA    v30.4s, v16.4s,  v2.s[2]
524        FMLA    v21.4s, v17.4s,  v0.s[0]
525        FMLA    v23.4s, v17.4s,  v0.s[2]
526        FMLA    v25.4s, v17.4s,  v1.s[0]
527        FMLA    v27.4s, v17.4s,  v1.s[2]
528        FMLA    v29.4s, v17.4s,  v2.s[0]
529        FMLA    v31.4s, v17.4s,  v2.s[2]
530        B       4b
531
532        # Store odd width
5337:
534        TBZ     x1, 2, 8f
535        STR     q30,  [x7], 16
536        MOV     v30.16b, v31.16b
537        STR     q28, [x13], 16
538        MOV     v28.16b, v29.16b
539        STR     q26, [x10], 16
540        MOV     v26.16b, v27.16b
541        STR     q24, [x17], 16
542        MOV     v24.16b, v25.16b
543        STR     q22, [x16], 16
544        MOV     v22.16b, v23.16b
545        STR     q20,  [x6], 16
546        MOV     v20.16b, v21.16b
5478:
548        TBZ     x1, 1, 9f
549        STR     d30,  [x7], 8
550        STR     d28, [x13], 8
551        DUP     d30, v30.d[1]
552        DUP     d28, v28.d[1]
553        STR     d26, [x10], 8
554        STR     d24, [x17], 8
555        DUP     d26, v26.d[1]
556        DUP     d24, v24.d[1]
557        STR     d22, [x16], 8
558        STR     d20,  [x6], 8
559        DUP     d22, v22.d[1]
560        DUP     d20, v20.d[1]
561
5629:
563        TBZ     x1, 0, 10f
564        STR     s30,  [x7]
565        STR     s28, [x13]
566        STR     s26, [x10]
567        STR     s24, [x17]
568        STR     s22, [x16]
569        STR     s20,  [x6]
57010:
571        # Restore x19-x23, d12-d15 from stack
572        LDR     x23,      [sp, 64]
573        LDP     x21, x22, [sp, 48]
574        LDP     x19, x20, [sp, 32]
575        LDP     d14, d15, [sp, 16]
576        LDP     d12, d13, [sp], 80
577        RET
578
579END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
580
581#ifdef __ELF__
582.section ".note.GNU-stack","",%progbits
583#endif
584