xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0     v3
28# A1  x9 v0[1]  v3[1]
29# A2 x10 v1     v4
30# A3 x11 v1[1]  v4[1]
31# A4 x12 v2     v5
32# A5  x4 v2[1]  v5[1]
33
34# B   x5 v12 v13 v14 v15 second set of B
35# B      v16 v17 v18 v19 first set
36
37# C0  x6 v20 v21
38# C1 x16 v22 v23
39# C2 x17 v24 v25
40# C3 x14 v26 v27
41# C4 x13 v28 v29
42# C5  x7 v30 v31
43
44# Clamp v6 v7
45# unused A   v8 v9 v10 v11
46# x8 temporary vector shadow register
47
48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
49
50        # Load params pointer
51        LDR     x8, [sp, 8]
52
53        # Clamp A and C pointers
54        CMP     x0, 2                   // if mr < 2
55        ADD     x9, x3, x4              // a1 = a0 + a_stride
56        ADD     x16, x6, x7             // c1 = c0 + cm_stride
57        CSEL    x9, x3, x9, LO          //   a1 = a0
58        CSEL    x16, x6, x16, LO        //   c1 = c0
59
60        ADD     x10, x9, x4             // a2 = a1 + a_stride
61        ADD     x17, x16, x7            // c2 = c1 + cm_stride
62                                        // if mr <= 2
63        CSEL    x10, x9, x10, LS        //   a2 = a1
64        CSEL    x17, x16, x17, LS       //   c2 = c1
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x11, x10, x4            // a3 = a2 + a_stride
68        ADD     x14, x17, x7            // c3 = c2 + cm_stride
69        CSEL    x11, x10, x11, LO       //   a3 = a2
70        CSEL    x14, x17, x14, LO       //   c3 = c2
71
72        ADD     x12, x11, x4            // a4 = a3 + a_stride
73        ADD     x13, x14, x7            // c4 = c3 + cm_stride
74                                        // if mr <= 4
75        CSEL    x12, x11, x12, LS       //   a4 = a3
76        CSEL    x13, x14, x13, LS       //   c4 = c3
77
78        CMP     x0, 6                   // if mr < 6
79        ADD     x4, x12, x4             // a5 = a4 + a_stride
80        ADD     x7, x13, x7             // c5 = c4 + cm_stride
81        CSEL    x4, x12, x4, LO         //   a5 = a4
82        CSEL    x7, x13, x7, LO         //   c5 = c4
83
84        # Load min/max values
85        LD2R    {v6.4s, v7.4s}, [x8]
86
87        # Save d12-d15 on stack
88        STP     d12, d13, [sp, -32]!
89        STP     d14, d15, [sp, 16]
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        SUBS    x0, x2, 16              // k = kc - 16
95        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
96        PRFM    PLDL1KEEP,  [x3, 64]
97        MOV     v22.16b, v20.16b
98        PRFM    PLDL1KEEP,  [x9,  0]
99        PRFM    PLDL1KEEP,  [x9, 64]
100        MOV     v23.16b, v21.16b
101        PRFM    PLDL1KEEP, [x10,  0]
102        PRFM    PLDL1KEEP, [x10, 64]
103        MOV     v24.16b, v20.16b
104        PRFM    PLDL1KEEP, [x11,  0]
105        PRFM    PLDL1KEEP, [x11, 64]
106        MOV     v25.16b, v21.16b
107        PRFM    PLDL1KEEP, [x12,  0]
108        PRFM    PLDL1KEEP, [x12, 64]
109        MOV     v26.16b, v20.16b
110        PRFM    PLDL1KEEP,  [x4,  0]
111        PRFM    PLDL1KEEP,  [x4, 64]
112        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
113        MOV     v27.16b, v21.16b
114        PRFM    PLDL1KEEP, [x5,  64]
115        MOV     v28.16b, v20.16b
116        PRFM    PLDL1KEEP, [x5, 128]
117        MOV     v29.16b, v21.16b
118        PRFM    PLDL1KEEP, [x5, 192]
119        MOV     v30.16b, v20.16b
120        PRFM    PLDL1KEEP, [x5, 256]
121        MOV     v31.16b, v21.16b
122        PRFM    PLDL1KEEP, [x5, 320]
123
124        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
125        B.LO    4f
126
127        # Prologue - First group loads, no FMA
128        LDR     d0, [x3], 8               // a0
129        LDP     q16, q17, [x5], 32        // b
130        LDR     d1, [x10], 8              // a2
131        LDR     d2, [x12], 8              // a4
132        LD1     {v0.d}[1],  [x9], 8       // a1
133        LD1     {v1.d}[1], [x11], 8       // a3
134        LD1     {v2.d}[1],  [x4], 8       // a5
135        SUBS    x0, x0, 16
136        LDR     q18, [x5], 16
137        LDR     d19, [x5], 8
138        LDR     x8, [x5], 8             // ins is in BLOCK 0
139
140        # Is there at least 4 floats (16 bytes) for main loop?
141        B.LO    2f
142
143        # Main loop - 4 floats of A (16 bytes)
144        # 48 FMA + 12 LD64 A + 8 LDR B
1451:
146        # First group of 24 FMA, Second group loads
147        # BLOCK 0
148        FMLA    v20.4s, v16.4s,  v0.s[0]
149        LDR     d3, [x3], 8              // a0
150        FMLA    v22.4s, v16.4s,  v0.s[2]
151        INS     v19.d[1], x8               // b from second group
152        FMLA    v24.4s, v16.4s,  v1.s[0]
153        LDR     x8, [x9], 8              // a1
154
155        # BLOCK 1
156        FMLA    v26.4s, v16.4s,  v1.s[2]
157        LDR     d12, [x5]
158        FMLA    v28.4s, v16.4s,  v2.s[0]
159        INS     v3.d[1], x8                // a1 ins
160        FMLA    v30.4s, v16.4s,  v2.s[2]
161        LDR     x8, [x5, 8]              // b
162
163        # BLOCK 2
164        FMLA    v21.4s, v17.4s,  v0.s[0]
165        LDR     d4, [x10], 8             // a2
166        FMLA    v23.4s, v17.4s,  v0.s[2]
167        INS     v12.d[1], x8               // b  ins
168        FMLA    v25.4s, v17.4s,  v1.s[0]
169        LDR     x8, [x11], 8             // a3
170
171        # BLOCK 3
172        FMLA    v27.4s, v17.4s,  v1.s[2]
173        LDR     d5, [x12], 8             // a4
174        FMLA    v29.4s, v17.4s,  v2.s[0]
175        INS     v4.d[1], x8                // a3 ins
176        FMLA    v31.4s, v17.4s,  v2.s[2]
177        LDR     x8, [x4], 8              // a5
178
179        # BLOCK 4
180        FMLA    v20.4s, v18.4s,  v0.s[1]
181        LDR     d13, [x5, 16]
182        FMLA    v22.4s, v18.4s,  v0.s[3]
183        INS     v5.d[1], x8                // a5 ins
184        FMLA    v24.4s, v18.4s,  v1.s[1]
185        LDR     x8, [x5, 24]
186
187        # BLOCK 5
188        FMLA    v26.4s, v18.4s,  v1.s[3]
189        LDR     d14, [x5, 32]
190        FMLA    v28.4s, v18.4s,  v2.s[1]
191        INS     v13.d[1], x8               // b
192        FMLA    v30.4s, v18.4s,  v2.s[3]
193        LDR     x8, [x5, 40]
194
195        # BLOCK 6
196        FMLA    v21.4s, v19.4s,  v0.s[1]
197        LDR     d15, [x5, 48]
198        FMLA    v23.4s, v19.4s,  v0.s[3]
199        INS     v14.d[1], x8               // b
200        FMLA    v25.4s, v19.4s,  v1.s[1]
201        LDR     x8, [x5, 56]
202
203        # BLOCK 7
204        FMLA    v27.4s, v19.4s,  v1.s[3]
205        FMLA    v29.4s, v19.4s,  v2.s[1]
206        INS     v15.d[1], x8
207        FMLA    v31.4s, v19.4s,  v2.s[3]
208
209        # Second group of 24 FMA, First group of loads
210        # BLOCK 0
211        FMLA    v20.4s, v12.4s,  v3.s[0]
212        LDR     d0, [x3], 8              // a0
213        FMLA    v22.4s, v12.4s,  v3.s[2]
214        FMLA    v24.4s, v12.4s,  v4.s[0]
215        LDR     x8, [x9], 8              // a1
216
217        # BLOCK 1
218        FMLA    v26.4s, v12.4s,  v4.s[2]
219        LDR     d16, [x5, 64]
220        FMLA    v28.4s, v12.4s,  v5.s[0]
221        INS     v0.d[1], x8                // a1 ins
222        FMLA    v30.4s, v12.4s,  v5.s[2]
223        LDR     x8, [x5, 72]             // b
224
225        # BLOCK 2
226        FMLA    v21.4s, v13.4s,  v3.s[0]
227        LDR     d1, [x10], 8             // a2
228        FMLA    v23.4s, v13.4s,  v3.s[2]
229        INS     v16.d[1], x8               // b
230        FMLA    v25.4s, v13.4s,  v4.s[0]
231        LDR     x8, [x11], 8             // a3
232
233        # BLOCK 3
234        FMLA    v27.4s, v13.4s,  v4.s[2]
235        LDR     d2, [x12], 8             // a4
236        FMLA    v29.4s, v13.4s,  v5.s[0]
237        INS     v1.d[1], x8                // a3 ins
238        FMLA    v31.4s, v13.4s,  v5.s[2]
239        LDR     x8,  [x4], 8             // a5
240
241        # BLOCK 4
242        FMLA    v20.4s, v14.4s,  v3.s[1]
243        LDR     d17, [x5, 80]
244        FMLA    v22.4s, v14.4s,  v3.s[3]
245        INS     v2.d[1], x8                // a5 ins
246        FMLA    v24.4s, v14.4s,  v4.s[1]
247        LDR     x8, [x5, 88]
248
249        # BLOCK 5
250        FMLA    v26.4s, v14.4s,  v4.s[3]
251        LDR     d18, [x5, 96]
252        FMLA    v28.4s, v14.4s,  v5.s[1]
253        INS     v17.d[1], x8               // b
254        FMLA    v30.4s, v14.4s,  v5.s[3]
255        LDR     x8, [x5, 104]
256
257        # BLOCK 6
258        FMLA    v21.4s, v15.4s,  v3.s[1]
259        LDR     d19, [x5, 112]
260        FMLA    v23.4s, v15.4s,  v3.s[3]
261        INS     v18.d[1], x8               // b
262        FMLA    v25.4s, v15.4s,  v4.s[1]
263        LDR     x8, [x5, 120]
264
265        # BLOCK 7
266        FMLA    v27.4s, v15.4s,  v4.s[3]
267        SUBS    x0, x0, 16
268        FMLA    v29.4s, v15.4s,  v5.s[1]
269        ADD     x5, x5, 128
270        FMLA    v31.4s, v15.4s,  v5.s[3]
271        B.HS    1b
272
273        # Epilogue - 4 floats of A (16 bytes)
274        # 48 FMA + 12 LD64 A + 8 LDR B
2752:
276        # First group of 24 FMA, Second group loads
277        # BLOCK 0
278        FMLA    v20.4s, v16.4s,  v0.s[0]
279        LDR     d3, [x3], 8              // a0
280        FMLA    v22.4s, v16.4s,  v0.s[2]
281        INS     v19.d[1], x8               // b from second group
282        FMLA    v24.4s, v16.4s,  v1.s[0]
283        LDR     x8, [x9], 8              // a1
284
285        # BLOCK 1
286        FMLA    v26.4s, v16.4s,  v1.s[2]
287        LDR     d12, [x5]
288        FMLA    v28.4s, v16.4s,  v2.s[0]
289        INS     v3.d[1], x8                // a1 ins
290        FMLA    v30.4s, v16.4s,  v2.s[2]
291        LDR     x8, [x5, 8]              // b
292
293        # BLOCK 2
294        FMLA    v21.4s, v17.4s,  v0.s[0]
295        LDR     d4, [x10], 8             // a2
296        FMLA    v23.4s, v17.4s,  v0.s[2]
297        INS     v12.d[1], x8               // b  ins
298        FMLA    v25.4s, v17.4s,  v1.s[0]
299        LDR     x8, [x11], 8             // a3
300
301        # BLOCK 3
302        FMLA    v27.4s, v17.4s,  v1.s[2]
303        LDR     d5, [x12], 8             // a4
304        FMLA    v29.4s, v17.4s,  v2.s[0]
305        INS     v4.d[1], x8                // a3 ins
306        FMLA    v31.4s, v17.4s,  v2.s[2]
307        LDR     x8, [x4], 8              // a5
308
309        # BLOCK 4
310        FMLA    v20.4s, v18.4s,  v0.s[1]
311        LDR     d13, [x5, 16]
312        FMLA    v22.4s, v18.4s,  v0.s[3]
313        INS     v5.d[1], x8                // a5 ins
314        FMLA    v24.4s, v18.4s,  v1.s[1]
315        LDR     x8, [x5, 24]
316
317        # BLOCK 5
318        FMLA    v26.4s, v18.4s,  v1.s[3]
319        LDR     d14, [x5, 32]
320        FMLA    v28.4s, v18.4s,  v2.s[1]
321        INS     v13.d[1], x8               // b
322        FMLA    v30.4s, v18.4s,  v2.s[3]
323        LDR     x8, [x5, 40]
324
325        # BLOCK 6
326        FMLA    v21.4s, v19.4s,  v0.s[1]
327        LDR     d15, [x5, 48]
328        FMLA    v23.4s, v19.4s,  v0.s[3]
329        INS     v14.d[1], x8               // b
330        FMLA    v25.4s, v19.4s,  v1.s[1]
331        LDR     x8, [x5, 56]
332
333        # BLOCK 7
334        FMLA    v27.4s, v19.4s,  v1.s[3]
335        FMLA    v29.4s, v19.4s,  v2.s[1]
336        INS     v15.d[1], x8               // b
337        FMLA    v31.4s, v19.4s,  v2.s[3]
338
339        # Second group of 24 FMA, First group of loads
340        # BLOCK 0
341        FMLA    v20.4s, v12.4s,  v3.s[0]
342        PRFM    PSTL1KEEP,  [x6]          // Prefetch C0
343        FMLA    v22.4s, v12.4s,  v3.s[2]
344        PRFM    PSTL1KEEP, [x16]          // Prefetch C1
345        FMLA    v24.4s, v12.4s,  v4.s[0]
346        PRFM    PSTL1KEEP, [x17]          // Prefetch C2
347
348        # BLOCK 1
349        FMLA    v26.4s, v12.4s,  v4.s[2]
350        PRFM    PSTL1KEEP, [x14]          // Prefetch C3
351        FMLA    v28.4s, v12.4s,  v5.s[0]
352        PRFM    PSTL1KEEP, [x13]          // Prefetch C4
353        FMLA    v30.4s, v12.4s,  v5.s[2]
354        PRFM    PSTL1KEEP, [x7]           // Prefetch C5
355
356        # BLOCK 2
357        FMLA    v21.4s, v13.4s,  v3.s[0]
358        FMLA    v23.4s, v13.4s,  v3.s[2]
359        FMLA    v25.4s, v13.4s,  v4.s[0]
360
361        # BLOCK 3
362        FMLA    v27.4s, v13.4s,  v4.s[2]
363        FMLA    v29.4s, v13.4s,  v5.s[0]
364        FMLA    v31.4s, v13.4s,  v5.s[2]
365
366        # BLOCK 4
367        FMLA    v20.4s, v14.4s,  v3.s[1]
368        FMLA    v22.4s, v14.4s,  v3.s[3]
369        FMLA    v24.4s, v14.4s,  v4.s[1]
370
371        # BLOCK 5
372        FMLA    v26.4s, v14.4s,  v4.s[3]
373        FMLA    v28.4s, v14.4s,  v5.s[1]
374        FMLA    v30.4s, v14.4s,  v5.s[3]
375        TST     x0, 15
376
377        # BLOCK 6
378        FMLA    v21.4s, v15.4s,  v3.s[1]
379        FMLA    v23.4s, v15.4s,  v3.s[3]
380        FMLA    v25.4s, v15.4s,  v4.s[1]
381        ADD     x5, x5, 64
382
383        # BLOCK 7
384        FMLA    v27.4s, v15.4s,  v4.s[3]
385        FMLA    v29.4s, v15.4s,  v5.s[1]
386        FMLA    v31.4s, v15.4s,  v5.s[3]
387
388        # Is there a remainder?- 2 floats of A (8 bytes) or less
389        B.NE    4f
3903:
391        # Clamp
392        FMAX    v20.4s, v20.4s, v6.4s
393        # Load cn_stride
394        LDR     x0, [sp, 32]
395        FMAX    v21.4s, v21.4s, v6.4s
396        FMAX    v22.4s, v22.4s, v6.4s
397        FMAX    v23.4s, v23.4s, v6.4s
398        FMAX    v24.4s, v24.4s, v6.4s
399        FMAX    v25.4s, v25.4s, v6.4s
400        FMAX    v26.4s, v26.4s, v6.4s
401        FMAX    v27.4s, v27.4s, v6.4s
402        FMAX    v28.4s, v28.4s, v6.4s
403        FMAX    v29.4s, v29.4s, v6.4s
404        FMAX    v30.4s, v30.4s, v6.4s
405        FMAX    v31.4s, v31.4s, v6.4s
406        SUBS    x1, x1, 8
407        FMIN    v20.4s, v20.4s, v7.4s
408        FMIN    v21.4s, v21.4s, v7.4s
409        FMIN    v22.4s, v22.4s, v7.4s
410        FMIN    v23.4s, v23.4s, v7.4s
411        FMIN    v24.4s, v24.4s, v7.4s
412        FMIN    v25.4s, v25.4s, v7.4s
413        FMIN    v26.4s, v26.4s, v7.4s
414        FMIN    v27.4s, v27.4s, v7.4s
415        FMIN    v28.4s, v28.4s, v7.4s
416        FMIN    v29.4s, v29.4s, v7.4s
417        FMIN    v30.4s, v30.4s, v7.4s
418        FMIN    v31.4s, v31.4s, v7.4s
419
420        # Store full 6 x 8
421        B.LO    6f
422
423        ST1     {v20.16b, v21.16b},  [x6], x0
424        SUB     x3,  x3, x2             // a0 -= kc
425        ST1     {v22.16b, v23.16b}, [x16], x0
426        SUB     x9,  x9, x2             // a1 -= kc
427        ST1     {v24.16b, v25.16b}, [x17], x0
428        SUB     x10, x10, x2            // a2 -= kc
429        ST1     {v26.16b, v27.16b}, [x14], x0
430        SUB     x11, x11, x2            // a3 -= kc
431        ST1     {v28.16b, v29.16b}, [x13], x0
432        SUB     x12, x12, x2            // a4 -= kc
433        ST1     {v30.16b, v31.16b},  [x7], x0
434        SUB     x4,  x4, x2             // a5 -= kc
435
436        B.HI    0b
437
438        # Restore d12-d15 from stack
439        LDP     d14, d15, [sp, 16]
440        LDP     d12, d13, [sp], 32
441        RET
442
4434:
444        # Is there a remainder?- 2 floats of A (8 bytes)
445        TBZ     x0, 3, 5f
446
447        # Remainder- 2 floats of A (8 bytes)
448        LDR     d0,  [x3], 8
449        LDR     q16, [x5], 16
450        LD1     {v0.d}[1], [x9], 8
451        LDR     d1, [x10], 8
452        LD1     {v1.d}[1], [x11], 8
453        LDR     d2, [x12], 8
454        LD1     {v2.d}[1], [x4], 8
455        LDR     q17, [x5], 16
456        LDR     q18, [x5], 16
457        LDR     q19, [x5], 16
458
459        FMLA    v20.4s, v16.4s,  v0.s[0]
460        FMLA    v22.4s, v16.4s,  v0.s[2]
461        FMLA    v24.4s, v16.4s,  v1.s[0]
462        FMLA    v26.4s, v16.4s,  v1.s[2]
463        FMLA    v28.4s, v16.4s,  v2.s[0]
464        FMLA    v30.4s, v16.4s,  v2.s[2]
465        FMLA    v21.4s, v17.4s,  v0.s[0]
466        FMLA    v23.4s, v17.4s,  v0.s[2]
467        FMLA    v25.4s, v17.4s,  v1.s[0]
468        FMLA    v27.4s, v17.4s,  v1.s[2]
469        FMLA    v29.4s, v17.4s,  v2.s[0]
470        FMLA    v31.4s, v17.4s,  v2.s[2]
471
472        FMLA    v20.4s, v18.4s,  v0.s[1]
473        FMLA    v22.4s, v18.4s,  v0.s[3]
474        FMLA    v24.4s, v18.4s,  v1.s[1]
475        FMLA    v26.4s, v18.4s,  v1.s[3]
476        FMLA    v28.4s, v18.4s,  v2.s[1]
477        FMLA    v30.4s, v18.4s,  v2.s[3]
478        FMLA    v21.4s, v19.4s,  v0.s[1]
479        FMLA    v23.4s, v19.4s,  v0.s[3]
480        FMLA    v25.4s, v19.4s,  v1.s[1]
481        FMLA    v27.4s, v19.4s,  v1.s[3]
482        FMLA    v29.4s, v19.4s,  v2.s[1]
483        FMLA    v31.4s, v19.4s,  v2.s[3]
484
485        # Is there a remainder?- 1 float of A (4 bytes)
486        TBZ     x0, 2, 3b
4875:
488        # Remainder- 1 float of A (4 bytes)
489        LDR     s0,  [x3], 4
490        LDR     q16, [x5], 16
491        LD1     {v0.s}[2], [x9], 4
492        LDR     s1, [x10], 4
493        LD1     {v1.s}[2], [x11], 4
494        LDR     s2, [x12], 4
495        LD1     {v2.s}[2], [x4], 4
496        LDR     q17, [x5], 16
497
498        FMLA    v20.4s, v16.4s,  v0.s[0]
499        FMLA    v22.4s, v16.4s,  v0.s[2]
500        FMLA    v24.4s, v16.4s,  v1.s[0]
501        FMLA    v26.4s, v16.4s,  v1.s[2]
502        FMLA    v28.4s, v16.4s,  v2.s[0]
503        FMLA    v30.4s, v16.4s,  v2.s[2]
504        FMLA    v21.4s, v17.4s,  v0.s[0]
505        FMLA    v23.4s, v17.4s,  v0.s[2]
506        FMLA    v25.4s, v17.4s,  v1.s[0]
507        FMLA    v27.4s, v17.4s,  v1.s[2]
508        FMLA    v29.4s, v17.4s,  v2.s[0]
509        FMLA    v31.4s, v17.4s,  v2.s[2]
510        B       3b
511
512        # Store odd width
5136:
514        TBZ     x1, 2, 7f
515        STR     q20,  [x6], 16
516        MOV     v20.16b, v21.16b
517        STR     q22, [x16], 16
518        MOV     v22.16b, v23.16b
519        STR     q24, [x17], 16
520        MOV     v24.16b, v25.16b
521        STR     q26, [x14], 16
522        MOV     v26.16b, v27.16b
523        STR     q28, [x13], 16
524        MOV     v28.16b, v29.16b
525        STR     q30,  [x7], 16
526        MOV     v30.16b, v31.16b
527
5287:
529        TBZ     x1, 1, 8f
530        STR     d20,  [x6], 8
531        STR     d22, [x16], 8
532        DUP     d20, v20.d[1]
533        DUP     d22, v22.d[1]
534        STR     d24, [x17], 8
535        STR     d26, [x14], 8
536        DUP     d24, v24.d[1]
537        DUP     d26, v26.d[1]
538        STR     d28, [x13], 8
539        STR     d30,  [x7], 8
540        DUP     d28, v28.d[1]
541        DUP     d30, v30.d[1]
542
5438:
544        TBZ     x1, 0, 9f
545        STR     s20,  [x6]
546        STR     s22, [x16]
547        STR     s24, [x17]
548        STR     s26, [x14]
549        STR     s28, [x13]
550        STR     s30,  [x7]
5519:
552        # Restore d12-d15 from stack
553        LDP     d14, d15, [sp, 16]
554        LDP     d12, d13, [sp], 32
555        RET
556
557END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
558
559#ifdef __ELF__
560.section ".note.GNU-stack","",%progbits
561#endif
562