xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0     v3
28# A1  x9 v0[1]  v3[1]
29# A2 x10 v1     v4
30# A3 x11 v1[1]  v4[1]
31# A4 x12 v2     v5
32# A5  x4 v2[1]  v5[1]
33
34# B   x5 v12 v13 v14 v15 second set of B
35# B      v16 v17 v18 v19 first set
36
37# C   x6 v20 v21
38# C  x16 v22 v23
39# C  x17 v24 v25
40# C  x14 v26 v27
41# C  x13 v28 v29
42# C   x7 v30 v31
43
44# Clamp v6 v7
45# unused A   v8 v9 v10 v11
46# x8 temporary vector shadow register
47
48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53
49
50        # Load params pointer
51        LDR     x8, [sp, 8]
52
53        # Clamp A and C pointers
54        CMP     x0, 2                   // if mr < 2
55        ADD     x9, x3, x4              // A1 = a0 + a_stride
56        ADD     x16, x6, x7             // c1 = c0 + cm_stride
57        CSEL    x9, x3, x9, LO          //   a1 = a0
58        CSEL    x16, x6, x16, LO        //   c1 = c0
59
60        ADD     x10, x9, x4             // A2 = a1 + a_stride
61        ADD     x17, x16, x7            // c2 = c1 + cm_stride
62                                        // if mr <= 2
63        CSEL    x10, x9, x10, LS        //   a2 = a1
64        CSEL    x17, x16, x17, LS       //   c2 = c1
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x11, x10, x4            // A3 = a2 + a_stride
68        ADD     x14, x17, x7            // c3 = c2 + cm_stride
69        CSEL    x11, x10, x11, LO       //   a3 = a2
70        CSEL    x14, x17, x14, LO       //   c3 = c2
71
72        ADD     x12, x11, x4            // A4 = a3 + a_stride
73        ADD     x13, x14, x7            // c4 = c3 + cm_stride
74                                        // if mr <= 4
75        CSEL    x12, x11, x12, LS       //   a4 = a3
76        CSEL    x13, x14, x13, LS       //   c4 = c3
77
78        CMP     x0, 6                   // if mr < 6
79        ADD     x4, x12, x4             // A5 = a4 + a_stride
80        ADD     x7, x13, x7             // c5 = c4 + cm_stride
81        CSEL    x4, x12, x4, LO         //   a5 = a4
82        CSEL    x7, x13, x7, LO         //   c5 = c4
83
84        # Load min/max values
85        LD2R    {v6.4s, v7.4s}, [x8]
86
87        # Save d12-d15 on stack
88        STP     d12, d13, [sp, -32]!
89        STP     d14, d15, [sp, 16]
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        MOV     v22.16b, v20.16b
95        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
96        PRFM    PLDL1KEEP,  [x3, 64]
97        MOV     v23.16b, v21.16b
98        PRFM    PLDL1KEEP,  [x9,  0]
99        PRFM    PLDL1KEEP,  [x9, 64]
100        MOV     v24.16b, v20.16b
101        PRFM    PLDL1KEEP, [x10,  0]
102        PRFM    PLDL1KEEP, [x10, 64]
103        MOV     v25.16b, v21.16b
104        PRFM    PLDL1KEEP, [x11,  0]
105        PRFM    PLDL1KEEP, [x11, 64]
106        MOV     v26.16b, v20.16b
107        PRFM    PLDL1KEEP, [x12,  0]
108        PRFM    PLDL1KEEP, [x12, 64]
109        MOV     v27.16b, v21.16b
110        PRFM    PLDL1KEEP,  [x4,  0]
111        PRFM    PLDL1KEEP,  [x4, 64]
112        MOV     v28.16b, v20.16b
113        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
114        MOV     v29.16b, v21.16b
115        PRFM    PLDL1KEEP, [x5,  64]
116        MOV     v30.16b, v20.16b
117        PRFM    PLDL1KEEP, [x5, 128]
118        MOV     v31.16b, v21.16b
119        PRFM    PLDL1KEEP, [x5, 192]
120
121        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
122        SUBS    x0, x2, 16              // k = kc - 16
123        B.LO    4f
124
125        # Prologue - First group loads, no FMA
126        LDR     d0, [x3], 8               // A0
127        LDP     q16, q17, [x5], 32        // B
128        LDR     d1, [x10], 8              // A2
129        LDR     d2, [x12], 8              // A4
130        LD1     {v0.d}[1],  [x9], 8       // A1
131        LD1     {v1.d}[1], [x11], 8       // A3
132        LD1     {v2.d}[1],  [x4], 8       // A5
133        SUBS    x0, x0, 16
134        LDR     q18, [x5], 16
135        LDR     d19, [x5], 8
136        LDR     x8, [x5], 8               // ins is in BLOCK 0
137
138        # Is there at least 4 floats (16 bytes) for main loop?
139        B.LO    2f
140
141        # Main loop - 4 floats of A (16 bytes)
142        # 48 FMA + 12 LD64 A + 8 LDR B
1431:
144        # First group of 24 FMA, Second group loads
145        # BLOCK 0
146        LDR     d3, [x3], 8               // A0
147        INS     v19.d[1], x8              // B from second group
148        FMLA    v20.4s, v16.4s,  v0.s[0]
149        LDR     x8, [x9], 8               // A1
150        FMLA    v22.4s, v16.4s,  v0.s[2]
151        FMLA    v24.4s, v16.4s,  v1.s[0]
152
153        # BLOCK 1
154        LDR     d12, [x5]
155        INS     v3.d[1], x8               // A1 ins
156        FMLA    v26.4s, v16.4s,  v1.s[2]
157        LDR     x8, [x5, 8]               // B
158        FMLA    v28.4s, v16.4s,  v2.s[0]
159        FMLA    v30.4s, v16.4s,  v2.s[2]
160
161        # BLOCK 2
162        LDR     d4, [x10], 8              // A2
163        INS     v12.d[1], x8              // B  ins
164        FMLA    v21.4s, v17.4s,  v0.s[0]
165        LDR     x8, [x11], 8              // A3
166        FMLA    v23.4s, v17.4s,  v0.s[2]
167        FMLA    v25.4s, v17.4s,  v1.s[0]
168
169        # BLOCK 3
170        LDR     d5, [x12], 8              // A4
171        INS     v4.d[1], x8               // A3 ins
172        FMLA    v27.4s, v17.4s,  v1.s[2]
173        LDR     x8, [x4], 8               // A5
174        FMLA    v29.4s, v17.4s,  v2.s[0]
175        FMLA    v31.4s, v17.4s,  v2.s[2]
176
177        # BLOCK 4
178        LDR     d13, [x5, 16]
179        INS     v5.d[1], x8               // A5 ins
180        FMLA    v20.4s, v18.4s,  v0.s[1]
181        LDR     x8, [x5, 24]
182        FMLA    v22.4s, v18.4s,  v0.s[3]
183        FMLA    v24.4s, v18.4s,  v1.s[1]
184
185        # BLOCK 5
186        LDR     d14, [x5, 32]
187        INS     v13.d[1], x8              // B
188        FMLA    v26.4s, v18.4s,  v1.s[3]
189        LDR     x8, [x5, 40]
190        FMLA    v28.4s, v18.4s,  v2.s[1]
191        FMLA    v30.4s, v18.4s,  v2.s[3]
192
193        # BLOCK 6
194        LDR     d15, [x5, 48]
195        INS     v14.d[1], x8              // B
196        FMLA    v21.4s, v19.4s,  v0.s[1]
197        LDR     x8, [x5, 56]
198        FMLA    v23.4s, v19.4s,  v0.s[3]
199        FMLA    v25.4s, v19.4s,  v1.s[1]
200
201        # BLOCK 7
202        INS     v15.d[1], x8
203        FMLA    v27.4s, v19.4s,  v1.s[3]
204        FMLA    v29.4s, v19.4s,  v2.s[1]
205        FMLA    v31.4s, v19.4s,  v2.s[3]
206
207        # Second group of 24 FMA, First group of loads
208        # BLOCK 0
209        LDR     d0, [x3], 8               // A0
210        FMLA    v20.4s, v12.4s,  v3.s[0]
211        LDR     x8, [x9], 8               // A1
212        FMLA    v22.4s, v12.4s,  v3.s[2]
213        FMLA    v24.4s, v12.4s,  v4.s[0]
214        PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
215
216        # BLOCK 1
217        LDR     d16, [x5, 64]
218        INS     v0.d[1], x8               // A1 ins
219        FMLA    v26.4s, v12.4s,  v4.s[2]
220        LDR     x8, [x5, 72]              // B
221        FMLA    v28.4s, v12.4s,  v5.s[0]
222        FMLA    v30.4s, v12.4s,  v5.s[2]
223        PRFM    PLDL1KEEP, [x9, 128]      // Prefetch A1
224
225        # BLOCK 2
226        LDR     d1, [x10], 8              // A2
227        INS     v16.d[1], x8              // B
228        FMLA    v21.4s, v13.4s,  v3.s[0]
229        LDR     x8, [x11], 8              // A3
230        FMLA    v23.4s, v13.4s,  v3.s[2]
231        FMLA    v25.4s, v13.4s,  v4.s[0]
232        PRFM    PLDL1KEEP, [x10, 128]     // Prefetch A2
233
234        # BLOCK 3
235        LDR     d2, [x12], 8              // A4
236        INS     v1.d[1], x8               // A3 ins
237        FMLA    v27.4s, v13.4s,  v4.s[2]
238        LDR     x8,  [x4], 8              // A5
239        FMLA    v29.4s, v13.4s,  v5.s[0]
240        FMLA    v31.4s, v13.4s,  v5.s[2]
241        PRFM    PLDL1KEEP, [x11, 128]     // Prefetch A3
242
243        # BLOCK 4
244        LDR     d17, [x5, 80]
245        INS     v2.d[1], x8               // A5 ins
246        FMLA    v20.4s, v14.4s,  v3.s[1]
247        LDR     x8, [x5, 88]
248        FMLA    v22.4s, v14.4s,  v3.s[3]
249        FMLA    v24.4s, v14.4s,  v4.s[1]
250        PRFM    PLDL1KEEP, [x12, 128]     // Prefetch A4
251
252        # BLOCK 5
253        LDR     d18, [x5, 96]
254        INS     v17.d[1], x8              // B
255        FMLA    v26.4s, v14.4s,  v4.s[3]
256        LDR     x8, [x5, 104]
257        FMLA    v28.4s, v14.4s,  v5.s[1]
258        FMLA    v30.4s, v14.4s,  v5.s[3]
259        PRFM    PLDL1KEEP, [x4, 128]      // Prefetch A5
260
261        # BLOCK 6
262        LDR     d19, [x5, 112]
263        INS     v18.d[1], x8              // B
264        FMLA    v21.4s, v15.4s,  v3.s[1]
265        LDR     x8, [x5, 120]
266        FMLA    v23.4s, v15.4s,  v3.s[3]
267        PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
268        FMLA    v25.4s, v15.4s,  v4.s[1]
269        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
270
271        # BLOCK 7
272        SUBS    x0, x0, 16                // LDR lands here
273        FMLA    v27.4s, v15.4s,  v4.s[3]
274        FMLA    v29.4s, v15.4s,  v5.s[1]
275        ADD     x5, x5, 128
276        FMLA    v31.4s, v15.4s,  v5.s[3]
277        B.HS    1b
278
279        # Epilogue - 4 floats of A (16 bytes)
280        # 48 FMA + 12 LD64 A + 8 LDR B
2812:
282        # First group of 24 FMA, Second group loads
283        # BLOCK 0
284        LDR     d3, [x3], 8               // A0
285        INS     v19.d[1], x8              // B from second group
286        FMLA    v20.4s, v16.4s,  v0.s[0]
287        LDR     x8, [x9], 8               // A1
288        FMLA    v22.4s, v16.4s,  v0.s[2]
289        FMLA    v24.4s, v16.4s,  v1.s[0]
290        PRFM    PSTL1KEEP,  [x6]          // Prefetch C0
291
292        # BLOCK 1
293        LDR     d12, [x5]
294        INS     v3.d[1], x8               // A1 ins
295        FMLA    v26.4s, v16.4s,  v1.s[2]
296        LDR     x8, [x5, 8]               // B
297        FMLA    v28.4s, v16.4s,  v2.s[0]
298        FMLA    v30.4s, v16.4s,  v2.s[2]
299        PRFM    PSTL1KEEP, [x16]          // Prefetch C1
300
301        # BLOCK 2
302        LDR     d4, [x10], 8              // A2
303        INS     v12.d[1], x8              // B  ins
304        FMLA    v21.4s, v17.4s,  v0.s[0]
305        LDR     x8, [x11], 8              // A3
306        FMLA    v23.4s, v17.4s,  v0.s[2]
307        FMLA    v25.4s, v17.4s,  v1.s[0]
308        PRFM    PSTL1KEEP, [x17]          // Prefetch C2
309
310        # BLOCK 3
311        LDR     d5, [x12], 8              // A4
312        INS     v4.d[1], x8               // A3 ins
313        FMLA    v27.4s, v17.4s,  v1.s[2]
314        LDR     x8, [x4], 8               // A5
315        FMLA    v29.4s, v17.4s,  v2.s[0]
316        FMLA    v31.4s, v17.4s,  v2.s[2]
317        PRFM    PSTL1KEEP, [x14]          // Prefetch C3
318
319        # BLOCK 4
320        LDR     d13, [x5, 16]
321        INS     v5.d[1], x8               // A5 ins
322        FMLA    v20.4s, v18.4s,  v0.s[1]
323        LDR     x8, [x5, 24]
324        FMLA    v22.4s, v18.4s,  v0.s[3]
325        FMLA    v24.4s, v18.4s,  v1.s[1]
326        PRFM    PSTL1KEEP, [x13]          // Prefetch C4
327
328        # BLOCK 5
329        LDR     d14, [x5, 32]
330        INS     v13.d[1], x8              // B
331        FMLA    v26.4s, v18.4s,  v1.s[3]
332        LDR     x8, [x5, 40]
333        FMLA    v28.4s, v18.4s,  v2.s[1]
334        FMLA    v30.4s, v18.4s,  v2.s[3]
335        PRFM    PSTL1KEEP, [x7]           // Prefetch C5
336
337        # BLOCK 6
338        LDR     d15, [x5, 48]
339        INS     v14.d[1], x8              // B
340        FMLA    v21.4s, v19.4s,  v0.s[1]
341        LDR     x8, [x5, 56]
342        FMLA    v23.4s, v19.4s,  v0.s[3]
343        FMLA    v25.4s, v19.4s,  v1.s[1]
344
345        # BLOCK 7
346        INS     v15.d[1], x8              // B
347        FMLA    v27.4s, v19.4s,  v1.s[3]
348        FMLA    v29.4s, v19.4s,  v2.s[1]
349        FMLA    v31.4s, v19.4s,  v2.s[3]
350
351        # Second group of 24 FMA, First group of loads
352        # BLOCK 0
353        FMLA    v20.4s, v12.4s,  v3.s[0]
354        FMLA    v22.4s, v12.4s,  v3.s[2]
355        FMLA    v24.4s, v12.4s,  v4.s[0]
356
357        # BLOCK 1
358        FMLA    v26.4s, v12.4s,  v4.s[2]
359        FMLA    v28.4s, v12.4s,  v5.s[0]
360        FMLA    v30.4s, v12.4s,  v5.s[2]
361
362        # BLOCK 2
363        FMLA    v21.4s, v13.4s,  v3.s[0]
364        FMLA    v23.4s, v13.4s,  v3.s[2]
365        FMLA    v25.4s, v13.4s,  v4.s[0]
366
367        # BLOCK 3
368        FMLA    v27.4s, v13.4s,  v4.s[2]
369        FMLA    v29.4s, v13.4s,  v5.s[0]
370        FMLA    v31.4s, v13.4s,  v5.s[2]
371
372        # BLOCK 4
373        FMLA    v20.4s, v14.4s,  v3.s[1]
374        FMLA    v22.4s, v14.4s,  v3.s[3]
375        FMLA    v24.4s, v14.4s,  v4.s[1]
376
377        # BLOCK 5
378        FMLA    v26.4s, v14.4s,  v4.s[3]
379        FMLA    v28.4s, v14.4s,  v5.s[1]
380        FMLA    v30.4s, v14.4s,  v5.s[3]
381        TST     x0, 15
382
383        # BLOCK 6
384        FMLA    v21.4s, v15.4s,  v3.s[1]
385        FMLA    v23.4s, v15.4s,  v3.s[3]
386        FMLA    v25.4s, v15.4s,  v4.s[1]
387        ADD     x5, x5, 64
388
389        # BLOCK 7
390        FMLA    v27.4s, v15.4s,  v4.s[3]
391        FMLA    v29.4s, v15.4s,  v5.s[1]
392        FMLA    v31.4s, v15.4s,  v5.s[3]
393
394        # Is there a remainder?- 2 floats of A (8 bytes) or less
395        B.NE    4f
3963:
397        # Clamp
398        FMAX    v20.4s, v20.4s, v6.4s
399        # Load cn_stride
400        LDR     x0, [sp, 32]
401        FMAX    v21.4s, v21.4s, v6.4s
402        FMAX    v22.4s, v22.4s, v6.4s
403        FMAX    v23.4s, v23.4s, v6.4s
404        FMAX    v24.4s, v24.4s, v6.4s
405        FMAX    v25.4s, v25.4s, v6.4s
406        FMAX    v26.4s, v26.4s, v6.4s
407        FMAX    v27.4s, v27.4s, v6.4s
408        FMAX    v28.4s, v28.4s, v6.4s
409        FMAX    v29.4s, v29.4s, v6.4s
410        FMAX    v30.4s, v30.4s, v6.4s
411        FMAX    v31.4s, v31.4s, v6.4s
412        SUBS    x1, x1, 8
413        FMIN    v20.4s, v20.4s, v7.4s
414        FMIN    v21.4s, v21.4s, v7.4s
415        FMIN    v22.4s, v22.4s, v7.4s
416        FMIN    v23.4s, v23.4s, v7.4s
417        FMIN    v24.4s, v24.4s, v7.4s
418        FMIN    v25.4s, v25.4s, v7.4s
419        FMIN    v26.4s, v26.4s, v7.4s
420        FMIN    v27.4s, v27.4s, v7.4s
421        FMIN    v28.4s, v28.4s, v7.4s
422        FMIN    v29.4s, v29.4s, v7.4s
423        FMIN    v30.4s, v30.4s, v7.4s
424        FMIN    v31.4s, v31.4s, v7.4s
425
426        # Store full 6 x 8
427        B.LO    6f
428
429        ST1     {v20.16b, v21.16b},  [x6], x0
430        SUB     x3,  x3, x2             // A0 -= kc
431        ST1     {v22.16b, v23.16b}, [x16], x0
432        SUB     x9,  x9, x2             // A1 -= kc
433        ST1     {v24.16b, v25.16b}, [x17], x0
434        SUB     x10, x10, x2            // A2 -= kc
435        ST1     {v26.16b, v27.16b}, [x14], x0
436        SUB     x11, x11, x2            // A3 -= kc
437        ST1     {v28.16b, v29.16b}, [x13], x0
438        SUB     x12, x12, x2            // A4 -= kc
439        ST1     {v30.16b, v31.16b},  [x7], x0
440        SUB     x4,  x4, x2             // A5 -= kc
441
442        B.HI    0b
443
444        # Restore d12-d15 from stack
445        LDP     d14, d15, [sp, 16]
446        LDP     d12, d13, [sp], 32
447        RET
448
4494:
450        # Is there a remainder?- 2 floats of A (8 bytes)
451        TBZ     x0, 3, 5f
452
453        # Remainder- 2 floats of A (8 bytes)
454        LDR     d0,  [x3], 8
455        LDR     q16, [x5], 16
456        LD1     {v0.d}[1], [x9], 8
457        LDR     d1, [x10], 8
458        LD1     {v1.d}[1], [x11], 8
459        LDR     d2, [x12], 8
460        LD1     {v2.d}[1], [x4], 8
461        LDR     q17, [x5], 16
462        LDR     q18, [x5], 16
463        LDR     q19, [x5], 16
464
465        FMLA    v20.4s, v16.4s,  v0.s[0]
466        FMLA    v22.4s, v16.4s,  v0.s[2]
467        FMLA    v24.4s, v16.4s,  v1.s[0]
468        FMLA    v26.4s, v16.4s,  v1.s[2]
469        FMLA    v28.4s, v16.4s,  v2.s[0]
470        FMLA    v30.4s, v16.4s,  v2.s[2]
471        FMLA    v21.4s, v17.4s,  v0.s[0]
472        FMLA    v23.4s, v17.4s,  v0.s[2]
473        FMLA    v25.4s, v17.4s,  v1.s[0]
474        FMLA    v27.4s, v17.4s,  v1.s[2]
475        FMLA    v29.4s, v17.4s,  v2.s[0]
476        FMLA    v31.4s, v17.4s,  v2.s[2]
477
478        FMLA    v20.4s, v18.4s,  v0.s[1]
479        FMLA    v22.4s, v18.4s,  v0.s[3]
480        FMLA    v24.4s, v18.4s,  v1.s[1]
481        FMLA    v26.4s, v18.4s,  v1.s[3]
482        FMLA    v28.4s, v18.4s,  v2.s[1]
483        FMLA    v30.4s, v18.4s,  v2.s[3]
484        FMLA    v21.4s, v19.4s,  v0.s[1]
485        FMLA    v23.4s, v19.4s,  v0.s[3]
486        FMLA    v25.4s, v19.4s,  v1.s[1]
487        FMLA    v27.4s, v19.4s,  v1.s[3]
488        FMLA    v29.4s, v19.4s,  v2.s[1]
489        FMLA    v31.4s, v19.4s,  v2.s[3]
490
491        # Is there a remainder?- 1 float of A (4 bytes)
492        TBZ     x0, 2, 3b
4935:
494        # Remainder- 1 float of A (4 bytes)
495        LDR     s0,  [x3], 4
496        LDR     q16, [x5], 16
497        LD1     {v0.s}[2], [x9], 4
498        LDR     s1, [x10], 4
499        LD1     {v1.s}[2], [x11], 4
500        LDR     s2, [x12], 4
501        LD1     {v2.s}[2], [x4], 4
502        LDR     q17, [x5], 16
503
504        FMLA    v20.4s, v16.4s,  v0.s[0]
505        FMLA    v22.4s, v16.4s,  v0.s[2]
506        FMLA    v24.4s, v16.4s,  v1.s[0]
507        FMLA    v26.4s, v16.4s,  v1.s[2]
508        FMLA    v28.4s, v16.4s,  v2.s[0]
509        FMLA    v30.4s, v16.4s,  v2.s[2]
510        FMLA    v21.4s, v17.4s,  v0.s[0]
511        FMLA    v23.4s, v17.4s,  v0.s[2]
512        FMLA    v25.4s, v17.4s,  v1.s[0]
513        FMLA    v27.4s, v17.4s,  v1.s[2]
514        FMLA    v29.4s, v17.4s,  v2.s[0]
515        FMLA    v31.4s, v17.4s,  v2.s[2]
516        B       3b
517
518        # Store odd width
5196:
520        TBZ     x1, 2, 7f
521        STR     q20,  [x6], 16
522        MOV     v20.16b, v21.16b
523        STR     q22, [x16], 16
524        MOV     v22.16b, v23.16b
525        STR     q24, [x17], 16
526        MOV     v24.16b, v25.16b
527        STR     q26, [x14], 16
528        MOV     v26.16b, v27.16b
529        STR     q28, [x13], 16
530        MOV     v28.16b, v29.16b
531        STR     q30,  [x7], 16
532        MOV     v30.16b, v31.16b
533
5347:
535        TBZ     x1, 1, 8f
536        STR     d20,  [x6], 8
537        STR     d22, [x16], 8
538        DUP     d20, v20.d[1]
539        DUP     d22, v22.d[1]
540        STR     d24, [x17], 8
541        STR     d26, [x14], 8
542        DUP     d24, v24.d[1]
543        DUP     d26, v26.d[1]
544        STR     d28, [x13], 8
545        STR     d30,  [x7], 8
546        DUP     d28, v28.d[1]
547        DUP     d30, v30.d[1]
548
5498:
550        TBZ     x1, 0, 9f
551        STR     s20,  [x6]
552        STR     s22, [x16]
553        STR     s24, [x17]
554        STR     s26, [x14]
555        STR     s28, [x13]
556        STR     s30,  [x7]
5579:
558        # Restore d12-d15 from stack
559        LDP     d14, d15, [sp, 16]
560        LDP     d12, d13, [sp], 32
561        RET
562
563END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53
564
565#ifdef __ELF__
566.section ".note.GNU-stack","",%progbits
567#endif
568