xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# Register usage
29# A0 x14  v0     v3
30# A1 x15  v0[1]  v3[1]
31# A2 x20  v1     v4
32# A3 x21  v1[1]  v4[1]
33# A4 x22  v2     v5
34# A5 x23  v2[1]  v5[1]
35
36# B   x5  v12 v13 v14 v15 second set of B
37# B       v16 v17 v18 v19 first set
38
39# C0  x6  v20 v21
40# C1 x16  v22 v23
41# C2 x17  v24 v25
42# C3 x10  v26 v27
43# C4 x13  v28 v29
44# C5  x7  v30 v31
45
46# Clamp v6 v7
47# unused A   v8 v9 v10 v11
48# x8 temporary vector shadow register
49
50BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53
51
52        # Load a_offset
53        LDR     x11, [sp, 8]
54
55        # Load zero, params pointer
56        LDP     x12, x8, [sp, 16]
57
58        # Clamp C pointers
59        CMP     x0, 2                   // if mr < 2
60        ADD     x16, x6, x7             // c1 = c0 + cm_stride
61        CSEL    x16, x6, x16, LO        //   c1 = c0
62
63        ADD     x17, x16, x7            // c2 = c1 + cm_stride
64                                        // if mr <= 2
65        CSEL    x17, x16, x17, LS       //   c2 = c1
66
67        CMP     x0, 4                   // if mr < 4
68        ADD     x10, x17, x7            // c3 = c2 + cm_stride
69        CSEL    x10, x17, x10, LO       //   c3 = c2
70
71        ADD     x13, x10, x7            // c4 = c3 + cm_stride
72                                        // if mr <= 4
73        CSEL    x13, x10, x13, LS       //   c4 = c3
74
75        CMP     x0, 6                   // if mr < 6
76        ADD     x7, x13, x7             // c5 = c4 + cm_stride
77        CSEL    x7, x13, x7, LO         //   c5 = c4
78
79        # Load min/max values
80        LD2R    {v6.4s, v7.4s}, [x8]
81
82        # Save x20-x23, d12-d15 on stack
83        STP     d12, d13, [sp, -64]!
84        STP     d14, d15, [sp, 16]
85        STP     x20, x21, [sp, 32]
86        STP     x22, x23, [sp, 48]
87
880:
89        # Load initial bias from w into accumulators
90        LDP     q20, q21, [x5], 32
91        MOV     v22.16b, v20.16b
92        MOV     v23.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
94        MOV     v24.16b, v20.16b
95        PRFM    PLDL1KEEP, [x5, 64]
96        MOV     v25.16b, v21.16b
97        PRFM    PLDL1KEEP, [x5, 128]
98        MOV     v26.16b, v20.16b
99        PRFM    PLDL1KEEP, [x5, 192]
100        MOV     v27.16b, v21.16b
101        MOV     v28.16b, v20.16b
102        MOV     v29.16b, v21.16b
103        MOV     v30.16b, v20.16b
104        MOV     v31.16b, v21.16b
105
106        MOV     x9, x3                  // p = ks
107
1081:
109        # Load next 6 A pointers
110        LDP     x14, x15, [x4], 16
111        LDP     x20, x21, [x4], 16
112        LDP     x22, x23, [x4], 16
113
114        CMP     x14, x12                // if a0 == zero
115        ADD     x14, x14, x11           // A0 += a_offset
116        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
117        CMP     x15, x12                // if a1 == zero
118        ADD     x15, x15, x11           // A1 += a_offset
119        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
120        CMP     x20, x12                // if a2 == zero
121        ADD     x20, x20, x11           // A2 += a_offset
122        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
123        CMP     x21, x12                // if a3 == zero
124        ADD     x21, x21, x11           // A3 += a_offset
125        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
126        CMP     x22, x12                // if a4 == zero
127        ADD     x22, x22, x11           // A4 += a_offset
128        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
129        CMP     x23, x12                // if a5 == zero
130        ADD     x23, x23, x11           // A5 += a_offset
131        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
132
133        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
134        SUBS    x0, x2, 16              // k = kc - 16
135        B.LO    5f
136
137        # Prologue - First group loads, no FMA
138        LDR     d0, [x14], 8              // A0
139        LDP     q16, q17, [x5], 32        // B
140        LDR     d1, [x20], 8              // A2
141        LDR     d2, [x22], 8              // A4
142        LD1     {v0.d}[1], [x15], 8       // A1
143        LD1     {v1.d}[1], [x21], 8       // A3
144        LD1     {v2.d}[1],  [x23], 8      // A5
145        SUBS    x0, x0, 16
146        LDR     q18, [x5], 16
147        LDR     d19, [x5], 8
148        LDR     x8, [x5], 8              // ins is in BLOCK 0
149
150        # Is there at least 4 floats (16 bytes) for main loop?
151        B.LO    3f
152
153        # Main loop - 4 floats of A (16 bytes)
154        # 48 FMA + 12 LD64 A + 8 LDR B
1552:
156        # First group of 24 FMA, Second group loads
157        # BLOCK 0
158        LDR     d3, [x14], 8              // A0
159        INS     v19.d[1], x8              // B from second group
160        FMLA    v20.4s, v16.4s,  v0.s[0]
161        LDR     x8, [x15], 8              // A1
162        FMLA    v22.4s, v16.4s,  v0.s[2]
163        FMLA    v24.4s, v16.4s,  v1.s[0]
164
165        # BLOCK 1
166        LDR     d12, [x5]
167        INS     v3.d[1], x8               // A1 ins
168        FMLA    v26.4s, v16.4s,  v1.s[2]
169        LDR     x8, [x5, 8]               // B
170        FMLA    v28.4s, v16.4s,  v2.s[0]
171        FMLA    v30.4s, v16.4s,  v2.s[2]
172
173        # BLOCK 2
174        LDR     d4, [x20], 8              // A2
175        INS     v12.d[1], x8              // B  ins
176        FMLA    v21.4s, v17.4s,  v0.s[0]
177        LDR     x8, [x21], 8              // A3
178        FMLA    v23.4s, v17.4s,  v0.s[2]
179        FMLA    v25.4s, v17.4s,  v1.s[0]
180
181        # BLOCK 3
182        LDR     d5, [x22], 8              // A4
183        INS     v4.d[1], x8               // A3 ins
184        FMLA    v27.4s, v17.4s,  v1.s[2]
185        LDR     x8, [x23], 8              // A5
186        FMLA    v29.4s, v17.4s,  v2.s[0]
187        FMLA    v31.4s, v17.4s,  v2.s[2]
188
189        # BLOCK 4
190        LDR     d13, [x5, 16]
191        INS     v5.d[1], x8               // A5 ins
192        FMLA    v20.4s, v18.4s,  v0.s[1]
193        LDR     x8, [x5, 24]
194        FMLA    v22.4s, v18.4s,  v0.s[3]
195        FMLA    v24.4s, v18.4s,  v1.s[1]
196
197        # BLOCK 5
198        LDR     d14, [x5, 32]
199        INS     v13.d[1], x8              // B
200        FMLA    v26.4s, v18.4s,  v1.s[3]
201        LDR     x8, [x5, 40]
202        FMLA    v28.4s, v18.4s,  v2.s[1]
203        FMLA    v30.4s, v18.4s,  v2.s[3]
204
205        # BLOCK 6
206        LDR     d15, [x5, 48]
207        INS     v14.d[1], x8              // B
208        FMLA    v21.4s, v19.4s,  v0.s[1]
209        LDR     x8, [x5, 56]
210        FMLA    v23.4s, v19.4s,  v0.s[3]
211        FMLA    v25.4s, v19.4s,  v1.s[1]
212
213        # BLOCK 7
214        INS     v15.d[1], x8
215        FMLA    v27.4s, v19.4s,  v1.s[3]
216        FMLA    v29.4s, v19.4s,  v2.s[1]
217        FMLA    v31.4s, v19.4s,  v2.s[3]
218
219        # Second group of 24 FMA, First group of loads
220        # BLOCK 0
221        LDR     d0, [x14], 8              // A0
222        FMLA    v20.4s, v12.4s,  v3.s[0]
223        LDR     x8, [x15], 8              // A1
224        FMLA    v22.4s, v12.4s,  v3.s[2]
225        FMLA    v24.4s, v12.4s,  v4.s[0]
226        PRFM    PLDL1KEEP, [x14, 128]     // Prefetch A0
227
228        # BLOCK 1
229        LDR     d16, [x5, 64]
230        INS     v0.d[1], x8               // A1 ins
231        FMLA    v26.4s, v12.4s,  v4.s[2]
232        LDR     x8, [x5, 72]              // B
233        FMLA    v28.4s, v12.4s,  v5.s[0]
234        FMLA    v30.4s, v12.4s,  v5.s[2]
235        PRFM    PLDL1KEEP, [x15, 128]     // Prefetch A1
236
237        # BLOCK 2
238        LDR     d1, [x20], 8              // A2
239        INS     v16.d[1], x8              // B
240        FMLA    v21.4s, v13.4s,  v3.s[0]
241        LDR     x8, [x21], 8              // A3
242        FMLA    v23.4s, v13.4s,  v3.s[2]
243        FMLA    v25.4s, v13.4s,  v4.s[0]
244        PRFM    PLDL1KEEP, [x20, 128]     // Prefetch A2
245
246        # BLOCK 3
247        LDR     d2, [x22], 8              // A4
248        INS     v1.d[1], x8               // A3 ins
249        FMLA    v27.4s, v13.4s,  v4.s[2]
250        LDR     x8,  [x23], 8             // A5
251        FMLA    v29.4s, v13.4s,  v5.s[0]
252        FMLA    v31.4s, v13.4s,  v5.s[2]
253        PRFM    PLDL1KEEP, [x21, 128]     // Prefetch A3
254
255        # BLOCK 4
256        LDR     d17, [x5, 80]
257        INS     v2.d[1], x8               // A5 ins
258        FMLA    v20.4s, v14.4s,  v3.s[1]
259        LDR     x8, [x5, 88]
260        FMLA    v22.4s, v14.4s,  v3.s[3]
261        FMLA    v24.4s, v14.4s,  v4.s[1]
262        PRFM    PLDL1KEEP, [x22, 128]     // Prefetch A4
263
264        # BLOCK 5
265        LDR     d18, [x5, 96]
266        INS     v17.d[1], x8              // B
267        FMLA    v26.4s, v14.4s,  v4.s[3]
268        LDR     x8, [x5, 104]
269        FMLA    v28.4s, v14.4s,  v5.s[1]
270        FMLA    v30.4s, v14.4s,  v5.s[3]
271        PRFM    PLDL1KEEP, [x23, 128]     // Prefetch A5
272
273        # BLOCK 6
274        LDR     d19, [x5, 112]
275        INS     v18.d[1], x8              // B
276        FMLA    v21.4s, v15.4s,  v3.s[1]
277        LDR     x8, [x5, 120]
278        FMLA    v23.4s, v15.4s,  v3.s[3]
279        PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
280        FMLA    v25.4s, v15.4s,  v4.s[1]
281        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
282
283        # BLOCK 7
284        SUBS    x0, x0, 16                // LDR lands here
285        FMLA    v27.4s, v15.4s,  v4.s[3]
286        FMLA    v29.4s, v15.4s,  v5.s[1]
287        ADD     x5, x5, 128
288        FMLA    v31.4s, v15.4s,  v5.s[3]
289        B.HS    2b
290
291        # Epilogue - 4 floats of A (16 bytes)
292        # 48 FMA + 12 LD64 A + 8 LDR B
2933:
294        # First group of 24 FMA, Second group loads
295        # BLOCK 0
296        LDR     d3, [x14], 8              // A0
297        INS     v19.d[1], x8              // B from second group
298        FMLA    v20.4s, v16.4s,  v0.s[0]
299        LDR     x8, [x15], 8              // A1
300        FMLA    v22.4s, v16.4s,  v0.s[2]
301        FMLA    v24.4s, v16.4s,  v1.s[0]
302        PRFM    PSTL1KEEP,  [x6]          // Prefetch C0
303
304        # BLOCK 1
305        LDR     d12, [x5]
306        INS     v3.d[1], x8               // A1 ins
307        FMLA    v26.4s, v16.4s,  v1.s[2]
308        LDR     x8, [x5, 8]               // B
309        FMLA    v28.4s, v16.4s,  v2.s[0]
310        FMLA    v30.4s, v16.4s,  v2.s[2]
311        PRFM    PSTL1KEEP, [x16]          // Prefetch C1
312
313        # BLOCK 2
314        LDR     d4, [x20], 8              // A2
315        INS     v12.d[1], x8              // B  ins
316        FMLA    v21.4s, v17.4s,  v0.s[0]
317        LDR     x8, [x21], 8              // A3
318        FMLA    v23.4s, v17.4s,  v0.s[2]
319        FMLA    v25.4s, v17.4s,  v1.s[0]
320        PRFM    PSTL1KEEP, [x17]          // Prefetch C2
321
322        # BLOCK 3
323        LDR     d5, [x22], 8              // A4
324        INS     v4.d[1], x8               // A3 ins
325        FMLA    v27.4s, v17.4s,  v1.s[2]
326        LDR     x8, [x23], 8              // A5
327        FMLA    v29.4s, v17.4s,  v2.s[0]
328        FMLA    v31.4s, v17.4s,  v2.s[2]
329        PRFM    PSTL1KEEP, [x10]          // Prefetch C3
330
331        # BLOCK 4
332        LDR     d13, [x5, 16]
333        INS     v5.d[1], x8               // A5 ins
334        FMLA    v20.4s, v18.4s,  v0.s[1]
335        LDR     x8, [x5, 24]
336        FMLA    v22.4s, v18.4s,  v0.s[3]
337        FMLA    v24.4s, v18.4s,  v1.s[1]
338        PRFM    PSTL1KEEP, [x13]          // Prefetch C4
339
340        # BLOCK 5
341        LDR     d14, [x5, 32]
342        INS     v13.d[1], x8              // B
343        FMLA    v26.4s, v18.4s,  v1.s[3]
344        LDR     x8, [x5, 40]
345        FMLA    v28.4s, v18.4s,  v2.s[1]
346        FMLA    v30.4s, v18.4s,  v2.s[3]
347        PRFM    PSTL1KEEP, [x7]           // Prefetch C5
348
349        # BLOCK 6
350        LDR     d15, [x5, 48]
351        INS     v14.d[1], x8              // B
352        FMLA    v21.4s, v19.4s,  v0.s[1]
353        LDR     x8, [x5, 56]
354        FMLA    v23.4s, v19.4s,  v0.s[3]
355        FMLA    v25.4s, v19.4s,  v1.s[1]
356
357        # BLOCK 7
358        INS     v15.d[1], x8              // B from previous
359        FMLA    v27.4s, v19.4s,  v1.s[3]
360        FMLA    v29.4s, v19.4s,  v2.s[1]
361        FMLA    v31.4s, v19.4s,  v2.s[3]
362
363        # Second group of 24 FMA, First group of loads
364        # BLOCK 0
365        FMLA    v20.4s, v12.4s,  v3.s[0]
366        FMLA    v22.4s, v12.4s,  v3.s[2]
367        FMLA    v24.4s, v12.4s,  v4.s[0]
368
369        # BLOCK 1
370        FMLA    v26.4s, v12.4s,  v4.s[2]
371        FMLA    v28.4s, v12.4s,  v5.s[0]
372        FMLA    v30.4s, v12.4s,  v5.s[2]
373
374        # BLOCK 2
375        FMLA    v21.4s, v13.4s,  v3.s[0]
376        FMLA    v23.4s, v13.4s,  v3.s[2]
377        FMLA    v25.4s, v13.4s,  v4.s[0]
378
379        # BLOCK 3
380        FMLA    v27.4s, v13.4s,  v4.s[2]
381        FMLA    v29.4s, v13.4s,  v5.s[0]
382        FMLA    v31.4s, v13.4s,  v5.s[2]
383
384        # BLOCK 4
385        FMLA    v20.4s, v14.4s,  v3.s[1]
386        FMLA    v22.4s, v14.4s,  v3.s[3]
387        FMLA    v24.4s, v14.4s,  v4.s[1]
388
389        # BLOCK 5
390        FMLA    v26.4s, v14.4s,  v4.s[3]
391        FMLA    v28.4s, v14.4s,  v5.s[1]
392        FMLA    v30.4s, v14.4s,  v5.s[3]
393        TST     x0, 15
394
395        # BLOCK 6
396        FMLA    v21.4s, v15.4s,  v3.s[1]
397        FMLA    v23.4s, v15.4s,  v3.s[3]
398        FMLA    v25.4s, v15.4s,  v4.s[1]
399        ADD     x5, x5, 64
400
401        # BLOCK 7
402        FMLA    v27.4s, v15.4s,  v4.s[3]
403        FMLA    v29.4s, v15.4s,  v5.s[1]
404        FMLA    v31.4s, v15.4s,  v5.s[3]
405
406        # Is there a remainder?- 2 floats of A (8 bytes) or less
407        B.NE    5f
408
4094:
410        # ks loop
411        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
412        B.HI    1b
413
414        # Clamp
415        FMAX    v20.4s, v20.4s, v6.4s
416        # Load cn_stride
417        LDR     x0, [sp, 64]
418        FMAX    v21.4s, v21.4s, v6.4s
419        FMAX    v22.4s, v22.4s, v6.4s
420        FMAX    v23.4s, v23.4s, v6.4s
421        FMAX    v24.4s, v24.4s, v6.4s
422        FMAX    v25.4s, v25.4s, v6.4s
423        FMAX    v26.4s, v26.4s, v6.4s
424        FMAX    v27.4s, v27.4s, v6.4s
425        FMAX    v28.4s, v28.4s, v6.4s
426        FMAX    v29.4s, v29.4s, v6.4s
427        FMAX    v30.4s, v30.4s, v6.4s
428        FMAX    v31.4s, v31.4s, v6.4s
429        SUBS    x1, x1, 8
430        FMIN    v20.4s, v20.4s, v7.4s
431        FMIN    v21.4s, v21.4s, v7.4s
432        FMIN    v22.4s, v22.4s, v7.4s
433        FMIN    v23.4s, v23.4s, v7.4s
434        FMIN    v24.4s, v24.4s, v7.4s
435        FMIN    v25.4s, v25.4s, v7.4s
436        FMIN    v26.4s, v26.4s, v7.4s
437        FMIN    v27.4s, v27.4s, v7.4s
438        FMIN    v28.4s, v28.4s, v7.4s
439        FMIN    v29.4s, v29.4s, v7.4s
440        FMIN    v30.4s, v30.4s, v7.4s
441        FMIN    v31.4s, v31.4s, v7.4s
442
443        # Store full 6 x 8
444        B.LO    7f
445
446        STP     q30, q31,  [x7]
447        ADD     x7, x7, x0
448        STP     q28, q29, [x13]
449        ADD     x13, x13, x0
450        STP     q26, q27, [x10]
451        ADD     x10, x10, x0
452        STP     q24, q25, [x17]
453        ADD     x17, x17, x0
454        STP     q22, q23, [x16]
455        ADD     x16, x16, x0
456        STP     q20, q21,  [x6]
457        ADD     x6,  x6, x0
458
459        SUB     x4, x4, x3              // A -= ks
460
461        # nc loop
462        B.HI    0b
463
464        # Restore x20-x23, d12-d15 from stack
465        LDP     x22, x23, [sp, 48]
466        LDP     x20, x21, [sp, 32]
467        LDP     d14, d15, [sp, 16]
468        LDP     d12, d13, [sp], 64
469        RET
470
4715:
472        # Is there a remainder?- 2 floats of A (8 bytes)
473        TBZ     x0, 3, 6f
474
475        # Remainder- 2 floats of A (8 bytes)
476        LDR     d0, [x14], 8
477        LDR     q16, [x5], 16
478        LD1     {v0.d}[1], [x15], 8
479        LDR     d1, [x20], 8
480        LD1     {v1.d}[1], [x21], 8
481        LDR     d2, [x22], 8
482        LD1     {v2.d}[1], [x23], 8
483        LDR     q17, [x5], 16
484        LDR     q18, [x5], 16
485        LDR     q19, [x5], 16
486        FMLA    v20.4s, v16.4s,  v0.s[0]
487        FMLA    v22.4s, v16.4s,  v0.s[2]
488        FMLA    v24.4s, v16.4s,  v1.s[0]
489        FMLA    v26.4s, v16.4s,  v1.s[2]
490        FMLA    v28.4s, v16.4s,  v2.s[0]
491        FMLA    v30.4s, v16.4s,  v2.s[2]
492        FMLA    v21.4s, v17.4s,  v0.s[0]
493        FMLA    v23.4s, v17.4s,  v0.s[2]
494        FMLA    v25.4s, v17.4s,  v1.s[0]
495        FMLA    v27.4s, v17.4s,  v1.s[2]
496        FMLA    v29.4s, v17.4s,  v2.s[0]
497        FMLA    v31.4s, v17.4s,  v2.s[2]
498
499        FMLA    v20.4s, v18.4s,  v0.s[1]
500        FMLA    v22.4s, v18.4s,  v0.s[3]
501        FMLA    v24.4s, v18.4s,  v1.s[1]
502        FMLA    v26.4s, v18.4s,  v1.s[3]
503        FMLA    v28.4s, v18.4s,  v2.s[1]
504        FMLA    v30.4s, v18.4s,  v2.s[3]
505        FMLA    v21.4s, v19.4s,  v0.s[1]
506        FMLA    v23.4s, v19.4s,  v0.s[3]
507        FMLA    v25.4s, v19.4s,  v1.s[1]
508        FMLA    v27.4s, v19.4s,  v1.s[3]
509        FMLA    v29.4s, v19.4s,  v2.s[1]
510        FMLA    v31.4s, v19.4s,  v2.s[3]
511
512        # Is there a remainder?- 1 float of A (4 bytes)
513        TBZ     x0, 2, 4b
5146:
515        # Remainder- 1 float of A (4 bytes)
516        LDR     s0,  [x14], 4
517        LDR     q16, [x5], 16
518        LD1     {v0.s}[2], [x15], 4
519        LDR     s1, [x20], 4
520        LD1     {v1.s}[2], [x21], 4
521        LDR     s2, [x22], 4
522        LD1     {v2.s}[2], [x23], 4
523        LDR     q17, [x5], 16
524
525        FMLA    v20.4s, v16.4s,  v0.s[0]
526        FMLA    v22.4s, v16.4s,  v0.s[2]
527        FMLA    v24.4s, v16.4s,  v1.s[0]
528        FMLA    v26.4s, v16.4s,  v1.s[2]
529        FMLA    v28.4s, v16.4s,  v2.s[0]
530        FMLA    v30.4s, v16.4s,  v2.s[2]
531        FMLA    v21.4s, v17.4s,  v0.s[0]
532        FMLA    v23.4s, v17.4s,  v0.s[2]
533        FMLA    v25.4s, v17.4s,  v1.s[0]
534        FMLA    v27.4s, v17.4s,  v1.s[2]
535        FMLA    v29.4s, v17.4s,  v2.s[0]
536        FMLA    v31.4s, v17.4s,  v2.s[2]
537        B       4b
538
539        # Store odd width
5407:
541        TBZ     x1, 2, 8f
542        STR     q30,  [x7], 16
543        MOV     v30.16b, v31.16b
544        STR     q28, [x13], 16
545        MOV     v28.16b, v29.16b
546        STR     q26, [x10], 16
547        MOV     v26.16b, v27.16b
548        STR     q24, [x17], 16
549        MOV     v24.16b, v25.16b
550        STR     q22, [x16], 16
551        MOV     v22.16b, v23.16b
552        STR     q20,  [x6], 16
553        MOV     v20.16b, v21.16b
5548:
555        TBZ     x1, 1, 9f
556        STR     d30,  [x7], 8
557        STR     d28, [x13], 8
558        DUP     d30, v30.d[1]
559        DUP     d28, v28.d[1]
560        STR     d26, [x10], 8
561        STR     d24, [x17], 8
562        DUP     d26, v26.d[1]
563        DUP     d24, v24.d[1]
564        STR     d22, [x16], 8
565        STR     d20,  [x6], 8
566        DUP     d22, v22.d[1]
567        DUP     d20, v20.d[1]
568
5699:
570        TBZ     x1, 0, 10f
571        STR     s30,  [x7]
572        STR     s28, [x13]
573        STR     s26, [x10]
574        STR     s24, [x17]
575        STR     s22, [x16]
576        STR     s20,  [x6]
57710:
578        # Restore x20-x23, d12-d15 from stack
579        LDP     x22, x23, [sp, 48]
580        LDP     x20, x21, [sp, 32]
581        LDP     d14, d15, [sp, 16]
582        LDP     d12, d13, [sp], 64
583        RET
584
585END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53
586
587#ifdef __ELF__
588.section ".note.GNU-stack","",%progbits
589#endif
590