xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen-inc/4x12inc-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28# x3  a0
29# x11 a1
30# x12 a2
31# x4  a3 / a_stride
32
33# C pointers
34# x6  c0
35# x9  c1
36# x10 c2
37# x7  c3 / cm_stride
38
39# x8 temporary vector shadow register
40
41# Vector register usage and GPR shadows
42# a0  v0
43# a1  v0[1]
44# a2  v1
45# a3  v1[1]
46# a0  v2
47# a1  v2[1]
48# a2  v3
49# a3  v3[1]
50# B   v6  v7  v8
51# B   v9 v10 v11
52# B  v14 v15 v16
53# B  v17 v18 v19
54# C  v20 v21 v22
55# C  v23 v24 v25
56# C  v26 v27 v28
57# C  v29 v30 v31
58# Clamp v4 v5
59# v12 to v13 unused.
60
61BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
62
63        # Load cn_stride, acc
64        LDP     x14, x15, [sp]
65        # Load params pointer
66        LDR     x8, [sp, 16]
67
68        # Load min/max values
69        LD2R    {v4.4s, v5.4s}, [x8]
70
71        # Save d8-d11,d14,d15 on stack
72        STP     d8,  d9, [sp, -48]!
73        STP     d10, d11, [sp, 16]
74        STP     d14, d15, [sp, 32]
75
76        # Clamp A and C pointers
77        CMP     x0, 2                   // if mr < 2
78        ADD     x11, x3, x4             // a1 = a0 + a_stride
79        ADD     x9, x6, x7              // c1 = c0 + cm_stride
80        CSEL    x11, x3, x11, LO        //   a1 = a0
81        CSEL    x9, x6, x9, LO          //   c1 = c0
82        ADD     x12, x11, x4            // a2 = a1 + a_stride
83        ADD     x10, x9, x7             // c2 = c1 + cm_stride
84                                        // if mr <= 2
85        CSEL    x12, x11, x12, LS       //   a2 = a1
86        CSEL    x10, x9, x10, LS        //   c2 = c1
87        CMP     x0, 4                   // if mr < 4
88        ADD     x4, x12, x4             // a3 = a2 + a_stride
89        ADD     x7, x10, x7             // c3 = c2 + cm_stride
90        CSEL    x4, x12, x4, LO         //   a3 = a2
91        CSEL    x7, x10, x7, LO         //   c3 = c2
92
930:
94        # Load initial accumulators
95        LD1     {v20.16b, v21.16b, v22.16b}, [x15], 48
96        LD1     {v23.16b, v24.16b, v25.16b}, [x15], 48
97        LD1     {v26.16b, v27.16b, v28.16b}, [x15], 48
98        LD1     {v29.16b, v30.16b, v31.16b}, [x15], 48
99        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
100        PRFM    PLDL1KEEP,  [x3, 64]
101        PRFM    PLDL1KEEP, [x11,  0]
102        PRFM    PLDL1KEEP, [x11, 64]
103        PRFM    PLDL1KEEP, [x12,  0]
104        PRFM    PLDL1KEEP, [x12, 64]
105        PRFM    PLDL1KEEP,  [x4,  0]
106        PRFM    PLDL1KEEP,  [x4, 64]
107        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
108        PRFM    PLDL1KEEP, [x5,  64]
109        PRFM    PLDL1KEEP, [x5, 128]
110        PRFM    PLDL1KEEP, [x5, 192]
111        PRFM    PLDL1KEEP, [x5, 256]
112        PRFM    PLDL1KEEP, [x5, 320]
113
114        # Is there at least 4 floats (16 bytes)?
115        SUBS    x0, x2, 16              // k = kc - 16
116        B.LO    4f
117
118        SUBS    x0, x0, 16
119
120        # Prologue - loads for first group of 24 FMA
121
122        # Read first block of 4 A.
123        LDR     d0,  [x3], 8              // a0
124        LDR     d1, [x12], 8              // a2
125        LD1     {v0.d}[1], [x11], 8       // a1
126        LD1     {v1.d}[1],  [x4], 8       // a3
127
128        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
129        LD1     {v9.16b, v10.16b}, [x5], 32
130        LDR     d11, [x5], 8
131        LDR     x8, [x5], 8
132
133        # Is there at least 4 floats (16 bytes) for main loop?
134        B.LO    2f
135
136        # Main loop - 4 floats of A (16 bytes)
1371:
138        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
139        # A is loaded for 2nd group into v2/v3
140        # INS is 4 blocks (16 cycles) after load
141
142        # BLOCK 0
143        LDR     d2, [x3], 8                // a0
144        INS     v11.d[1], x8
145        FMLA    v20.4s, v6.4s, v0.s[0]
146        LDR     x8, [x11], 8               // a1
147        FMLA    v23.4s, v6.4s, v0.s[2]
148        FMLA    v26.4s, v6.4s, v1.s[0]
149        PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
150
151        # BLOCK 1
152        LDR     d3, [x12], 8               // a2
153        INS     v2.d[1], x8                // a1 was loaded in block 0
154        FMLA    v29.4s, v6.4s, v1.s[2]
155        LDR     x8, [x4], 8                // a3
156        FMLA    v21.4s, v7.4s, v0.s[0]
157        FMLA    v24.4s, v7.4s, v0.s[2]
158        PRFM    PLDL1KEEP, [x11, 128]      // Prefetch A1
159
160        # BLOCK 2
161        LDR     d14, [x5]                  // vb0x0123
162        INS     v3.d[1], x8                // a3 was loaded in block 1
163        FMLA    v27.4s, v7.4s, v1.s[0]
164        LDR     x8, [x5, 8]
165        FMLA    v30.4s, v7.4s, v1.s[2]
166        FMLA    v22.4s, v8.4s, v0.s[0]
167        PRFM    PLDL1KEEP, [x12, 128]     // Prefetch A2
168
169        # BLOCK 3
170        LDR     d15, [x5, 16]              // vb0x4567
171        INS     v14.d[1], x8               // v14 was loaded in block 2
172        FMLA    v25.4s, v8.4s, v0.s[2]
173        LDR     x8, [x5, 24]
174        FMLA    v28.4s, v8.4s, v1.s[0]
175        FMLA    v31.4s, v8.4s, v1.s[2]
176        PRFM    PLDL1KEEP, [x4, 128]      // Prefetch A3
177
178        # BLOCK 4
179        LDR     d16, [x5, 32]              // vb0x89AB
180        INS     v15.d[1], x8
181        FMLA    v20.4s, v9.4s, v0.s[1]
182        LDR     x8, [x5, 40]
183        FMLA    v23.4s, v9.4s, v0.s[3]
184        FMLA    v26.4s, v9.4s, v1.s[1]
185        PRFM    PLDL1KEEP, [x5, 320]      // Prefetch B
186
187        # BLOCK 5
188        LDR     d17, [x5, 48]              // vb1x0123
189        INS     v16.d[1], x8
190        FMLA    v29.4s, v9.4s, v1.s[3]
191        LDR     x8, [x5, 56]
192        FMLA    v21.4s, v10.4s, v0.s[1]
193        FMLA    v24.4s, v10.4s, v0.s[3]
194        PRFM    PLDL1KEEP, [x5, 384]      // Prefetch B
195
196        # BLOCK 6
197        LDR     d18, [x5, 64]              // vb1x4567
198        INS     v17.d[1], x8
199        FMLA    v27.4s, v10.4s, v1.s[1]
200        LDR     x8, [x5, 72]
201        FMLA    v30.4s, v10.4s, v1.s[3]
202        FMLA    v22.4s, v11.4s, v0.s[1]
203        PRFM    PLDL1KEEP, [x5, 448]      // Prefetch B
204
205        # BLOCK 7
206        LDR     d19, [x5, 80]              // vb1x89AB
207        INS     v18.d[1], x8
208        FMLA    v25.4s, v11.4s, v0.s[3]
209        LDR     x8, [x5, 88]
210        FMLA    v28.4s, v11.4s, v1.s[1]
211        FMLA    v31.4s, v11.4s, v1.s[3]
212
213        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
214        # A is loaded for 1st group into v0/v1
215
216        # BLOCK 0
217        LDR     d0, [x3], 8                // a0
218        INS     v19.d[1], x8
219        FMLA    v20.4s, v14.4s, v2.s[0]
220        LDR     x8, [x11], 8               // a1
221        FMLA    v23.4s, v14.4s, v2.s[2]
222        FMLA    v26.4s, v14.4s, v3.s[0]
223
224        # BLOCK 1
225        LDR     d1, [x12], 8               // a2
226        INS     v0.d[1], x8                // a1
227        FMLA    v29.4s, v14.4s, v3.s[2]
228        LDR     x8, [x4], 8                // a3
229        FMLA    v21.4s, v15.4s, v2.s[0]
230        FMLA    v24.4s, v15.4s, v2.s[2]
231
232        # BLOCK 2
233        LDR     d6, [x5, 96]               // vb0x0123
234        INS     v1.d[1], x8                // a3
235        FMLA    v27.4s, v15.4s, v3.s[0]
236        LDR     x8, [x5, 104]
237        FMLA    v30.4s, v15.4s, v3.s[2]
238        FMLA    v22.4s, v16.4s, v2.s[0]
239
240        # BLOCK 3
241        LDR     d7, [x5, 112]              // vb0x4567
242        INS     v6.d[1], x8
243        FMLA    v25.4s, v16.4s, v2.s[2]
244        LDR     x8, [x5, 120]
245        FMLA    v28.4s, v16.4s, v3.s[0]
246        FMLA    v31.4s, v16.4s, v3.s[2]
247
248        # BLOCK 4
249        LDR     d8, [x5, 128]              // vb0x89AB
250        INS     v7.d[1], x8
251        FMLA    v20.4s, v17.4s, v2.s[1]
252        LDR     x8, [x5, 136]
253        FMLA    v23.4s, v17.4s, v2.s[3]
254        FMLA    v26.4s, v17.4s, v3.s[1]
255
256        # BLOCK 5
257        LDR     d9, [x5, 144]              // vb1x0123
258        INS     v8.d[1], x8
259        FMLA    v29.4s, v17.4s, v3.s[3]
260        LDR     x8, [x5, 152]
261        FMLA    v21.4s, v18.4s, v2.s[1]
262        FMLA    v24.4s, v18.4s, v2.s[3]
263
264        # BLOCK 6
265        LDR     d10, [x5, 160]             // vb1x4567
266        INS     v9.d[1], x8
267        FMLA    v27.4s, v18.4s, v3.s[1]
268        LDR     x8, [x5, 168]
269        FMLA    v30.4s, v18.4s, v3.s[3]
270        SUBS    x0, x0, 16
271        FMLA    v22.4s, v19.4s, v2.s[1]
272
273        # BLOCK 7
274        LDR     d11, [x5, 176]             // vb1x89AB
275        INS     v10.d[1], x8
276        FMLA    v25.4s, v19.4s, v2.s[3]
277        LDR     x8, [x5, 184]
278        FMLA    v28.4s, v19.4s, v3.s[1]
279        ADD     x5, x5, 192
280        FMLA    v31.4s, v19.4s, v3.s[3]
281        B.HS    1b
282
283        # Epilogue
284        # First block same as main loop.  Second block has no loads.
2852:
286        # BLOCK 0
287        LDR     d2, [x3], 8                // a0
288        INS     v11.d[1], x8
289        FMLA    v20.4s, v6.4s, v0.s[0]
290        LDR     x8, [x11], 8               // a1
291        FMLA    v23.4s, v6.4s, v0.s[2]
292        FMLA    v26.4s, v6.4s, v1.s[0]
293
294        # BLOCK 1
295        LDR     d3, [x12], 8               // a2
296        INS     v2.d[1], x8                // a1 was loaded in block 0
297        FMLA    v29.4s, v6.4s, v1.s[2]
298        LDR     x8, [x4], 8                // a3
299        FMLA    v21.4s, v7.4s, v0.s[0]
300        FMLA    v24.4s, v7.4s, v0.s[2]
301
302        # BLOCK 2
303        LDR     d14, [x5]                  // vb0x0123
304        INS     v3.d[1], x8                // a3 was loaded in block 1
305        FMLA    v27.4s, v7.4s, v1.s[0]
306        LDR     x8, [x5, 8]
307        FMLA    v30.4s, v7.4s, v1.s[2]
308        FMLA    v22.4s, v8.4s, v0.s[0]
309
310        # BLOCK 3
311        LDR     d15, [x5, 16]              // vb0x4567
312        INS     v14.d[1], x8               // v14 was loaded in block 2
313        FMLA    v25.4s, v8.4s, v0.s[2]
314        LDR     x8, [x5, 24]
315        FMLA    v28.4s, v8.4s, v1.s[0]
316        FMLA    v31.4s, v8.4s, v1.s[2]
317
318        # BLOCK 4
319        LDR     d16, [x5, 32]              // vb0x89AB
320        INS     v15.d[1], x8
321        FMLA    v20.4s, v9.4s, v0.s[1]
322        LDR     x8, [x5, 40]
323        FMLA    v23.4s, v9.4s, v0.s[3]
324        FMLA    v26.4s, v9.4s, v1.s[1]
325
326        # BLOCK 5
327        LDR     d17, [x5, 48]             // vb1x0123
328        INS     v16.d[1], x8
329        FMLA    v29.4s, v9.4s, v1.s[3]
330        LDR     x8, [x5, 56]
331        FMLA    v21.4s, v10.4s, v0.s[1]
332        FMLA    v24.4s, v10.4s, v0.s[3]
333
334        # BLOCK 6
335        LDR     d18, [x5, 64]             // vb1x4567
336        INS     v17.d[1], x8
337        FMLA    v27.4s, v10.4s, v1.s[1]
338        LDR     x8, [x5, 72]
339        FMLA    v30.4s, v10.4s, v1.s[3]
340        FMLA    v22.4s, v11.4s, v0.s[1]
341
342        # BLOCK 7
343        LDR     d19, [x5, 80]             // vb1x89AB
344        INS     v18.d[1], x8
345        FMLA    v25.4s, v11.4s, v0.s[3]
346        LDR     x8, [x5, 88]
347        FMLA    v28.4s, v11.4s, v1.s[1]
348        FMLA    v31.4s, v11.4s, v1.s[3]
349
350        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
351        # A is loaded for 1st group into v0/v1
352
353        # BLOCK 0
354        INS     v19.d[1], x8
355        FMLA    v20.4s, v14.4s, v2.s[0]
356        FMLA    v23.4s, v14.4s, v2.s[2]
357        FMLA    v26.4s, v14.4s, v3.s[0]
358
359        # BLOCK 1
360        FMLA    v29.4s, v14.4s, v3.s[2]
361        FMLA    v21.4s, v15.4s, v2.s[0]
362        FMLA    v24.4s, v15.4s, v2.s[2]
363
364        # BLOCK 2
365        FMLA    v27.4s, v15.4s, v3.s[0]
366        FMLA    v30.4s, v15.4s, v3.s[2]
367        FMLA    v22.4s, v16.4s, v2.s[0]
368
369        # BLOCK 3
370        FMLA    v25.4s, v16.4s, v2.s[2]
371        FMLA    v28.4s, v16.4s, v3.s[0]
372        FMLA    v31.4s, v16.4s, v3.s[2]
373
374        # BLOCK 4
375        FMLA    v20.4s, v17.4s, v2.s[1]
376        FMLA    v23.4s, v17.4s, v2.s[3]
377        FMLA    v26.4s, v17.4s, v3.s[1]
378
379        # BLOCK 5
380        FMLA    v29.4s, v17.4s, v3.s[3]
381        FMLA    v21.4s, v18.4s, v2.s[1]
382        FMLA    v24.4s, v18.4s, v2.s[3]
383
384        # BLOCK 6
385        FMLA    v27.4s, v18.4s, v3.s[1]
386        FMLA    v30.4s, v18.4s, v3.s[3]
387        FMLA    v22.4s, v19.4s, v2.s[1]
388        TST     x0, 15
389
390        # BLOCK 7
391        FMLA    v25.4s, v19.4s, v2.s[3]
392        FMLA    v28.4s, v19.4s, v3.s[1]
393        ADD     x5, x5, 96
394        FMLA    v31.4s, v19.4s, v3.s[3]
395
396        # Is there a remainder?- 2 floats of A (8 bytes) or less
397        B.NE    4f
398
3993:
400        # Clamp
401        FMAX    v20.4s, v20.4s, v4.4s
402        SUBS    x1, x1, 12
403        FMAX    v21.4s, v21.4s, v4.4s
404        FMAX    v22.4s, v22.4s, v4.4s
405        FMAX    v23.4s, v23.4s, v4.4s
406        FMAX    v24.4s, v24.4s, v4.4s
407        FMAX    v25.4s, v25.4s, v4.4s
408        FMAX    v26.4s, v26.4s, v4.4s
409        FMAX    v27.4s, v27.4s, v4.4s
410        FMAX    v28.4s, v28.4s, v4.4s
411        FMAX    v29.4s, v29.4s, v4.4s
412        FMAX    v30.4s, v30.4s, v4.4s
413        FMAX    v31.4s, v31.4s, v4.4s
414        FMIN    v20.4s, v20.4s, v5.4s
415        FMIN    v21.4s, v21.4s, v5.4s
416        FMIN    v22.4s, v22.4s, v5.4s
417        FMIN    v23.4s, v23.4s, v5.4s
418        FMIN    v24.4s, v24.4s, v5.4s
419        FMIN    v25.4s, v25.4s, v5.4s
420        FMIN    v26.4s, v26.4s, v5.4s
421        FMIN    v27.4s, v27.4s, v5.4s
422        FMIN    v28.4s, v28.4s, v5.4s
423        FMIN    v29.4s, v29.4s, v5.4s
424        FMIN    v30.4s, v30.4s, v5.4s
425        FMIN    v31.4s, v31.4s, v5.4s
426
427        # Store full 4 x 12
428        B.LO    6f
429
430        ST1     {v29.16b, v30.16b, v31.16b},  [x7], x14
431        SUB     x3,  x3, x2             // a0 -= kc
432        ST1     {v26.16b, v27.16b, v28.16b}, [x10], x14
433        SUB     x11, x11, x2            // a1 -= kc
434        ST1     {v23.16b, v24.16b, v25.16b},  [x9], x14
435        SUB     x12, x12, x2            // a2 -= kc
436        ST1     {v20.16b, v21.16b, v22.16b},  [x6], x14
437        SUB     x4,  x4, x2             // a3 -= kc
438
439        B.HI    0b
440
441        # Restore d8-d11,d14,d15 from stack
442        LDP     d14, d15, [sp, 32]
443        LDP     d10, d11, [sp, 16]
444        LDP     d8,  d9, [sp], 48
445        RET
446
4474:
448        # Is there a remainder?- 2 floats of A (8 bytes)
449        TBZ     x0, 3, 5f
450
451        # Remainder - 2 floats of A (8 bytes)
452        # Read first block of 4 A.
453        LDR     d0,  [x3], 8            // a0
454        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
455        LDR     d1, [x11], 8            // a1
456        LDR     d2, [x12], 8            // a2
457        LDR     d3,  [x4], 8            // a3
458        LD1     {v9.16b, v10.16b, v11.16b}, [x5], 48
459
460        # First block of 3 B
461        FMLA    v20.4s, v6.4s, v0.s[0]
462        FMLA    v23.4s, v6.4s, v1.s[0]
463        FMLA    v26.4s, v6.4s, v2.s[0]
464        FMLA    v29.4s, v6.4s, v3.s[0]
465        FMLA    v21.4s, v7.4s, v0.s[0]
466        FMLA    v24.4s, v7.4s, v1.s[0]
467        FMLA    v27.4s, v7.4s, v2.s[0]
468        FMLA    v30.4s, v7.4s, v3.s[0]
469        FMLA    v22.4s, v8.4s, v0.s[0]
470        FMLA    v25.4s, v8.4s, v1.s[0]
471        FMLA    v28.4s, v8.4s, v2.s[0]
472        FMLA    v31.4s, v8.4s, v3.s[0]
473
474        # Second block of 3 B
475        FMLA    v20.4s, v9.4s, v0.s[1]
476        FMLA    v23.4s, v9.4s, v1.s[1]
477        FMLA    v26.4s, v9.4s, v2.s[1]
478        FMLA    v29.4s, v9.4s, v3.s[1]
479        FMLA    v21.4s, v10.4s, v0.s[1]
480        FMLA    v24.4s, v10.4s, v1.s[1]
481        FMLA    v27.4s, v10.4s, v2.s[1]
482        FMLA    v30.4s, v10.4s, v3.s[1]
483        FMLA    v22.4s, v11.4s, v0.s[1]
484        FMLA    v25.4s, v11.4s, v1.s[1]
485        FMLA    v28.4s, v11.4s, v2.s[1]
486        FMLA    v31.4s, v11.4s, v3.s[1]
487
488        TBZ     x0, 2, 3b
4895:
490        # Remainder - 1 float of A (4 bytes)
491        LDR     s0,  [x3], 4            // a0
492        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
493        LDR     s1, [x11], 4            // a1
494        LDR     s2, [x12], 4            // a2
495        LDR     s3,  [x4], 4            // a3
496
497        FMLA    v20.4s, v6.4s, v0.s[0]
498        FMLA    v23.4s, v6.4s, v1.s[0]
499        FMLA    v26.4s, v6.4s, v2.s[0]
500        FMLA    v29.4s, v6.4s, v3.s[0]
501        FMLA    v21.4s, v7.4s, v0.s[0]
502        FMLA    v24.4s, v7.4s, v1.s[0]
503        FMLA    v27.4s, v7.4s, v2.s[0]
504        FMLA    v30.4s, v7.4s, v3.s[0]
505        FMLA    v22.4s, v8.4s, v0.s[0]
506        FMLA    v25.4s, v8.4s, v1.s[0]
507        FMLA    v28.4s, v8.4s, v2.s[0]
508        FMLA    v31.4s, v8.4s, v3.s[0]
509        B       3b
510
5116:
512        ADD     x1, x1, 12
513        # Store odd channels
514        TBZ     x1, 3, 7f
515        STP     q29, q30,  [x7], 32
516        MOV     v29.16b, v31.16b
517        STP     q26, q27, [x10], 32
518        MOV     v26.16b, v28.16b
519        STP     q23, q24,  [x9], 32
520        MOV     v23.16b, v25.16b
521        STP     q20, q21,  [x6], 32
522        MOV     v20.16b, v22.16b
523
5247:
525        TBZ     x1, 2, 8f
526        STR     q29,  [x7], 16
527        MOV     v29.16b, v30.16b
528        STR     q26, [x10], 16
529        MOV     v26.16b, v27.16b
530        STR     q23,  [x9], 16
531        MOV     v23.16b, v24.16b
532        STR     q20,  [x6], 16
533        MOV     v20.16b, v21.16b
534
5358:
536        TBZ     x1, 1, 9f
537        STR     d29,  [x7], 8
538        DUP     d29, v29.d[1]
539        STR     d26, [x10], 8
540        DUP     d26, v26.d[1]
541        STR     d23,  [x9], 8
542        DUP     d23, v23.d[1]
543        STR     d20,  [x6], 8
544        DUP     d20, v20.d[1]
545
5469:
547        TBZ     x1, 0, 10f
548        STR     s29,  [x7]
549        STR     s26, [x10]
550        STR     s23,  [x9]
551        STR     s20,  [x6]
55210:
553        # Restore d8-d11,d14,d15 from stack
554        LDP     d14, d15, [sp, 32]
555        LDP     d10, d11, [sp, 16]
556        LDP     d8,  d9, [sp], 48
557        RET
558
559END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
560
561#ifdef __ELF__
562.section ".note.GNU-stack","",%progbits
563#endif
564