xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x12-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# x8 temporary vector shadow register
39
40# Vector register usage and GPR shadows
41# a0  v0
42# a1  v0[1]
43# a2  v1
44# a3  v1[1]
45# a0  v2
46# a1  v2[1]
47# a2  v3
48# a3  v3[1]
49# B   v6  v7  v8
50# B   v9 v10 v11
51# B  v14 v15 v16
52# B  v17 v18 v19
53# C  v20 v21 v22
54# C  v23 v24 v25
55# C  v26 v27 v28
56# C  v29 v30 v31
57# Clamp v4 v5
58# v12 to v13 unused.
59
60BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
61
62        # Load cn_stride, params pointer
63        LDP     x14, x8, [sp]
64
65        # Load min/max values
66        LD2R    {v4.4s, v5.4s}, [x8]
67
68        # Save d8-d11,d14,d15 on stack
69        STP     d8,  d9, [sp, -48]!
70        STP     d10, d11, [sp, 16]
71        STP     d14, d15, [sp, 32]
72
73        # Clamp A and C pointers
74        CMP     x0, 2                   // if mr < 2
75        ADD     x11, x3, x4             // a1 = a0 + a_stride
76        ADD     x9, x6, x7              // c1 = c0 + cm_stride
77        CSEL    x11, x3, x11, LO        //   a1 = a0
78        CSEL    x9, x6, x9, LO          //   c1 = c0
79        ADD     x12, x11, x4            // a2 = a1 + a_stride
80        ADD     x10, x9, x7             // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x12, x11, x12, LS       //   a2 = a1
83        CSEL    x10, x9, x10, LS        //   c2 = c1
84        CMP     x0, 4                   // if mr < 4
85        ADD     x4, x12, x4             // a3 = a2 + a_stride
86        ADD     x7, x10, x7             // c3 = c2 + cm_stride
87        CSEL    x4, x12, x4, LO         //   a3 = a2
88        CSEL    x7, x10, x7, LO         //   c3 = c2
89
900:
91        # Load initial bias from w into accumulators
92        LD1     {v20.16b, v21.16b, v22.16b}, [x5], 48
93        MOV     v23.16b, v20.16b
94        PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
95        PRFM    PLDL1KEEP,  [x3, 64]
96        MOV     v24.16b, v21.16b
97        PRFM    PLDL1KEEP,  [x11,  0]
98        PRFM    PLDL1KEEP,  [x11, 64]
99        MOV     v25.16b, v22.16b
100        PRFM    PLDL1KEEP, [x12,  0]
101        PRFM    PLDL1KEEP, [x12, 64]
102        MOV     v26.16b, v20.16b
103        PRFM    PLDL1KEEP, [x4,  0]
104        PRFM    PLDL1KEEP, [x4, 64]
105        MOV     v27.16b, v21.16b
106        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
107        PRFM    PLDL1KEEP, [x5,  64]
108        MOV     v28.16b, v22.16b
109        PRFM    PLDL1KEEP, [x5, 128]
110        PRFM    PLDL1KEEP, [x5, 192]
111        MOV     v29.16b, v20.16b
112        PRFM    PLDL1KEEP, [x5, 256]
113        MOV     v30.16b, v21.16b
114        PRFM    PLDL1KEEP, [x5, 320]
115        MOV     v31.16b, v22.16b
116
117        # Is there at least 4 floats (16 bytes)?
118        SUBS    x0, x2, 16              // k = kc - 16
119        B.LO    4f
120
121        SUBS    x0, x0, 16
122
123        # Prologue - loads for first group of 24 FMA
124
125        # Read first block of 4 A.
126        LDR     d0,  [x3], 8              // a0
127        LDR     d1, [x12], 8              // a2
128        LD1     {v0.d}[1], [x11], 8       // a1
129        LD1     {v1.d}[1],  [x4], 8       // a3
130
131        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
132        LD1     {v9.16b, v10.16b}, [x5], 32
133        LDR     d11, [x5], 8
134        LDR     x8, [x5], 8
135
136        # Is there at least 4 floats (16 bytes) for main loop?
137        B.LO    2f
138
139        # Main loop - 4 floats of A (16 bytes)
1401:
141        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
142        # A is loaded for 2nd group into v2/v3
143        # INS is 4 blocks (16 cycles) after load
144
145        # BLOCK 0
146        LDR     d2, [x3], 8                // a0
147        INS     v11.d[1], x8
148        FMLA    v20.4s, v6.4s, v0.s[0]
149        LDR     x8, [x11], 8               // a1
150        FMLA    v23.4s, v6.4s, v0.s[2]
151        FMLA    v26.4s, v6.4s, v1.s[0]
152        PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
153
154        # BLOCK 1
155        LDR     d3, [x12], 8               // a2
156        INS     v2.d[1], x8                // a1 was loaded in block 0
157        FMLA    v29.4s, v6.4s, v1.s[2]
158        LDR     x8, [x4], 8                // a3
159        FMLA    v21.4s, v7.4s, v0.s[0]
160        FMLA    v24.4s, v7.4s, v0.s[2]
161        PRFM    PLDL1KEEP, [x11, 128]      // Prefetch A1
162
163        # BLOCK 2
164        LDR     d14, [x5]                  // vb0x0123
165        INS     v3.d[1], x8                // a3 was loaded in block 1
166        FMLA    v27.4s, v7.4s, v1.s[0]
167        LDR     x8, [x5, 8]
168        FMLA    v30.4s, v7.4s, v1.s[2]
169        FMLA    v22.4s, v8.4s, v0.s[0]
170        PRFM    PLDL1KEEP, [x12, 128]     // Prefetch A2
171
172        # BLOCK 3
173        LDR     d15, [x5, 16]              // vb0x4567
174        INS     v14.d[1], x8               // v14 was loaded in block 2
175        FMLA    v25.4s, v8.4s, v0.s[2]
176        LDR     x8, [x5, 24]
177        FMLA    v28.4s, v8.4s, v1.s[0]
178        FMLA    v31.4s, v8.4s, v1.s[2]
179        PRFM    PLDL1KEEP, [x4, 128]      // Prefetch A3
180
181        # BLOCK 4
182        LDR     d16, [x5, 32]              // vb0x89AB
183        INS     v15.d[1], x8
184        FMLA    v20.4s, v9.4s, v0.s[1]
185        LDR     x8, [x5, 40]
186        FMLA    v23.4s, v9.4s, v0.s[3]
187        FMLA    v26.4s, v9.4s, v1.s[1]
188        PRFM    PLDL1KEEP, [x5, 320]      // Prefetch B
189
190        # BLOCK 5
191        LDR     d17, [x5, 48]              // vb1x0123
192        INS     v16.d[1], x8
193        FMLA    v29.4s, v9.4s, v1.s[3]
194        LDR     x8, [x5, 56]
195        FMLA    v21.4s, v10.4s, v0.s[1]
196        FMLA    v24.4s, v10.4s, v0.s[3]
197        PRFM    PLDL1KEEP, [x5, 384]      // Prefetch B
198
199        # BLOCK 6
200        LDR     d18, [x5, 64]              // vb1x4567
201        INS     v17.d[1], x8
202        FMLA    v27.4s, v10.4s, v1.s[1]
203        LDR     x8, [x5, 72]
204        FMLA    v30.4s, v10.4s, v1.s[3]
205        FMLA    v22.4s, v11.4s, v0.s[1]
206        PRFM    PLDL1KEEP, [x5, 448]      // Prefetch B
207
208        # BLOCK 7
209        LDR     d19, [x5, 80]              // vb1x89AB
210        INS     v18.d[1], x8
211        FMLA    v25.4s, v11.4s, v0.s[3]
212        LDR     x8, [x5, 88]
213        FMLA    v28.4s, v11.4s, v1.s[1]
214        FMLA    v31.4s, v11.4s, v1.s[3]
215
216        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
217        # A is loaded for 1st group into v0/v1
218
219        # BLOCK 0
220        LDR     d0, [x3], 8                // a0
221        INS     v19.d[1], x8
222        FMLA    v20.4s, v14.4s, v2.s[0]
223        LDR     x8, [x11], 8               // a1
224        FMLA    v23.4s, v14.4s, v2.s[2]
225        FMLA    v26.4s, v14.4s, v3.s[0]
226
227        # BLOCK 1
228        LDR     d1, [x12], 8               // a2
229        INS     v0.d[1], x8                // a1
230        FMLA    v29.4s, v14.4s, v3.s[2]
231        LDR     x8, [x4], 8                // a3
232        FMLA    v21.4s, v15.4s, v2.s[0]
233        FMLA    v24.4s, v15.4s, v2.s[2]
234
235        # BLOCK 2
236        LDR     d6, [x5, 96]               // vb0x0123
237        INS     v1.d[1], x8                // a3
238        FMLA    v27.4s, v15.4s, v3.s[0]
239        LDR     x8, [x5, 104]
240        FMLA    v30.4s, v15.4s, v3.s[2]
241        FMLA    v22.4s, v16.4s, v2.s[0]
242
243        # BLOCK 3
244        LDR     d7, [x5, 112]              // vb0x4567
245        INS     v6.d[1], x8
246        FMLA    v25.4s, v16.4s, v2.s[2]
247        LDR     x8, [x5, 120]
248        FMLA    v28.4s, v16.4s, v3.s[0]
249        FMLA    v31.4s, v16.4s, v3.s[2]
250
251        # BLOCK 4
252        LDR     d8, [x5, 128]              // vb0x89AB
253        INS     v7.d[1], x8
254        FMLA    v20.4s, v17.4s, v2.s[1]
255        LDR     x8, [x5, 136]
256        FMLA    v23.4s, v17.4s, v2.s[3]
257        FMLA    v26.4s, v17.4s, v3.s[1]
258
259        # BLOCK 5
260        LDR     d9, [x5, 144]              // vb1x0123
261        INS     v8.d[1], x8
262        FMLA    v29.4s, v17.4s, v3.s[3]
263        LDR     x8, [x5, 152]
264        FMLA    v21.4s, v18.4s, v2.s[1]
265        FMLA    v24.4s, v18.4s, v2.s[3]
266
267        # BLOCK 6
268        LDR     d10, [x5, 160]             // vb1x4567
269        INS     v9.d[1], x8
270        FMLA    v27.4s, v18.4s, v3.s[1]
271        LDR     x8, [x5, 168]
272        FMLA    v30.4s, v18.4s, v3.s[3]
273        SUBS    x0, x0, 16
274        FMLA    v22.4s, v19.4s, v2.s[1]
275
276        # BLOCK 7
277        LDR     d11, [x5, 176]             // vb1x89AB
278        INS     v10.d[1], x8
279        FMLA    v25.4s, v19.4s, v2.s[3]
280        LDR     x8, [x5, 184]
281        FMLA    v28.4s, v19.4s, v3.s[1]
282        ADD     x5, x5, 192
283        FMLA    v31.4s, v19.4s, v3.s[3]
284        B.HS    1b
285
286        # Epilogue
287        # First block same as main loop.  Second block has no loads.
2882:
289        # BLOCK 0
290        LDR     d2, [x3], 8                // a0
291        INS     v11.d[1], x8
292        FMLA    v20.4s, v6.4s, v0.s[0]
293        LDR     x8, [x11], 8               // a1
294        FMLA    v23.4s, v6.4s, v0.s[2]
295        FMLA    v26.4s, v6.4s, v1.s[0]
296
297        # BLOCK 1
298        LDR     d3, [x12], 8               // a2
299        INS     v2.d[1], x8                // a1 was loaded in block 0
300        FMLA    v29.4s, v6.4s, v1.s[2]
301        LDR     x8, [x4], 8                // a3
302        FMLA    v21.4s, v7.4s, v0.s[0]
303        FMLA    v24.4s, v7.4s, v0.s[2]
304
305        # BLOCK 2
306        LDR     d14, [x5]                  // vb0x0123
307        INS     v3.d[1], x8                // a3 was loaded in block 1
308        FMLA    v27.4s, v7.4s, v1.s[0]
309        LDR     x8, [x5, 8]
310        FMLA    v30.4s, v7.4s, v1.s[2]
311        FMLA    v22.4s, v8.4s, v0.s[0]
312
313        # BLOCK 3
314        LDR     d15, [x5, 16]              // vb0x4567
315        INS     v14.d[1], x8               // v14 was loaded in block 2
316        FMLA    v25.4s, v8.4s, v0.s[2]
317        LDR     x8, [x5, 24]
318        FMLA    v28.4s, v8.4s, v1.s[0]
319        FMLA    v31.4s, v8.4s, v1.s[2]
320
321        # BLOCK 4
322        LDR     d16, [x5, 32]              // vb0x89AB
323        INS     v15.d[1], x8
324        FMLA    v20.4s, v9.4s, v0.s[1]
325        LDR     x8, [x5, 40]
326        FMLA    v23.4s, v9.4s, v0.s[3]
327        FMLA    v26.4s, v9.4s, v1.s[1]
328
329        # BLOCK 5
330        LDR     d17, [x5, 48]             // vb1x0123
331        INS     v16.d[1], x8
332        FMLA    v29.4s, v9.4s, v1.s[3]
333        LDR     x8, [x5, 56]
334        FMLA    v21.4s, v10.4s, v0.s[1]
335        FMLA    v24.4s, v10.4s, v0.s[3]
336
337        # BLOCK 6
338        LDR     d18, [x5, 64]             // vb1x4567
339        INS     v17.d[1], x8
340        FMLA    v27.4s, v10.4s, v1.s[1]
341        LDR     x8, [x5, 72]
342        FMLA    v30.4s, v10.4s, v1.s[3]
343        FMLA    v22.4s, v11.4s, v0.s[1]
344
345        # BLOCK 7
346        LDR     d19, [x5, 80]             // vb1x89AB
347        INS     v18.d[1], x8
348        FMLA    v25.4s, v11.4s, v0.s[3]
349        LDR     x8, [x5, 88]
350        FMLA    v28.4s, v11.4s, v1.s[1]
351        FMLA    v31.4s, v11.4s, v1.s[3]
352
353        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
354        # A is loaded for 1st group into v0/v1
355
356        # BLOCK 0
357        INS     v19.d[1], x8
358        FMLA    v20.4s, v14.4s, v2.s[0]
359        FMLA    v23.4s, v14.4s, v2.s[2]
360        FMLA    v26.4s, v14.4s, v3.s[0]
361
362        # BLOCK 1
363        FMLA    v29.4s, v14.4s, v3.s[2]
364        FMLA    v21.4s, v15.4s, v2.s[0]
365        FMLA    v24.4s, v15.4s, v2.s[2]
366
367        # BLOCK 2
368        FMLA    v27.4s, v15.4s, v3.s[0]
369        FMLA    v30.4s, v15.4s, v3.s[2]
370        FMLA    v22.4s, v16.4s, v2.s[0]
371
372        # BLOCK 3
373        FMLA    v25.4s, v16.4s, v2.s[2]
374        FMLA    v28.4s, v16.4s, v3.s[0]
375        FMLA    v31.4s, v16.4s, v3.s[2]
376
377        # BLOCK 4
378        FMLA    v20.4s, v17.4s, v2.s[1]
379        FMLA    v23.4s, v17.4s, v2.s[3]
380        FMLA    v26.4s, v17.4s, v3.s[1]
381
382        # BLOCK 5
383        FMLA    v29.4s, v17.4s, v3.s[3]
384        FMLA    v21.4s, v18.4s, v2.s[1]
385        FMLA    v24.4s, v18.4s, v2.s[3]
386
387        # BLOCK 6
388        FMLA    v27.4s, v18.4s, v3.s[1]
389        FMLA    v30.4s, v18.4s, v3.s[3]
390        FMLA    v22.4s, v19.4s, v2.s[1]
391        TST     x0, 15
392
393        # BLOCK 7
394        FMLA    v25.4s, v19.4s, v2.s[3]
395        FMLA    v28.4s, v19.4s, v3.s[1]
396        ADD     x5, x5, 96
397        FMLA    v31.4s, v19.4s, v3.s[3]
398
399        # Is there a remainder?- 2 floats of A (8 bytes) or less
400        B.NE    4f
401
4023:
403        # Clamp
404        FMAX    v20.4s, v20.4s, v4.4s
405        SUBS    x1, x1, 12
406        FMAX    v21.4s, v21.4s, v4.4s
407        FMAX    v22.4s, v22.4s, v4.4s
408        FMAX    v23.4s, v23.4s, v4.4s
409        FMAX    v24.4s, v24.4s, v4.4s
410        FMAX    v25.4s, v25.4s, v4.4s
411        FMAX    v26.4s, v26.4s, v4.4s
412        FMAX    v27.4s, v27.4s, v4.4s
413        FMAX    v28.4s, v28.4s, v4.4s
414        FMAX    v29.4s, v29.4s, v4.4s
415        FMAX    v30.4s, v30.4s, v4.4s
416        FMAX    v31.4s, v31.4s, v4.4s
417        FMIN    v20.4s, v20.4s, v5.4s
418        FMIN    v21.4s, v21.4s, v5.4s
419        FMIN    v22.4s, v22.4s, v5.4s
420        FMIN    v23.4s, v23.4s, v5.4s
421        FMIN    v24.4s, v24.4s, v5.4s
422        FMIN    v25.4s, v25.4s, v5.4s
423        FMIN    v26.4s, v26.4s, v5.4s
424        FMIN    v27.4s, v27.4s, v5.4s
425        FMIN    v28.4s, v28.4s, v5.4s
426        FMIN    v29.4s, v29.4s, v5.4s
427        FMIN    v30.4s, v30.4s, v5.4s
428        FMIN    v31.4s, v31.4s, v5.4s
429
430        # Store full 4 x 12
431        B.LO    6f
432
433        ST1     {v20.16b, v21.16b, v22.16b},  [x6], x14
434        SUB     x3,  x3, x2             // a0 -= kc
435        ST1     {v23.16b, v24.16b, v25.16b},  [x9], x14
436        SUB     x11, x11, x2            // a1 -= kc
437        ST1     {v26.16b, v27.16b, v28.16b}, [x10], x14
438        SUB     x12, x12, x2            // a2 -= kc
439        ST1     {v29.16b, v30.16b, v31.16b},  [x7], x14
440        SUB     x4,  x4, x2             // a3 -= kc
441
442        B.HI    0b
443
444        # Restore d8-d11,d14,d15 from stack
445        LDP     d14, d15, [sp, 32]
446        LDP     d10, d11, [sp, 16]
447        LDP     d8,  d9, [sp], 48
448        RET
449
4504:
451        # Is there a remainder?- 2 floats of A (8 bytes)
452        TBZ     x0, 3, 5f
453
454        # Remainder - 2 floats of A (8 bytes)
455        # Read first block of 4 A.
456        LDR     d0,  [x3], 8            // a0
457        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
458        LDR     d1, [x11], 8            // a1
459        LDR     d2, [x12], 8            // a2
460        LDR     d3,  [x4], 8            // a3
461        LD1     {v9.16b, v10.16b, v11.16b}, [x5], 48
462
463        # First block of 3 B
464        FMLA    v20.4s, v6.4s, v0.s[0]
465        FMLA    v23.4s, v6.4s, v1.s[0]
466        FMLA    v26.4s, v6.4s, v2.s[0]
467        FMLA    v29.4s, v6.4s, v3.s[0]
468        FMLA    v21.4s, v7.4s, v0.s[0]
469        FMLA    v24.4s, v7.4s, v1.s[0]
470        FMLA    v27.4s, v7.4s, v2.s[0]
471        FMLA    v30.4s, v7.4s, v3.s[0]
472        FMLA    v22.4s, v8.4s, v0.s[0]
473        FMLA    v25.4s, v8.4s, v1.s[0]
474        FMLA    v28.4s, v8.4s, v2.s[0]
475        FMLA    v31.4s, v8.4s, v3.s[0]
476
477        # Second block of 3 B
478        FMLA    v20.4s, v9.4s, v0.s[1]
479        FMLA    v23.4s, v9.4s, v1.s[1]
480        FMLA    v26.4s, v9.4s, v2.s[1]
481        FMLA    v29.4s, v9.4s, v3.s[1]
482        FMLA    v21.4s, v10.4s, v0.s[1]
483        FMLA    v24.4s, v10.4s, v1.s[1]
484        FMLA    v27.4s, v10.4s, v2.s[1]
485        FMLA    v30.4s, v10.4s, v3.s[1]
486        FMLA    v22.4s, v11.4s, v0.s[1]
487        FMLA    v25.4s, v11.4s, v1.s[1]
488        FMLA    v28.4s, v11.4s, v2.s[1]
489        FMLA    v31.4s, v11.4s, v3.s[1]
490
491        TBZ     x0, 2, 3b
4925:
493        # Remainder - 1 float of A (4 bytes)
494        LDR     s0,  [x3], 4            // a0
495        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
496        LDR     s1, [x11], 4            // a1
497        LDR     s2, [x12], 4            // a2
498        LDR     s3,  [x4], 4            // a3
499
500        FMLA    v20.4s, v6.4s, v0.s[0]
501        FMLA    v23.4s, v6.4s, v1.s[0]
502        FMLA    v26.4s, v6.4s, v2.s[0]
503        FMLA    v29.4s, v6.4s, v3.s[0]
504        FMLA    v21.4s, v7.4s, v0.s[0]
505        FMLA    v24.4s, v7.4s, v1.s[0]
506        FMLA    v27.4s, v7.4s, v2.s[0]
507        FMLA    v30.4s, v7.4s, v3.s[0]
508        FMLA    v22.4s, v8.4s, v0.s[0]
509        FMLA    v25.4s, v8.4s, v1.s[0]
510        FMLA    v28.4s, v8.4s, v2.s[0]
511        FMLA    v31.4s, v8.4s, v3.s[0]
512        B       3b
513
5146:
515        ADD     x1, x1, 12
516        # Store odd channels
517        TBZ     x1, 3, 7f
518        STP     q20, q21,  [x6], 32
519        MOV     v20.16b, v22.16b
520        STP     q23, q24,  [x9], 32
521        MOV     v23.16b, v25.16b
522        STP     q26, q27, [x10], 32
523        MOV     v26.16b, v28.16b
524        STP     q29, q30,  [x7], 32
525        MOV     v29.16b, v31.16b
526
5277:
528        TBZ     x1, 2, 8f
529        STR     q20,  [x6], 16
530        MOV     v20.16b, v21.16b
531        STR     q23,  [x9], 16
532        MOV     v23.16b, v24.16b
533        STR     q26, [x10], 16
534        MOV     v26.16b, v27.16b
535        STR     q29,  [x7], 16
536        MOV     v29.16b, v30.16b
537
5388:
539        TBZ     x1, 1, 9f
540        STR     d20,  [x6], 8
541        DUP     d20, v20.d[1]
542        STR     d23,  [x9], 8
543        DUP     d23, v23.d[1]
544        STR     d26, [x10], 8
545        DUP     d26, v26.d[1]
546        STR     d29,  [x7], 8
547        DUP     d29, v29.d[1]
548
5499:
550        TBZ     x1, 0, 10f
551        STR     s20,  [x6]
552        STR     s23,  [x9]
553        STR     s26, [x10]
554        STR     s29,  [x7]
55510:
556        # Restore d8-d11,d14,d15 from stack
557        LDP     d14, d15, [sp, 32]
558        LDP     d10, d11, [sp, 16]
559        LDP     d8,  d9, [sp], 48
560        RET
561
562END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
563
564#ifdef __ELF__
565.section ".note.GNU-stack","",%progbits
566#endif
567