xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/4x12-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x13 a0
26# x14 a1
27# x15 a2
28# x16 a3
29
30# C pointers
31# x6  c0
32# x17 c1
33# x10 c2
34# x7  c3 / cm_stride
35
36# x8 temporary vector shadow register
37
38# Vector register usage and GPR shadows
39# A0  v0
40# A1  v0[1]
41# A2  v1
42# A3  v1[1]
43# A0  v2
44# A1  v2[1]
45# A2  v3
46# A3  v3[1]
47# B   v6  v7  v8
48# B   v9 v10 v11
49# B  v14 v15 v16
50# B  v17 v18 v19
51# C  v20 v21 v22
52# C  v23 v24 v25
53# C  v26 v27 v28
54# C  v29 v30 v31
55# Clamp v4 v5
56# v12 to v13 unused.
57
58BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
59
60        # Load a_offset
61        LDR     x11, [sp, 8]
62
63        # Load zero, params pointer
64        LDP     x12, x8, [sp, 16]
65
66        # Save d8-d11,d14,d15 on stack
67        STP     d8,  d9, [sp, -48]!
68        STP     d10, d11, [sp, 16]
69        STP     d14, d15, [sp, 32]
70
71        # Load min/max values
72        LD2R    {v4.4s, v5.4s}, [x8]
73
74        # Clamp C pointers
75        CMP     x0, 2                   // if mr < 2
76        ADD     x17, x6, x7             // c1 = c0 + cm_stride
77        CSEL    x17, x6, x17, LO        //   c1 = c0
78
79        ADD     x10, x17, x7            // c2 = c1 + cm_stride
80                                        // if mr <= 2
81
82        CSEL    x10, x17, x10, LS       //   c2 = c1
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x7, x10, x7             // c3 = c2 + cm_stride
86        CSEL    x7, x10, x7, LO         //   c3 = c2
87
880:
89        # Load initial bias from w into accumulators
90        LD1     {v20.16b, v21.16b, v22.16b}, [x5], 48
91        MOV     v23.16b, v20.16b
92        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
93        MOV     v24.16b, v21.16b
94        PRFM    PLDL1KEEP, [x5,  64]
95        MOV     v25.16b, v22.16b
96        PRFM    PLDL1KEEP, [x5, 128]
97        MOV     v26.16b, v20.16b
98        PRFM    PLDL1KEEP, [x5, 192]
99        MOV     v27.16b, v21.16b
100        PRFM    PLDL1KEEP, [x5, 256]
101        MOV     v28.16b, v22.16b
102        PRFM    PLDL1KEEP, [x5, 320]
103        MOV     v29.16b, v20.16b
104        MOV     v30.16b, v21.16b
105        MOV     v31.16b, v22.16b
106
107        MOV     x9, x3                  // p = ks
108
1091:
110        # Load next 4 A pointers
111        LDP     x13, x14, [x4], 16
112        LDP     x15, x16, [x4], 16
113
114        CMP     x13, x12                // if a0 == zero
115        ADD     x13, x13, x11           // a0 += a_offset
116        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
117        CMP     x14, x12                // if a1 == zero
118        ADD     x14, x14, x11           // a1 += a_offset
119        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
120        CMP     x15, x12                // if a2 == zero
121        ADD     x15, x15, x11           // a2 += a_offset
122        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
123        CMP     x16, x12                // if a3 == zero
124        ADD     x16, x16, x11           // a3 += a_offset
125        CSEL    x16, x12, x16, EQ       //   a3 = zero, else += a3 + a_offset
126
127        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
128        SUBS    x0, x2, 16              // k = kc - 16
129
130        PRFM    PLDL1KEEP, [x13,  0]    // Prefetch A
131        PRFM    PLDL1KEEP, [x13, 64]
132        PRFM    PLDL1KEEP, [x14,  0]
133        PRFM    PLDL1KEEP, [x14, 64]
134        PRFM    PLDL1KEEP, [x15,  0]
135        PRFM    PLDL1KEEP, [x15, 64]
136        PRFM    PLDL1KEEP, [x16,  0]
137        PRFM    PLDL1KEEP, [x16, 64]
138        B.LO    5f
139
140        SUBS    x0, x0, 16              // 4 floats for main loop
141
142        # Prologue - loads for first group of 24 FMA
143
144        # Read first block of 4 A.
145        LDR     d0, [x13], 8              // a0
146        LDR     d1, [x15], 8              // a2
147        LD1     {v0.d}[1], [x14], 8       // a1
148        LD1     {v1.d}[1], [x16], 8       // a3
149
150        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
151        LD1     {v9.16b, v10.16b}, [x5], 32
152        LDR     d11, [x5], 8
153        LDR     x8, [x5], 8
154
155        # Is there at least 4 floats (16 bytes) for main loop?
156        B.LO    3f
157
158        # Main loop - 4 floats of A (16 bytes)
1592:
160        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
161        # A is loaded for 2nd group into v2/v3
162        # INS is 4 blocks (16 cycles) after load
163
164        # BLOCK 0
165        LDR     d2, [x13], 8               // a0
166        INS     v11.d[1], x8
167        FMLA    v20.4s, v6.4s, v0.s[0]
168        LDR     x8, [x14], 8               // a1
169        FMLA    v23.4s, v6.4s, v0.s[2]
170        FMLA    v26.4s, v6.4s, v1.s[0]
171        PRFM    PLDL1KEEP, [x13, 128]      // Prefetch A0
172
173        # BLOCK 1
174        LDR     d3, [x15], 8               // a2
175        INS     v2.d[1], x8                // a1 was loaded in block 0
176        FMLA    v29.4s, v6.4s, v1.s[2]
177        LDR     x8, [x16], 8               // a3
178        FMLA    v21.4s, v7.4s, v0.s[0]
179        FMLA    v24.4s, v7.4s, v0.s[2]
180        PRFM    PLDL1KEEP, [x14, 128]      // Prefetch A1
181
182        # BLOCK 2
183        LDR     d14, [x5]                  // vb0x0123
184        INS     v3.d[1], x8               // a3 was loaded in block 1
185        FMLA    v27.4s, v7.4s, v1.s[0]
186        LDR     x8, [x5, 8]
187        FMLA    v30.4s, v7.4s, v1.s[2]
188        FMLA    v22.4s, v8.4s, v0.s[0]
189        PRFM    PLDL1KEEP, [x15, 128]     // Prefetch A2
190
191        # BLOCK 3
192        LDR     d15, [x5, 16]              // vb0x4567
193        INS     v14.d[1], x8               // v14 was loaded in block 2
194        FMLA    v25.4s, v8.4s, v0.s[2]
195        LDR     x8, [x5, 24]
196        FMLA    v28.4s, v8.4s, v1.s[0]
197        FMLA    v31.4s, v8.4s, v1.s[2]
198        PRFM    PLDL1KEEP, [x16, 128]      // Prefetch A3
199
200        # BLOCK 4
201        LDR     d16, [x5, 32]              // vb0x89AB
202        INS     v15.d[1], x8
203        FMLA    v20.4s, v9.4s, v0.s[1]
204        LDR     x8, [x5, 40]
205        FMLA    v23.4s, v9.4s, v0.s[3]
206        FMLA    v26.4s, v9.4s, v1.s[1]
207        PRFM    PLDL1KEEP, [x5, 320]      // Prefetch B
208
209        # BLOCK 5
210        LDR     d17, [x5, 48]              // vb1x0123
211        INS     v16.d[1], x8
212        FMLA    v29.4s, v9.4s, v1.s[3]
213        LDR     x8, [x5, 56]
214        FMLA    v21.4s, v10.4s, v0.s[1]
215        FMLA    v24.4s, v10.4s, v0.s[3]
216        PRFM    PLDL1KEEP, [x5, 384]      // Prefetch B
217
218        # BLOCK 6
219        LDR     d18, [x5, 64]              // vb1x4567
220        INS     v17.d[1], x8
221        FMLA    v27.4s, v10.4s, v1.s[1]
222        LDR     x8, [x5, 72]
223        FMLA    v30.4s, v10.4s, v1.s[3]
224        FMLA    v22.4s, v11.4s, v0.s[1]
225        PRFM    PLDL1KEEP, [x5, 448]      // Prefetch B
226
227        # BLOCK 7
228        LDR     d19, [x5, 80]              // vb1x89AB
229        INS     v18.d[1], x8
230        FMLA    v25.4s, v11.4s, v0.s[3]
231        LDR     x8, [x5, 88]
232        FMLA    v28.4s, v11.4s, v1.s[1]
233        FMLA    v31.4s, v11.4s, v1.s[3]
234
235        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
236        # A is loaded for 1st group into v0/v1
237
238        # BLOCK 0
239        LDR     d0, [x13], 8               // a0
240        INS     v19.d[1], x8
241        FMLA    v20.4s, v14.4s, v2.s[0]
242        LDR     x8, [x14], 8               // a1
243        FMLA    v23.4s, v14.4s, v2.s[2]
244        FMLA    v26.4s, v14.4s, v3.s[0]
245
246        # BLOCK 1
247        LDR     d1, [x15], 8               // a2
248        INS     v0.d[1], x8                // a1
249        FMLA    v29.4s, v14.4s, v3.s[2]
250        LDR     x8, [x16], 8               // a3
251        FMLA    v21.4s, v15.4s, v2.s[0]
252        FMLA    v24.4s, v15.4s, v2.s[2]
253
254        # BLOCK 2
255        LDR     d6, [x5, 96]               // vb0x0123
256        INS     v1.d[1], x8               // a3
257        FMLA    v27.4s, v15.4s, v3.s[0]
258        LDR     x8, [x5, 104]
259        FMLA    v30.4s, v15.4s, v3.s[2]
260        FMLA    v22.4s, v16.4s, v2.s[0]
261
262        # BLOCK 3
263        LDR     d7, [x5, 112]              // vb0x4567
264        INS     v6.d[1], x8
265        FMLA    v25.4s, v16.4s, v2.s[2]
266        LDR     x8, [x5, 120]
267        FMLA    v28.4s, v16.4s, v3.s[0]
268        FMLA    v31.4s, v16.4s, v3.s[2]
269
270        # BLOCK 4
271        LDR     d8, [x5, 128]              // vb0x89AB
272        INS     v7.d[1], x8
273        FMLA    v20.4s, v17.4s, v2.s[1]
274        LDR     x8, [x5, 136]
275        FMLA    v23.4s, v17.4s, v2.s[3]
276        FMLA    v26.4s, v17.4s, v3.s[1]
277
278        # BLOCK 5
279        LDR     d9, [x5, 144]              // vb1x0123
280        INS     v8.d[1], x8
281        FMLA    v29.4s, v17.4s, v3.s[3]
282        LDR     x8, [x5, 152]
283        FMLA    v21.4s, v18.4s, v2.s[1]
284        FMLA    v24.4s, v18.4s, v2.s[3]
285
286        # BLOCK 6
287        LDR     d10, [x5, 160]             // vb1x4567
288        INS     v9.d[1], x8
289        FMLA    v27.4s, v18.4s, v3.s[1]
290        LDR     x8, [x5, 168]
291        FMLA    v30.4s, v18.4s, v3.s[3]
292        SUBS    x0, x0, 16
293        FMLA    v22.4s, v19.4s, v2.s[1]
294
295        # BLOCK 7
296        LDR     d11, [x5, 176]             // vb1x89AB
297        INS     v10.d[1], x8
298        FMLA    v25.4s, v19.4s, v2.s[3]
299        LDR     x8, [x5, 184]
300        FMLA    v28.4s, v19.4s, v3.s[1]
301        ADD     x5, x5, 192
302        FMLA    v31.4s, v19.4s, v3.s[3]
303        B.HS    2b
304
305        # Epilogue
306        # First block same as main loop.  Second block has no loads.
3073:
308        # BLOCK 0
309        LDR     d2, [x13], 8               // a0
310        INS     v11.d[1], x8
311        FMLA    v20.4s, v6.4s, v0.s[0]
312        LDR     x8, [x14], 8               // a1
313        FMLA    v23.4s, v6.4s, v0.s[2]
314        FMLA    v26.4s, v6.4s, v1.s[0]
315
316        # BLOCK 1
317        LDR     d3, [x15], 8               // a2
318        INS     v2.d[1], x8                // a1 was loaded in block 0
319        FMLA    v29.4s, v6.4s, v1.s[2]
320        LDR     x8, [x16], 8               // a3
321        FMLA    v21.4s, v7.4s, v0.s[0]
322        FMLA    v24.4s, v7.4s, v0.s[2]
323
324        # BLOCK 2
325        LDR     d14, [x5]                  // vb0x0123
326        INS     v3.d[1], x8               // a3 was loaded in block 1
327        FMLA    v27.4s, v7.4s, v1.s[0]
328        LDR     x8, [x5, 8]
329        FMLA    v30.4s, v7.4s, v1.s[2]
330        FMLA    v22.4s, v8.4s, v0.s[0]
331
332        # BLOCK 3
333        LDR     d15, [x5, 16]              // vb0x4567
334        INS     v14.d[1], x8               // v14 was loaded in block 2
335        FMLA    v25.4s, v8.4s, v0.s[2]
336        LDR     x8, [x5, 24]
337        FMLA    v28.4s, v8.4s, v1.s[0]
338        FMLA    v31.4s, v8.4s, v1.s[2]
339
340        # BLOCK 4
341        LDR     d16, [x5, 32]              // vb0x89AB
342        INS     v15.d[1], x8
343        FMLA    v20.4s, v9.4s, v0.s[1]
344        LDR     x8, [x5, 40]
345        FMLA    v23.4s, v9.4s, v0.s[3]
346        FMLA    v26.4s, v9.4s, v1.s[1]
347
348        # BLOCK 5
349        LDR     d17, [x5, 48]             // vb1x0123
350        INS     v16.d[1], x8
351        FMLA    v29.4s, v9.4s, v1.s[3]
352        LDR     x8, [x5, 56]
353        FMLA    v21.4s, v10.4s, v0.s[1]
354        FMLA    v24.4s, v10.4s, v0.s[3]
355
356        # BLOCK 6
357        LDR     d18, [x5, 64]             // vb1x4567
358        INS     v17.d[1], x8
359        FMLA    v27.4s, v10.4s, v1.s[1]
360        LDR     x8, [x5, 72]
361        FMLA    v30.4s, v10.4s, v1.s[3]
362        FMLA    v22.4s, v11.4s, v0.s[1]
363
364        # BLOCK 7
365        LDR     d19, [x5, 80]             // vb1x89AB
366        INS     v18.d[1], x8
367        FMLA    v25.4s, v11.4s, v0.s[3]
368        LDR     x8, [x5, 88]
369        FMLA    v28.4s, v11.4s, v1.s[1]
370        FMLA    v31.4s, v11.4s, v1.s[3]
371
372        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
373        # A is loaded for 1st group into v0/v1
374
375        # BLOCK 0
376        INS     v19.d[1], x8
377        FMLA    v20.4s, v14.4s, v2.s[0]
378        FMLA    v23.4s, v14.4s, v2.s[2]
379        FMLA    v26.4s, v14.4s, v3.s[0]
380
381        # BLOCK 1
382        FMLA    v29.4s, v14.4s, v3.s[2]
383        FMLA    v21.4s, v15.4s, v2.s[0]
384        FMLA    v24.4s, v15.4s, v2.s[2]
385
386        # BLOCK 2
387        FMLA    v27.4s, v15.4s, v3.s[0]
388        FMLA    v30.4s, v15.4s, v3.s[2]
389        FMLA    v22.4s, v16.4s, v2.s[0]
390
391        # BLOCK 3
392        FMLA    v25.4s, v16.4s, v2.s[2]
393        FMLA    v28.4s, v16.4s, v3.s[0]
394        FMLA    v31.4s, v16.4s, v3.s[2]
395
396        # BLOCK 4
397        FMLA    v20.4s, v17.4s, v2.s[1]
398        FMLA    v23.4s, v17.4s, v2.s[3]
399        FMLA    v26.4s, v17.4s, v3.s[1]
400
401        # BLOCK 5
402        FMLA    v29.4s, v17.4s, v3.s[3]
403        FMLA    v21.4s, v18.4s, v2.s[1]
404        FMLA    v24.4s, v18.4s, v2.s[3]
405
406        # BLOCK 6
407        FMLA    v27.4s, v18.4s, v3.s[1]
408        FMLA    v30.4s, v18.4s, v3.s[3]
409        FMLA    v22.4s, v19.4s, v2.s[1]
410        TST     x0, 15
411
412        # BLOCK 7
413        FMLA    v25.4s, v19.4s, v2.s[3]
414        FMLA    v28.4s, v19.4s, v3.s[1]
415        ADD     x5, x5, 96
416        FMLA    v31.4s, v19.4s, v3.s[3]
417
418        # Is there a remainder?- 2 floats of A (8 bytes) or less
419        B.NE    5f
420
4214:
422        # ks loop
423        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
424        B.HI    1b
425
426        # Clamp
427        FMAX    v20.4s, v20.4s, v4.4s
428        # Load cn_stride
429        LDR     x0, [sp, 48]
430        FMAX    v21.4s, v21.4s, v4.4s
431        FMAX    v22.4s, v22.4s, v4.4s
432        FMAX    v23.4s, v23.4s, v4.4s
433        FMAX    v24.4s, v24.4s, v4.4s
434        FMAX    v25.4s, v25.4s, v4.4s
435        FMAX    v26.4s, v26.4s, v4.4s
436        FMAX    v27.4s, v27.4s, v4.4s
437        FMAX    v28.4s, v28.4s, v4.4s
438        FMAX    v29.4s, v29.4s, v4.4s
439        FMAX    v30.4s, v30.4s, v4.4s
440        FMAX    v31.4s, v31.4s, v4.4s
441        SUBS    x1, x1, 12
442        FMIN    v20.4s, v20.4s, v5.4s
443        FMIN    v21.4s, v21.4s, v5.4s
444        FMIN    v22.4s, v22.4s, v5.4s
445        FMIN    v23.4s, v23.4s, v5.4s
446        FMIN    v24.4s, v24.4s, v5.4s
447        FMIN    v25.4s, v25.4s, v5.4s
448        FMIN    v26.4s, v26.4s, v5.4s
449        FMIN    v27.4s, v27.4s, v5.4s
450        FMIN    v28.4s, v28.4s, v5.4s
451        FMIN    v29.4s, v29.4s, v5.4s
452        FMIN    v30.4s, v30.4s, v5.4s
453        FMIN    v31.4s, v31.4s, v5.4s
454
455        # Store full 4 x 12
456        B.LO    7f
457
458        ST1     {v29.16b, v30.16b, v31.16b},  [x7], x0
459        ST1     {v26.16b, v27.16b, v28.16b}, [x10], x0
460        ST1     {v23.16b, v24.16b, v25.16b}, [x17], x0
461        ST1     {v20.16b, v21.16b, v22.16b},  [x6], x0
462        SUB     x4, x4, x3              // a -= ks
463
464        # nc loop
465        B.HI    0b
466
467        # Restore d8-d11,d14,d15 from stack
468        LDP     d14, d15, [sp, 32]
469        LDP     d10, d11, [sp, 16]
470        LDP     d8,  d9, [sp], 48
471        RET
472
4735:
474        # Is there a remainder?- 2 floats of A (8 bytes)
475        TBZ     x0, 3, 6f
476
477        # Remainder- 2 floats of A (8 bytes)
478        LDR     d0,  [x13], 8           // a0
479        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
480        LDR     d1, [x14], 8            // a1
481        LDR     d2, [x15], 8            // a2
482        LDR     d3,  [x16], 8           // a3
483        LD1     {v9.16b, v10.16b, v11.16b}, [x5], 48
484
485        # First block of 3 B
486        FMLA    v20.4s, v6.4s, v0.s[0]
487        FMLA    v23.4s, v6.4s, v1.s[0]
488        FMLA    v26.4s, v6.4s, v2.s[0]
489        FMLA    v29.4s, v6.4s, v3.s[0]
490        FMLA    v21.4s, v7.4s, v0.s[0]
491        FMLA    v24.4s, v7.4s, v1.s[0]
492        FMLA    v27.4s, v7.4s, v2.s[0]
493        FMLA    v30.4s, v7.4s, v3.s[0]
494        FMLA    v22.4s, v8.4s, v0.s[0]
495        FMLA    v25.4s, v8.4s, v1.s[0]
496        FMLA    v28.4s, v8.4s, v2.s[0]
497        FMLA    v31.4s, v8.4s, v3.s[0]
498
499        # Second block of 3 B
500        FMLA    v20.4s, v9.4s, v0.s[1]
501        FMLA    v23.4s, v9.4s, v1.s[1]
502        FMLA    v26.4s, v9.4s, v2.s[1]
503        FMLA    v29.4s, v9.4s, v3.s[1]
504        FMLA    v21.4s, v10.4s, v0.s[1]
505        FMLA    v24.4s, v10.4s, v1.s[1]
506        FMLA    v27.4s, v10.4s, v2.s[1]
507        FMLA    v30.4s, v10.4s, v3.s[1]
508        FMLA    v22.4s, v11.4s, v0.s[1]
509        FMLA    v25.4s, v11.4s, v1.s[1]
510        FMLA    v28.4s, v11.4s, v2.s[1]
511        FMLA    v31.4s, v11.4s, v3.s[1]
512
513        # Is there a remainder?- 1 float of A (4 bytes)
514        TBZ     x0, 2, 4b
5156:
516        # Remainder- 1 float of A (4 bytes)
517        LDR     s0,  [x13], 4           // a0
518        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
519        LDR     s1, [x14], 4            // a1
520        LDR     s2, [x15], 4            // a2
521        LDR     s3,  [x16], 4           // a3
522
523        FMLA    v20.4s, v6.4s, v0.s[0]
524        FMLA    v23.4s, v6.4s, v1.s[0]
525        FMLA    v26.4s, v6.4s, v2.s[0]
526        FMLA    v29.4s, v6.4s, v3.s[0]
527        FMLA    v21.4s, v7.4s, v0.s[0]
528        FMLA    v24.4s, v7.4s, v1.s[0]
529        FMLA    v27.4s, v7.4s, v2.s[0]
530        FMLA    v30.4s, v7.4s, v3.s[0]
531        FMLA    v22.4s, v8.4s, v0.s[0]
532        FMLA    v25.4s, v8.4s, v1.s[0]
533        FMLA    v28.4s, v8.4s, v2.s[0]
534        FMLA    v31.4s, v8.4s, v3.s[0]
535        B       4b
536
5377:
538        ADD     x1, x1, 12
539        # Store odd channels
540        TBZ     x1, 3, 8f
541        STP     q29, q30,  [x7], 32
542        MOV     v29.16b, v31.16b
543        STP     q26, q27, [x10], 32
544        MOV     v26.16b, v28.16b
545        STP     q23, q24, [x17], 32
546        MOV     v23.16b, v25.16b
547        STP     q20, q21,  [x6], 32
548        MOV     v20.16b, v22.16b
549
5508:
551        TBZ     x1, 2, 9f
552        STR     q29,  [x7], 16
553        MOV     v29.16b, v30.16b
554        STR     q26, [x10], 16
555        MOV     v26.16b, v27.16b
556        STR     q23,  [x17], 16
557        MOV     v23.16b, v24.16b
558        STR     q20,  [x6], 16
559        MOV     v20.16b, v21.16b
560
5619:
562        TBZ     x1, 1, 10f
563        STR     d29,  [x7], 8
564        DUP     d29, v29.d[1]
565        STR     d26, [x10], 8
566        DUP     d26, v26.d[1]
567        STR     d23, [x17], 8
568        DUP     d23, v23.d[1]
569        STR     d20,  [x6], 8
570        DUP     d20, v20.d[1]
571
57210:
573        TBZ     x1, 0, 11f
574        STR     s29,  [x7]
575        STR     s26, [x10]
576        STR     s23, [x17]
577        STR     s20,  [x6]
57811:
579        # Restore d8-d11,d14,d15 from stack
580        LDP     d14, d15, [sp, 32]
581        LDP     d10, d11, [sp, 16]
582        LDP     d8,  d9, [sp], 48
583        RET
584
585END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
586
587#ifdef __ELF__
588.section ".note.GNU-stack","",%progbits
589#endif
590