xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# x8 temporary vector shadow register
39
40# Vector register usage and GPR shadows
41# a0  v0
42# a1  v0[1]
43# a2  v1
44# a3  v1[1]
45# a0  v2
46# a1  v2[1]
47# a2  v3
48# a3  v3[1]
49# B   v6  v7  v8
50# B   v9 v10 v11
51# B  v14 v15 v16
52# B  v17 v18 v19
53# C  v20 v21 v22
54# C  v23 v24 v25
55# C  v26 v27 v28
56# C  v29 v30 v31
57# Clamp v4 v5
58# v12 to v13 unused.
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
61
62        $if INC:
63          # Load cn_stride, acc
64          LDP     x14, x15, [sp]
65          # Load params pointer
66          LDR     x8, [sp, 16]
67        $else:
68          # Load cn_stride, params pointer
69          LDP     x14, x8, [sp]
70
71        # Load min/max values
72        LD2R    {v4.4s, v5.4s}, [x8]
73
74        # Save d8-d11,d14,d15 on stack
75        STP     d8,  d9, [sp, -48]!
76        STP     d10, d11, [sp, 16]
77        STP     d14, d15, [sp, 32]
78
79        # Clamp A and C pointers
80        CMP     x0, 2                   // if mr < 2
81        ADD     x11, x3, x4             // a1 = a0 + a_stride
82        ADD     x9, x6, x7              // c1 = c0 + cm_stride
83        CSEL    x11, x3, x11, LO        //   a1 = a0
84        CSEL    x9, x6, x9, LO          //   c1 = c0
85        ADD     x12, x11, x4            // a2 = a1 + a_stride
86        ADD     x10, x9, x7             // c2 = c1 + cm_stride
87                                        // if mr <= 2
88        CSEL    x12, x11, x12, LS       //   a2 = a1
89        CSEL    x10, x9, x10, LS        //   c2 = c1
90        CMP     x0, 4                   // if mr < 4
91        ADD     x4, x12, x4             // a3 = a2 + a_stride
92        ADD     x7, x10, x7             // c3 = c2 + cm_stride
93        CSEL    x4, x12, x4, LO         //   a3 = a2
94        CSEL    x7, x10, x7, LO         //   c3 = c2
95
960:
97        $if INC:
98          # Load initial accumulators
99          LD1     {v20.16b, v21.16b, v22.16b}, [x15], 48
100          LD1     {v23.16b, v24.16b, v25.16b}, [x15], 48
101          LD1     {v26.16b, v27.16b, v28.16b}, [x15], 48
102          LD1     {v29.16b, v30.16b, v31.16b}, [x15], 48
103          PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
104          PRFM    PLDL1KEEP,  [x3, 64]
105          PRFM    PLDL1KEEP, [x11,  0]
106          PRFM    PLDL1KEEP, [x11, 64]
107          PRFM    PLDL1KEEP, [x12,  0]
108          PRFM    PLDL1KEEP, [x12, 64]
109          PRFM    PLDL1KEEP,  [x4,  0]
110          PRFM    PLDL1KEEP,  [x4, 64]
111          PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
112          PRFM    PLDL1KEEP, [x5,  64]
113          PRFM    PLDL1KEEP, [x5, 128]
114          PRFM    PLDL1KEEP, [x5, 192]
115          PRFM    PLDL1KEEP, [x5, 256]
116          PRFM    PLDL1KEEP, [x5, 320]
117        $else:
118          # Load initial bias from w into accumulators
119          LD1     {v20.16b, v21.16b, v22.16b}, [x5], 48
120          MOV     v23.16b, v20.16b
121          PRFM    PLDL1KEEP,  [x3,  0]    // Prefetch A
122          PRFM    PLDL1KEEP,  [x3, 64]
123          MOV     v24.16b, v21.16b
124          PRFM    PLDL1KEEP,  [x11,  0]
125          PRFM    PLDL1KEEP,  [x11, 64]
126          MOV     v25.16b, v22.16b
127          PRFM    PLDL1KEEP, [x12,  0]
128          PRFM    PLDL1KEEP, [x12, 64]
129          MOV     v26.16b, v20.16b
130          PRFM    PLDL1KEEP, [x4,  0]
131          PRFM    PLDL1KEEP, [x4, 64]
132          MOV     v27.16b, v21.16b
133          PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
134          PRFM    PLDL1KEEP, [x5,  64]
135          MOV     v28.16b, v22.16b
136          PRFM    PLDL1KEEP, [x5, 128]
137          PRFM    PLDL1KEEP, [x5, 192]
138          MOV     v29.16b, v20.16b
139          PRFM    PLDL1KEEP, [x5, 256]
140          MOV     v30.16b, v21.16b
141          PRFM    PLDL1KEEP, [x5, 320]
142          MOV     v31.16b, v22.16b
143
144        # Is there at least 4 floats (16 bytes)?
145        SUBS    x0, x2, 16              // k = kc - 16
146        B.LO    4f
147
148        SUBS    x0, x0, 16
149
150        # Prologue - loads for first group of 24 FMA
151
152        # Read first block of 4 A.
153        LDR     d0,  [x3], 8              // a0
154        LDR     d1, [x12], 8              // a2
155        LD1     {v0.d}[1], [x11], 8       // a1
156        LD1     {v1.d}[1],  [x4], 8       // a3
157
158        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
159        LD1     {v9.16b, v10.16b}, [x5], 32
160        LDR     d11, [x5], 8
161        LDR     x8, [x5], 8
162
163        # Is there at least 4 floats (16 bytes) for main loop?
164        B.LO    2f
165
166        # Main loop - 4 floats of A (16 bytes)
1671:
168        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
169        # A is loaded for 2nd group into v2/v3
170        # INS is 4 blocks (16 cycles) after load
171
172        # BLOCK 0
173        LDR     d2, [x3], 8                // a0
174        INS     v11.d[1], x8
175        FMLA    v20.4s, v6.4s, v0.s[0]
176        LDR     x8, [x11], 8               // a1
177        FMLA    v23.4s, v6.4s, v0.s[2]
178        FMLA    v26.4s, v6.4s, v1.s[0]
179        PRFM    PLDL1KEEP, [x3, 128]      // Prefetch A0
180
181        # BLOCK 1
182        LDR     d3, [x12], 8               // a2
183        INS     v2.d[1], x8                // a1 was loaded in block 0
184        FMLA    v29.4s, v6.4s, v1.s[2]
185        LDR     x8, [x4], 8                // a3
186        FMLA    v21.4s, v7.4s, v0.s[0]
187        FMLA    v24.4s, v7.4s, v0.s[2]
188        PRFM    PLDL1KEEP, [x11, 128]      // Prefetch A1
189
190        # BLOCK 2
191        LDR     d14, [x5]                  // vb0x0123
192        INS     v3.d[1], x8                // a3 was loaded in block 1
193        FMLA    v27.4s, v7.4s, v1.s[0]
194        LDR     x8, [x5, 8]
195        FMLA    v30.4s, v7.4s, v1.s[2]
196        FMLA    v22.4s, v8.4s, v0.s[0]
197        PRFM    PLDL1KEEP, [x12, 128]     // Prefetch A2
198
199        # BLOCK 3
200        LDR     d15, [x5, 16]              // vb0x4567
201        INS     v14.d[1], x8               // v14 was loaded in block 2
202        FMLA    v25.4s, v8.4s, v0.s[2]
203        LDR     x8, [x5, 24]
204        FMLA    v28.4s, v8.4s, v1.s[0]
205        FMLA    v31.4s, v8.4s, v1.s[2]
206        PRFM    PLDL1KEEP, [x4, 128]      // Prefetch A3
207
208        # BLOCK 4
209        LDR     d16, [x5, 32]              // vb0x89AB
210        INS     v15.d[1], x8
211        FMLA    v20.4s, v9.4s, v0.s[1]
212        LDR     x8, [x5, 40]
213        FMLA    v23.4s, v9.4s, v0.s[3]
214        FMLA    v26.4s, v9.4s, v1.s[1]
215        PRFM    PLDL1KEEP, [x5, 320]      // Prefetch B
216
217        # BLOCK 5
218        LDR     d17, [x5, 48]              // vb1x0123
219        INS     v16.d[1], x8
220        FMLA    v29.4s, v9.4s, v1.s[3]
221        LDR     x8, [x5, 56]
222        FMLA    v21.4s, v10.4s, v0.s[1]
223        FMLA    v24.4s, v10.4s, v0.s[3]
224        PRFM    PLDL1KEEP, [x5, 384]      // Prefetch B
225
226        # BLOCK 6
227        LDR     d18, [x5, 64]              // vb1x4567
228        INS     v17.d[1], x8
229        FMLA    v27.4s, v10.4s, v1.s[1]
230        LDR     x8, [x5, 72]
231        FMLA    v30.4s, v10.4s, v1.s[3]
232        FMLA    v22.4s, v11.4s, v0.s[1]
233        PRFM    PLDL1KEEP, [x5, 448]      // Prefetch B
234
235        # BLOCK 7
236        LDR     d19, [x5, 80]              // vb1x89AB
237        INS     v18.d[1], x8
238        FMLA    v25.4s, v11.4s, v0.s[3]
239        LDR     x8, [x5, 88]
240        FMLA    v28.4s, v11.4s, v1.s[1]
241        FMLA    v31.4s, v11.4s, v1.s[3]
242
243        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
244        # A is loaded for 1st group into v0/v1
245
246        # BLOCK 0
247        LDR     d0, [x3], 8                // a0
248        INS     v19.d[1], x8
249        FMLA    v20.4s, v14.4s, v2.s[0]
250        LDR     x8, [x11], 8               // a1
251        FMLA    v23.4s, v14.4s, v2.s[2]
252        FMLA    v26.4s, v14.4s, v3.s[0]
253
254        # BLOCK 1
255        LDR     d1, [x12], 8               // a2
256        INS     v0.d[1], x8                // a1
257        FMLA    v29.4s, v14.4s, v3.s[2]
258        LDR     x8, [x4], 8                // a3
259        FMLA    v21.4s, v15.4s, v2.s[0]
260        FMLA    v24.4s, v15.4s, v2.s[2]
261
262        # BLOCK 2
263        LDR     d6, [x5, 96]               // vb0x0123
264        INS     v1.d[1], x8                // a3
265        FMLA    v27.4s, v15.4s, v3.s[0]
266        LDR     x8, [x5, 104]
267        FMLA    v30.4s, v15.4s, v3.s[2]
268        FMLA    v22.4s, v16.4s, v2.s[0]
269
270        # BLOCK 3
271        LDR     d7, [x5, 112]              // vb0x4567
272        INS     v6.d[1], x8
273        FMLA    v25.4s, v16.4s, v2.s[2]
274        LDR     x8, [x5, 120]
275        FMLA    v28.4s, v16.4s, v3.s[0]
276        FMLA    v31.4s, v16.4s, v3.s[2]
277
278        # BLOCK 4
279        LDR     d8, [x5, 128]              // vb0x89AB
280        INS     v7.d[1], x8
281        FMLA    v20.4s, v17.4s, v2.s[1]
282        LDR     x8, [x5, 136]
283        FMLA    v23.4s, v17.4s, v2.s[3]
284        FMLA    v26.4s, v17.4s, v3.s[1]
285
286        # BLOCK 5
287        LDR     d9, [x5, 144]              // vb1x0123
288        INS     v8.d[1], x8
289        FMLA    v29.4s, v17.4s, v3.s[3]
290        LDR     x8, [x5, 152]
291        FMLA    v21.4s, v18.4s, v2.s[1]
292        FMLA    v24.4s, v18.4s, v2.s[3]
293
294        # BLOCK 6
295        LDR     d10, [x5, 160]             // vb1x4567
296        INS     v9.d[1], x8
297        FMLA    v27.4s, v18.4s, v3.s[1]
298        LDR     x8, [x5, 168]
299        FMLA    v30.4s, v18.4s, v3.s[3]
300        SUBS    x0, x0, 16
301        FMLA    v22.4s, v19.4s, v2.s[1]
302
303        # BLOCK 7
304        LDR     d11, [x5, 176]             // vb1x89AB
305        INS     v10.d[1], x8
306        FMLA    v25.4s, v19.4s, v2.s[3]
307        LDR     x8, [x5, 184]
308        FMLA    v28.4s, v19.4s, v3.s[1]
309        ADD     x5, x5, 192
310        FMLA    v31.4s, v19.4s, v3.s[3]
311        B.HS    1b
312
313        # Epilogue
314        # First block same as main loop.  Second block has no loads.
3152:
316        # BLOCK 0
317        LDR     d2, [x3], 8                // a0
318        INS     v11.d[1], x8
319        FMLA    v20.4s, v6.4s, v0.s[0]
320        LDR     x8, [x11], 8               // a1
321        FMLA    v23.4s, v6.4s, v0.s[2]
322        FMLA    v26.4s, v6.4s, v1.s[0]
323
324        # BLOCK 1
325        LDR     d3, [x12], 8               // a2
326        INS     v2.d[1], x8                // a1 was loaded in block 0
327        FMLA    v29.4s, v6.4s, v1.s[2]
328        LDR     x8, [x4], 8                // a3
329        FMLA    v21.4s, v7.4s, v0.s[0]
330        FMLA    v24.4s, v7.4s, v0.s[2]
331
332        # BLOCK 2
333        LDR     d14, [x5]                  // vb0x0123
334        INS     v3.d[1], x8                // a3 was loaded in block 1
335        FMLA    v27.4s, v7.4s, v1.s[0]
336        LDR     x8, [x5, 8]
337        FMLA    v30.4s, v7.4s, v1.s[2]
338        FMLA    v22.4s, v8.4s, v0.s[0]
339
340        # BLOCK 3
341        LDR     d15, [x5, 16]              // vb0x4567
342        INS     v14.d[1], x8               // v14 was loaded in block 2
343        FMLA    v25.4s, v8.4s, v0.s[2]
344        LDR     x8, [x5, 24]
345        FMLA    v28.4s, v8.4s, v1.s[0]
346        FMLA    v31.4s, v8.4s, v1.s[2]
347
348        # BLOCK 4
349        LDR     d16, [x5, 32]              // vb0x89AB
350        INS     v15.d[1], x8
351        FMLA    v20.4s, v9.4s, v0.s[1]
352        LDR     x8, [x5, 40]
353        FMLA    v23.4s, v9.4s, v0.s[3]
354        FMLA    v26.4s, v9.4s, v1.s[1]
355
356        # BLOCK 5
357        LDR     d17, [x5, 48]             // vb1x0123
358        INS     v16.d[1], x8
359        FMLA    v29.4s, v9.4s, v1.s[3]
360        LDR     x8, [x5, 56]
361        FMLA    v21.4s, v10.4s, v0.s[1]
362        FMLA    v24.4s, v10.4s, v0.s[3]
363
364        # BLOCK 6
365        LDR     d18, [x5, 64]             // vb1x4567
366        INS     v17.d[1], x8
367        FMLA    v27.4s, v10.4s, v1.s[1]
368        LDR     x8, [x5, 72]
369        FMLA    v30.4s, v10.4s, v1.s[3]
370        FMLA    v22.4s, v11.4s, v0.s[1]
371
372        # BLOCK 7
373        LDR     d19, [x5, 80]             // vb1x89AB
374        INS     v18.d[1], x8
375        FMLA    v25.4s, v11.4s, v0.s[3]
376        LDR     x8, [x5, 88]
377        FMLA    v28.4s, v11.4s, v1.s[1]
378        FMLA    v31.4s, v11.4s, v1.s[3]
379
380        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
381        # A is loaded for 1st group into v0/v1
382
383        # BLOCK 0
384        INS     v19.d[1], x8
385        FMLA    v20.4s, v14.4s, v2.s[0]
386        FMLA    v23.4s, v14.4s, v2.s[2]
387        FMLA    v26.4s, v14.4s, v3.s[0]
388
389        # BLOCK 1
390        FMLA    v29.4s, v14.4s, v3.s[2]
391        FMLA    v21.4s, v15.4s, v2.s[0]
392        FMLA    v24.4s, v15.4s, v2.s[2]
393
394        # BLOCK 2
395        FMLA    v27.4s, v15.4s, v3.s[0]
396        FMLA    v30.4s, v15.4s, v3.s[2]
397        FMLA    v22.4s, v16.4s, v2.s[0]
398
399        # BLOCK 3
400        FMLA    v25.4s, v16.4s, v2.s[2]
401        FMLA    v28.4s, v16.4s, v3.s[0]
402        FMLA    v31.4s, v16.4s, v3.s[2]
403
404        # BLOCK 4
405        FMLA    v20.4s, v17.4s, v2.s[1]
406        FMLA    v23.4s, v17.4s, v2.s[3]
407        FMLA    v26.4s, v17.4s, v3.s[1]
408
409        # BLOCK 5
410        FMLA    v29.4s, v17.4s, v3.s[3]
411        FMLA    v21.4s, v18.4s, v2.s[1]
412        FMLA    v24.4s, v18.4s, v2.s[3]
413
414        # BLOCK 6
415        FMLA    v27.4s, v18.4s, v3.s[1]
416        FMLA    v30.4s, v18.4s, v3.s[3]
417        FMLA    v22.4s, v19.4s, v2.s[1]
418        TST     x0, 15
419
420        # BLOCK 7
421        FMLA    v25.4s, v19.4s, v2.s[3]
422        FMLA    v28.4s, v19.4s, v3.s[1]
423        ADD     x5, x5, 96
424        FMLA    v31.4s, v19.4s, v3.s[3]
425
426        # Is there a remainder?- 2 floats of A (8 bytes) or less
427        B.NE    4f
428
4293:
430        # Clamp
431        FMAX    v20.4s, v20.4s, v4.4s
432        SUBS    x1, x1, 12
433        FMAX    v21.4s, v21.4s, v4.4s
434        FMAX    v22.4s, v22.4s, v4.4s
435        FMAX    v23.4s, v23.4s, v4.4s
436        FMAX    v24.4s, v24.4s, v4.4s
437        FMAX    v25.4s, v25.4s, v4.4s
438        FMAX    v26.4s, v26.4s, v4.4s
439        FMAX    v27.4s, v27.4s, v4.4s
440        FMAX    v28.4s, v28.4s, v4.4s
441        FMAX    v29.4s, v29.4s, v4.4s
442        FMAX    v30.4s, v30.4s, v4.4s
443        FMAX    v31.4s, v31.4s, v4.4s
444        FMIN    v20.4s, v20.4s, v5.4s
445        FMIN    v21.4s, v21.4s, v5.4s
446        FMIN    v22.4s, v22.4s, v5.4s
447        FMIN    v23.4s, v23.4s, v5.4s
448        FMIN    v24.4s, v24.4s, v5.4s
449        FMIN    v25.4s, v25.4s, v5.4s
450        FMIN    v26.4s, v26.4s, v5.4s
451        FMIN    v27.4s, v27.4s, v5.4s
452        FMIN    v28.4s, v28.4s, v5.4s
453        FMIN    v29.4s, v29.4s, v5.4s
454        FMIN    v30.4s, v30.4s, v5.4s
455        FMIN    v31.4s, v31.4s, v5.4s
456
457        # Store full 4 x 12
458        B.LO    6f
459
460        $if INC:
461          ST1     {v29.16b, v30.16b, v31.16b},  [x7], x14
462          SUB     x3,  x3, x2             // a0 -= kc
463          ST1     {v26.16b, v27.16b, v28.16b}, [x10], x14
464          SUB     x11, x11, x2            // a1 -= kc
465          ST1     {v23.16b, v24.16b, v25.16b},  [x9], x14
466          SUB     x12, x12, x2            // a2 -= kc
467          ST1     {v20.16b, v21.16b, v22.16b},  [x6], x14
468          SUB     x4,  x4, x2             // a3 -= kc
469        $else:
470          ST1     {v20.16b, v21.16b, v22.16b},  [x6], x14
471          SUB     x3,  x3, x2             // a0 -= kc
472          ST1     {v23.16b, v24.16b, v25.16b},  [x9], x14
473          SUB     x11, x11, x2            // a1 -= kc
474          ST1     {v26.16b, v27.16b, v28.16b}, [x10], x14
475          SUB     x12, x12, x2            // a2 -= kc
476          ST1     {v29.16b, v30.16b, v31.16b},  [x7], x14
477          SUB     x4,  x4, x2             // a3 -= kc
478
479        B.HI    0b
480
481        # Restore d8-d11,d14,d15 from stack
482        LDP     d14, d15, [sp, 32]
483        LDP     d10, d11, [sp, 16]
484        LDP     d8,  d9, [sp], 48
485        RET
486
4874:
488        # Is there a remainder?- 2 floats of A (8 bytes)
489        TBZ     x0, 3, 5f
490
491        # Remainder - 2 floats of A (8 bytes)
492        # Read first block of 4 A.
493        LDR     d0,  [x3], 8            // a0
494        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
495        LDR     d1, [x11], 8            // a1
496        LDR     d2, [x12], 8            // a2
497        LDR     d3,  [x4], 8            // a3
498        LD1     {v9.16b, v10.16b, v11.16b}, [x5], 48
499
500        # First block of 3 B
501        FMLA    v20.4s, v6.4s, v0.s[0]
502        FMLA    v23.4s, v6.4s, v1.s[0]
503        FMLA    v26.4s, v6.4s, v2.s[0]
504        FMLA    v29.4s, v6.4s, v3.s[0]
505        FMLA    v21.4s, v7.4s, v0.s[0]
506        FMLA    v24.4s, v7.4s, v1.s[0]
507        FMLA    v27.4s, v7.4s, v2.s[0]
508        FMLA    v30.4s, v7.4s, v3.s[0]
509        FMLA    v22.4s, v8.4s, v0.s[0]
510        FMLA    v25.4s, v8.4s, v1.s[0]
511        FMLA    v28.4s, v8.4s, v2.s[0]
512        FMLA    v31.4s, v8.4s, v3.s[0]
513
514        # Second block of 3 B
515        FMLA    v20.4s, v9.4s, v0.s[1]
516        FMLA    v23.4s, v9.4s, v1.s[1]
517        FMLA    v26.4s, v9.4s, v2.s[1]
518        FMLA    v29.4s, v9.4s, v3.s[1]
519        FMLA    v21.4s, v10.4s, v0.s[1]
520        FMLA    v24.4s, v10.4s, v1.s[1]
521        FMLA    v27.4s, v10.4s, v2.s[1]
522        FMLA    v30.4s, v10.4s, v3.s[1]
523        FMLA    v22.4s, v11.4s, v0.s[1]
524        FMLA    v25.4s, v11.4s, v1.s[1]
525        FMLA    v28.4s, v11.4s, v2.s[1]
526        FMLA    v31.4s, v11.4s, v3.s[1]
527
528        TBZ     x0, 2, 3b
5295:
530        # Remainder - 1 float of A (4 bytes)
531        LDR     s0,  [x3], 4            // a0
532        LD1     {v6.16b, v7.16b, v8.16b}, [x5], 48
533        LDR     s1, [x11], 4            // a1
534        LDR     s2, [x12], 4            // a2
535        LDR     s3,  [x4], 4            // a3
536
537        FMLA    v20.4s, v6.4s, v0.s[0]
538        FMLA    v23.4s, v6.4s, v1.s[0]
539        FMLA    v26.4s, v6.4s, v2.s[0]
540        FMLA    v29.4s, v6.4s, v3.s[0]
541        FMLA    v21.4s, v7.4s, v0.s[0]
542        FMLA    v24.4s, v7.4s, v1.s[0]
543        FMLA    v27.4s, v7.4s, v2.s[0]
544        FMLA    v30.4s, v7.4s, v3.s[0]
545        FMLA    v22.4s, v8.4s, v0.s[0]
546        FMLA    v25.4s, v8.4s, v1.s[0]
547        FMLA    v28.4s, v8.4s, v2.s[0]
548        FMLA    v31.4s, v8.4s, v3.s[0]
549        B       3b
550
5516:
552        ADD     x1, x1, 12
553        # Store odd channels
554        TBZ     x1, 3, 7f
555        $if INC:
556          STP     q29, q30,  [x7], 32
557          MOV     v29.16b, v31.16b
558          STP     q26, q27, [x10], 32
559          MOV     v26.16b, v28.16b
560          STP     q23, q24,  [x9], 32
561          MOV     v23.16b, v25.16b
562          STP     q20, q21,  [x6], 32
563          MOV     v20.16b, v22.16b
564        $else:
565          STP     q20, q21,  [x6], 32
566          MOV     v20.16b, v22.16b
567          STP     q23, q24,  [x9], 32
568          MOV     v23.16b, v25.16b
569          STP     q26, q27, [x10], 32
570          MOV     v26.16b, v28.16b
571          STP     q29, q30,  [x7], 32
572          MOV     v29.16b, v31.16b
573
5747:
575        TBZ     x1, 2, 8f
576        $if INC:
577          STR     q29,  [x7], 16
578          MOV     v29.16b, v30.16b
579          STR     q26, [x10], 16
580          MOV     v26.16b, v27.16b
581          STR     q23,  [x9], 16
582          MOV     v23.16b, v24.16b
583          STR     q20,  [x6], 16
584          MOV     v20.16b, v21.16b
585        $else:
586          STR     q20,  [x6], 16
587          MOV     v20.16b, v21.16b
588          STR     q23,  [x9], 16
589          MOV     v23.16b, v24.16b
590          STR     q26, [x10], 16
591          MOV     v26.16b, v27.16b
592          STR     q29,  [x7], 16
593          MOV     v29.16b, v30.16b
594
5958:
596        TBZ     x1, 1, 9f
597        $if INC:
598          STR     d29,  [x7], 8
599          DUP     d29, v29.d[1]
600          STR     d26, [x10], 8
601          DUP     d26, v26.d[1]
602          STR     d23,  [x9], 8
603          DUP     d23, v23.d[1]
604          STR     d20,  [x6], 8
605          DUP     d20, v20.d[1]
606        $else:
607          STR     d20,  [x6], 8
608          DUP     d20, v20.d[1]
609          STR     d23,  [x9], 8
610          DUP     d23, v23.d[1]
611          STR     d26, [x10], 8
612          DUP     d26, v26.d[1]
613          STR     d29,  [x7], 8
614          DUP     d29, v29.d[1]
615
6169:
617        TBZ     x1, 0, 10f
618        $if INC:
619          STR     s29,  [x7]
620          STR     s26, [x10]
621          STR     s23,  [x9]
622          STR     s20,  [x6]
623        $else:
624          STR     s20,  [x6]
625          STR     s23,  [x9]
626          STR     s26, [x10]
627          STR     s29,  [x7]
62810:
629        # Restore d8-d11,d14,d15 from stack
630        LDP     d14, d15, [sp, 32]
631        LDP     d10, d11, [sp, 16]
632        LDP     d8,  d9, [sp], 48
633        RET
634
635END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
636
637#ifdef __ELF__
638.section ".note.GNU-stack","",%progbits
639#endif
640