xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a73.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0  v6
44# A1   v1  v7
45# A2   v2  v8
46# A3   v3  v9
47# A4   v4 v10
48# A5   v5 v11
49# B   v12 v13 v14 v15
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58
59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
60
61        # Load params pointer
62        LDR     x8, [sp, 8]
63
64        # Clamp A and C pointers / Save d8-d15 on stack
65        STP     d8,  d9, [sp, -64]!
66        CMP     x0, 2                   // if mr < 2
67        ADD     x9, x3, x4              // a1 = a0 + a_stride
68        ADD     x16, x6, x7             // c1 = c0 + cm_stride
69        CSEL    x9, x3, x9, LO          //   a1 = a0
70        CSEL    x16, x6, x16, LO        //   c1 = c0
71
72        STP     d10, d11, [sp, 16]
73        ADD     x10, x9, x4             // a2 = a1 + a_stride
74        ADD     x17, x16, x7            // c2 = c1 + cm_stride
75                                        // if mr <= 2
76        CSEL    x10, x9, x10, LS        //   a2 = a1
77        CSEL    x17, x16, x17, LS       //   c2 = c1
78
79        STP     d12, d13, [sp, 32]
80        CMP     x0, 4                   // if mr < 4
81        ADD     x11, x10, x4            // a3 = a2 + a_stride
82        ADD     x14, x17, x7            // c3 = c2 + cm_stride
83        CSEL    x11, x10, x11, LO       //   a3 = a2
84        CSEL    x14, x17, x14, LO       //   c3 = c2
85
86        STP     d14, d15, [sp, 48]
87        ADD     x12, x11, x4            // a4 = a3 + a_stride
88        ADD     x13, x14, x7            // c4 = c3 + cm_stride
89                                        // if mr <= 4
90        CSEL    x12, x11, x12, LS       //   a4 = a3
91        CSEL    x13, x14, x13, LS       //   c4 = c3
92
93        CMP     x0, 6                   // if mr < 6
94        ADD     x4, x12, x4             // a5 = a4 + a_stride
95        ADD     x7, x13, x7             // c5 = c4 + cm_stride
96        CSEL    x4, x12, x4, LO         //   a5 = a4
97        CSEL    x7, x13, x7, LO         //   c5 = c4
98
99        .p2align 3
1000:
101        # Load initial bias from w into accumulators
102        LDP     q20, q21, [x5], 32
103        MOV     v22.16b, v20.16b
104        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
105        MOV     v23.16b, v21.16b
106        PRFM    PLDL1KEEP, [x5, 64]
107        MOV     v24.16b, v20.16b
108        PRFM    PLDL1KEEP, [x5, 128]
109        MOV     v25.16b, v21.16b
110        PRFM    PLDL1KEEP, [x5, 192]
111        MOV     v26.16b, v20.16b
112        PRFM    PLDL1KEEP,  [x3]        // Prefetch A
113        MOV     v27.16b, v21.16b
114        PRFM    PLDL1KEEP,  [x9]
115        MOV     v28.16b, v20.16b
116        PRFM    PLDL1KEEP, [x10]
117        MOV     v29.16b, v21.16b
118        PRFM    PLDL1KEEP, [x11]
119        MOV     v30.16b, v20.16b
120        PRFM    PLDL1KEEP, [x12]
121        MOV     v31.16b, v21.16b
122        PRFM    PLDL1KEEP,  [x4]
123
124        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
125        SUBS    x0, x2, 32              // k = kc - 32
126        B.LO    4f
127
128        # Prologue - loads for main loop of 96 FMA
129        # load A0 to A4 but not A5
130        LDP     q0,  q6,  [x3], 32
131        LDP     q1,  q7,  [x9], 32
132        LDP     q2,  q8, [x10], 32
133        LDP     q3,  q9, [x11], 32
134        LDP     q4,  q10, [x12], 32
135        # load first set of B
136        LDP     q12, q13, [x5], 32
137        LDP     q14, q15, [x5], 32
138
139        # Is there at least 8 floats (32 bytes) for main loop?
140        SUBS    x0, x0, 32
141        B.LO    2f
142
143        # Main loop - 8 floats of A (32 bytes)
144        # 96 FMA + 6 LDP A + 8 LDP B
145        .p2align 3
1461:
147        # First group of 4 A.  48 FMA.  Loads A5
148
149        LDP     q5, q11, [x4], 32
150        FMLA    v20.4s, v12.4s,  v0.s[0]
151        FMLA    v22.4s, v12.4s,  v1.s[0]
152        LDP     q16,  q17, [x5], 32
153        FMLA    v24.4s, v12.4s,  v2.s[0]
154        FMLA    v26.4s, v12.4s,  v3.s[0]
155        LDP     q18,  q19, [x5], 32
156        FMLA    v28.4s, v12.4s,  v4.s[0]
157        FMLA    v30.4s, v12.4s,  v5.s[0]
158        FMLA    v21.4s, v13.4s,  v0.s[0]
159        FMLA    v23.4s, v13.4s,  v1.s[0]
160        FMLA    v25.4s, v13.4s,  v2.s[0]
161        FMLA    v27.4s, v13.4s,  v3.s[0]
162        FMLA    v29.4s, v13.4s,  v4.s[0]
163        FMLA    v31.4s, v13.4s,  v5.s[0]
164
165        FMLA    v20.4s, v14.4s,  v0.s[1]
166        FMLA    v22.4s, v14.4s,  v1.s[1]
167        FMLA    v24.4s, v14.4s,  v2.s[1]
168        FMLA    v26.4s, v14.4s,  v3.s[1]
169        FMLA    v28.4s, v14.4s,  v4.s[1]
170        FMLA    v30.4s, v14.4s,  v5.s[1]
171        FMLA    v21.4s, v15.4s,  v0.s[1]
172        FMLA    v23.4s, v15.4s,  v1.s[1]
173        FMLA    v25.4s, v15.4s,  v2.s[1]
174        FMLA    v27.4s, v15.4s,  v3.s[1]
175        FMLA    v29.4s, v15.4s,  v4.s[1]
176        FMLA    v31.4s, v15.4s,  v5.s[1]
177
178        LDP     q12,  q13, [x5], 32
179        FMLA    v20.4s, v16.4s,  v0.s[2]
180        FMLA    v22.4s, v16.4s,  v1.s[2]
181        LDP     q14,  q15, [x5], 32
182        FMLA    v24.4s, v16.4s,  v2.s[2]
183        FMLA    v26.4s, v16.4s,  v3.s[2]
184        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
185        FMLA    v28.4s, v16.4s,  v4.s[2]
186        FMLA    v30.4s, v16.4s,  v5.s[2]
187        PRFM    PLDL1KEEP, [x5, 256]
188        FMLA    v21.4s, v17.4s,  v0.s[2]
189        FMLA    v23.4s, v17.4s,  v1.s[2]
190        FMLA    v25.4s, v17.4s,  v2.s[2]
191        FMLA    v27.4s, v17.4s,  v3.s[2]
192        FMLA    v29.4s, v17.4s,  v4.s[2]
193        FMLA    v31.4s, v17.4s,  v5.s[2]
194
195        FMLA    v20.4s, v18.4s,  v0.s[3]
196        FMLA    v22.4s, v18.4s,  v1.s[3]
197        FMLA    v24.4s, v18.4s,  v2.s[3]
198        FMLA    v26.4s, v18.4s,  v3.s[3]
199        FMLA    v28.4s, v18.4s,  v4.s[3]
200        FMLA    v30.4s, v18.4s,  v5.s[3]
201        FMLA    v21.4s, v19.4s,  v0.s[3]
202        FMLA    v23.4s, v19.4s,  v1.s[3]
203        FMLA    v25.4s, v19.4s,  v2.s[3]
204        FMLA    v27.4s, v19.4s,  v3.s[3]
205        FMLA    v29.4s, v19.4s,  v4.s[3]
206        FMLA    v31.4s, v19.4s,  v5.s[3]
207
208        # Second group of 4 A.  48 FMA.  Loads A0 - A4
209
210        LDP     q16,  q17, [x5], 32
211        FMLA    v20.4s, v12.4s,  v6.s[0]
212        FMLA    v22.4s, v12.4s,  v7.s[0]
213        LDP     q18,  q19, [x5], 32
214        FMLA    v24.4s, v12.4s,  v8.s[0]
215        FMLA    v26.4s, v12.4s,  v9.s[0]
216        FMLA    v28.4s, v12.4s, v10.s[0]
217        FMLA    v30.4s, v12.4s, v11.s[0]
218        FMLA    v21.4s, v13.4s,  v6.s[0]
219        FMLA    v23.4s, v13.4s,  v7.s[0]
220        FMLA    v25.4s, v13.4s,  v8.s[0]
221        FMLA    v27.4s, v13.4s,  v9.s[0]
222        FMLA    v29.4s, v13.4s, v10.s[0]
223        FMLA    v31.4s, v13.4s, v11.s[0]
224
225        FMLA    v20.4s, v14.4s,  v6.s[1]
226        FMLA    v22.4s, v14.4s,  v7.s[1]
227        FMLA    v24.4s, v14.4s,  v8.s[1]
228        FMLA    v26.4s, v14.4s,  v9.s[1]
229        FMLA    v28.4s, v14.4s, v10.s[1]
230        FMLA    v30.4s, v14.4s, v11.s[1]
231        FMLA    v21.4s, v15.4s,  v6.s[1]
232        FMLA    v23.4s, v15.4s,  v7.s[1]
233        FMLA    v25.4s, v15.4s,  v8.s[1]
234        FMLA    v27.4s, v15.4s,  v9.s[1]
235        FMLA    v29.4s, v15.4s, v10.s[1]
236        FMLA    v31.4s, v15.4s, v11.s[1]
237
238        LDP     q12,  q13, [x5], 32
239        FMLA    v20.4s, v16.4s,  v6.s[2]
240        FMLA    v20.4s, v18.4s,  v6.s[3]
241        LDP     q14,  q15, [x5], 32
242        FMLA    v21.4s, v17.4s,  v6.s[2]
243        FMLA    v21.4s, v19.4s,  v6.s[3]
244        LDP     q0,  q6, [x3], 32
245        FMLA    v22.4s, v16.4s,  v7.s[2]
246        FMLA    v22.4s, v18.4s,  v7.s[3]
247        FMLA    v23.4s, v17.4s,  v7.s[2]
248        FMLA    v23.4s, v19.4s,  v7.s[3]
249        LDP     q1,  q7, [x9], 32
250        FMLA    v24.4s, v16.4s,  v8.s[2]
251        FMLA    v24.4s, v18.4s,  v8.s[3]
252        FMLA    v25.4s, v17.4s,  v8.s[2]
253        FMLA    v25.4s, v19.4s,  v8.s[3]
254        LDP     q2,  q8, [x10], 32
255        FMLA    v26.4s, v16.4s,  v9.s[2]
256        FMLA    v26.4s, v18.4s,  v9.s[3]
257        FMLA    v27.4s, v17.4s,  v9.s[2]
258        FMLA    v27.4s, v19.4s,  v9.s[3]
259        LDP     q3,  q9, [x11], 32
260        FMLA    v28.4s, v16.4s, v10.s[2]
261        FMLA    v28.4s, v18.4s, v10.s[3]
262        FMLA    v29.4s, v17.4s, v10.s[2]
263        FMLA    v29.4s, v19.4s, v10.s[3]
264        LDP     q4,  q10, [x12], 32
265        FMLA    v30.4s, v16.4s, v11.s[2]
266        FMLA    v30.4s, v18.4s, v11.s[3]
267        SUBS    x0, x0, 32
268        FMLA    v31.4s, v17.4s, v11.s[2]
269        FMLA    v31.4s, v19.4s, v11.s[3]
270        B.HS    1b
271
272        # Epilogue - 8 floats of A (32 bytes)
273        # 96 FMA + 6 LDP A + 8 LDP B
274        # First block same as main loop.  Second block has no preloads.
2752:
276        # First group of 4 A.  48 FMA.  Loads A5
277
278        LDP     q5, q11, [x4], 32
279        FMLA    v20.4s, v12.4s,  v0.s[0]
280        FMLA    v22.4s, v12.4s,  v1.s[0]
281        LDP     q16,  q17, [x5], 32
282        FMLA    v24.4s, v12.4s,  v2.s[0]
283        FMLA    v26.4s, v12.4s,  v3.s[0]
284        LDP     q18,  q19, [x5], 32
285        FMLA    v28.4s, v12.4s,  v4.s[0]
286        FMLA    v30.4s, v12.4s,  v5.s[0]
287        FMLA    v21.4s, v13.4s,  v0.s[0]
288        FMLA    v23.4s, v13.4s,  v1.s[0]
289        FMLA    v25.4s, v13.4s,  v2.s[0]
290        FMLA    v27.4s, v13.4s,  v3.s[0]
291        FMLA    v29.4s, v13.4s,  v4.s[0]
292        FMLA    v31.4s, v13.4s,  v5.s[0]
293
294        FMLA    v20.4s, v14.4s,  v0.s[1]
295        FMLA    v22.4s, v14.4s,  v1.s[1]
296        FMLA    v24.4s, v14.4s,  v2.s[1]
297        FMLA    v26.4s, v14.4s,  v3.s[1]
298        FMLA    v28.4s, v14.4s,  v4.s[1]
299        FMLA    v30.4s, v14.4s,  v5.s[1]
300        FMLA    v21.4s, v15.4s,  v0.s[1]
301        FMLA    v23.4s, v15.4s,  v1.s[1]
302        FMLA    v25.4s, v15.4s,  v2.s[1]
303        FMLA    v27.4s, v15.4s,  v3.s[1]
304        FMLA    v29.4s, v15.4s,  v4.s[1]
305        FMLA    v31.4s, v15.4s,  v5.s[1]
306
307        LDP     q12,  q13, [x5], 32
308        FMLA    v20.4s, v16.4s,  v0.s[2]
309        FMLA    v22.4s, v16.4s,  v1.s[2]
310        LDP     q14,  q15, [x5], 32
311        FMLA    v24.4s, v16.4s,  v2.s[2]
312        FMLA    v26.4s, v16.4s,  v3.s[2]
313        FMLA    v28.4s, v16.4s,  v4.s[2]
314        FMLA    v30.4s, v16.4s,  v5.s[2]
315        FMLA    v21.4s, v17.4s,  v0.s[2]
316        FMLA    v23.4s, v17.4s,  v1.s[2]
317        FMLA    v25.4s, v17.4s,  v2.s[2]
318        FMLA    v27.4s, v17.4s,  v3.s[2]
319        FMLA    v29.4s, v17.4s,  v4.s[2]
320        FMLA    v31.4s, v17.4s,  v5.s[2]
321
322        FMLA    v20.4s, v18.4s,  v0.s[3]
323        FMLA    v22.4s, v18.4s,  v1.s[3]
324        FMLA    v24.4s, v18.4s,  v2.s[3]
325        FMLA    v26.4s, v18.4s,  v3.s[3]
326        FMLA    v28.4s, v18.4s,  v4.s[3]
327        FMLA    v30.4s, v18.4s,  v5.s[3]
328        FMLA    v21.4s, v19.4s,  v0.s[3]
329        FMLA    v23.4s, v19.4s,  v1.s[3]
330        FMLA    v25.4s, v19.4s,  v2.s[3]
331        FMLA    v27.4s, v19.4s,  v3.s[3]
332        FMLA    v29.4s, v19.4s,  v4.s[3]
333        FMLA    v31.4s, v19.4s,  v5.s[3]
334
335        # Second group of 4 A.  48 FMA. No A Loads, No last B load
336
337        LDP     q16,  q17, [x5], 32
338        FMLA    v20.4s, v12.4s,  v6.s[0]
339        FMLA    v22.4s, v12.4s,  v7.s[0]
340        LDP     q18,  q19, [x5], 32
341        FMLA    v24.4s, v12.4s,  v8.s[0]
342        FMLA    v26.4s, v12.4s,  v9.s[0]
343        FMLA    v28.4s, v12.4s, v10.s[0]
344        FMLA    v30.4s, v12.4s, v11.s[0]
345        FMLA    v21.4s, v13.4s,  v6.s[0]
346        FMLA    v23.4s, v13.4s,  v7.s[0]
347        FMLA    v25.4s, v13.4s,  v8.s[0]
348        FMLA    v27.4s, v13.4s,  v9.s[0]
349        FMLA    v29.4s, v13.4s, v10.s[0]
350        FMLA    v31.4s, v13.4s, v11.s[0]
351
352        FMLA    v20.4s, v14.4s,  v6.s[1]
353        FMLA    v22.4s, v14.4s,  v7.s[1]
354        FMLA    v24.4s, v14.4s,  v8.s[1]
355        FMLA    v26.4s, v14.4s,  v9.s[1]
356        FMLA    v28.4s, v14.4s, v10.s[1]
357        FMLA    v30.4s, v14.4s, v11.s[1]
358        FMLA    v21.4s, v15.4s,  v6.s[1]
359        FMLA    v23.4s, v15.4s,  v7.s[1]
360        FMLA    v25.4s, v15.4s,  v8.s[1]
361        FMLA    v27.4s, v15.4s,  v9.s[1]
362        FMLA    v29.4s, v15.4s, v10.s[1]
363        FMLA    v31.4s, v15.4s, v11.s[1]
364
365        # Last part of epilogue has loads removed.
366
367        FMLA    v20.4s, v16.4s,  v6.s[2]
368        FMLA    v22.4s, v16.4s,  v7.s[2]
369        FMLA    v24.4s, v16.4s,  v8.s[2]
370        FMLA    v26.4s, v16.4s,  v9.s[2]
371        FMLA    v28.4s, v16.4s, v10.s[2]
372        FMLA    v30.4s, v16.4s, v11.s[2]
373        FMLA    v21.4s, v17.4s,  v6.s[2]
374        FMLA    v23.4s, v17.4s,  v7.s[2]
375        FMLA    v25.4s, v17.4s,  v8.s[2]
376        FMLA    v27.4s, v17.4s,  v9.s[2]
377        FMLA    v29.4s, v17.4s, v10.s[2]
378        FMLA    v31.4s, v17.4s, v11.s[2]
379
380        FMLA    v20.4s, v18.4s,  v6.s[3]
381        FMLA    v22.4s, v18.4s,  v7.s[3]
382        FMLA    v24.4s, v18.4s,  v8.s[3]
383        FMLA    v26.4s, v18.4s,  v9.s[3]
384        FMLA    v28.4s, v18.4s, v10.s[3]
385        FMLA    v30.4s, v18.4s, v11.s[3]
386        FMLA    v21.4s, v19.4s,  v6.s[3]
387        FMLA    v23.4s, v19.4s,  v7.s[3]
388
389        # Load min/max values
390        LD2R    {v6.4s, v7.4s}, [x8]
391
392        FMLA    v25.4s, v19.4s,  v8.s[3]
393        FMLA    v27.4s, v19.4s,  v9.s[3]
394        # Is there a remainder?- 4 floats of A (16 bytes) or less
395        TST     x0, 31
396        FMLA    v29.4s, v19.4s, v10.s[3]
397        FMLA    v31.4s, v19.4s, v11.s[3]
398        B.NE    4f
399
400        .p2align 3
401
402        # Clamp
4033:
404        FMAX    v20.4s, v20.4s, v6.4s
405        # Load cn_stride
406        LDR     x0, [sp, 64]
407        FMAX    v21.4s, v21.4s, v6.4s
408        FMAX    v22.4s, v22.4s, v6.4s
409        FMAX    v23.4s, v23.4s, v6.4s
410        FMAX    v24.4s, v24.4s, v6.4s
411        FMAX    v25.4s, v25.4s, v6.4s
412        FMAX    v26.4s, v26.4s, v6.4s
413        FMAX    v27.4s, v27.4s, v6.4s
414        FMAX    v28.4s, v28.4s, v6.4s
415        FMAX    v29.4s, v29.4s, v6.4s
416        FMAX    v30.4s, v30.4s, v6.4s
417        FMAX    v31.4s, v31.4s, v6.4s
418        SUBS    x1, x1, 8
419        FMIN    v20.4s, v20.4s, v7.4s
420        FMIN    v21.4s, v21.4s, v7.4s
421        FMIN    v22.4s, v22.4s, v7.4s
422        FMIN    v23.4s, v23.4s, v7.4s
423        FMIN    v24.4s, v24.4s, v7.4s
424        FMIN    v25.4s, v25.4s, v7.4s
425        FMIN    v26.4s, v26.4s, v7.4s
426        FMIN    v27.4s, v27.4s, v7.4s
427        FMIN    v28.4s, v28.4s, v7.4s
428        FMIN    v29.4s, v29.4s, v7.4s
429        FMIN    v30.4s, v30.4s, v7.4s
430        FMIN    v31.4s, v31.4s, v7.4s
431
432        # Store full 6 x 8
433        B.LO    7f
434
435        STP     q20, q21,  [x6]
436        ADD     x6,  x6, x0
437        SUB     x3,  x3, x2             // a0 -= kc
438        STP     q22, q23, [x16]
439        ADD     x16, x16, x0
440        SUB     x9,  x9, x2             // a1 -= kc
441        STP     q24, q25, [x17]
442        ADD     x17, x17, x0
443        SUB     x10, x10, x2            // a2 -= kc
444        STP     q26, q27, [x14]
445        ADD     x14, x14, x0
446        SUB     x11, x11, x2            // a3 -= kc
447        STP     q28, q29, [x13]
448        ADD     x13, x13, x0
449        SUB     x12, x12, x2            // a4 -= kc
450        STP     q30, q31,  [x7]
451        ADD     x7, x7, x0
452        SUB     x4,  x4, x2             // a5 -= kc
453
454        NOP
455        B.HI    0b
456
457        # Restore d8-d15 from stack
458        LDP     d14, d15, [sp, 48]
459        LDP     d12, d13, [sp, 32]
460        LDP     d10, d11, [sp, 16]
461        LDP     d8,  d9, [sp], 64
462        RET
463
464        .p2align 3
4654:
466        # Load min/max values
467        LD2R    {v6.4s, v7.4s}, [x8]
468
469        # Is there a remainder?- 4 floats of A (16 bytes)
470        TBZ     x0, 4, 5f
471
472        # Remainder- 4 floats of A (16 bytes)
473        # Load A
474        LDR     q0,  [x3], 16
475        LDR     q1,  [x9], 16
476        LDR     q2, [x10], 16
477        LDR     q3, [x11], 16
478        LDR     q4, [x12], 16
479        LDR     q5,  [x4], 16
480        # Load B
481        LDP     q12,  q13, [x5], 32
482        LDP     q14,  q15, [x5], 32
483        LDP     q16,  q17, [x5], 32
484        LDP     q18,  q19, [x5], 32
485
486        FMLA    v20.4s, v12.4s,  v0.s[0]
487        FMLA    v22.4s, v12.4s,  v1.s[0]
488        FMLA    v24.4s, v12.4s,  v2.s[0]
489        FMLA    v26.4s, v12.4s,  v3.s[0]
490        FMLA    v28.4s, v12.4s,  v4.s[0]
491        FMLA    v30.4s, v12.4s,  v5.s[0]
492        FMLA    v21.4s, v13.4s,  v0.s[0]
493        FMLA    v23.4s, v13.4s,  v1.s[0]
494        FMLA    v25.4s, v13.4s,  v2.s[0]
495        FMLA    v27.4s, v13.4s,  v3.s[0]
496        FMLA    v29.4s, v13.4s,  v4.s[0]
497        FMLA    v31.4s, v13.4s,  v5.s[0]
498
499        FMLA    v20.4s, v14.4s,  v0.s[1]
500        FMLA    v22.4s, v14.4s,  v1.s[1]
501        FMLA    v24.4s, v14.4s,  v2.s[1]
502        FMLA    v26.4s, v14.4s,  v3.s[1]
503        FMLA    v28.4s, v14.4s,  v4.s[1]
504        FMLA    v30.4s, v14.4s,  v5.s[1]
505        FMLA    v21.4s, v15.4s,  v0.s[1]
506        FMLA    v23.4s, v15.4s,  v1.s[1]
507        FMLA    v25.4s, v15.4s,  v2.s[1]
508        FMLA    v27.4s, v15.4s,  v3.s[1]
509        FMLA    v29.4s, v15.4s,  v4.s[1]
510        FMLA    v31.4s, v15.4s,  v5.s[1]
511
512        FMLA    v20.4s, v16.4s,  v0.s[2]
513        FMLA    v22.4s, v16.4s,  v1.s[2]
514        FMLA    v24.4s, v16.4s,  v2.s[2]
515        FMLA    v26.4s, v16.4s,  v3.s[2]
516        FMLA    v28.4s, v16.4s,  v4.s[2]
517        FMLA    v30.4s, v16.4s,  v5.s[2]
518        FMLA    v21.4s, v17.4s,  v0.s[2]
519        FMLA    v23.4s, v17.4s,  v1.s[2]
520        FMLA    v25.4s, v17.4s,  v2.s[2]
521        FMLA    v27.4s, v17.4s,  v3.s[2]
522        FMLA    v29.4s, v17.4s,  v4.s[2]
523        FMLA    v31.4s, v17.4s,  v5.s[2]
524
525        FMLA    v20.4s, v18.4s,  v0.s[3]
526        FMLA    v22.4s, v18.4s,  v1.s[3]
527        FMLA    v24.4s, v18.4s,  v2.s[3]
528        FMLA    v26.4s, v18.4s,  v3.s[3]
529        FMLA    v28.4s, v18.4s,  v4.s[3]
530        FMLA    v30.4s, v18.4s,  v5.s[3]
531        FMLA    v21.4s, v19.4s,  v0.s[3]
532        FMLA    v23.4s, v19.4s,  v1.s[3]
533        FMLA    v25.4s, v19.4s,  v2.s[3]
534        FMLA    v27.4s, v19.4s,  v3.s[3]
535        FMLA    v29.4s, v19.4s,  v4.s[3]
536        FMLA    v31.4s, v19.4s,  v5.s[3]
537
538        # Is there a remainder?- 2 floats of A (8 bytes)
5395:
540        TBZ     x0, 3, 6f
541
542        # Remainder- 2 floats of A (8 bytes)
543        # Load A
544        LDR     d0,  [x3], 8
545        LDR     d1,  [x9], 8
546        LDR     d2, [x10], 8
547        LDR     d3, [x11], 8
548        LDR     d4, [x12], 8
549        LDR     d5,  [x4], 8
550        # Load B
551        LDP     q12,  q13, [x5], 32
552        LDP     q14,  q15, [x5], 32
553
554        FMLA    v20.4s, v12.4s,  v0.s[0]
555        FMLA    v22.4s, v12.4s,  v1.s[0]
556        FMLA    v24.4s, v12.4s,  v2.s[0]
557        FMLA    v26.4s, v12.4s,  v3.s[0]
558        FMLA    v28.4s, v12.4s,  v4.s[0]
559        FMLA    v30.4s, v12.4s,  v5.s[0]
560        FMLA    v21.4s, v13.4s,  v0.s[0]
561        FMLA    v23.4s, v13.4s,  v1.s[0]
562        FMLA    v25.4s, v13.4s,  v2.s[0]
563        FMLA    v27.4s, v13.4s,  v3.s[0]
564        FMLA    v29.4s, v13.4s,  v4.s[0]
565        FMLA    v31.4s, v13.4s,  v5.s[0]
566
567        FMLA    v20.4s, v14.4s,  v0.s[1]
568        FMLA    v22.4s, v14.4s,  v1.s[1]
569        FMLA    v24.4s, v14.4s,  v2.s[1]
570        FMLA    v26.4s, v14.4s,  v3.s[1]
571        FMLA    v28.4s, v14.4s,  v4.s[1]
572        FMLA    v30.4s, v14.4s,  v5.s[1]
573        FMLA    v21.4s, v15.4s,  v0.s[1]
574        FMLA    v23.4s, v15.4s,  v1.s[1]
575        FMLA    v25.4s, v15.4s,  v2.s[1]
576        FMLA    v27.4s, v15.4s,  v3.s[1]
577        FMLA    v29.4s, v15.4s,  v4.s[1]
578        FMLA    v31.4s, v15.4s,  v5.s[1]
579
580        # Is there a remainder?- 1 float of A (4 bytes)
5816:
582        TBZ     x0, 2, 3b
583
584        # Remainder- 1 float of A (4 bytes)
585        # Load A
586        LDR     s0,  [x3], 4
587        LDR     s1,  [x9], 4
588        LDR     s2, [x10], 4
589        LDR     s3, [x11], 4
590        LDR     s4, [x12], 4
591        LDR     s5,  [x4], 4
592        # Load B
593        LDP     q12,  q13, [x5], 32
594
595        FMLA    v20.4s, v12.4s,  v0.s[0]
596        FMLA    v22.4s, v12.4s,  v1.s[0]
597        FMLA    v24.4s, v12.4s,  v2.s[0]
598        FMLA    v26.4s, v12.4s,  v3.s[0]
599        FMLA    v28.4s, v12.4s,  v4.s[0]
600        FMLA    v30.4s, v12.4s,  v5.s[0]
601        FMLA    v21.4s, v13.4s,  v0.s[0]
602        FMLA    v23.4s, v13.4s,  v1.s[0]
603        FMLA    v25.4s, v13.4s,  v2.s[0]
604        FMLA    v27.4s, v13.4s,  v3.s[0]
605        FMLA    v29.4s, v13.4s,  v4.s[0]
606        FMLA    v31.4s, v13.4s,  v5.s[0]
607        B       3b
608
609        .p2align 3
610
611        # Store odd width
6127:
613        TBZ     x1, 2, 8f
614        STR     q20,  [x6], 16
615        MOV     v20.16b, v21.16b
616        STR     q22, [x16], 16
617        MOV     v22.16b, v23.16b
618        STR     q24, [x17], 16
619        MOV     v24.16b, v25.16b
620        STR     q26, [x14], 16
621        MOV     v26.16b, v27.16b
622        STR     q28, [x13], 16
623        MOV     v28.16b, v29.16b
624        STR     q30,  [x7], 16
625        MOV     v30.16b, v31.16b
6268:
627        TBZ     x1, 1, 9f
628        STR     d20,  [x6], 8
629        STR     d22, [x16], 8
630        DUP     d20, v20.d[1]
631        DUP     d22, v22.d[1]
632        STR     d24, [x17], 8
633        STR     d26, [x14], 8
634        DUP     d24, v24.d[1]
635        DUP     d26, v26.d[1]
636        STR     d28, [x13], 8
637        STR     d30,  [x7], 8
638        DUP     d28, v28.d[1]
639        DUP     d30, v30.d[1]
640
6419:
642        TBZ     x1, 0, 10f
643        STR     s20,  [x6]
644        STR     s22, [x16]
645        STR     s24, [x17]
646        STR     s26, [x14]
647        STR     s28, [x13]
648        STR     s30,  [x7]
64910:
650        # Restore d8-d15 from stack
651        LDP     d14, d15, [sp, 48]
652        LDP     d12, d13, [sp, 32]
653        LDP     d10, d11, [sp, 16]
654        LDP     d8,  d9, [sp], 64
655        RET
656
657END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
658
659#ifdef __ELF__
660.section ".note.GNU-stack","",%progbits
661#endif
662