xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     const uint8_t* a,                  x3
17#     size_t a_stride,                   x4
18#     const void* w,                     x5
19#     uint8_t* c,                        x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     const xnn_f32_minmax_params params [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0  v6
44# A1   v1  v7
45# A2   v2  v8
46# A3   v3  v9
47# A4   v4 v10
48# A5   v5 v11
49# B   v12 v13 v14 v15
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58
59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
60
61        # Clamp A and C pointers / Save d8-d15 on stack
62        CMP     x0, 2                   // if mr < 2
63        STP     d8,  d9, [sp, -64]!
64        ADD     x9, x3, x4              // a1 = a0 + a_stride
65        ADD     x16, x6, x7             // c1 = c0 + cm_stride
66        CSEL    x9, x3, x9, LO          //   a1 = a0
67        CSEL    x16, x6, x16, LO        //   c1 = c0
68
69        STP     d10, d11, [sp, 16]
70        ADD     x10, x9, x4             // a2 = a1 + a_stride
71        ADD     x17, x16, x7            // c2 = c1 + cm_stride
72                                        // if mr <= 2
73        CSEL    x10, x9, x10, LS        //   a2 = a1
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75
76        STP     d12, d13, [sp, 32]
77        CMP     x0, 4                   // if mr < 4
78        ADD     x11, x10, x4            // a3 = a2 + a_stride
79        ADD     x14, x17, x7            // c3 = c2 + cm_stride
80        CSEL    x11, x10, x11, LO       //   a3 = a2
81        CSEL    x14, x17, x14, LO       //   c3 = c2
82
83        STP     d14, d15, [sp, 48]
84        ADD     x12, x11, x4            // a4 = a3 + a_stride
85        ADD     x13, x14, x7            // c4 = c3 + cm_stride
86                                        // if mr <= 4
87        CSEL    x12, x11, x12, LS       //   a4 = a3
88        CSEL    x13, x14, x13, LS       //   c4 = c3
89
90        # Load params pointer
91        LDR     x8, [sp, 72]
92
93        CMP     x0, 6                   // if mr < 6
94        ADD     x4, x12, x4             // a5 = a4 + a_stride
95        ADD     x7, x13, x7             // c5 = c4 + cm_stride
96        CSEL    x4, x12, x4, LO         //   a5 = a4
97        CSEL    x7, x13, x7, LO         //   c5 = c4
98
990:
100        # Load initial bias from w into accumulators
101        LDP     q20, q21, [x5], 32
102        SUBS    x0, x2, 32              // k = kc - 32
103        MOV     v22.16b, v20.16b
104        MOV     v23.16b, v21.16b
105        MOV     v24.16b, v20.16b
106        MOV     v25.16b, v21.16b
107        MOV     v26.16b, v20.16b
108        MOV     v27.16b, v21.16b
109        MOV     v28.16b, v20.16b
110        MOV     v29.16b, v21.16b
111        MOV     v30.16b, v20.16b
112        MOV     v31.16b, v21.16b
113        B.LO    4f
114
115        # Prologue - loads for main loop of 96 FMA
116        LDR     q0,  [x3], 16
117        LDP     q12,  q13, [x5], 32     // Fetch 3 B (4th deferred)
118        LDR     q1,  [x9], 16
119        LDR     q2, [x10], 16
120        LDR     q3, [x11], 16
121        LDR     q4, [x12], 16
122        LDR     q5,  [x4], 16
123        LDP     q14,  q15, [x5], 32
124        LDP     q16,  q17, [x5], 32
125
126        # Is there at least 8 floats (32 bytes) for main loop?
127        SUBS    x0, x0, 32
128        B.LO    2f
129
130        # Main loop - 8 floats of A (32 bytes)
131        # 96 FMA + 6 LDP A + 8 LDP B
132        # 64 float weights = 256 bytes.  4 cache lines.
1331:
134        # First group of 4 A.  48 FMA.
135        FMLA    v20.4s, v12.4s,  v0.s[0]
136        LDP     q18,  q19, [x5], 32      // Load last B
137        FMLA    v22.4s, v12.4s,  v1.s[0]
138        FMLA    v24.4s, v12.4s,  v2.s[0]
139        FMLA    v26.4s, v12.4s,  v3.s[0]
140        FMLA    v28.4s, v12.4s,  v4.s[0]
141        FMLA    v30.4s, v12.4s,  v5.s[0]
142        FMLA    v21.4s, v13.4s,  v0.s[0]
143        FMLA    v23.4s, v13.4s,  v1.s[0]
144        FMLA    v25.4s, v13.4s,  v2.s[0]
145        FMLA    v27.4s, v13.4s,  v3.s[0]
146        FMLA    v29.4s, v13.4s,  v4.s[0]
147        FMLA    v31.4s, v13.4s,  v5.s[0]
148        FMLA    v20.4s, v14.4s,  v0.s[1]
149        FMLA    v22.4s, v14.4s,  v1.s[1]
150        FMLA    v24.4s, v14.4s,  v2.s[1]
151        FMLA    v26.4s, v14.4s,  v3.s[1]
152        FMLA    v28.4s, v14.4s,  v4.s[1]
153        FMLA    v30.4s, v14.4s,  v5.s[1]
154        FMLA    v21.4s, v15.4s,  v0.s[1]
155        FMLA    v23.4s, v15.4s,  v1.s[1]
156        FMLA    v25.4s, v15.4s,  v2.s[1]
157        LDR     q6,  [x3], 16            // Load next 6 A
158        FMLA    v27.4s, v15.4s,  v3.s[1]
159        FMLA    v29.4s, v15.4s,  v4.s[1]
160        FMLA    v31.4s, v15.4s,  v5.s[1]
161        LDR     q7,  [x9], 16
162
163        FMLA    v20.4s, v16.4s,  v0.s[2]
164        FMLA    v22.4s, v16.4s,  v1.s[2]
165        FMLA    v24.4s, v16.4s,  v2.s[2]
166        LDR     q8, [x10], 16
167        FMLA    v26.4s, v16.4s,  v3.s[2]
168        FMLA    v28.4s, v16.4s,  v4.s[2]
169        FMLA    v30.4s, v16.4s,  v5.s[2]
170        LDR     q9, [x11], 16
171        FMLA    v21.4s, v17.4s,  v0.s[2]
172        FMLA    v23.4s, v17.4s,  v1.s[2]
173        FMLA    v25.4s, v17.4s,  v2.s[2]
174        LDR     q10, [x12], 16
175        FMLA    v27.4s, v17.4s,  v3.s[2]
176        FMLA    v29.4s, v17.4s,  v4.s[2]
177        FMLA    v31.4s, v17.4s,  v5.s[2]
178        LDR     q11,  [x4], 16
179
180        FMLA    v20.4s, v18.4s,  v0.s[3]
181        FMLA    v22.4s, v18.4s,  v1.s[3]
182        FMLA    v24.4s, v18.4s,  v2.s[3]
183        LDP     q12,  q13, [x5], 32       // Load 4 B
184        FMLA    v26.4s, v18.4s,  v3.s[3]
185        FMLA    v28.4s, v18.4s,  v4.s[3]
186        FMLA    v30.4s, v18.4s,  v5.s[3]
187        LDP     q14,  q15, [x5], 32
188        FMLA    v21.4s, v19.4s,  v0.s[3]
189        FMLA    v23.4s, v19.4s,  v1.s[3]
190        FMLA    v25.4s, v19.4s,  v2.s[3]
191        LDP     q16,  q17, [x5], 32
192        FMLA    v27.4s, v19.4s,  v3.s[3]
193        FMLA    v29.4s, v19.4s,  v4.s[3]
194        FMLA    v31.4s, v19.4s,  v5.s[3]
195        LDP     q18,  q19, [x5], 32
196
197        # Second group of 4 A.  48 FMA.
198        FMLA    v20.4s, v12.4s,  v6.s[0]
199        FMLA    v22.4s, v12.4s,  v7.s[0]
200        FMLA    v24.4s, v12.4s,  v8.s[0]
201        LDR     q0,  [x3], 16           // Load next 6 A
202        FMLA    v26.4s, v12.4s,  v9.s[0]
203        FMLA    v28.4s, v12.4s, v10.s[0]
204        FMLA    v30.4s, v12.4s, v11.s[0]
205        LDR     q1,  [x9], 16
206        FMLA    v21.4s, v13.4s,  v6.s[0]
207        FMLA    v23.4s, v13.4s,  v7.s[0]
208        FMLA    v25.4s, v13.4s,  v8.s[0]
209        LDR     q2, [x10], 16
210        FMLA    v27.4s, v13.4s,  v9.s[0]
211        FMLA    v29.4s, v13.4s, v10.s[0]
212        FMLA    v31.4s, v13.4s, v11.s[0]
213        LDR     q3, [x11], 16
214
215        FMLA    v20.4s, v14.4s,  v6.s[1]
216        FMLA    v22.4s, v14.4s,  v7.s[1]
217        FMLA    v24.4s, v14.4s,  v8.s[1]
218        LDR     q4, [x12], 16
219        FMLA    v26.4s, v14.4s,  v9.s[1]
220        FMLA    v28.4s, v14.4s, v10.s[1]
221        FMLA    v30.4s, v14.4s, v11.s[1]
222        LDR     q5,  [x4], 16
223        FMLA    v21.4s, v15.4s,  v6.s[1]
224        FMLA    v23.4s, v15.4s,  v7.s[1]
225        FMLA    v25.4s, v15.4s,  v8.s[1]
226        LDP     q12,  q13, [x5], 32       // Load next 3 B (not last)
227        FMLA    v27.4s, v15.4s,  v9.s[1]
228        FMLA    v29.4s, v15.4s, v10.s[1]
229        FMLA    v31.4s, v15.4s, v11.s[1]
230        LDP     q14,  q15, [x5], 32
231
232        FMLA    v20.4s, v16.4s,  v6.s[2]
233        FMLA    v22.4s, v16.4s,  v7.s[2]
234        FMLA    v24.4s, v16.4s,  v8.s[2]
235        FMLA    v26.4s, v16.4s,  v9.s[2]
236        FMLA    v28.4s, v16.4s, v10.s[2]
237        FMLA    v30.4s, v16.4s, v11.s[2]
238        FMLA    v21.4s, v17.4s,  v6.s[2]
239        FMLA    v23.4s, v17.4s,  v7.s[2]
240        FMLA    v25.4s, v17.4s,  v8.s[2]
241        FMLA    v27.4s, v17.4s,  v9.s[2]
242        FMLA    v29.4s, v17.4s, v10.s[2]
243        FMLA    v31.4s, v17.4s, v11.s[2]
244
245        FMLA    v20.4s, v18.4s,  v6.s[3]
246        FMLA    v22.4s, v18.4s,  v7.s[3]
247        LDP     q16,  q17, [x5], 32
248        FMLA    v24.4s, v18.4s,  v8.s[3]
249        FMLA    v26.4s, v18.4s,  v9.s[3]
250        FMLA    v28.4s, v18.4s, v10.s[3]
251        FMLA    v30.4s, v18.4s, v11.s[3]
252        SUBS    x0, x0, 32
253        FMLA    v21.4s, v19.4s,  v6.s[3]
254        FMLA    v23.4s, v19.4s,  v7.s[3]
255        FMLA    v25.4s, v19.4s,  v8.s[3]
256        FMLA    v27.4s, v19.4s,  v9.s[3]
257        FMLA    v29.4s, v19.4s, v10.s[3]
258        FMLA    v31.4s, v19.4s, v11.s[3]
259        B.HS    1b
260
261        # Epilogue - 8 floats of A (32 bytes)
262        # 96 FMA + 6 LDP A + 8 LDP B
263        # First block same as main loop.  Second block has no preloads.
2642:
265        # First group of 4 A.  48 FMA.
266        FMLA    v20.4s, v12.4s,  v0.s[0]
267        LDP     q18,  q19, [x5], 32      // Load last B
268        FMLA    v22.4s, v12.4s,  v1.s[0]
269        FMLA    v24.4s, v12.4s,  v2.s[0]
270        FMLA    v26.4s, v12.4s,  v3.s[0]
271        FMLA    v28.4s, v12.4s,  v4.s[0]
272        FMLA    v30.4s, v12.4s,  v5.s[0]
273        FMLA    v21.4s, v13.4s,  v0.s[0]
274        FMLA    v23.4s, v13.4s,  v1.s[0]
275        FMLA    v25.4s, v13.4s,  v2.s[0]
276        FMLA    v27.4s, v13.4s,  v3.s[0]
277        FMLA    v29.4s, v13.4s,  v4.s[0]
278        FMLA    v31.4s, v13.4s,  v5.s[0]
279        FMLA    v20.4s, v14.4s,  v0.s[1]
280        FMLA    v22.4s, v14.4s,  v1.s[1]
281        FMLA    v24.4s, v14.4s,  v2.s[1]
282        FMLA    v26.4s, v14.4s,  v3.s[1]
283        FMLA    v28.4s, v14.4s,  v4.s[1]
284        FMLA    v30.4s, v14.4s,  v5.s[1]
285        FMLA    v21.4s, v15.4s,  v0.s[1]
286        FMLA    v23.4s, v15.4s,  v1.s[1]
287        FMLA    v25.4s, v15.4s,  v2.s[1]
288        LDR     q6,  [x3], 16            // Load next 6 A
289        FMLA    v27.4s, v15.4s,  v3.s[1]
290        FMLA    v29.4s, v15.4s,  v4.s[1]
291        FMLA    v31.4s, v15.4s,  v5.s[1]
292        LDR     q7,  [x9], 16
293
294        FMLA    v20.4s, v16.4s,  v0.s[2]
295        FMLA    v22.4s, v16.4s,  v1.s[2]
296        FMLA    v24.4s, v16.4s,  v2.s[2]
297        LDR     q8, [x10], 16
298        FMLA    v26.4s, v16.4s,  v3.s[2]
299        FMLA    v28.4s, v16.4s,  v4.s[2]
300        FMLA    v30.4s, v16.4s,  v5.s[2]
301        LDR     q9, [x11], 16
302        FMLA    v21.4s, v17.4s,  v0.s[2]
303        FMLA    v23.4s, v17.4s,  v1.s[2]
304        FMLA    v25.4s, v17.4s,  v2.s[2]
305        LDR     q10, [x12], 16
306        FMLA    v27.4s, v17.4s,  v3.s[2]
307        FMLA    v29.4s, v17.4s,  v4.s[2]
308        FMLA    v31.4s, v17.4s,  v5.s[2]
309        LDR     q11,  [x4], 16
310
311        FMLA    v20.4s, v18.4s,  v0.s[3]
312        FMLA    v22.4s, v18.4s,  v1.s[3]
313        FMLA    v24.4s, v18.4s,  v2.s[3]
314        LDP     q12,  q13, [x5], 32       // Load 4 B
315        FMLA    v26.4s, v18.4s,  v3.s[3]
316        FMLA    v28.4s, v18.4s,  v4.s[3]
317        FMLA    v30.4s, v18.4s,  v5.s[3]
318        LDP     q14,  q15, [x5], 32
319        FMLA    v21.4s, v19.4s,  v0.s[3]
320        FMLA    v23.4s, v19.4s,  v1.s[3]
321        FMLA    v25.4s, v19.4s,  v2.s[3]
322        LDP     q16,  q17, [x5], 32
323        FMLA    v27.4s, v19.4s,  v3.s[3]
324        FMLA    v29.4s, v19.4s,  v4.s[3]
325        FMLA    v31.4s, v19.4s,  v5.s[3]
326        LDP     q18,  q19, [x5], 32
327
328        # Second group of 4 A.  48 FMA.
329        FMLA    v20.4s, v12.4s,  v6.s[0]
330        FMLA    v22.4s, v12.4s,  v7.s[0]
331        FMLA    v24.4s, v12.4s,  v8.s[0]
332        FMLA    v26.4s, v12.4s,  v9.s[0]
333        FMLA    v28.4s, v12.4s, v10.s[0]
334        FMLA    v30.4s, v12.4s, v11.s[0]
335        FMLA    v21.4s, v13.4s,  v6.s[0]
336        FMLA    v23.4s, v13.4s,  v7.s[0]
337        FMLA    v25.4s, v13.4s,  v8.s[0]
338        FMLA    v27.4s, v13.4s,  v9.s[0]
339        FMLA    v29.4s, v13.4s, v10.s[0]
340        FMLA    v31.4s, v13.4s, v11.s[0]
341
342        FMLA    v20.4s, v14.4s,  v6.s[1]
343        FMLA    v22.4s, v14.4s,  v7.s[1]
344        FMLA    v24.4s, v14.4s,  v8.s[1]
345        FMLA    v26.4s, v14.4s,  v9.s[1]
346        FMLA    v28.4s, v14.4s, v10.s[1]
347        FMLA    v30.4s, v14.4s, v11.s[1]
348        FMLA    v21.4s, v15.4s,  v6.s[1]
349        FMLA    v23.4s, v15.4s,  v7.s[1]
350        FMLA    v25.4s, v15.4s,  v8.s[1]
351        FMLA    v27.4s, v15.4s,  v9.s[1]
352        FMLA    v29.4s, v15.4s, v10.s[1]
353        FMLA    v31.4s, v15.4s, v11.s[1]
354
355        FMLA    v20.4s, v16.4s,  v6.s[2]
356        FMLA    v22.4s, v16.4s,  v7.s[2]
357        FMLA    v24.4s, v16.4s,  v8.s[2]
358        FMLA    v26.4s, v16.4s,  v9.s[2]
359        FMLA    v28.4s, v16.4s, v10.s[2]
360        FMLA    v30.4s, v16.4s, v11.s[2]
361        FMLA    v21.4s, v17.4s,  v6.s[2]
362        FMLA    v23.4s, v17.4s,  v7.s[2]
363        FMLA    v25.4s, v17.4s,  v8.s[2]
364        FMLA    v27.4s, v17.4s,  v9.s[2]
365        FMLA    v29.4s, v17.4s, v10.s[2]
366        FMLA    v31.4s, v17.4s, v11.s[2]
367
368        FMLA    v20.4s, v18.4s,  v6.s[3]
369        FMLA    v22.4s, v18.4s,  v7.s[3]
370        FMLA    v24.4s, v18.4s,  v8.s[3]
371        FMLA    v26.4s, v18.4s,  v9.s[3]
372        FMLA    v28.4s, v18.4s, v10.s[3]
373        FMLA    v30.4s, v18.4s, v11.s[3]
374
375        # Is there a remainder?- 4 floats of A (16 bytes) or less
376        TST     x0, 31
377
378        FMLA    v21.4s, v19.4s,  v6.s[3]
379        FMLA    v23.4s, v19.4s,  v7.s[3]
380        FMLA    v25.4s, v19.4s,  v8.s[3]
381        LD2R    {v6.4s, v7.4s}, [x8]      // Load min/max values
382        FMLA    v27.4s, v19.4s,  v9.s[3]
383        FMLA    v29.4s, v19.4s, v10.s[3]
384        FMLA    v31.4s, v19.4s, v11.s[3]
385        B.NE    4f
386
387        # Clamp
3883:
389        FMAX    v20.4s, v20.4s, v6.4s
390        FMAX    v21.4s, v21.4s, v6.4s
391        FMAX    v22.4s, v22.4s, v6.4s
392        FMAX    v23.4s, v23.4s, v6.4s
393        FMAX    v24.4s, v24.4s, v6.4s
394        LDR     x0, [sp, 64]            // Load cn_stride
395        FMAX    v25.4s, v25.4s, v6.4s
396        FMAX    v26.4s, v26.4s, v6.4s
397        FMAX    v27.4s, v27.4s, v6.4s
398        FMAX    v28.4s, v28.4s, v6.4s
399        FMAX    v29.4s, v29.4s, v6.4s
400        FMAX    v30.4s, v30.4s, v6.4s
401        FMAX    v31.4s, v31.4s, v6.4s
402        SUBS    x1, x1, 8
403        FMIN    v20.4s, v20.4s, v7.4s
404        FMIN    v21.4s, v21.4s, v7.4s
405        FMIN    v22.4s, v22.4s, v7.4s
406        FMIN    v23.4s, v23.4s, v7.4s
407        FMIN    v24.4s, v24.4s, v7.4s
408        FMIN    v25.4s, v25.4s, v7.4s
409        FMIN    v26.4s, v26.4s, v7.4s
410        FMIN    v27.4s, v27.4s, v7.4s
411        FMIN    v28.4s, v28.4s, v7.4s
412        FMIN    v29.4s, v29.4s, v7.4s
413        FMIN    v30.4s, v30.4s, v7.4s
414        FMIN    v31.4s, v31.4s, v7.4s
415
416        # Store full 6 x 8
417        B.LO    7f
418
419        STP     q20, q21,  [x6]
420        ADD     x6,  x6, x0
421        SUB     x3,  x3, x2             // a0 -= kc
422        STP     q22, q23, [x16]
423        ADD     x16, x16, x0
424        SUB     x9,  x9, x2             // a1 -= kc
425        STP     q24, q25, [x17]
426        ADD     x17, x17, x0
427        SUB     x10, x10, x2            // a2 -= kc
428        STP     q26, q27, [x14]
429        ADD     x14, x14, x0
430        SUB     x11, x11, x2            // a3 -= kc
431        STP     q28, q29, [x13]
432        ADD     x13, x13, x0
433        SUB     x12, x12, x2            // a4 -= kc
434        STP     q30, q31,  [x7]
435        ADD     x7, x7, x0
436        SUB     x4,  x4, x2             // a5 -= kc
437
438        B.HI    0b
439
440        # Restore d8-d15 from stack
441        LDP     d14, d15, [sp, 48]
442        LDP     d12, d13, [sp, 32]
443        LDP     d10, d11, [sp, 16]
444        LDP     d8,  d9, [sp], 64
445        RET
446
4474:
448        # Load min/max values
449        LD2R    {v6.4s, v7.4s}, [x8]
450
451        # Is there a remainder?- 4 floats of A (16 bytes)
452        TBZ     x0, 4, 5f
453
454        # Remainder- 4 floats of A (16 bytes)
455        # Load A
456        LDR     q0,  [x3], 16
457        LDR     q1,  [x9], 16
458        LDR     q2, [x10], 16
459        LDR     q3, [x11], 16
460        LDR     q4, [x12], 16
461        LDR     q5,  [x4], 16
462        # Load B
463        LDP     q12,  q13, [x5], 32
464        LDP     q14,  q15, [x5], 32
465        LDP     q16,  q17, [x5], 32
466        LDP     q18,  q19, [x5], 32
467
468        FMLA    v20.4s, v12.4s,  v0.s[0]
469        FMLA    v22.4s, v12.4s,  v1.s[0]
470        FMLA    v24.4s, v12.4s,  v2.s[0]
471        FMLA    v26.4s, v12.4s,  v3.s[0]
472        FMLA    v28.4s, v12.4s,  v4.s[0]
473        FMLA    v30.4s, v12.4s,  v5.s[0]
474        FMLA    v21.4s, v13.4s,  v0.s[0]
475        FMLA    v23.4s, v13.4s,  v1.s[0]
476        FMLA    v25.4s, v13.4s,  v2.s[0]
477        FMLA    v27.4s, v13.4s,  v3.s[0]
478        FMLA    v29.4s, v13.4s,  v4.s[0]
479        FMLA    v31.4s, v13.4s,  v5.s[0]
480
481        FMLA    v20.4s, v14.4s,  v0.s[1]
482        FMLA    v22.4s, v14.4s,  v1.s[1]
483        FMLA    v24.4s, v14.4s,  v2.s[1]
484        FMLA    v26.4s, v14.4s,  v3.s[1]
485        FMLA    v28.4s, v14.4s,  v4.s[1]
486        FMLA    v30.4s, v14.4s,  v5.s[1]
487        FMLA    v21.4s, v15.4s,  v0.s[1]
488        FMLA    v23.4s, v15.4s,  v1.s[1]
489        FMLA    v25.4s, v15.4s,  v2.s[1]
490        FMLA    v27.4s, v15.4s,  v3.s[1]
491        FMLA    v29.4s, v15.4s,  v4.s[1]
492        FMLA    v31.4s, v15.4s,  v5.s[1]
493
494        FMLA    v20.4s, v16.4s,  v0.s[2]
495        FMLA    v22.4s, v16.4s,  v1.s[2]
496        FMLA    v24.4s, v16.4s,  v2.s[2]
497        FMLA    v26.4s, v16.4s,  v3.s[2]
498        FMLA    v28.4s, v16.4s,  v4.s[2]
499        FMLA    v30.4s, v16.4s,  v5.s[2]
500        FMLA    v21.4s, v17.4s,  v0.s[2]
501        FMLA    v23.4s, v17.4s,  v1.s[2]
502        FMLA    v25.4s, v17.4s,  v2.s[2]
503        FMLA    v27.4s, v17.4s,  v3.s[2]
504        FMLA    v29.4s, v17.4s,  v4.s[2]
505        FMLA    v31.4s, v17.4s,  v5.s[2]
506
507        FMLA    v20.4s, v18.4s,  v0.s[3]
508        FMLA    v22.4s, v18.4s,  v1.s[3]
509        FMLA    v24.4s, v18.4s,  v2.s[3]
510        FMLA    v26.4s, v18.4s,  v3.s[3]
511        FMLA    v28.4s, v18.4s,  v4.s[3]
512        FMLA    v30.4s, v18.4s,  v5.s[3]
513        FMLA    v21.4s, v19.4s,  v0.s[3]
514        FMLA    v23.4s, v19.4s,  v1.s[3]
515        FMLA    v25.4s, v19.4s,  v2.s[3]
516        FMLA    v27.4s, v19.4s,  v3.s[3]
517        FMLA    v29.4s, v19.4s,  v4.s[3]
518        FMLA    v31.4s, v19.4s,  v5.s[3]
519
520        # Is there a remainder?- 2 floats of A (8 bytes)
5215:
522        TBZ     x0, 3, 6f
523
524        # Remainder- 2 floats of A (8 bytes)
525        # Load A
526        LDR     d0,  [x3], 8
527        LDR     d1,  [x9], 8
528        LDR     d2, [x10], 8
529        LDR     d3, [x11], 8
530        LDR     d4, [x12], 8
531        LDR     d5,  [x4], 8
532        # Load B
533        LDP     q12,  q13, [x5], 32
534        LDP     q14,  q15, [x5], 32
535
536        FMLA    v20.4s, v12.4s,  v0.s[0]
537        FMLA    v22.4s, v12.4s,  v1.s[0]
538        FMLA    v24.4s, v12.4s,  v2.s[0]
539        FMLA    v26.4s, v12.4s,  v3.s[0]
540        FMLA    v28.4s, v12.4s,  v4.s[0]
541        FMLA    v30.4s, v12.4s,  v5.s[0]
542        FMLA    v21.4s, v13.4s,  v0.s[0]
543        FMLA    v23.4s, v13.4s,  v1.s[0]
544        FMLA    v25.4s, v13.4s,  v2.s[0]
545        FMLA    v27.4s, v13.4s,  v3.s[0]
546        FMLA    v29.4s, v13.4s,  v4.s[0]
547        FMLA    v31.4s, v13.4s,  v5.s[0]
548
549        FMLA    v20.4s, v14.4s,  v0.s[1]
550        FMLA    v22.4s, v14.4s,  v1.s[1]
551        FMLA    v24.4s, v14.4s,  v2.s[1]
552        FMLA    v26.4s, v14.4s,  v3.s[1]
553        FMLA    v28.4s, v14.4s,  v4.s[1]
554        FMLA    v30.4s, v14.4s,  v5.s[1]
555        FMLA    v21.4s, v15.4s,  v0.s[1]
556        FMLA    v23.4s, v15.4s,  v1.s[1]
557        FMLA    v25.4s, v15.4s,  v2.s[1]
558        FMLA    v27.4s, v15.4s,  v3.s[1]
559        FMLA    v29.4s, v15.4s,  v4.s[1]
560        FMLA    v31.4s, v15.4s,  v5.s[1]
561
562        # Is there a remainder?- 1 float of A (4 bytes)
5636:
564        TBZ     x0, 2, 3b
565
566        # Remainder- 1 float of A (4 bytes)
567        # Load A
568        LDR     s0,  [x3], 4
569        LDR     s1,  [x9], 4
570        LDR     s2, [x10], 4
571        LDR     s3, [x11], 4
572        LDR     s4, [x12], 4
573        LDR     s5,  [x4], 4
574        # Load B
575        LDP     q12,  q13, [x5], 32
576
577        FMLA    v20.4s, v12.4s,  v0.s[0]
578        FMLA    v22.4s, v12.4s,  v1.s[0]
579        FMLA    v24.4s, v12.4s,  v2.s[0]
580        FMLA    v26.4s, v12.4s,  v3.s[0]
581        FMLA    v28.4s, v12.4s,  v4.s[0]
582        FMLA    v30.4s, v12.4s,  v5.s[0]
583        FMLA    v21.4s, v13.4s,  v0.s[0]
584        FMLA    v23.4s, v13.4s,  v1.s[0]
585        FMLA    v25.4s, v13.4s,  v2.s[0]
586        FMLA    v27.4s, v13.4s,  v3.s[0]
587        FMLA    v29.4s, v13.4s,  v4.s[0]
588        FMLA    v31.4s, v13.4s,  v5.s[0]
589        B       3b
590
591        # Store odd width
5927:
593        TBZ     x1, 2, 8f
594        STR     q20,  [x6], 16
595        MOV     v20.16b, v21.16b
596        STR     q22, [x16], 16
597        MOV     v22.16b, v23.16b
598        STR     q24, [x17], 16
599        MOV     v24.16b, v25.16b
600        STR     q26, [x14], 16
601        MOV     v26.16b, v27.16b
602        STR     q28, [x13], 16
603        MOV     v28.16b, v29.16b
604        STR     q30,  [x7], 16
605        MOV     v30.16b, v31.16b
6068:
607        TBZ     x1, 1, 9f
608        STR     d20,  [x6], 8
609        STR     d22, [x16], 8
610        DUP     d20, v20.d[1]
611        DUP     d22, v22.d[1]
612        STR     d24, [x17], 8
613        STR     d26, [x14], 8
614        DUP     d24, v24.d[1]
615        DUP     d26, v26.d[1]
616        STR     d28, [x13], 8
617        STR     d30,  [x7], 8
618        DUP     d28, v28.d[1]
619        DUP     d30, v30.d[1]
620
6219:
622        TBZ     x1, 0, 10f
623        STR     s20,  [x6]
624        STR     s22, [x16]
625        STR     s24, [x17]
626        STR     s26, [x14]
627        STR     s28, [x13]
628        STR     s30,  [x7]
62910:
630        # Restore d8-d15 from stack
631        LDP     d14, d15, [sp, 48]
632        LDP     d12, d13, [sp, 32]
633        LDP     d10, d11, [sp, 16]
634        LDP     d8,  d9, [sp], 64
635        RET
636
637END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
638
639#ifdef __ELF__
640.section ".note.GNU-stack","",%progbits
641#endif
642