xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# unused compared to 5x8
25#  x4 a5
26#  x7 c5
27# A5  v10 v11
28# C   v30 v31
29
30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
31
32# A pointers
33#  x3 a0
34#  x9 a1
35# x10 a2
36# x11 a3
37# x12 a4
38
39# C pointers
40#  x6 c0
41# x16 c1
42# x17 c2
43# x13 c3
44#  x7 c4
45
46# Vector register usage
47# A0   v0  v1
48# A1   v2  v3
49# A2   v4  v5
50# A3   v6  v7
51# A4   v8  v9
52# B   v12 v13 v14 v15
53# B   v16 v17 v18 v19
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# Clamp v30 v31
60
61BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
62
63        # Load cn_stride, params pointer
64        LDP     x14, x8, [sp]
65
66        # Clamp A and C pointers / Save d8-d15 on stack
67        STP     d8,  d9, [sp, -48]!
68        CMP     x0, 2                   // if mr < 2
69        ADD     x9, x3, x4              // a1 = a0 + a_stride
70        ADD     x16, x6, x7             // c1 = c0 + cm_stride
71        CSEL    x9, x3, x9, LO          //   a1 = a0
72        CSEL    x16, x6, x16, LO        //   c1 = c0
73
74        STP     d12, d13, [sp, 16]
75        ADD     x10, x9, x4             // a2 = a1 + a_stride
76        ADD     x17, x16, x7            // c2 = c1 + cm_stride
77                                        // if mr <= 2
78        CSEL    x10, x9, x10, LS        //   a2 = a1
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        STP     d14, d15, [sp, 32]
82        CMP     x0, 4                   // if mr < 4
83        ADD     x11, x10, x4            // a3 = a2 + a_stride
84        ADD     x13, x17, x7            // c3 = c2 + cm_stride
85        CSEL    x11, x10, x11, LO       //   a3 = a2
86        CSEL    x13, x17, x13, LO       //   c3 = c2
87
88        ADD     x12, x11, x4            // a4 = a3 + a_stride
89        ADD     x7, x13, x7             // c4 = c3 + cm_stride
90                                        // if mr <= 4
91        CSEL    x12, x11, x12, LS       //   a4 = a3
92        CSEL    x7, x13, x7, LS         //   c4 = c3
93
94        # Load clamp values
95        LD2R    {v30.4s, v31.4s}, [x8]
96
970:
98        # Load initial bias from w into accumulators
99        LDP     q20, q21, [x5], 32
100        MOV     v22.16b, v20.16b
101        MOV     v23.16b, v21.16b
102        MOV     v24.16b, v20.16b
103        MOV     v25.16b, v21.16b
104        MOV     v26.16b, v20.16b
105        MOV     v27.16b, v21.16b
106        MOV     v28.16b, v20.16b
107        MOV     v29.16b, v21.16b
108
109        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
110        SUBS    x0, x2, 32              // k = kc - 32
111        B.LO    4f
112
113        # Prologue - loads for main loop of 80 FMA
114        LDR     q0,  [x3], 16
115        LDR     q2,  [x9], 16
116        LDR     q4, [x10], 16
117        LDR     q6, [x11], 16
118        LDR     q8, [x12], 16
119        LDP     q12,  q13, [x5], 32     // Fetch 3 B (4th deferred)
120        LDP     q14,  q15, [x5], 32
121        LDP     q16,  q17, [x5], 32
122
123        # Is there at least 8 floats (32 bytes) for main loop?
124        SUBS    x0, x0, 32
125        B.LO    2f
126
127        # Main loop - 8 floats of A (32 bytes)
128        # 80 FMA + 5 LDP A + 8 LDP B
1291:
130        # First group of 4 A.  40 FMA.
131        FMLA    v20.4s, v12.4s,  v0.s[0]
132        LDP     q18,  q19, [x5], 32      // Load last B
133        FMLA    v22.4s, v12.4s,  v2.s[0]
134        FMLA    v24.4s, v12.4s,  v4.s[0]
135        FMLA    v26.4s, v12.4s,  v6.s[0]
136        FMLA    v28.4s, v12.4s,  v8.s[0]
137        FMLA    v21.4s, v13.4s,  v0.s[0]
138        FMLA    v23.4s, v13.4s,  v2.s[0]
139        FMLA    v25.4s, v13.4s,  v4.s[0]
140        FMLA    v27.4s, v13.4s,  v6.s[0]
141        FMLA    v29.4s, v13.4s,  v8.s[0]
142        LDR     q1,  [x3], 16            // Load next 5 A
143
144        FMLA    v20.4s, v14.4s,  v0.s[1]
145        FMLA    v22.4s, v14.4s,  v2.s[1]
146        FMLA    v24.4s, v14.4s,  v4.s[1]
147        LDR     q3,  [x9], 16
148        FMLA    v26.4s, v14.4s,  v6.s[1]
149        FMLA    v28.4s, v14.4s,  v8.s[1]
150        FMLA    v21.4s, v15.4s,  v0.s[1]
151        LDR     q5, [x10], 16
152        FMLA    v23.4s, v15.4s,  v2.s[1]
153        FMLA    v25.4s, v15.4s,  v4.s[1]
154        FMLA    v27.4s, v15.4s,  v6.s[1]
155        LDR     q7, [x11], 16
156        FMLA    v29.4s, v15.4s,  v8.s[1]
157
158        FMLA    v20.4s, v16.4s,  v0.s[2]
159        FMLA    v22.4s, v16.4s,  v2.s[2]
160        LDR     q9, [x12], 16
161        FMLA    v24.4s, v16.4s,  v4.s[2]
162        FMLA    v26.4s, v16.4s,  v6.s[2]
163        FMLA    v28.4s, v16.4s,  v8.s[2]
164        LDP     q12,  q13, [x5], 32       // Load 4 B
165        FMLA    v21.4s, v17.4s,  v0.s[2]
166        FMLA    v23.4s, v17.4s,  v2.s[2]
167        FMLA    v25.4s, v17.4s,  v4.s[2]
168        LDP     q14,  q15, [x5], 32
169        FMLA    v27.4s, v17.4s,  v6.s[2]
170        FMLA    v29.4s, v17.4s,  v8.s[2]
171
172        FMLA    v20.4s, v18.4s,  v0.s[3]
173        LDP     q16,  q17, [x5], 32
174        FMLA    v22.4s, v18.4s,  v2.s[3]
175        FMLA    v24.4s, v18.4s,  v4.s[3]
176        FMLA    v26.4s, v18.4s,  v6.s[3]
177        FMLA    v28.4s, v18.4s,  v8.s[3]
178        FMLA    v21.4s, v19.4s,  v0.s[3]
179        FMLA    v23.4s, v19.4s,  v2.s[3]
180        FMLA    v25.4s, v19.4s,  v4.s[3]
181        FMLA    v27.4s, v19.4s,  v6.s[3]
182        FMLA    v29.4s, v19.4s,  v8.s[3]
183        LDP     q18,  q19, [x5], 32
184
185        # Second group of 4 A.  40 FMA.
186        FMLA    v20.4s, v12.4s,  v1.s[0]
187        FMLA    v22.4s, v12.4s,  v3.s[0]
188        FMLA    v24.4s, v12.4s,  v5.s[0]
189        LDR     q0,  [x3], 16           // Load next 5 A
190        FMLA    v26.4s, v12.4s,  v7.s[0]
191        FMLA    v28.4s, v12.4s,  v9.s[0]
192        FMLA    v21.4s, v13.4s,  v1.s[0]
193        LDR     q2,  [x9], 16
194        FMLA    v23.4s, v13.4s,  v3.s[0]
195        FMLA    v25.4s, v13.4s,  v5.s[0]
196        FMLA    v27.4s, v13.4s,  v7.s[0]
197        LDR     q4, [x10], 16
198        FMLA    v29.4s, v13.4s,  v9.s[0]
199
200        FMLA    v20.4s, v14.4s,  v1.s[1]
201        FMLA    v22.4s, v14.4s,  v3.s[1]
202        LDR     q6, [x11], 16
203        FMLA    v24.4s, v14.4s,  v5.s[1]
204        FMLA    v26.4s, v14.4s,  v7.s[1]
205        FMLA    v28.4s, v14.4s,  v9.s[1]
206        LDR     q8, [x12], 16
207        FMLA    v21.4s, v15.4s,  v1.s[1]
208        FMLA    v23.4s, v15.4s,  v3.s[1]
209        FMLA    v25.4s, v15.4s,  v5.s[1]
210        LDP     q12,  q13, [x5], 32       // Load next 3 B (not last)
211        FMLA    v27.4s, v15.4s,  v7.s[1]
212        FMLA    v29.4s, v15.4s,  v9.s[1]
213
214        FMLA    v20.4s, v16.4s,  v1.s[2]
215        LDP     q14,  q15, [x5], 32
216        FMLA    v22.4s, v16.4s,  v3.s[2]
217        FMLA    v24.4s, v16.4s,  v5.s[2]
218        FMLA    v26.4s, v16.4s,  v7.s[2]
219        FMLA    v28.4s, v16.4s,  v9.s[2]
220        FMLA    v21.4s, v17.4s,  v1.s[2]
221        FMLA    v23.4s, v17.4s,  v3.s[2]
222        FMLA    v25.4s, v17.4s,  v5.s[2]
223        FMLA    v27.4s, v17.4s,  v7.s[2]
224        FMLA    v29.4s, v17.4s,  v9.s[2]
225        LDP     q16,  q17, [x5], 32
226
227        FMLA    v20.4s, v18.4s,  v1.s[3]
228        FMLA    v22.4s, v18.4s,  v3.s[3]
229        SUBS    x0, x0, 32
230        FMLA    v24.4s, v18.4s,  v5.s[3]
231        FMLA    v26.4s, v18.4s,  v7.s[3]
232        FMLA    v28.4s, v18.4s,  v9.s[3]
233        FMLA    v21.4s, v19.4s,  v1.s[3]
234        FMLA    v23.4s, v19.4s,  v3.s[3]
235        FMLA    v25.4s, v19.4s,  v5.s[3]
236        FMLA    v27.4s, v19.4s,  v7.s[3]
237        FMLA    v29.4s, v19.4s,  v9.s[3]
238        B.HS    1b
239
240        # Epilogue - 8 floats of A (32 bytes)
241        # 80 FMA + 5 LDP A + 8 LDP B
242        # First block same as main loop.  Second block has no preloads.
2432:
244        # First group of 4 A.  40 FMA.
245        FMLA    v20.4s, v12.4s,  v0.s[0]
246        LDP     q18,  q19, [x5], 32      // Load last B
247        FMLA    v22.4s, v12.4s,  v2.s[0]
248        FMLA    v24.4s, v12.4s,  v4.s[0]
249        FMLA    v26.4s, v12.4s,  v6.s[0]
250        FMLA    v28.4s, v12.4s,  v8.s[0]
251        FMLA    v21.4s, v13.4s,  v0.s[0]
252        FMLA    v23.4s, v13.4s,  v2.s[0]
253        FMLA    v25.4s, v13.4s,  v4.s[0]
254        FMLA    v27.4s, v13.4s,  v6.s[0]
255        FMLA    v29.4s, v13.4s,  v8.s[0]
256        LDR     q1,  [x3], 16            // Load next 5 A
257
258        FMLA    v20.4s, v14.4s,  v0.s[1]
259        FMLA    v22.4s, v14.4s,  v2.s[1]
260        FMLA    v24.4s, v14.4s,  v4.s[1]
261        LDR     q3,  [x9], 16
262        FMLA    v26.4s, v14.4s,  v6.s[1]
263        FMLA    v28.4s, v14.4s,  v8.s[1]
264        FMLA    v21.4s, v15.4s,  v0.s[1]
265        LDR     q5, [x10], 16
266        FMLA    v23.4s, v15.4s,  v2.s[1]
267        FMLA    v25.4s, v15.4s,  v4.s[1]
268        FMLA    v27.4s, v15.4s,  v6.s[1]
269        LDR     q7, [x11], 16
270        FMLA    v29.4s, v15.4s,  v8.s[1]
271
272        FMLA    v20.4s, v16.4s,  v0.s[2]
273        FMLA    v22.4s, v16.4s,  v2.s[2]
274        LDR     q9, [x12], 16
275        FMLA    v24.4s, v16.4s,  v4.s[2]
276        FMLA    v26.4s, v16.4s,  v6.s[2]
277        FMLA    v28.4s, v16.4s,  v8.s[2]
278        LDP     q12,  q13, [x5], 32       // Load 4 B
279        FMLA    v21.4s, v17.4s,  v0.s[2]
280        FMLA    v23.4s, v17.4s,  v2.s[2]
281        FMLA    v25.4s, v17.4s,  v4.s[2]
282        LDP     q14,  q15, [x5], 32
283        FMLA    v27.4s, v17.4s,  v6.s[2]
284        FMLA    v29.4s, v17.4s,  v8.s[2]
285
286        FMLA    v20.4s, v18.4s,  v0.s[3]
287        LDP     q16,  q17, [x5], 32
288        FMLA    v22.4s, v18.4s,  v2.s[3]
289        FMLA    v24.4s, v18.4s,  v4.s[3]
290        FMLA    v26.4s, v18.4s,  v6.s[3]
291        FMLA    v28.4s, v18.4s,  v8.s[3]
292        FMLA    v21.4s, v19.4s,  v0.s[3]
293        FMLA    v23.4s, v19.4s,  v2.s[3]
294        FMLA    v25.4s, v19.4s,  v4.s[3]
295        FMLA    v27.4s, v19.4s,  v6.s[3]
296        FMLA    v29.4s, v19.4s,  v8.s[3]
297        LDP     q18,  q19, [x5], 32
298
299        # Second group of 4 A.  40 FMA.
300        FMLA    v20.4s, v12.4s,  v1.s[0]
301        FMLA    v22.4s, v12.4s,  v3.s[0]
302        FMLA    v24.4s, v12.4s,  v5.s[0]
303        FMLA    v26.4s, v12.4s,  v7.s[0]
304        FMLA    v28.4s, v12.4s,  v9.s[0]
305        FMLA    v21.4s, v13.4s,  v1.s[0]
306        FMLA    v23.4s, v13.4s,  v3.s[0]
307        FMLA    v25.4s, v13.4s,  v5.s[0]
308        FMLA    v27.4s, v13.4s,  v7.s[0]
309        FMLA    v29.4s, v13.4s,  v9.s[0]
310
311        FMLA    v20.4s, v14.4s,  v1.s[1]
312        FMLA    v22.4s, v14.4s,  v3.s[1]
313        FMLA    v24.4s, v14.4s,  v5.s[1]
314        FMLA    v26.4s, v14.4s,  v7.s[1]
315        FMLA    v28.4s, v14.4s,  v9.s[1]
316        FMLA    v21.4s, v15.4s,  v1.s[1]
317        FMLA    v23.4s, v15.4s,  v3.s[1]
318        FMLA    v25.4s, v15.4s,  v5.s[1]
319        FMLA    v27.4s, v15.4s,  v7.s[1]
320        FMLA    v29.4s, v15.4s,  v9.s[1]
321
322        FMLA    v20.4s, v16.4s,  v1.s[2]
323        FMLA    v22.4s, v16.4s,  v3.s[2]
324        FMLA    v24.4s, v16.4s,  v5.s[2]
325        FMLA    v26.4s, v16.4s,  v7.s[2]
326        FMLA    v28.4s, v16.4s,  v9.s[2]
327        FMLA    v21.4s, v17.4s,  v1.s[2]
328        FMLA    v23.4s, v17.4s,  v3.s[2]
329        FMLA    v25.4s, v17.4s,  v5.s[2]
330        FMLA    v27.4s, v17.4s,  v7.s[2]
331        FMLA    v29.4s, v17.4s,  v9.s[2]
332        TST     x0, 31
333
334        FMLA    v20.4s, v18.4s,  v1.s[3]
335        FMLA    v22.4s, v18.4s,  v3.s[3]
336        FMLA    v24.4s, v18.4s,  v5.s[3]
337        FMLA    v26.4s, v18.4s,  v7.s[3]
338        FMLA    v28.4s, v18.4s,  v9.s[3]
339        FMLA    v21.4s, v19.4s,  v1.s[3]
340        FMLA    v23.4s, v19.4s,  v3.s[3]
341        FMLA    v25.4s, v19.4s,  v5.s[3]
342        FMLA    v27.4s, v19.4s,  v7.s[3]
343        FMLA    v29.4s, v19.4s,  v9.s[3]
344        B.NE    4f
345
346        # Clamp
3473:
348        FMAX    v20.4s, v20.4s, v30.4s
349        SUBS    x1, x1, 8
350        FMAX    v21.4s, v21.4s, v30.4s
351        FMAX    v22.4s, v22.4s, v30.4s
352        FMAX    v23.4s, v23.4s, v30.4s
353        FMAX    v24.4s, v24.4s, v30.4s
354        FMAX    v25.4s, v25.4s, v30.4s
355        FMAX    v26.4s, v26.4s, v30.4s
356        FMAX    v27.4s, v27.4s, v30.4s
357        FMAX    v28.4s, v28.4s, v30.4s
358        FMAX    v29.4s, v29.4s, v30.4s
359        FMIN    v20.4s, v20.4s, v31.4s
360        FMIN    v21.4s, v21.4s, v31.4s
361        FMIN    v22.4s, v22.4s, v31.4s
362        FMIN    v23.4s, v23.4s, v31.4s
363        FMIN    v24.4s, v24.4s, v31.4s
364        FMIN    v25.4s, v25.4s, v31.4s
365        FMIN    v26.4s, v26.4s, v31.4s
366        FMIN    v27.4s, v27.4s, v31.4s
367        FMIN    v28.4s, v28.4s, v31.4s
368        FMIN    v29.4s, v29.4s, v31.4s
369
370        # Store full 5 x 8
371        B.LO    7f
372
373        STP     q20, q21,  [x6]
374        ADD     x6,  x6, x14
375        SUB     x3,  x3, x2             // a0 -= kc
376        STP     q22, q23, [x16]
377        ADD     x16, x16, x14
378        SUB     x9,  x9, x2             // a1 -= kc
379        STP     q24, q25, [x17]
380        ADD     x17, x17, x14
381        SUB     x10, x10, x2            // a2 -= kc
382        STP     q26, q27, [x13]
383        ADD     x13, x13, x14
384        SUB     x11, x11, x2            // a3 -= kc
385        STP     q28, q29, [x7]
386        ADD     x7, x7, x14
387        SUB     x12, x12, x2            // a4 -= kc
388
389        B.HI    0b
390
391        # Restore d8-d15 from stack
392        LDP     d14, d15, [sp, 32]
393        LDP     d12, d13, [sp, 16]
394        LDP     d8,  d9, [sp], 48
395        RET
396
397        # Load clamp values
3984:
399        # Is there a remainder?- 4 floats of A (16 bytes)
400        TBZ     x0, 4, 5f
401
402        # Remainder- 4 floats of A (16 bytes)
403        # Load A
404        LDR     q0,  [x3], 16
405        LDR     q2,  [x9], 16
406        LDR     q4, [x10], 16
407        LDR     q6, [x11], 16
408        LDR     q8, [x12], 16
409        # Load B
410        LDP     q12,  q13, [x5], 32
411        LDP     q14,  q15, [x5], 32
412        LDP     q16,  q17, [x5], 32
413        LDP     q18,  q19, [x5], 32
414
415        FMLA    v20.4s, v12.4s,  v0.s[0]
416        FMLA    v22.4s, v12.4s,  v2.s[0]
417        FMLA    v24.4s, v12.4s,  v4.s[0]
418        FMLA    v26.4s, v12.4s,  v6.s[0]
419        FMLA    v28.4s, v12.4s,  v8.s[0]
420        FMLA    v21.4s, v13.4s,  v0.s[0]
421        FMLA    v23.4s, v13.4s,  v2.s[0]
422        FMLA    v25.4s, v13.4s,  v4.s[0]
423        FMLA    v27.4s, v13.4s,  v6.s[0]
424        FMLA    v29.4s, v13.4s,  v8.s[0]
425
426        FMLA    v20.4s, v14.4s,  v0.s[1]
427        FMLA    v22.4s, v14.4s,  v2.s[1]
428        FMLA    v24.4s, v14.4s,  v4.s[1]
429        FMLA    v26.4s, v14.4s,  v6.s[1]
430        FMLA    v28.4s, v14.4s,  v8.s[1]
431        FMLA    v21.4s, v15.4s,  v0.s[1]
432        FMLA    v23.4s, v15.4s,  v2.s[1]
433        FMLA    v25.4s, v15.4s,  v4.s[1]
434        FMLA    v27.4s, v15.4s,  v6.s[1]
435        FMLA    v29.4s, v15.4s,  v8.s[1]
436
437        FMLA    v20.4s, v16.4s,  v0.s[2]
438        FMLA    v22.4s, v16.4s,  v2.s[2]
439        FMLA    v24.4s, v16.4s,  v4.s[2]
440        FMLA    v26.4s, v16.4s,  v6.s[2]
441        FMLA    v28.4s, v16.4s,  v8.s[2]
442        FMLA    v21.4s, v17.4s,  v0.s[2]
443        FMLA    v23.4s, v17.4s,  v2.s[2]
444        FMLA    v25.4s, v17.4s,  v4.s[2]
445        FMLA    v27.4s, v17.4s,  v6.s[2]
446        FMLA    v29.4s, v17.4s,  v8.s[2]
447
448        FMLA    v20.4s, v18.4s,  v0.s[3]
449        FMLA    v22.4s, v18.4s,  v2.s[3]
450        FMLA    v24.4s, v18.4s,  v4.s[3]
451        FMLA    v26.4s, v18.4s,  v6.s[3]
452        FMLA    v28.4s, v18.4s,  v8.s[3]
453        FMLA    v21.4s, v19.4s,  v0.s[3]
454        FMLA    v23.4s, v19.4s,  v2.s[3]
455        FMLA    v25.4s, v19.4s,  v4.s[3]
456        FMLA    v27.4s, v19.4s,  v6.s[3]
457        FMLA    v29.4s, v19.4s,  v8.s[3]
458
459        # Is there a remainder?- 2 floats of A (8 bytes)
4605:
461        TBZ     x0, 3, 6f
462
463        # Remainder- 2 floats of A (8 bytes)
464        # Load A
465        LDR     d0,  [x3], 8
466        LDR     d2,  [x9], 8
467        LDR     d4, [x10], 8
468        LDR     d6, [x11], 8
469        LDR     d8, [x12], 8
470        # Load B
471        LDP     q12,  q13, [x5], 32
472        LDP     q14,  q15, [x5], 32
473
474        FMLA    v20.4s, v12.4s,  v0.s[0]
475        FMLA    v22.4s, v12.4s,  v2.s[0]
476        FMLA    v24.4s, v12.4s,  v4.s[0]
477        FMLA    v26.4s, v12.4s,  v6.s[0]
478        FMLA    v28.4s, v12.4s,  v8.s[0]
479        FMLA    v21.4s, v13.4s,  v0.s[0]
480        FMLA    v23.4s, v13.4s,  v2.s[0]
481        FMLA    v25.4s, v13.4s,  v4.s[0]
482        FMLA    v27.4s, v13.4s,  v6.s[0]
483        FMLA    v29.4s, v13.4s,  v8.s[0]
484
485        FMLA    v20.4s, v14.4s,  v0.s[1]
486        FMLA    v22.4s, v14.4s,  v2.s[1]
487        FMLA    v24.4s, v14.4s,  v4.s[1]
488        FMLA    v26.4s, v14.4s,  v6.s[1]
489        FMLA    v28.4s, v14.4s,  v8.s[1]
490        FMLA    v21.4s, v15.4s,  v0.s[1]
491        FMLA    v23.4s, v15.4s,  v2.s[1]
492        FMLA    v25.4s, v15.4s,  v4.s[1]
493        FMLA    v27.4s, v15.4s,  v6.s[1]
494        FMLA    v29.4s, v15.4s,  v8.s[1]
495
496        # Is there a remainder?- 1 float of A (4 bytes)
4976:
498        TBZ     x0, 2, 3b
499
500        # Remainder- 1 float of A (4 bytes)
501        # Load A
502        LDR     s0,  [x3], 4
503        LDR     s2,  [x9], 4
504        LDR     s4, [x10], 4
505        LDR     s6, [x11], 4
506        LDR     s8, [x12], 4
507        # Load B
508        LDP     q12,  q13, [x5], 32
509
510        FMLA    v20.4s, v12.4s,  v0.s[0]
511        FMLA    v22.4s, v12.4s,  v2.s[0]
512        FMLA    v24.4s, v12.4s,  v4.s[0]
513        FMLA    v26.4s, v12.4s,  v6.s[0]
514        FMLA    v28.4s, v12.4s,  v8.s[0]
515        FMLA    v21.4s, v13.4s,  v0.s[0]
516        FMLA    v23.4s, v13.4s,  v2.s[0]
517        FMLA    v25.4s, v13.4s,  v4.s[0]
518        FMLA    v27.4s, v13.4s,  v6.s[0]
519        FMLA    v29.4s, v13.4s,  v8.s[0]
520        B       3b
521
522        # Store odd width
5237:
524        TBZ     x1, 2, 8f
525        STR     q20,  [x6], 16
526        MOV     v20.16b, v21.16b
527        STR     q22, [x16], 16
528        MOV     v22.16b, v23.16b
529        STR     q24, [x17], 16
530        MOV     v24.16b, v25.16b
531        STR     q26, [x13], 16
532        MOV     v26.16b, v27.16b
533        STR     q28, [x7], 16
534        MOV     v28.16b, v29.16b
5358:
536        TBZ     x1, 1, 9f
537        STR     d20,  [x6], 8
538        STR     d22, [x16], 8
539        DUP     d20, v20.d[1]
540        DUP     d22, v22.d[1]
541        STR     d24, [x17], 8
542        STR     d26, [x13], 8
543        DUP     d24, v24.d[1]
544        DUP     d26, v26.d[1]
545        STR     d28, [x7], 8
546        DUP     d28, v28.d[1]
547
5489:
549        TBZ     x1, 0, 10f
550        STR     s20,  [x6]
551        STR     s22, [x16]
552        STR     s24, [x17]
553        STR     s26, [x13]
554        STR     s28, [x7]
55510:
556        # Restore d8-d15 from stack
557        LDP     d14, d15, [sp, 32]
558        LDP     d12, d13, [sp, 16]
559        LDP     d8,  d9, [sp], 48
560        RET
561
562END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
563
564#ifdef __ELF__
565.section ".note.GNU-stack","",%progbits
566#endif
567