xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# 5x8 strips the following out of 5x8
27# x23 a5
28#  x7 c5  x13 unused
29# A5  v10 v11
30# C   v30 v31
31
32# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
33
34# A pointers
35# x14 a0
36# x15 a1
37# x20 a2
38# x21 a3
39#  x8 a4
40
41# C pointers
42#  x6 c0
43# x16 c1
44# x17 c2
45# x13 c3
46#  x7 c4
47
48# Vector register usage
49# A0   v0  v1
50# A1   v2  v3
51# A2   v4  v5
52# A3   v6  v7
53# A4   v8  v9
54# B   v12 v13 v14 v15
55# B   v16 v17 v18 v19
56# C   v20 v21
57# C   v22 v23
58# C   v24 v25
59# C   v26 v27
60# C   v28 v29
61# Clamp v30 v31
62
63BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75
64
65        # Clamp C pointers / Save d8-d15 on stack
66        STP     d8,  d9, [sp, -64]!
67        CMP     x0, 2                   // if mr < 2
68        ADD     x16, x6, x7             // c1 = c0 + cm_stride
69        CSEL    x16, x6, x16, LO        //   c1 = c0
70
71        STP     d12, d13, [sp, 16]
72        ADD     x17, x16, x7            // c2 = c1 + cm_stride
73                                        // if mr <= 2
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75
76        STP     d14, d15, [sp, 32]
77        CMP     x0, 4                   // if mr < 4
78        ADD     x13, x17, x7            // c3 = c2 + cm_stride
79        CSEL    x13, x17, x13, LO       //   c3 = c2
80
81        # Load zero, params pointer
82        LDP     x12, x8, [sp, 80]
83        ADD     x7, x13, x7             // c4 = c3 + cm_stride
84                                        // if mr <= 4
85        CSEL    x7, x13, x7, LS         //   c4 = c3
86
87        # Save x20,x21 on stack
88        STP     x20, x21, [sp, 48]
89
90        # Load clamp values
91        LD2R    {v30.4s, v31.4s}, [x8]
92
93        # Load cn_stride, a_offset
94        LDP     x10, x11, [sp, 64]
95
960:
97        # Load initial bias from w into accumulators
98        LDP     q20, q21, [x5], 32
99        MOV     v22.16b, v20.16b
100        MOV     v23.16b, v21.16b
101        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
102        MOV     v24.16b, v20.16b
103        MOV     v25.16b, v21.16b
104        PRFM    PLDL1KEEP, [x5, 64]
105        MOV     v26.16b, v20.16b
106        MOV     v27.16b, v21.16b
107        PRFM    PLDL1KEEP, [x5, 128]
108        MOV     v28.16b, v20.16b
109        MOV     v29.16b, v21.16b
110        PRFM    PLDL1KEEP, [x5, 192]
111
112        MOV     x9, x3                  // p = ks
113
1141:
115        # Load next 5 A pointers
116        LDP     x14, x15, [x4], 16
117        LDP     x20, x21, [x4], 16
118        LDR     x8, [x4], 8
119
120        CMP     x14, x12                // if a0 == zero
121        ADD     x14, x14, x11           // a0 += a_offset
122        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
123        CMP     x15, x12                // if a1 == zero
124        ADD     x15, x15, x11           // a1 += a_offset
125        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
126        CMP     x20, x12                // if a2 == zero
127        ADD     x20, x20, x11           // a2 += a_offset
128        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
129        CMP     x21, x12                // if a3 == zero
130        ADD     x21, x21, x11           // a3 += a_offset
131        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
132        CMP     x8, x12                 // if a4 == zero
133        ADD     x8, x8, x11             // a4 += a_offset
134        CSEL    x8, x12, x8, EQ         //   a4 = zero, else += a4 + a_offset
135
136        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
137        SUBS    x0, x2, 32              // k = kc - 32
138        B.LO    5f
139
140        # Prologue - loads for main loop of 96 FMA
141        LDR     q0, [x14], 16
142        LDR     q2, [x15], 16
143        LDR     q4, [x20], 16
144        LDR     q6, [x21], 16
145        LDR     q8, [x8], 16
146        LDP     q12, q13, [x5], 32      // Fetch 3 B (4th deferred)
147        LDP     q14, q15, [x5], 32
148        LDP     q16, q17, [x5], 32
149
150        # Is there at least 8 floats (32 bytes) for main loop?
151        SUBS    x0, x0, 32
152        B.LO    3f
153
154        # Main loop - 8 floats of A (32 bytes)
155        # 80 FMA + 5 LDP A + 8 LDP B
1562:
157        # First group of 4 A.  40 FMA.
158        FMLA    v20.4s, v12.4s,  v0.s[0]
159        LDP     q18, q19, [x5], 32        // Load last B
160        FMLA    v22.4s, v12.4s,  v2.s[0]
161        FMLA    v24.4s, v12.4s,  v4.s[0]
162        FMLA    v26.4s, v12.4s,  v6.s[0]
163        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
164        FMLA    v28.4s, v12.4s,  v8.s[0]
165        FMLA    v21.4s, v13.4s,  v0.s[0]
166        FMLA    v23.4s, v13.4s,  v2.s[0]
167        PRFM    PLDL1KEEP, [x5, 256]
168        FMLA    v25.4s, v13.4s,  v4.s[0]
169        FMLA    v27.4s, v13.4s,  v6.s[0]
170        FMLA    v29.4s, v13.4s,  v8.s[0]
171        LDR     q1, [x14], 16            // Load next 5 A
172
173        FMLA    v20.4s, v14.4s,  v0.s[1]
174        FMLA    v22.4s, v14.4s,  v2.s[1]
175        FMLA    v24.4s, v14.4s,  v4.s[1]
176        LDR     q3, [x15], 16
177        FMLA    v26.4s, v14.4s,  v6.s[1]
178        FMLA    v28.4s, v14.4s,  v8.s[1]
179        FMLA    v21.4s, v15.4s,  v0.s[1]
180        LDR     q5, [x20], 16
181        FMLA    v23.4s, v15.4s,  v2.s[1]
182        FMLA    v25.4s, v15.4s,  v4.s[1]
183        FMLA    v27.4s, v15.4s,  v6.s[1]
184        LDR     q7, [x21], 16
185        FMLA    v29.4s, v15.4s,  v8.s[1]
186
187        FMLA    v20.4s, v16.4s,  v0.s[2]
188        FMLA    v22.4s, v16.4s,  v2.s[2]
189        LDR     q9, [x8], 16
190        FMLA    v24.4s, v16.4s,  v4.s[2]
191        FMLA    v26.4s, v16.4s,  v6.s[2]
192        FMLA    v28.4s, v16.4s,  v8.s[2]
193        LDP     q12, q13, [x5], 32        // Load 4 B
194        FMLA    v21.4s, v17.4s,  v0.s[2]
195        FMLA    v23.4s, v17.4s,  v2.s[2]
196        FMLA    v25.4s, v17.4s,  v4.s[2]
197        FMLA    v27.4s, v17.4s,  v6.s[2]
198        FMLA    v29.4s, v17.4s,  v8.s[2]
199
200        FMLA    v20.4s, v18.4s,  v0.s[3]
201        FMLA    v22.4s, v18.4s,  v2.s[3]
202        FMLA    v24.4s, v18.4s,  v4.s[3]
203        FMLA    v26.4s, v18.4s,  v6.s[3]
204        LDP     q14, q15, [x5], 32
205        FMLA    v28.4s, v18.4s,  v8.s[3]
206        FMLA    v21.4s, v19.4s,  v0.s[3]
207        FMLA    v23.4s, v19.4s,  v2.s[3]
208        LDP     q16, q17, [x5], 32
209        FMLA    v25.4s, v19.4s,  v4.s[3]
210        FMLA    v27.4s, v19.4s,  v6.s[3]
211        FMLA    v29.4s, v19.4s,  v8.s[3]
212        LDP     q18, q19, [x5], 32
213
214        # Second group of 4 A.  40 FMA.
215        FMLA    v20.4s, v12.4s,  v1.s[0]
216        FMLA    v22.4s, v12.4s,  v3.s[0]
217        FMLA    v24.4s, v12.4s,  v5.s[0]
218        LDR     q0, [x14], 16            // Load next 5 A
219        FMLA    v26.4s, v12.4s,  v7.s[0]
220        FMLA    v28.4s, v12.4s,  v9.s[0]
221        FMLA    v21.4s, v13.4s,  v1.s[0]
222        LDR     q2, [x15], 16
223        FMLA    v23.4s, v13.4s,  v3.s[0]
224        FMLA    v25.4s, v13.4s,  v5.s[0]
225        FMLA    v27.4s, v13.4s,  v7.s[0]
226        LDR     q4, [x20], 16
227        FMLA    v29.4s, v13.4s,  v9.s[0]
228
229        FMLA    v20.4s, v14.4s,  v1.s[1]
230        FMLA    v22.4s, v14.4s,  v3.s[1]
231        LDR     q6, [x21], 16
232        FMLA    v24.4s, v14.4s,  v5.s[1]
233        FMLA    v26.4s, v14.4s,  v7.s[1]
234        FMLA    v28.4s, v14.4s,  v9.s[1]
235        LDR     q8, [x8], 16
236        FMLA    v21.4s, v15.4s,  v1.s[1]
237        FMLA    v23.4s, v15.4s,  v3.s[1]
238        FMLA    v25.4s, v15.4s,  v5.s[1]
239        LDP     q12, q13, [x5], 32        // Load next 3 B (not last)
240        FMLA    v27.4s, v15.4s,  v7.s[1]
241        FMLA    v29.4s, v15.4s,  v9.s[1]
242
243        FMLA    v20.4s, v16.4s,  v1.s[2]
244        FMLA    v22.4s, v16.4s,  v3.s[2]
245        FMLA    v24.4s, v16.4s,  v5.s[2]
246        FMLA    v26.4s, v16.4s,  v7.s[2]
247        FMLA    v28.4s, v16.4s,  v9.s[2]
248        FMLA    v21.4s, v17.4s,  v1.s[2]
249        FMLA    v23.4s, v17.4s,  v3.s[2]
250        LDP     q14, q15, [x5], 32
251        FMLA    v25.4s, v17.4s,  v5.s[2]
252        FMLA    v27.4s, v17.4s,  v7.s[2]
253        FMLA    v29.4s, v17.4s,  v9.s[2]
254        LDP     q16,  q17, [x5], 32
255
256        FMLA    v20.4s, v18.4s,  v1.s[3]
257        FMLA    v22.4s, v18.4s,  v3.s[3]
258        SUBS    x0, x0, 32
259        FMLA    v24.4s, v18.4s,  v5.s[3]
260        FMLA    v26.4s, v18.4s,  v7.s[3]
261        FMLA    v28.4s, v18.4s,  v9.s[3]
262        FMLA    v21.4s, v19.4s,  v1.s[3]
263        FMLA    v23.4s, v19.4s,  v3.s[3]
264        FMLA    v25.4s, v19.4s,  v5.s[3]
265        FMLA    v27.4s, v19.4s,  v7.s[3]
266        FMLA    v29.4s, v19.4s,  v9.s[3]
267        B.HS    2b
268
269        # Epilogue - 8 floats of A (32 bytes)
270        # 80 FMA + 5 LDP A + 8 LDP B
271        # First block same as main loop.  Second block has no preloads.
2723:
273        # First group of 4 A.  40 FMA.
274        FMLA    v20.4s, v12.4s,  v0.s[0]
275        LDP     q18, q19, [x5], 32        // Load last B
276        FMLA    v22.4s, v12.4s,  v2.s[0]
277        FMLA    v24.4s, v12.4s,  v4.s[0]
278        FMLA    v26.4s, v12.4s,  v6.s[0]
279        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
280        FMLA    v28.4s, v12.4s,  v8.s[0]
281        FMLA    v21.4s, v13.4s,  v0.s[0]
282        FMLA    v23.4s, v13.4s,  v2.s[0]
283        PRFM    PLDL1KEEP, [x5, 256]
284        FMLA    v25.4s, v13.4s,  v4.s[0]
285        FMLA    v27.4s, v13.4s,  v6.s[0]
286        FMLA    v29.4s, v13.4s,  v8.s[0]
287        LDR     q1, [x14], 16            // Load next 5 A
288
289        FMLA    v20.4s, v14.4s,  v0.s[1]
290        FMLA    v22.4s, v14.4s,  v2.s[1]
291        FMLA    v24.4s, v14.4s,  v4.s[1]
292        LDR     q3, [x15], 16
293        FMLA    v26.4s, v14.4s,  v6.s[1]
294        FMLA    v28.4s, v14.4s,  v8.s[1]
295        FMLA    v21.4s, v15.4s,  v0.s[1]
296        LDR     q5, [x20], 16
297        FMLA    v23.4s, v15.4s,  v2.s[1]
298        FMLA    v25.4s, v15.4s,  v4.s[1]
299        FMLA    v27.4s, v15.4s,  v6.s[1]
300        LDR     q7, [x21], 16
301        FMLA    v29.4s, v15.4s,  v8.s[1]
302
303        FMLA    v20.4s, v16.4s,  v0.s[2]
304        FMLA    v22.4s, v16.4s,  v2.s[2]
305        LDR     q9, [x8], 16
306        FMLA    v24.4s, v16.4s,  v4.s[2]
307        FMLA    v26.4s, v16.4s,  v6.s[2]
308        FMLA    v28.4s, v16.4s,  v8.s[2]
309        LDP     q12, q13, [x5], 32        // Load 4 B
310        FMLA    v21.4s, v17.4s,  v0.s[2]
311        FMLA    v23.4s, v17.4s,  v2.s[2]
312        FMLA    v25.4s, v17.4s,  v4.s[2]
313        FMLA    v27.4s, v17.4s,  v6.s[2]
314        FMLA    v29.4s, v17.4s,  v8.s[2]
315
316        FMLA    v20.4s, v18.4s,  v0.s[3]
317        FMLA    v22.4s, v18.4s,  v2.s[3]
318        FMLA    v24.4s, v18.4s,  v4.s[3]
319        FMLA    v26.4s, v18.4s,  v6.s[3]
320        LDP     q14, q15, [x5], 32
321        FMLA    v28.4s, v18.4s,  v8.s[3]
322        FMLA    v21.4s, v19.4s,  v0.s[3]
323        FMLA    v23.4s, v19.4s,  v2.s[3]
324        LDP     q16, q17, [x5], 32
325        FMLA    v25.4s, v19.4s,  v4.s[3]
326        FMLA    v27.4s, v19.4s,  v6.s[3]
327        FMLA    v29.4s, v19.4s,  v8.s[3]
328        LDP     q18, q19, [x5], 32
329
330        # Second group of 4 A.  40 FMA.
331        FMLA    v20.4s, v12.4s,  v1.s[0]
332        FMLA    v22.4s, v12.4s,  v3.s[0]
333        FMLA    v24.4s, v12.4s,  v5.s[0]
334        FMLA    v26.4s, v12.4s,  v7.s[0]
335        FMLA    v28.4s, v12.4s,  v9.s[0]
336        FMLA    v21.4s, v13.4s,  v1.s[0]
337        FMLA    v23.4s, v13.4s,  v3.s[0]
338        FMLA    v25.4s, v13.4s,  v5.s[0]
339        FMLA    v27.4s, v13.4s,  v7.s[0]
340        FMLA    v29.4s, v13.4s,  v9.s[0]
341
342        FMLA    v20.4s, v14.4s,  v1.s[1]
343        FMLA    v22.4s, v14.4s,  v3.s[1]
344        FMLA    v24.4s, v14.4s,  v5.s[1]
345        FMLA    v26.4s, v14.4s,  v7.s[1]
346        FMLA    v28.4s, v14.4s,  v9.s[1]
347        FMLA    v21.4s, v15.4s,  v1.s[1]
348        FMLA    v23.4s, v15.4s,  v3.s[1]
349        FMLA    v25.4s, v15.4s,  v5.s[1]
350        FMLA    v27.4s, v15.4s,  v7.s[1]
351        FMLA    v29.4s, v15.4s,  v9.s[1]
352
353        FMLA    v20.4s, v16.4s,  v1.s[2]
354        FMLA    v22.4s, v16.4s,  v3.s[2]
355        FMLA    v24.4s, v16.4s,  v5.s[2]
356        FMLA    v26.4s, v16.4s,  v7.s[2]
357        FMLA    v28.4s, v16.4s,  v9.s[2]
358        FMLA    v21.4s, v17.4s,  v1.s[2]
359        FMLA    v23.4s, v17.4s,  v3.s[2]
360        FMLA    v25.4s, v17.4s,  v5.s[2]
361        FMLA    v27.4s, v17.4s,  v7.s[2]
362        FMLA    v29.4s, v17.4s,  v9.s[2]
363
364        FMLA    v20.4s, v18.4s,  v1.s[3]
365        FMLA    v22.4s, v18.4s,  v3.s[3]
366        FMLA    v24.4s, v18.4s,  v5.s[3]
367        FMLA    v26.4s, v18.4s,  v7.s[3]
368        FMLA    v28.4s, v18.4s,  v9.s[3]
369        FMLA    v21.4s, v19.4s,  v1.s[3]
370        FMLA    v23.4s, v19.4s,  v3.s[3]
371        FMLA    v25.4s, v19.4s,  v5.s[3]
372        FMLA    v27.4s, v19.4s,  v7.s[3]
373        FMLA    v29.4s, v19.4s,  v9.s[3]
374        # Is there a remainder?- 4 floats of A (16 bytes) or less
375        TST     x0, 31
376        B.NE    5f
377
3784:
379        # ks loop
380        SUBS    x9, x9, 40              // ks -= MR * sizeof(void*)
381        B.HI    1b
382
383        # Clamp
384        FMAX    v20.4s, v20.4s, v30.4s
385        FMAX    v21.4s, v21.4s, v30.4s
386        FMAX    v22.4s, v22.4s, v30.4s
387        FMAX    v23.4s, v23.4s, v30.4s
388        FMAX    v24.4s, v24.4s, v30.4s
389        FMAX    v25.4s, v25.4s, v30.4s
390        FMAX    v26.4s, v26.4s, v30.4s
391        FMAX    v27.4s, v27.4s, v30.4s
392        FMAX    v28.4s, v28.4s, v30.4s
393        FMAX    v29.4s, v29.4s, v30.4s
394        FMIN    v20.4s, v20.4s, v31.4s
395        FMIN    v21.4s, v21.4s, v31.4s
396        FMIN    v22.4s, v22.4s, v31.4s
397        FMIN    v23.4s, v23.4s, v31.4s
398        FMIN    v24.4s, v24.4s, v31.4s
399        FMIN    v25.4s, v25.4s, v31.4s
400        FMIN    v26.4s, v26.4s, v31.4s
401        FMIN    v27.4s, v27.4s, v31.4s
402        FMIN    v28.4s, v28.4s, v31.4s
403        FMIN    v29.4s, v29.4s, v31.4s
404
405        # Store full 5 x 8
406        SUBS    x1, x1, 8
407        B.LO    8f
408
409        STP     q28, q29, [x7]
410        ADD     x7, x7, x10
411        STP     q26, q27, [x13]
412        ADD     x13, x13, x10
413        STP     q24, q25, [x17]
414        ADD     x17, x17, x10
415        STP     q22, q23, [x16]
416        ADD     x16, x16, x10
417        STP     q20, q21,  [x6]
418        ADD     x6,  x6, x10
419
420        SUB     x4, x4, x3              // a -= ks
421
422        # nc loop
423        B.HI    0b
424
425        # Restore x20,x21 from stack
426        LDP     x20, x21, [sp, 48]
427
428        # Restore d8-d15 from stack
429        LDP     d14, d15, [sp, 32]
430        LDP     d12, d13, [sp, 16]
431        LDP     d8,  d9, [sp], 64
432        RET
433
4345:
435        # Is there a remainder?- 4 floats of A (16 bytes)
436        TBZ     x0, 4, 6f
437
438        # Remainder- 4 floats of A (16 bytes)
439        # Load A
440        LDR     q0, [x14], 16
441        LDR     q2, [x15], 16
442        LDR     q4, [x20], 16
443        LDR     q6, [x21], 16
444        LDR     q8, [x8], 16
445        # Load B
446        LDP     q12, q13, [x5], 32
447        LDP     q14, q15, [x5], 32
448        LDP     q16, q17, [x5], 32
449        LDP     q18, q19, [x5], 32
450
451        FMLA    v20.4s, v12.4s,  v0.s[0]
452        FMLA    v22.4s, v12.4s,  v2.s[0]
453        FMLA    v24.4s, v12.4s,  v4.s[0]
454        FMLA    v26.4s, v12.4s,  v6.s[0]
455        FMLA    v28.4s, v12.4s,  v8.s[0]
456        FMLA    v21.4s, v13.4s,  v0.s[0]
457        FMLA    v23.4s, v13.4s,  v2.s[0]
458        FMLA    v25.4s, v13.4s,  v4.s[0]
459        FMLA    v27.4s, v13.4s,  v6.s[0]
460        FMLA    v29.4s, v13.4s,  v8.s[0]
461
462        FMLA    v20.4s, v14.4s,  v0.s[1]
463        FMLA    v22.4s, v14.4s,  v2.s[1]
464        FMLA    v24.4s, v14.4s,  v4.s[1]
465        FMLA    v26.4s, v14.4s,  v6.s[1]
466        FMLA    v28.4s, v14.4s,  v8.s[1]
467        FMLA    v21.4s, v15.4s,  v0.s[1]
468        FMLA    v23.4s, v15.4s,  v2.s[1]
469        FMLA    v25.4s, v15.4s,  v4.s[1]
470        FMLA    v27.4s, v15.4s,  v6.s[1]
471        FMLA    v29.4s, v15.4s,  v8.s[1]
472
473        FMLA    v20.4s, v16.4s,  v0.s[2]
474        FMLA    v22.4s, v16.4s,  v2.s[2]
475        FMLA    v24.4s, v16.4s,  v4.s[2]
476        FMLA    v26.4s, v16.4s,  v6.s[2]
477        FMLA    v28.4s, v16.4s,  v8.s[2]
478        FMLA    v21.4s, v17.4s,  v0.s[2]
479        FMLA    v23.4s, v17.4s,  v2.s[2]
480        FMLA    v25.4s, v17.4s,  v4.s[2]
481        FMLA    v27.4s, v17.4s,  v6.s[2]
482        FMLA    v29.4s, v17.4s,  v8.s[2]
483
484        FMLA    v20.4s, v18.4s,  v0.s[3]
485        FMLA    v22.4s, v18.4s,  v2.s[3]
486        FMLA    v24.4s, v18.4s,  v4.s[3]
487        FMLA    v26.4s, v18.4s,  v6.s[3]
488        FMLA    v28.4s, v18.4s,  v8.s[3]
489        FMLA    v21.4s, v19.4s,  v0.s[3]
490        FMLA    v23.4s, v19.4s,  v2.s[3]
491        FMLA    v25.4s, v19.4s,  v4.s[3]
492        FMLA    v27.4s, v19.4s,  v6.s[3]
493        FMLA    v29.4s, v19.4s,  v8.s[3]
494
495        # Is there a remainder?- 2 floats of A (8 bytes)
4966:
497        TBZ     x0, 3, 7f
498
499        # Remainder- 2 floats of A (8 bytes)
500        # Load A
501        LDR     d0, [x14], 8
502        LDR     d2, [x15], 8
503        LDR     d4, [x20], 8
504        LDR     d6, [x21], 8
505        LDR     d8, [x8], 8
506        # Load B
507        LDP     q12, q13, [x5], 32
508        LDP     q14, q15, [x5], 32
509
510        FMLA    v20.4s, v12.4s,  v0.s[0]
511        FMLA    v22.4s, v12.4s,  v2.s[0]
512        FMLA    v24.4s, v12.4s,  v4.s[0]
513        FMLA    v26.4s, v12.4s,  v6.s[0]
514        FMLA    v28.4s, v12.4s,  v8.s[0]
515        FMLA    v21.4s, v13.4s,  v0.s[0]
516        FMLA    v23.4s, v13.4s,  v2.s[0]
517        FMLA    v25.4s, v13.4s,  v4.s[0]
518        FMLA    v27.4s, v13.4s,  v6.s[0]
519        FMLA    v29.4s, v13.4s,  v8.s[0]
520
521        FMLA    v20.4s, v14.4s,  v0.s[1]
522        FMLA    v22.4s, v14.4s,  v2.s[1]
523        FMLA    v24.4s, v14.4s,  v4.s[1]
524        FMLA    v26.4s, v14.4s,  v6.s[1]
525        FMLA    v28.4s, v14.4s,  v8.s[1]
526        FMLA    v21.4s, v15.4s,  v0.s[1]
527        FMLA    v23.4s, v15.4s,  v2.s[1]
528        FMLA    v25.4s, v15.4s,  v4.s[1]
529        FMLA    v27.4s, v15.4s,  v6.s[1]
530        FMLA    v29.4s, v15.4s,  v8.s[1]
531
532        # Is there a remainder?- 1 float of A (4 bytes)
5337:
534        TBZ     x0, 2, 4b
535
536        # Remainder- 1 float of A (4 bytes)
537        # Load A
538        LDR     s0, [x14], 4
539        LDR     s2, [x15], 4
540        LDR     s4, [x20], 4
541        LDR     s6, [x21], 4
542        LDR     s8, [x8], 4
543        # Load B
544        LDP     q12, q13, [x5], 32
545
546        FMLA    v20.4s, v12.4s,  v0.s[0]
547        FMLA    v22.4s, v12.4s,  v2.s[0]
548        FMLA    v24.4s, v12.4s,  v4.s[0]
549        FMLA    v26.4s, v12.4s,  v6.s[0]
550        FMLA    v28.4s, v12.4s,  v8.s[0]
551        FMLA    v21.4s, v13.4s,  v0.s[0]
552        FMLA    v23.4s, v13.4s,  v2.s[0]
553        FMLA    v25.4s, v13.4s,  v4.s[0]
554        FMLA    v27.4s, v13.4s,  v6.s[0]
555        FMLA    v29.4s, v13.4s,  v8.s[0]
556        B       4b
557
558        # Store odd width
5598:
560        TBZ     x1, 2, 9f
561        STR     q28, [x7], 16
562        MOV     v28.16b, v29.16b
563        STR     q26, [x13], 16
564        MOV     v26.16b, v27.16b
565        STR     q24, [x17], 16
566        MOV     v24.16b, v25.16b
567        STR     q22, [x16], 16
568        MOV     v22.16b, v23.16b
569        STR     q20,  [x6], 16
570        MOV     v20.16b, v21.16b
5719:
572        TBZ     x1, 1, 10f
573        STR     d28, [x7], 8
574        STR     d26, [x13], 8
575        DUP     d28, v28.d[1]
576        DUP     d26, v26.d[1]
577        STR     d24, [x17], 8
578        STR     d22, [x16], 8
579        DUP     d24, v24.d[1]
580        DUP     d22, v22.d[1]
581        STR     d20,  [x6], 8
582        DUP     d20, v20.d[1]
583
58410:
585        TBZ     x1, 0, 11f
586        STR     s28, [x7]
587        STR     s26, [x13]
588        STR     s24, [x17]
589        STR     s22, [x16]
590        STR     s20,  [x6]
59111:
592        # Restore x20,x21 from stack
593        LDP     x20, x21, [sp, 48]
594
595        # Restore d8-d15 from stack
596        LDP     d14, d15, [sp, 32]
597        LDP     d12, d13, [sp, 16]
598        LDP     d8,  d9, [sp], 64
599        RET
600
601END_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75
602
603#ifdef __ELF__
604.section ".note.GNU-stack","",%progbits
605#endif
606