xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float** a,                   x4
18#     const void* w,                     x5
19#     uint8_t* c,                        x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
62
63        # Clamp C pointers / Save d8-d15 on stack
64        CMP     x0, 2                   // if mr < 2
65        STP     d8,  d9, [sp, -96]!
66        ADD     x16, x6, x7             // c1 = c0 + cm_stride
67        STP     d10, d11, [sp, 16]
68        CSEL    x16, x6, x16, LO        //   c1 = c0
69        STP     d12, d13, [sp, 32]
70
71        ADD     x17, x16, x7            // c2 = c1 + cm_stride
72        STP     d14, d15, [sp, 48]
73                                        // if mr <= 2
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75        STP     x20, x21, [sp, 64]
76
77        CMP     x0, 4                   // if mr < 4
78        STP     x22, x23, [sp, 80]
79        ADD     x10, x17, x7            // c3 = c2 + cm_stride
80        CSEL    x10, x17, x10, LO       //   c3 = c2
81
82        ADD     x13, x10, x7            // c4 = c3 + cm_stride
83                                        // if mr <= 4
84        CSEL    x13, x10, x13, LS       //   c4 = c3
85
86        # Load zero, params pointer
87        LDP     x12, x8, [sp, 112]
88
89        CMP     x0, 6                   // if mr < 6
90        ADD     x7, x13, x7             // c5 = c4 + cm_stride
91        LDR     x11, [sp, 104]          // Load a_offset
92        CSEL    x7, x13, x7, LO         //   c5 = c4
93
940:
95        # Load initial bias from w into accumulators
96        LDP     q20, q21, [x5], 32
97        MOV     v22.16b, v20.16b
98        MOV     v23.16b, v21.16b
99        MOV     v24.16b, v20.16b
100        MOV     v25.16b, v21.16b
101        MOV     v26.16b, v20.16b
102        MOV     v27.16b, v21.16b
103        MOV     v28.16b, v20.16b
104        MOV     v29.16b, v21.16b
105        MOV     v30.16b, v20.16b
106        MOV     v31.16b, v21.16b
107
108        MOV     x9, x3                  // p = ks
109
1101:
111        # Load next 6 A pointers
112        LDP     x14, x15, [x4], 16
113        LDP     x20, x21, [x4], 16
114        LDP     x22, x23, [x4], 16
115
116        CMP     x14, x12                // if a0 == zero
117        ADD     x14, x14, x11           // a0 += a_offset
118        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
119        CMP     x15, x12                // if a1 == zero
120        ADD     x15, x15, x11           // a1 += a_offset
121        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
122        CMP     x20, x12                // if a2 == zero
123        ADD     x20, x20, x11           // a2 += a_offset
124        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
125        CMP     x21, x12                // if a3 == zero
126        ADD     x21, x21, x11           // a3 += a_offset
127        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
128        CMP     x22, x12                // if a4 == zero
129        ADD     x22, x22, x11           // a4 += a_offset
130        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
131        CMP     x23, x12                // if a5 == zero
132        ADD     x23, x23, x11           // a5 += a_offset
133        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
134
135        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
136        SUBS    x0, x2, 32              // k = kc - 32
137        B.LO    5f
138
139        # Prologue - loads for main loop of 96 FMA
140        LDR     q0, [x14], 16
141        LDP     q12, q13, [x5], 32      // Fetch 3 B (4th deferred)
142        LDR     q1, [x15], 16
143        LDR     q2, [x20], 16
144        LDR     q3, [x21], 16
145        LDR     q4, [x22], 16
146        LDR     q5, [x23], 16
147        LDP     q14, q15, [x5], 32
148        LDP     q16, q17, [x5], 32
149
150        # Is there at least 8 floats (32 bytes) for main loop?
151        SUBS    x0, x0, 32
152        B.LO    3f
153
154        # Main loop - 8 floats of A (32 bytes)
155        # 96 FMA + 6 LDP A + 8 LDP B
156        # 64 float weights = 256 bytes.  4 cache lines.
1572:
158        # First group of 4 A.  48 FMA.
159        FMLA    v20.4s, v12.4s,  v0.s[0]
160        LDP     q18, q19, [x5], 32        // Load last B
161        FMLA    v22.4s, v12.4s,  v1.s[0]
162        FMLA    v24.4s, v12.4s,  v2.s[0]
163        FMLA    v26.4s, v12.4s,  v3.s[0]
164        FMLA    v28.4s, v12.4s,  v4.s[0]
165        FMLA    v30.4s, v12.4s,  v5.s[0]
166        FMLA    v21.4s, v13.4s,  v0.s[0]
167        FMLA    v23.4s, v13.4s,  v1.s[0]
168        FMLA    v25.4s, v13.4s,  v2.s[0]
169        FMLA    v27.4s, v13.4s,  v3.s[0]
170        FMLA    v29.4s, v13.4s,  v4.s[0]
171        FMLA    v31.4s, v13.4s,  v5.s[0]
172        FMLA    v20.4s, v14.4s,  v0.s[1]
173        FMLA    v22.4s, v14.4s,  v1.s[1]
174        FMLA    v24.4s, v14.4s,  v2.s[1]
175        FMLA    v26.4s, v14.4s,  v3.s[1]
176        FMLA    v28.4s, v14.4s,  v4.s[1]
177        FMLA    v30.4s, v14.4s,  v5.s[1]
178        FMLA    v21.4s, v15.4s,  v0.s[1]
179        FMLA    v23.4s, v15.4s,  v1.s[1]
180        FMLA    v25.4s, v15.4s,  v2.s[1]
181        LDR     q6, [x14], 16             // Load next 6 A
182        FMLA    v27.4s, v15.4s,  v3.s[1]
183        FMLA    v29.4s, v15.4s,  v4.s[1]
184        FMLA    v31.4s, v15.4s,  v5.s[1]
185        LDR     q7, [x15], 16
186
187        FMLA    v20.4s, v16.4s,  v0.s[2]
188        FMLA    v22.4s, v16.4s,  v1.s[2]
189        FMLA    v24.4s, v16.4s,  v2.s[2]
190        LDR     q8, [x20], 16
191        FMLA    v26.4s, v16.4s,  v3.s[2]
192        FMLA    v28.4s, v16.4s,  v4.s[2]
193        FMLA    v30.4s, v16.4s,  v5.s[2]
194        LDR     q9, [x21], 16
195        FMLA    v21.4s, v17.4s,  v0.s[2]
196        FMLA    v23.4s, v17.4s,  v1.s[2]
197        FMLA    v25.4s, v17.4s,  v2.s[2]
198        LDR     q10, [x22], 16
199        FMLA    v27.4s, v17.4s,  v3.s[2]
200        FMLA    v29.4s, v17.4s,  v4.s[2]
201        FMLA    v31.4s, v17.4s,  v5.s[2]
202        LDR     q11, [x23], 16
203
204        FMLA    v20.4s, v18.4s,  v0.s[3]
205        FMLA    v22.4s, v18.4s,  v1.s[3]
206        FMLA    v24.4s, v18.4s,  v2.s[3]
207        LDP     q12, q13, [x5], 32        // Load 4 B
208        FMLA    v26.4s, v18.4s,  v3.s[3]
209        FMLA    v28.4s, v18.4s,  v4.s[3]
210        FMLA    v30.4s, v18.4s,  v5.s[3]
211        LDP     q14, q15, [x5], 32
212        FMLA    v21.4s, v19.4s,  v0.s[3]
213        FMLA    v23.4s, v19.4s,  v1.s[3]
214        FMLA    v25.4s, v19.4s,  v2.s[3]
215        LDP     q16, q17, [x5], 32
216        FMLA    v27.4s, v19.4s,  v3.s[3]
217        FMLA    v29.4s, v19.4s,  v4.s[3]
218        FMLA    v31.4s, v19.4s,  v5.s[3]
219        LDP     q18, q19, [x5], 32
220
221        # Second group of 4 A.  48 FMA.
222        FMLA    v20.4s, v12.4s,  v6.s[0]
223        FMLA    v22.4s, v12.4s,  v7.s[0]
224        FMLA    v24.4s, v12.4s,  v8.s[0]
225        LDR     q0, [x14], 16            // Load next 6 A
226        FMLA    v26.4s, v12.4s,  v9.s[0]
227        FMLA    v28.4s, v12.4s, v10.s[0]
228        FMLA    v30.4s, v12.4s, v11.s[0]
229        LDR     q1, [x15], 16
230        FMLA    v21.4s, v13.4s,  v6.s[0]
231        FMLA    v23.4s, v13.4s,  v7.s[0]
232        FMLA    v25.4s, v13.4s,  v8.s[0]
233        LDR     q2, [x20], 16
234        FMLA    v27.4s, v13.4s,  v9.s[0]
235        FMLA    v29.4s, v13.4s, v10.s[0]
236        FMLA    v31.4s, v13.4s, v11.s[0]
237        LDR     q3, [x21], 16
238
239        FMLA    v20.4s, v14.4s,  v6.s[1]
240        FMLA    v22.4s, v14.4s,  v7.s[1]
241        FMLA    v24.4s, v14.4s,  v8.s[1]
242        LDR     q4, [x22], 16
243        FMLA    v26.4s, v14.4s,  v9.s[1]
244        FMLA    v28.4s, v14.4s, v10.s[1]
245        FMLA    v30.4s, v14.4s, v11.s[1]
246        LDR     q5, [x23], 16
247        FMLA    v21.4s, v15.4s,  v6.s[1]
248        FMLA    v23.4s, v15.4s,  v7.s[1]
249        FMLA    v25.4s, v15.4s,  v8.s[1]
250        LDP     q12, q13, [x5], 32        // Load next 3 B (not last)
251        FMLA    v27.4s, v15.4s,  v9.s[1]
252        FMLA    v29.4s, v15.4s, v10.s[1]
253        FMLA    v31.4s, v15.4s, v11.s[1]
254        LDP     q14, q15, [x5], 32
255
256        FMLA    v20.4s, v16.4s,  v6.s[2]
257        FMLA    v22.4s, v16.4s,  v7.s[2]
258        FMLA    v24.4s, v16.4s,  v8.s[2]
259        FMLA    v26.4s, v16.4s,  v9.s[2]
260        FMLA    v28.4s, v16.4s, v10.s[2]
261        FMLA    v30.4s, v16.4s, v11.s[2]
262        FMLA    v21.4s, v17.4s,  v6.s[2]
263        FMLA    v23.4s, v17.4s,  v7.s[2]
264        FMLA    v25.4s, v17.4s,  v8.s[2]
265        FMLA    v27.4s, v17.4s,  v9.s[2]
266        FMLA    v29.4s, v17.4s, v10.s[2]
267        FMLA    v31.4s, v17.4s, v11.s[2]
268
269        FMLA    v20.4s, v18.4s,  v6.s[3]
270        FMLA    v22.4s, v18.4s,  v7.s[3]
271        LDP     q16,  q17, [x5], 32
272        FMLA    v24.4s, v18.4s,  v8.s[3]
273        FMLA    v26.4s, v18.4s,  v9.s[3]
274        FMLA    v28.4s, v18.4s, v10.s[3]
275        FMLA    v30.4s, v18.4s, v11.s[3]
276        SUBS    x0, x0, 32
277        FMLA    v21.4s, v19.4s,  v6.s[3]
278        FMLA    v23.4s, v19.4s,  v7.s[3]
279        FMLA    v25.4s, v19.4s,  v8.s[3]
280        FMLA    v27.4s, v19.4s,  v9.s[3]
281        FMLA    v29.4s, v19.4s, v10.s[3]
282        FMLA    v31.4s, v19.4s, v11.s[3]
283        B.HS    2b
284
285        # Epilogue - 8 floats of A (32 bytes)
286        # 96 FMA + 6 LDP A + 8 LDP B
287        # First block same as main loop.  Second block has no preloads.
2883:
289        # First group of 4 A.  48 FMA.
290        FMLA    v20.4s, v12.4s,  v0.s[0]
291        LDP     q18, q19, [x5], 32        // Load last B
292        FMLA    v22.4s, v12.4s,  v1.s[0]
293        FMLA    v24.4s, v12.4s,  v2.s[0]
294        FMLA    v26.4s, v12.4s,  v3.s[0]
295        FMLA    v28.4s, v12.4s,  v4.s[0]
296        FMLA    v30.4s, v12.4s,  v5.s[0]
297        FMLA    v21.4s, v13.4s,  v0.s[0]
298        FMLA    v23.4s, v13.4s,  v1.s[0]
299        FMLA    v25.4s, v13.4s,  v2.s[0]
300        FMLA    v27.4s, v13.4s,  v3.s[0]
301        FMLA    v29.4s, v13.4s,  v4.s[0]
302        FMLA    v31.4s, v13.4s,  v5.s[0]
303        FMLA    v20.4s, v14.4s,  v0.s[1]
304        FMLA    v22.4s, v14.4s,  v1.s[1]
305        FMLA    v24.4s, v14.4s,  v2.s[1]
306        FMLA    v26.4s, v14.4s,  v3.s[1]
307        FMLA    v28.4s, v14.4s,  v4.s[1]
308        FMLA    v30.4s, v14.4s,  v5.s[1]
309        FMLA    v21.4s, v15.4s,  v0.s[1]
310        FMLA    v23.4s, v15.4s,  v1.s[1]
311        FMLA    v25.4s, v15.4s,  v2.s[1]
312        LDR     q6, [x14], 16             // Load next 6 A
313        FMLA    v27.4s, v15.4s,  v3.s[1]
314        FMLA    v29.4s, v15.4s,  v4.s[1]
315        FMLA    v31.4s, v15.4s,  v5.s[1]
316        LDR     q7, [x15], 16
317
318        FMLA    v20.4s, v16.4s,  v0.s[2]
319        FMLA    v22.4s, v16.4s,  v1.s[2]
320        FMLA    v24.4s, v16.4s,  v2.s[2]
321        LDR     q8, [x20], 16
322        FMLA    v26.4s, v16.4s,  v3.s[2]
323        FMLA    v28.4s, v16.4s,  v4.s[2]
324        FMLA    v30.4s, v16.4s,  v5.s[2]
325        LDR     q9, [x21], 16
326        FMLA    v21.4s, v17.4s,  v0.s[2]
327        FMLA    v23.4s, v17.4s,  v1.s[2]
328        FMLA    v25.4s, v17.4s,  v2.s[2]
329        LDR     q10, [x22], 16
330        FMLA    v27.4s, v17.4s,  v3.s[2]
331        FMLA    v29.4s, v17.4s,  v4.s[2]
332        FMLA    v31.4s, v17.4s,  v5.s[2]
333        LDR     q11, [x23], 16
334
335        FMLA    v20.4s, v18.4s,  v0.s[3]
336        FMLA    v22.4s, v18.4s,  v1.s[3]
337        FMLA    v24.4s, v18.4s,  v2.s[3]
338        LDP     q12, q13, [x5], 32        // Load 4 B
339        FMLA    v26.4s, v18.4s,  v3.s[3]
340        FMLA    v28.4s, v18.4s,  v4.s[3]
341        FMLA    v30.4s, v18.4s,  v5.s[3]
342        LDP     q14, q15, [x5], 32
343        FMLA    v21.4s, v19.4s,  v0.s[3]
344        FMLA    v23.4s, v19.4s,  v1.s[3]
345        FMLA    v25.4s, v19.4s,  v2.s[3]
346        LDP     q16, q17, [x5], 32
347        FMLA    v27.4s, v19.4s,  v3.s[3]
348        FMLA    v29.4s, v19.4s,  v4.s[3]
349        FMLA    v31.4s, v19.4s,  v5.s[3]
350        LDP     q18, q19, [x5], 32
351
352        # Second group of 4 A.  48 FMA.
353        FMLA    v20.4s, v12.4s,  v6.s[0]
354        FMLA    v22.4s, v12.4s,  v7.s[0]
355        FMLA    v24.4s, v12.4s,  v8.s[0]
356        FMLA    v26.4s, v12.4s,  v9.s[0]
357        FMLA    v28.4s, v12.4s, v10.s[0]
358        FMLA    v30.4s, v12.4s, v11.s[0]
359        FMLA    v21.4s, v13.4s,  v6.s[0]
360        FMLA    v23.4s, v13.4s,  v7.s[0]
361        FMLA    v25.4s, v13.4s,  v8.s[0]
362        FMLA    v27.4s, v13.4s,  v9.s[0]
363        FMLA    v29.4s, v13.4s, v10.s[0]
364        FMLA    v31.4s, v13.4s, v11.s[0]
365
366        FMLA    v20.4s, v14.4s,  v6.s[1]
367        FMLA    v22.4s, v14.4s,  v7.s[1]
368        FMLA    v24.4s, v14.4s,  v8.s[1]
369        FMLA    v26.4s, v14.4s,  v9.s[1]
370        FMLA    v28.4s, v14.4s, v10.s[1]
371        FMLA    v30.4s, v14.4s, v11.s[1]
372        FMLA    v21.4s, v15.4s,  v6.s[1]
373        FMLA    v23.4s, v15.4s,  v7.s[1]
374        FMLA    v25.4s, v15.4s,  v8.s[1]
375        FMLA    v27.4s, v15.4s,  v9.s[1]
376        FMLA    v29.4s, v15.4s, v10.s[1]
377        FMLA    v31.4s, v15.4s, v11.s[1]
378
379        FMLA    v20.4s, v16.4s,  v6.s[2]
380        FMLA    v22.4s, v16.4s,  v7.s[2]
381        FMLA    v24.4s, v16.4s,  v8.s[2]
382        FMLA    v26.4s, v16.4s,  v9.s[2]
383        FMLA    v28.4s, v16.4s, v10.s[2]
384        FMLA    v30.4s, v16.4s, v11.s[2]
385        FMLA    v21.4s, v17.4s,  v6.s[2]
386        FMLA    v23.4s, v17.4s,  v7.s[2]
387        FMLA    v25.4s, v17.4s,  v8.s[2]
388        FMLA    v27.4s, v17.4s,  v9.s[2]
389        FMLA    v29.4s, v17.4s, v10.s[2]
390        FMLA    v31.4s, v17.4s, v11.s[2]
391
392
393        FMLA    v20.4s, v18.4s,  v6.s[3]
394        FMLA    v22.4s, v18.4s,  v7.s[3]
395        FMLA    v24.4s, v18.4s,  v8.s[3]
396        FMLA    v26.4s, v18.4s,  v9.s[3]
397        FMLA    v28.4s, v18.4s, v10.s[3]
398        FMLA    v30.4s, v18.4s, v11.s[3]
399
400        # Is there a remainder?- 4 floats of A (16 bytes) or less
401        TST     x0, 31
402
403        FMLA    v21.4s, v19.4s,  v6.s[3]
404        FMLA    v23.4s, v19.4s,  v7.s[3]
405        FMLA    v25.4s, v19.4s,  v8.s[3]
406        LD2R    {v6.4s, v7.4s}, [x8]      // Load min/max values
407        FMLA    v27.4s, v19.4s,  v9.s[3]
408        FMLA    v29.4s, v19.4s, v10.s[3]
409        FMLA    v31.4s, v19.4s, v11.s[3]
410        B.NE    5f
411
4124:
413        # ks loop
414        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
415        B.HI    1b
416
417        # Clamp
418        FMAX    v20.4s, v20.4s, v6.4s
419        FMAX    v21.4s, v21.4s, v6.4s
420        FMAX    v22.4s, v22.4s, v6.4s
421        FMAX    v23.4s, v23.4s, v6.4s
422        LDR     x0, [sp, 96]            // Load cn_stride
423        FMAX    v24.4s, v24.4s, v6.4s
424        FMAX    v25.4s, v25.4s, v6.4s
425        FMAX    v26.4s, v26.4s, v6.4s
426        FMAX    v27.4s, v27.4s, v6.4s
427        FMAX    v28.4s, v28.4s, v6.4s
428        FMAX    v29.4s, v29.4s, v6.4s
429        FMAX    v30.4s, v30.4s, v6.4s
430        FMAX    v31.4s, v31.4s, v6.4s
431        SUBS    x1, x1, 8
432        FMIN    v20.4s, v20.4s, v7.4s
433        FMIN    v21.4s, v21.4s, v7.4s
434        FMIN    v22.4s, v22.4s, v7.4s
435        FMIN    v23.4s, v23.4s, v7.4s
436        FMIN    v24.4s, v24.4s, v7.4s
437        FMIN    v25.4s, v25.4s, v7.4s
438        FMIN    v26.4s, v26.4s, v7.4s
439        FMIN    v27.4s, v27.4s, v7.4s
440        FMIN    v28.4s, v28.4s, v7.4s
441        FMIN    v29.4s, v29.4s, v7.4s
442        FMIN    v30.4s, v30.4s, v7.4s
443        FMIN    v31.4s, v31.4s, v7.4s
444
445        # Store full 6 x 8
446        B.LO    8f
447
448        STP     q30, q31,  [x7]
449        ADD     x7, x7, x0
450        STP     q28, q29, [x13]
451        ADD     x13, x13, x0
452        STP     q26, q27, [x10]
453        ADD     x10, x10, x0
454        STP     q24, q25, [x17]
455        ADD     x17, x17, x0
456        STP     q22, q23, [x16]
457        ADD     x16, x16, x0
458        STP     q20, q21,  [x6]
459        ADD     x6,  x6, x0
460
461        SUB     x4, x4, x3              // a -= ks
462
463        # nc loop
464        B.HI    0b
465
466        # Restore x20,x21,x22,x23 from stack
467        LDP     x22, x23, [sp, 80]
468        LDP     x20, x21, [sp, 64]
469
470        # Restore d8-d15 from stack
471        LDP     d14, d15, [sp, 48]
472        LDP     d12, d13, [sp, 32]
473        LDP     d10, d11, [sp, 16]
474        LDP     d8,  d9, [sp], 96
475        RET
476
4775:
478        # Load min/max values
479        LD2R    {v6.4s, v7.4s}, [x8]
480
481        # Is there a remainder?- 4 floats of A (16 bytes)
482        TBZ     x0, 4, 6f
483
484        # Remainder- 4 floats of A (16 bytes)
485        # Load A
486        LDR     q0, [x14], 16
487        LDR     q1, [x15], 16
488        LDR     q2, [x20], 16
489        LDR     q3, [x21], 16
490        LDR     q4, [x22], 16
491        LDR     q5, [x23], 16
492        # Load B
493        LDP     q12, q13, [x5], 32
494        LDP     q14, q15, [x5], 32
495        LDP     q16, q17, [x5], 32
496        LDP     q18, q19, [x5], 32
497
498        FMLA    v20.4s, v12.4s,  v0.s[0]
499        FMLA    v22.4s, v12.4s,  v1.s[0]
500        FMLA    v24.4s, v12.4s,  v2.s[0]
501        FMLA    v26.4s, v12.4s,  v3.s[0]
502        FMLA    v28.4s, v12.4s,  v4.s[0]
503        FMLA    v30.4s, v12.4s,  v5.s[0]
504        FMLA    v21.4s, v13.4s,  v0.s[0]
505        FMLA    v23.4s, v13.4s,  v1.s[0]
506        FMLA    v25.4s, v13.4s,  v2.s[0]
507        FMLA    v27.4s, v13.4s,  v3.s[0]
508        FMLA    v29.4s, v13.4s,  v4.s[0]
509        FMLA    v31.4s, v13.4s,  v5.s[0]
510
511        FMLA    v20.4s, v14.4s,  v0.s[1]
512        FMLA    v22.4s, v14.4s,  v1.s[1]
513        FMLA    v24.4s, v14.4s,  v2.s[1]
514        FMLA    v26.4s, v14.4s,  v3.s[1]
515        FMLA    v28.4s, v14.4s,  v4.s[1]
516        FMLA    v30.4s, v14.4s,  v5.s[1]
517        FMLA    v21.4s, v15.4s,  v0.s[1]
518        FMLA    v23.4s, v15.4s,  v1.s[1]
519        FMLA    v25.4s, v15.4s,  v2.s[1]
520        FMLA    v27.4s, v15.4s,  v3.s[1]
521        FMLA    v29.4s, v15.4s,  v4.s[1]
522        FMLA    v31.4s, v15.4s,  v5.s[1]
523
524        FMLA    v20.4s, v16.4s,  v0.s[2]
525        FMLA    v22.4s, v16.4s,  v1.s[2]
526        FMLA    v24.4s, v16.4s,  v2.s[2]
527        FMLA    v26.4s, v16.4s,  v3.s[2]
528        FMLA    v28.4s, v16.4s,  v4.s[2]
529        FMLA    v30.4s, v16.4s,  v5.s[2]
530        FMLA    v21.4s, v17.4s,  v0.s[2]
531        FMLA    v23.4s, v17.4s,  v1.s[2]
532        FMLA    v25.4s, v17.4s,  v2.s[2]
533        FMLA    v27.4s, v17.4s,  v3.s[2]
534        FMLA    v29.4s, v17.4s,  v4.s[2]
535        FMLA    v31.4s, v17.4s,  v5.s[2]
536
537        FMLA    v20.4s, v18.4s,  v0.s[3]
538        FMLA    v22.4s, v18.4s,  v1.s[3]
539        FMLA    v24.4s, v18.4s,  v2.s[3]
540        FMLA    v26.4s, v18.4s,  v3.s[3]
541        FMLA    v28.4s, v18.4s,  v4.s[3]
542        FMLA    v30.4s, v18.4s,  v5.s[3]
543        FMLA    v21.4s, v19.4s,  v0.s[3]
544        FMLA    v23.4s, v19.4s,  v1.s[3]
545        FMLA    v25.4s, v19.4s,  v2.s[3]
546        FMLA    v27.4s, v19.4s,  v3.s[3]
547        FMLA    v29.4s, v19.4s,  v4.s[3]
548        FMLA    v31.4s, v19.4s,  v5.s[3]
549
550        # Is there a remainder?- 2 floats of A (8 bytes)
5516:
552        TBZ     x0, 3, 7f
553
554        # Remainder- 2 floats of A (8 bytes)
555        # Load A
556        LDR     d0, [x14], 8
557        LDR     d1, [x15], 8
558        LDR     d2, [x20], 8
559        LDR     d3, [x21], 8
560        LDR     d4, [x22], 8
561        LDR     d5, [x23], 8
562        # Load B
563        LDP     q12, q13, [x5], 32
564        LDP     q14, q15, [x5], 32
565
566        FMLA    v20.4s, v12.4s,  v0.s[0]
567        FMLA    v22.4s, v12.4s,  v1.s[0]
568        FMLA    v24.4s, v12.4s,  v2.s[0]
569        FMLA    v26.4s, v12.4s,  v3.s[0]
570        FMLA    v28.4s, v12.4s,  v4.s[0]
571        FMLA    v30.4s, v12.4s,  v5.s[0]
572        FMLA    v21.4s, v13.4s,  v0.s[0]
573        FMLA    v23.4s, v13.4s,  v1.s[0]
574        FMLA    v25.4s, v13.4s,  v2.s[0]
575        FMLA    v27.4s, v13.4s,  v3.s[0]
576        FMLA    v29.4s, v13.4s,  v4.s[0]
577        FMLA    v31.4s, v13.4s,  v5.s[0]
578
579        FMLA    v20.4s, v14.4s,  v0.s[1]
580        FMLA    v22.4s, v14.4s,  v1.s[1]
581        FMLA    v24.4s, v14.4s,  v2.s[1]
582        FMLA    v26.4s, v14.4s,  v3.s[1]
583        FMLA    v28.4s, v14.4s,  v4.s[1]
584        FMLA    v30.4s, v14.4s,  v5.s[1]
585        FMLA    v21.4s, v15.4s,  v0.s[1]
586        FMLA    v23.4s, v15.4s,  v1.s[1]
587        FMLA    v25.4s, v15.4s,  v2.s[1]
588        FMLA    v27.4s, v15.4s,  v3.s[1]
589        FMLA    v29.4s, v15.4s,  v4.s[1]
590        FMLA    v31.4s, v15.4s,  v5.s[1]
591
592        # Is there a remainder?- 1 float of A (4 bytes)
5937:
594        TBZ     x0, 2, 4b
595
596        # Remainder- 1 float of A (4 bytes)
597        # Load A
598        LDR     s0, [x14], 4
599        LDR     s1, [x15], 4
600        LDR     s2, [x20], 4
601        LDR     s3, [x21], 4
602        LDR     s4, [x22], 4
603        LDR     s5, [x23], 4
604        # Load B
605        LDP     q12, q13, [x5], 32
606
607        FMLA    v20.4s, v12.4s,  v0.s[0]
608        FMLA    v22.4s, v12.4s,  v1.s[0]
609        FMLA    v24.4s, v12.4s,  v2.s[0]
610        FMLA    v26.4s, v12.4s,  v3.s[0]
611        FMLA    v28.4s, v12.4s,  v4.s[0]
612        FMLA    v30.4s, v12.4s,  v5.s[0]
613        FMLA    v21.4s, v13.4s,  v0.s[0]
614        FMLA    v23.4s, v13.4s,  v1.s[0]
615        FMLA    v25.4s, v13.4s,  v2.s[0]
616        FMLA    v27.4s, v13.4s,  v3.s[0]
617        FMLA    v29.4s, v13.4s,  v4.s[0]
618        FMLA    v31.4s, v13.4s,  v5.s[0]
619        B       4b
620
621        # Store odd width
6228:
623        TBZ     x1, 2, 9f
624        STR     q30,  [x7], 16
625        MOV     v30.16b, v31.16b
626        STR     q28, [x13], 16
627        MOV     v28.16b, v29.16b
628        STR     q26, [x10], 16
629        MOV     v26.16b, v27.16b
630        STR     q24, [x17], 16
631        MOV     v24.16b, v25.16b
632        STR     q22, [x16], 16
633        MOV     v22.16b, v23.16b
634        STR     q20,  [x6], 16
635        MOV     v20.16b, v21.16b
6369:
637        TBZ     x1, 1, 10f
638        STR     d30,  [x7], 8
639        STR     d28, [x13], 8
640        DUP     d30, v30.d[1]
641        DUP     d28, v28.d[1]
642        STR     d26, [x10], 8
643        STR     d24, [x17], 8
644        DUP     d26, v26.d[1]
645        DUP     d24, v24.d[1]
646        STR     d22, [x16], 8
647        STR     d20,  [x6], 8
648        DUP     d22, v22.d[1]
649        DUP     d20, v20.d[1]
650
65110:
652        TBZ     x1, 0, 11f
653        STR     s30,  [x7]
654        STR     s28, [x13]
655        STR     s26, [x10]
656        STR     s24, [x17]
657        STR     s22, [x16]
658        STR     s20,  [x6]
65911:
660        # Restore x20,x21,x22,x23 from stack
661        LDP     x22, x23, [sp, 80]
662        LDP     x20, x21, [sp, 64]
663
664        # Restore d8-d15 from stack
665        LDP     d14, d15, [sp, 48]
666        LDP     d12, d13, [sp, 32]
667        LDP     d10, d11, [sp, 16]
668        LDP     d8,  d9, [sp], 96
669        RET
670
671END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
672
673#ifdef __ELF__
674.section ".note.GNU-stack","",%progbits
675#endif
676