xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a73.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> x8
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x14 a0
26# x15 a1
27# x20 a2
28# x21 a3
29# x22 a4
30# x23 a5
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x10 c3
37# x13 c4
38#  x7 c5
39
40# Vector register usage
41# A0   v0  v6
42# A1   v1  v7
43# A2   v2  v8
44# A3   v3  v9
45# A4   v4 v10
46# A5   v5 v11
47# B   v12 v13 v14 v15
48# B   v16 v17 v18 v19
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# C   v28 v29
54# C   v30 v31
55# Clamp v6 v7
56
57BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
58
59        # Load a_offset
60        LDR     x11, [sp, 8]
61
62        # Load zero, params pointer
63        LDP     x12, x8, [sp, 16]
64
65        # Clamp C pointers
66        STP     d8,  d9, [sp, -96]!
67        CMP     x0, 2                   // if mr < 2
68        ADD     x16, x6, x7             // c1 = c0 + cm_stride
69        CSEL    x16, x6, x16, LO        //   c1 = c0
70
71        STP     d10, d11, [sp, 16]
72        ADD     x17, x16, x7            // c2 = c1 + cm_stride
73                                        // if mr <= 2
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75
76        STP     d12, d13, [sp, 32]
77        CMP     x0, 4                   // if mr < 4
78        ADD     x10, x17, x7            // c3 = c2 + cm_stride
79        CSEL    x10, x17, x10, LO       //   c3 = c2
80
81        STP     d14, d15, [sp, 48]
82        ADD     x13, x10, x7            // c4 = c3 + cm_stride
83                                        // if mr <= 4
84        CSEL    x13, x10, x13, LS       //   c4 = c3
85
86        # Save x20,x21,x22,x23 on stack
87        STP     x20, x21, [sp, 64]
88        STP     x22, x23, [sp, 80]
89
90        CMP     x0, 6                   // if mr < 6
91        ADD     x7, x13, x7             // c5 = c4 + cm_stride
92        CSEL    x7, x13, x7, LO         //   c5 = c4
93
94        # Load zero, params pointer
95        LDP     x12, x8, [sp, 112]
96
97        # Load a_offset
98        LDR     x11, [sp, 104]
99
100        # Load min/max values
101        LD2R    {v6.4s, v7.4s}, [x8]
102
1030:
104        # Load initial bias from w into accumulators
105        LD1     {v20.16b, v21.16b}, [x5], 32
106        MOV     v22.16b, v20.16b
107        MOV     v23.16b, v21.16b
108        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
109        MOV     v24.16b, v20.16b
110        MOV     v25.16b, v21.16b
111        PRFM    PLDL1KEEP, [x5, 64]
112        MOV     v26.16b, v20.16b
113        MOV     v27.16b, v21.16b
114        PRFM    PLDL1KEEP, [x5, 128]
115        MOV     v28.16b, v20.16b
116        MOV     v29.16b, v21.16b
117        PRFM    PLDL1KEEP, [x5, 192]
118        MOV     v30.16b, v20.16b
119        MOV     v31.16b, v21.16b
120
121        MOV     x9, x3                  // p = ks
122
1231:
124        # Load next 6 A pointers
125        LDP     x14, x15, [x4], 16
126        LDP     x20, x21, [x4], 16
127        LDP     x22, x23, [x4], 16
128
129        CMP     x14, x12                // if a0 == zero
130        ADD     x14, x14, x11           // a0 += a_offset
131        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
132        CMP     x15, x12                // if a1 == zero
133        ADD     x15, x15, x11           // a1 += a_offset
134        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
135        CMP     x20, x12                // if a2 == zero
136        ADD     x20, x20, x11           // a2 += a_offset
137        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
138        CMP     x21, x12                // if a3 == zero
139        ADD     x21, x21, x11           // a3 += a_offset
140        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
141        CMP     x22, x12                // if a4 == zero
142        ADD     x22, x22, x11           // a4 += a_offset
143        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
144        CMP     x23, x12                // if a5 == zero
145        ADD     x23, x23, x11           // a5 += a_offset
146        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
147
148        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
149        SUBS    x0, x2, 32              // k = kc - 32
150        B.LO    5f
151
152        # Prologue - loads for main loop of 96 FMA
153        # load A0 to A4 but not A5
154        LDP     q0,  q6, [x14], 32
155        LDP     q1,  q7, [x15], 32
156        LDP     q2,  q8, [x20], 32
157        LDP     q3,  q9, [x21], 32
158        LDP     q4,  q10, [x22], 32
159        # load first set of B
160        LDP     q12, q13, [x5], 32
161        LDP     q14, q15, [x5], 32
162
163        # Is there at least 8 floats (32 bytes) for main loop?
164        SUBS    x0, x0, 32
165        B.LO    3f
166
167        # Main loop - 8 floats of A (32 bytes)
168        # 96 FMA + 6 LDP A + 8 LDP B
1692:
170        # First group of 4 A.  48 FMA.  Loads A5
171
172        LDP     q5, q11, [x23], 32
173        FMLA    v20.4s, v12.4s,  v0.s[0]
174        FMLA    v22.4s, v12.4s,  v1.s[0]
175        LDP     q16,  q17, [x5], 32
176        FMLA    v24.4s, v12.4s,  v2.s[0]
177        FMLA    v26.4s, v12.4s,  v3.s[0]
178        LDP     q18,  q19, [x5], 32
179        FMLA    v28.4s, v12.4s,  v4.s[0]
180        FMLA    v30.4s, v12.4s,  v5.s[0]
181        FMLA    v21.4s, v13.4s,  v0.s[0]
182        FMLA    v23.4s, v13.4s,  v1.s[0]
183        FMLA    v25.4s, v13.4s,  v2.s[0]
184        FMLA    v27.4s, v13.4s,  v3.s[0]
185        FMLA    v29.4s, v13.4s,  v4.s[0]
186        FMLA    v31.4s, v13.4s,  v5.s[0]
187
188        FMLA    v20.4s, v14.4s,  v0.s[1]
189        FMLA    v22.4s, v14.4s,  v1.s[1]
190        FMLA    v24.4s, v14.4s,  v2.s[1]
191        FMLA    v26.4s, v14.4s,  v3.s[1]
192        FMLA    v28.4s, v14.4s,  v4.s[1]
193        FMLA    v30.4s, v14.4s,  v5.s[1]
194        FMLA    v21.4s, v15.4s,  v0.s[1]
195        FMLA    v23.4s, v15.4s,  v1.s[1]
196        FMLA    v25.4s, v15.4s,  v2.s[1]
197        FMLA    v27.4s, v15.4s,  v3.s[1]
198        FMLA    v29.4s, v15.4s,  v4.s[1]
199        FMLA    v31.4s, v15.4s,  v5.s[1]
200
201        LDP     q12,  q13, [x5], 32
202        FMLA    v20.4s, v16.4s,  v0.s[2]
203        FMLA    v22.4s, v16.4s,  v1.s[2]
204        LDP     q14,  q15, [x5], 32
205        FMLA    v24.4s, v16.4s,  v2.s[2]
206        FMLA    v26.4s, v16.4s,  v3.s[2]
207        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
208        FMLA    v28.4s, v16.4s,  v4.s[2]
209        FMLA    v30.4s, v16.4s,  v5.s[2]
210        FMLA    v21.4s, v17.4s,  v0.s[2]
211        FMLA    v23.4s, v17.4s,  v1.s[2]
212        PRFM    PLDL1KEEP, [x5, 256]
213        FMLA    v25.4s, v17.4s,  v2.s[2]
214        FMLA    v27.4s, v17.4s,  v3.s[2]
215        FMLA    v29.4s, v17.4s,  v4.s[2]
216        FMLA    v31.4s, v17.4s,  v5.s[2]
217
218        FMLA    v20.4s, v18.4s,  v0.s[3]
219        FMLA    v22.4s, v18.4s,  v1.s[3]
220        FMLA    v24.4s, v18.4s,  v2.s[3]
221        FMLA    v26.4s, v18.4s,  v3.s[3]
222        FMLA    v28.4s, v18.4s,  v4.s[3]
223        FMLA    v30.4s, v18.4s,  v5.s[3]
224        FMLA    v21.4s, v19.4s,  v0.s[3]
225        FMLA    v23.4s, v19.4s,  v1.s[3]
226        FMLA    v25.4s, v19.4s,  v2.s[3]
227        FMLA    v27.4s, v19.4s,  v3.s[3]
228        FMLA    v29.4s, v19.4s,  v4.s[3]
229        FMLA    v31.4s, v19.4s,  v5.s[3]
230
231        # Second group of 4 A.  48 FMA.  Loads A0 - A4
232
233        LDP     q16,  q17, [x5], 32
234        FMLA    v20.4s, v12.4s,  v6.s[0]
235        FMLA    v22.4s, v12.4s,  v7.s[0]
236        LDP     q18,  q19, [x5], 32
237        FMLA    v24.4s, v12.4s,  v8.s[0]
238        FMLA    v26.4s, v12.4s,  v9.s[0]
239        FMLA    v28.4s, v12.4s, v10.s[0]
240        FMLA    v30.4s, v12.4s, v11.s[0]
241        FMLA    v21.4s, v13.4s,  v6.s[0]
242        FMLA    v23.4s, v13.4s,  v7.s[0]
243        FMLA    v25.4s, v13.4s,  v8.s[0]
244        FMLA    v27.4s, v13.4s,  v9.s[0]
245        FMLA    v29.4s, v13.4s, v10.s[0]
246        FMLA    v31.4s, v13.4s, v11.s[0]
247
248        FMLA    v20.4s, v14.4s,  v6.s[1]
249        FMLA    v22.4s, v14.4s,  v7.s[1]
250        FMLA    v24.4s, v14.4s,  v8.s[1]
251        FMLA    v26.4s, v14.4s,  v9.s[1]
252        FMLA    v28.4s, v14.4s, v10.s[1]
253        FMLA    v30.4s, v14.4s, v11.s[1]
254        FMLA    v21.4s, v15.4s,  v6.s[1]
255        FMLA    v23.4s, v15.4s,  v7.s[1]
256        FMLA    v25.4s, v15.4s,  v8.s[1]
257        FMLA    v27.4s, v15.4s,  v9.s[1]
258        FMLA    v29.4s, v15.4s, v10.s[1]
259        FMLA    v31.4s, v15.4s, v11.s[1]
260
261        LDP     q12,  q13, [x5], 32
262        FMLA    v20.4s, v16.4s,  v6.s[2]
263        FMLA    v20.4s, v18.4s,  v6.s[3]
264        LDP     q14,  q15, [x5], 32
265        FMLA    v21.4s, v17.4s,  v6.s[2]
266        FMLA    v21.4s, v19.4s,  v6.s[3]
267        LDP     q0,  q6, [x14], 32
268        FMLA    v22.4s, v16.4s,  v7.s[2]
269        FMLA    v22.4s, v18.4s,  v7.s[3]
270        FMLA    v23.4s, v17.4s,  v7.s[2]
271        FMLA    v23.4s, v19.4s,  v7.s[3]
272        LDP     q1,  q7, [x15], 32
273        FMLA    v24.4s, v16.4s,  v8.s[2]
274        FMLA    v24.4s, v18.4s,  v8.s[3]
275        FMLA    v25.4s, v17.4s,  v8.s[2]
276        FMLA    v25.4s, v19.4s,  v8.s[3]
277        LDP     q2,  q8, [x20], 32
278        FMLA    v26.4s, v16.4s,  v9.s[2]
279        FMLA    v26.4s, v18.4s,  v9.s[3]
280        FMLA    v27.4s, v17.4s,  v9.s[2]
281        FMLA    v27.4s, v19.4s,  v9.s[3]
282        LDP     q3,  q9, [x21], 32
283        FMLA    v28.4s, v16.4s, v10.s[2]
284        FMLA    v28.4s, v18.4s, v10.s[3]
285        FMLA    v29.4s, v17.4s, v10.s[2]
286        FMLA    v29.4s, v19.4s, v10.s[3]
287        LDP     q4,  q10, [x22], 32
288        FMLA    v30.4s, v16.4s, v11.s[2]
289        FMLA    v30.4s, v18.4s, v11.s[3]
290        SUBS    x0, x0, 32
291        FMLA    v31.4s, v17.4s, v11.s[2]
292        FMLA    v31.4s, v19.4s, v11.s[3]
293        B.HS    2b
294
295        # Epilogue - 8 floats of A (32 bytes)
296        # 96 FMA + 6 LDP A + 8 LDP B
297        # First block same as main loop.  Second block has no preloads.
2983:
299        # First group of 4 A.  48 FMA.  Loads A5
300
301        LDP     q5, q11, [x23], 32
302        FMLA    v20.4s, v12.4s,  v0.s[0]
303        FMLA    v22.4s, v12.4s,  v1.s[0]
304        LDP     q16,  q17, [x5], 32
305        FMLA    v24.4s, v12.4s,  v2.s[0]
306        FMLA    v26.4s, v12.4s,  v3.s[0]
307        LDP     q18,  q19, [x5], 32
308        FMLA    v28.4s, v12.4s,  v4.s[0]
309        FMLA    v30.4s, v12.4s,  v5.s[0]
310        FMLA    v21.4s, v13.4s,  v0.s[0]
311        FMLA    v23.4s, v13.4s,  v1.s[0]
312        FMLA    v25.4s, v13.4s,  v2.s[0]
313        FMLA    v27.4s, v13.4s,  v3.s[0]
314        FMLA    v29.4s, v13.4s,  v4.s[0]
315        FMLA    v31.4s, v13.4s,  v5.s[0]
316
317        FMLA    v20.4s, v14.4s,  v0.s[1]
318        FMLA    v22.4s, v14.4s,  v1.s[1]
319        FMLA    v24.4s, v14.4s,  v2.s[1]
320        FMLA    v26.4s, v14.4s,  v3.s[1]
321        FMLA    v28.4s, v14.4s,  v4.s[1]
322        FMLA    v30.4s, v14.4s,  v5.s[1]
323        FMLA    v21.4s, v15.4s,  v0.s[1]
324        FMLA    v23.4s, v15.4s,  v1.s[1]
325        FMLA    v25.4s, v15.4s,  v2.s[1]
326        FMLA    v27.4s, v15.4s,  v3.s[1]
327        FMLA    v29.4s, v15.4s,  v4.s[1]
328        FMLA    v31.4s, v15.4s,  v5.s[1]
329
330        LDP     q12,  q13, [x5], 32
331        FMLA    v20.4s, v16.4s,  v0.s[2]
332        FMLA    v22.4s, v16.4s,  v1.s[2]
333        LDP     q14,  q15, [x5], 32
334        FMLA    v24.4s, v16.4s,  v2.s[2]
335        FMLA    v26.4s, v16.4s,  v3.s[2]
336        FMLA    v28.4s, v16.4s,  v4.s[2]
337        FMLA    v30.4s, v16.4s,  v5.s[2]
338        FMLA    v21.4s, v17.4s,  v0.s[2]
339        FMLA    v23.4s, v17.4s,  v1.s[2]
340        FMLA    v25.4s, v17.4s,  v2.s[2]
341        FMLA    v27.4s, v17.4s,  v3.s[2]
342        FMLA    v29.4s, v17.4s,  v4.s[2]
343        FMLA    v31.4s, v17.4s,  v5.s[2]
344
345        FMLA    v20.4s, v18.4s,  v0.s[3]
346        FMLA    v22.4s, v18.4s,  v1.s[3]
347        FMLA    v24.4s, v18.4s,  v2.s[3]
348        FMLA    v26.4s, v18.4s,  v3.s[3]
349        FMLA    v28.4s, v18.4s,  v4.s[3]
350        FMLA    v30.4s, v18.4s,  v5.s[3]
351        FMLA    v21.4s, v19.4s,  v0.s[3]
352        FMLA    v23.4s, v19.4s,  v1.s[3]
353        FMLA    v25.4s, v19.4s,  v2.s[3]
354        FMLA    v27.4s, v19.4s,  v3.s[3]
355        FMLA    v29.4s, v19.4s,  v4.s[3]
356        FMLA    v31.4s, v19.4s,  v5.s[3]
357
358        # Second group of 4 A.  48 FMA. No A Loads, No last B load
359
360        LDP     q16,  q17, [x5], 32
361        FMLA    v20.4s, v12.4s,  v6.s[0]
362        FMLA    v22.4s, v12.4s,  v7.s[0]
363        LDP     q18,  q19, [x5], 32
364        FMLA    v24.4s, v12.4s,  v8.s[0]
365        FMLA    v26.4s, v12.4s,  v9.s[0]
366        FMLA    v28.4s, v12.4s, v10.s[0]
367        FMLA    v30.4s, v12.4s, v11.s[0]
368        FMLA    v21.4s, v13.4s,  v6.s[0]
369        FMLA    v23.4s, v13.4s,  v7.s[0]
370        FMLA    v25.4s, v13.4s,  v8.s[0]
371        FMLA    v27.4s, v13.4s,  v9.s[0]
372        FMLA    v29.4s, v13.4s, v10.s[0]
373        FMLA    v31.4s, v13.4s, v11.s[0]
374
375        FMLA    v20.4s, v14.4s,  v6.s[1]
376        FMLA    v22.4s, v14.4s,  v7.s[1]
377        FMLA    v24.4s, v14.4s,  v8.s[1]
378        FMLA    v26.4s, v14.4s,  v9.s[1]
379        FMLA    v28.4s, v14.4s, v10.s[1]
380        FMLA    v30.4s, v14.4s, v11.s[1]
381        FMLA    v21.4s, v15.4s,  v6.s[1]
382        FMLA    v23.4s, v15.4s,  v7.s[1]
383        FMLA    v25.4s, v15.4s,  v8.s[1]
384        FMLA    v27.4s, v15.4s,  v9.s[1]
385        FMLA    v29.4s, v15.4s, v10.s[1]
386        FMLA    v31.4s, v15.4s, v11.s[1]
387
388        # Last part of epilogue has loads removed.
389
390        FMLA    v20.4s, v16.4s,  v6.s[2]
391        FMLA    v22.4s, v16.4s,  v7.s[2]
392        FMLA    v24.4s, v16.4s,  v8.s[2]
393        FMLA    v26.4s, v16.4s,  v9.s[2]
394        FMLA    v28.4s, v16.4s, v10.s[2]
395        FMLA    v30.4s, v16.4s, v11.s[2]
396        FMLA    v21.4s, v17.4s,  v6.s[2]
397        FMLA    v23.4s, v17.4s,  v7.s[2]
398        FMLA    v25.4s, v17.4s,  v8.s[2]
399        FMLA    v27.4s, v17.4s,  v9.s[2]
400        FMLA    v29.4s, v17.4s, v10.s[2]
401        FMLA    v31.4s, v17.4s, v11.s[2]
402
403        FMLA    v20.4s, v18.4s,  v6.s[3]
404        FMLA    v22.4s, v18.4s,  v7.s[3]
405        FMLA    v24.4s, v18.4s,  v8.s[3]
406        FMLA    v26.4s, v18.4s,  v9.s[3]
407        FMLA    v28.4s, v18.4s, v10.s[3]
408        FMLA    v30.4s, v18.4s, v11.s[3]
409        FMLA    v21.4s, v19.4s,  v6.s[3]
410        FMLA    v23.4s, v19.4s,  v7.s[3]
411
412        # Load min/max values
413        LD2R    {v6.4s, v7.4s}, [x8]
414
415        FMLA    v25.4s, v19.4s,  v8.s[3]
416        FMLA    v27.4s, v19.4s,  v9.s[3]
417        TST     x0, 31
418        FMLA    v29.4s, v19.4s, v10.s[3]
419        FMLA    v31.4s, v19.4s, v11.s[3]
420        B.NE    5f
421
422        .p2align 3
4234:
424        # ks loop
425        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
426        B.HI    1b
427
428        # Clamp
429        FMAX    v20.4s, v20.4s, v6.4s
430        # Load cn_stride
431        LDR     x0, [sp, 96]
432        FMAX    v21.4s, v21.4s, v6.4s
433        FMAX    v22.4s, v22.4s, v6.4s
434        FMAX    v23.4s, v23.4s, v6.4s
435        FMAX    v24.4s, v24.4s, v6.4s
436        FMAX    v25.4s, v25.4s, v6.4s
437        FMAX    v26.4s, v26.4s, v6.4s
438        FMAX    v27.4s, v27.4s, v6.4s
439        FMAX    v28.4s, v28.4s, v6.4s
440        FMAX    v29.4s, v29.4s, v6.4s
441        FMAX    v30.4s, v30.4s, v6.4s
442        FMAX    v31.4s, v31.4s, v6.4s
443        SUBS    x1, x1, 8
444        FMIN    v20.4s, v20.4s, v7.4s
445        FMIN    v21.4s, v21.4s, v7.4s
446        FMIN    v22.4s, v22.4s, v7.4s
447        FMIN    v23.4s, v23.4s, v7.4s
448        FMIN    v24.4s, v24.4s, v7.4s
449        FMIN    v25.4s, v25.4s, v7.4s
450        FMIN    v26.4s, v26.4s, v7.4s
451        FMIN    v27.4s, v27.4s, v7.4s
452        FMIN    v28.4s, v28.4s, v7.4s
453        FMIN    v29.4s, v29.4s, v7.4s
454        FMIN    v30.4s, v30.4s, v7.4s
455        FMIN    v31.4s, v31.4s, v7.4s
456
457        # Store full 6 x 8
458        B.LO    8f
459
460        STP     q30, q31,  [x7]
461        ADD     x7, x7, x0
462        STP     q28, q29, [x13]
463        ADD     x13, x13, x0
464        STP     q26, q27, [x10]
465        ADD     x10, x10, x0
466        STP     q24, q25, [x17]
467        ADD     x17, x17, x0
468        STP     q22, q23, [x16]
469        ADD     x16, x16, x0
470        STP     q20, q21,  [x6]
471        ADD     x6,  x6, x0
472
473        SUB     x4, x4, x3              // a -= ks
474
475        # nc loop
476        B.HI    0b
477
478        # Restore x20,x21,x22,x23 from stack
479        LDP     x22, x23, [sp, 80]
480        LDP     x20, x21, [sp, 64]
481
482        # Restore d8-d15 from stack
483        LDP     d14, d15, [sp, 48]
484        LDP     d12, d13, [sp, 32]
485        LDP     d10, d11, [sp, 16]
486        LDP     d8,  d9, [sp], 96
487        RET
488
489        .p2align 3
4905:
491        # Is there a remainder?- 4 floats of A (16 bytes)
492        TBZ     x0, 4, 6f
493
494        # Remainder- 4 floats of A (16 bytes)
495        # Load A
496        LDR     q0, [x14], 16
497        LDR     q1, [x15], 16
498        LDR     q2, [x20], 16
499        LDR     q3, [x21], 16
500        LDR     q4, [x22], 16
501        LDR     q5, [x23], 16
502        # Load B
503        LDP     q12, q13, [x5], 32
504        LDP     q14, q15, [x5], 32
505        LDP     q16, q17, [x5], 32
506        LDP     q18, q19, [x5], 32
507
508        FMLA    v20.4s, v12.4s,  v0.s[0]
509        FMLA    v22.4s, v12.4s,  v1.s[0]
510        FMLA    v24.4s, v12.4s,  v2.s[0]
511        FMLA    v26.4s, v12.4s,  v3.s[0]
512        FMLA    v28.4s, v12.4s,  v4.s[0]
513        FMLA    v30.4s, v12.4s,  v5.s[0]
514        FMLA    v21.4s, v13.4s,  v0.s[0]
515        FMLA    v23.4s, v13.4s,  v1.s[0]
516        FMLA    v25.4s, v13.4s,  v2.s[0]
517        FMLA    v27.4s, v13.4s,  v3.s[0]
518        FMLA    v29.4s, v13.4s,  v4.s[0]
519        FMLA    v31.4s, v13.4s,  v5.s[0]
520
521        FMLA    v20.4s, v14.4s,  v0.s[1]
522        FMLA    v22.4s, v14.4s,  v1.s[1]
523        FMLA    v24.4s, v14.4s,  v2.s[1]
524        FMLA    v26.4s, v14.4s,  v3.s[1]
525        FMLA    v28.4s, v14.4s,  v4.s[1]
526        FMLA    v30.4s, v14.4s,  v5.s[1]
527        FMLA    v21.4s, v15.4s,  v0.s[1]
528        FMLA    v23.4s, v15.4s,  v1.s[1]
529        FMLA    v25.4s, v15.4s,  v2.s[1]
530        FMLA    v27.4s, v15.4s,  v3.s[1]
531        FMLA    v29.4s, v15.4s,  v4.s[1]
532        FMLA    v31.4s, v15.4s,  v5.s[1]
533
534        FMLA    v20.4s, v16.4s,  v0.s[2]
535        FMLA    v22.4s, v16.4s,  v1.s[2]
536        FMLA    v24.4s, v16.4s,  v2.s[2]
537        FMLA    v26.4s, v16.4s,  v3.s[2]
538        FMLA    v28.4s, v16.4s,  v4.s[2]
539        FMLA    v30.4s, v16.4s,  v5.s[2]
540        FMLA    v21.4s, v17.4s,  v0.s[2]
541        FMLA    v23.4s, v17.4s,  v1.s[2]
542        FMLA    v25.4s, v17.4s,  v2.s[2]
543        FMLA    v27.4s, v17.4s,  v3.s[2]
544        FMLA    v29.4s, v17.4s,  v4.s[2]
545        FMLA    v31.4s, v17.4s,  v5.s[2]
546
547        FMLA    v20.4s, v18.4s,  v0.s[3]
548        FMLA    v22.4s, v18.4s,  v1.s[3]
549        FMLA    v24.4s, v18.4s,  v2.s[3]
550        FMLA    v26.4s, v18.4s,  v3.s[3]
551        FMLA    v28.4s, v18.4s,  v4.s[3]
552        FMLA    v30.4s, v18.4s,  v5.s[3]
553        FMLA    v21.4s, v19.4s,  v0.s[3]
554        FMLA    v23.4s, v19.4s,  v1.s[3]
555        FMLA    v25.4s, v19.4s,  v2.s[3]
556        FMLA    v27.4s, v19.4s,  v3.s[3]
557        FMLA    v29.4s, v19.4s,  v4.s[3]
558        FMLA    v31.4s, v19.4s,  v5.s[3]
559
560        # Is there a remainder?- 2 floats of A (8 bytes)
5616:
562        TBZ     x0, 3, 7f
563
564        # Remainder- 2 floats of A (8 bytes)
565        # Load A
566        LDR     d0, [x14], 8
567        LDR     d1, [x15], 8
568        LDR     d2, [x20], 8
569        LDR     d3, [x21], 8
570        LDR     d4, [x22], 8
571        LDR     d5, [x23], 8
572        # Load B
573        LDP     q12, q13, [x5], 32
574        LDP     q14, q15, [x5], 32
575
576        FMLA    v20.4s, v12.4s,  v0.s[0]
577        FMLA    v22.4s, v12.4s,  v1.s[0]
578        FMLA    v24.4s, v12.4s,  v2.s[0]
579        FMLA    v26.4s, v12.4s,  v3.s[0]
580        FMLA    v28.4s, v12.4s,  v4.s[0]
581        FMLA    v30.4s, v12.4s,  v5.s[0]
582        FMLA    v21.4s, v13.4s,  v0.s[0]
583        FMLA    v23.4s, v13.4s,  v1.s[0]
584        FMLA    v25.4s, v13.4s,  v2.s[0]
585        FMLA    v27.4s, v13.4s,  v3.s[0]
586        FMLA    v29.4s, v13.4s,  v4.s[0]
587        FMLA    v31.4s, v13.4s,  v5.s[0]
588
589        FMLA    v20.4s, v14.4s,  v0.s[1]
590        FMLA    v22.4s, v14.4s,  v1.s[1]
591        FMLA    v24.4s, v14.4s,  v2.s[1]
592        FMLA    v26.4s, v14.4s,  v3.s[1]
593        FMLA    v28.4s, v14.4s,  v4.s[1]
594        FMLA    v30.4s, v14.4s,  v5.s[1]
595        FMLA    v21.4s, v15.4s,  v0.s[1]
596        FMLA    v23.4s, v15.4s,  v1.s[1]
597        FMLA    v25.4s, v15.4s,  v2.s[1]
598        FMLA    v27.4s, v15.4s,  v3.s[1]
599        FMLA    v29.4s, v15.4s,  v4.s[1]
600        FMLA    v31.4s, v15.4s,  v5.s[1]
601
602        # Is there a remainder?- 1 float of A (4 bytes)
6037:
604        TBZ     x0, 2, 4b
605
606        # Remainder- 1 float of A (4 bytes)
607        # Load A
608        LDR     s0, [x14], 4
609        LDR     s1, [x15], 4
610        LDR     s2, [x20], 4
611        LDR     s3, [x21], 4
612        LDR     s4, [x22], 4
613        LDR     s5, [x23], 4
614        # Load B
615        LDP     q12, q13, [x5], 32
616
617        FMLA    v20.4s, v12.4s,  v0.s[0]
618        FMLA    v22.4s, v12.4s,  v1.s[0]
619        FMLA    v24.4s, v12.4s,  v2.s[0]
620        FMLA    v26.4s, v12.4s,  v3.s[0]
621        FMLA    v28.4s, v12.4s,  v4.s[0]
622        FMLA    v30.4s, v12.4s,  v5.s[0]
623        FMLA    v21.4s, v13.4s,  v0.s[0]
624        FMLA    v23.4s, v13.4s,  v1.s[0]
625        FMLA    v25.4s, v13.4s,  v2.s[0]
626        FMLA    v27.4s, v13.4s,  v3.s[0]
627        FMLA    v29.4s, v13.4s,  v4.s[0]
628        FMLA    v31.4s, v13.4s,  v5.s[0]
629        B       4b
630
631        # Store odd width
6328:
633        TBZ     x1, 2, 9f
634        STR     q30,  [x7], 16
635        MOV     v30.16b, v31.16b
636        STR     q28, [x13], 16
637        MOV     v28.16b, v29.16b
638        STR     q26, [x10], 16
639        MOV     v26.16b, v27.16b
640        STR     q24, [x17], 16
641        MOV     v24.16b, v25.16b
642        STR     q22, [x16], 16
643        MOV     v22.16b, v23.16b
644        STR     q20,  [x6], 16
645        MOV     v20.16b, v21.16b
6469:
647        TBZ     x1, 1, 10f
648        STR     d30,  [x7], 8
649        STR     d28, [x13], 8
650        DUP     d30, v30.d[1]
651        DUP     d28, v28.d[1]
652        STR     d26, [x10], 8
653        STR     d24, [x17], 8
654        DUP     d26, v26.d[1]
655        DUP     d24, v24.d[1]
656        STR     d22, [x16], 8
657        STR     d20,  [x6], 8
658        DUP     d22, v22.d[1]
659        DUP     d20, v20.d[1]
660
66110:
662        TBZ     x1, 0, 11f
663        STR     s30,  [x7]
664        STR     s28, [x13]
665        STR     s26, [x10]
666        STR     s24, [x17]
667        STR     s22, [x16]
668        STR     s20,  [x6]
66911:
670        # Restore x20,x21,x22,x23 from stack
671        LDP     x22, x23, [sp, 80]
672        LDP     x20, x21, [sp, 64]
673
674        # Restore d8-d15 from stack
675        LDP     d14, d15, [sp, 48]
676        LDP     d12, d13, [sp, 32]
677        LDP     d10, d11, [sp, 16]
678        LDP     d8,  d9, [sp], 96
679        RET
680
681END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
682
683#ifdef __ELF__
684.section ".note.GNU-stack","",%progbits
685#endif
686