xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# LINT.IfChange
9# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75(
10#     size_t mr,                         x0
11#     size_t nc,                         x1
12#     size_t kc,                         x2 / x0
13#     size_t ks,                         x3 / x9
14#     const float** a,                   x4
15#     const void* w,                     x5
16#     uint8_t* c,                        x6
17#     size_t cm_stride,                  x7
18#     size_t cn_stride,                  [sp] -> (x0)
19#     size_t a_offset,                   [sp + 8] -> x11
20#     const float* zero,                 [sp + 16] -> x12
21#     const xnn_f32_minmax_params params [sp + 24] -> x8
22
23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
24
25# A pointers
26# x14 a0
27# x15 a1
28# x20 a2
29# x21 a3
30# x22 a4
31# x23 a5
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x10 c3
38# x13 c4
39#  x7 c5
40
41# Vector register usage
42# A0   v0  v6
43# A1   v1  v7
44# A2   v2  v8
45# A3   v3  v9
46# A4   v4 v10
47# A5   v5 v11
48# B   v12 v13 v14 v15
49# B   v16 v17 v18 v19
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# C   v28 v29
55# C   v30 v31
56# Clamp v6 v7
57
58BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
59
60        # Clamp C pointers / Save d8-d15 on stack
61        CMP     x0, 2                   // if mr < 2
62        STP     d8,  d9, [sp, -96]!
63        ADD     x16, x6, x7             // c1 = c0 + cm_stride
64        STP     d10, d11, [sp, 16]
65        CSEL    x16, x6, x16, LO        //   c1 = c0
66        STP     d12, d13, [sp, 32]
67
68        ADD     x17, x16, x7            // c2 = c1 + cm_stride
69        STP     d14, d15, [sp, 48]
70                                        // if mr <= 2
71        CSEL    x17, x16, x17, LS       //   c2 = c1
72        STP     x20, x21, [sp, 64]
73
74        CMP     x0, 4                   // if mr < 4
75        STP     x22, x23, [sp, 80]
76        ADD     x10, x17, x7            // c3 = c2 + cm_stride
77        CSEL    x10, x17, x10, LO       //   c3 = c2
78
79        ADD     x13, x10, x7            // c4 = c3 + cm_stride
80                                        // if mr <= 4
81        CSEL    x13, x10, x13, LS       //   c4 = c3
82
83        # Load zero, params pointer
84        LDP     x12, x8, [sp, 112]
85
86        CMP     x0, 6                   // if mr < 6
87        ADD     x7, x13, x7             // c5 = c4 + cm_stride
88        LDR     x11, [sp, 104]          // Load a_offset
89        CSEL    x7, x13, x7, LO         //   c5 = c4
90
910:
92        # Load initial bias from w into accumulators
93        LDP     q20, q21, [x5], 32
94        MOV     v22.16b, v20.16b
95        MOV     v23.16b, v21.16b
96        $if PREFETCH:
97          PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
98        MOV     v24.16b, v20.16b
99        $if PREFETCH:
100          PRFM    PLDL1KEEP, [x5, 64]
101        MOV     v25.16b, v21.16b
102        $if PREFETCH:
103          PRFM    PLDL1KEEP, [x5, 128]
104        MOV     v26.16b, v20.16b
105        $if PREFETCH:
106          PRFM    PLDL1KEEP, [x5, 192]
107        MOV     v27.16b, v21.16b
108        $if PREFETCH:
109            PRFM    PLDL1KEEP, [x5, 256]
110        MOV     v28.16b, v20.16b
111        $if PREFETCH:
112            PRFM    PLDL1KEEP, [x5, 320]
113        MOV     v29.16b, v21.16b
114        MOV     v30.16b, v20.16b
115        MOV     v31.16b, v21.16b
116
117        MOV     x9, x3                  // p = ks
118
1191:
120        # Load next 6 A pointers
121        LDP     x14, x15, [x4], 16
122        LDP     x20, x21, [x4], 16
123        LDP     x22, x23, [x4], 16
124
125        CMP     x14, x12                // if a0 == zero
126        ADD     x14, x14, x11           // a0 += a_offset
127        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
128        CMP     x15, x12                // if a1 == zero
129        ADD     x15, x15, x11           // a1 += a_offset
130        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
131        CMP     x20, x12                // if a2 == zero
132        ADD     x20, x20, x11           // a2 += a_offset
133        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
134        CMP     x21, x12                // if a3 == zero
135        ADD     x21, x21, x11           // a3 += a_offset
136        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
137        CMP     x22, x12                // if a4 == zero
138        ADD     x22, x22, x11           // a4 += a_offset
139        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
140        CMP     x23, x12                // if a5 == zero
141        ADD     x23, x23, x11           // a5 += a_offset
142        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
143
144        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
145        SUBS    x0, x2, 32              // k = kc - 32
146        B.LO    5f
147
148        # Prologue - loads for main loop of 96 FMA
149        LDR     q0, [x14], 16
150        LDP     q12, q13, [x5], 32      // Fetch 3 B (4th deferred)
151        LDR     q1, [x15], 16
152        LDR     q2, [x20], 16
153        LDR     q3, [x21], 16
154        LDR     q4, [x22], 16
155        LDR     q5, [x23], 16
156        LDP     q14, q15, [x5], 32
157        LDP     q16, q17, [x5], 32
158
159        # Is there at least 8 floats (32 bytes) for main loop?
160        SUBS    x0, x0, 32
161        B.LO    3f
162
163        # Main loop - 8 floats of A (32 bytes)
164        # 96 FMA + 6 LDP A + 8 LDP B
165        # 64 float weights = 256 bytes.  4 cache lines.
1662:
167        # First group of 4 A.  48 FMA.
168        FMLA    v20.4s, v12.4s,  v0.s[0]
169        LDP     q18, q19, [x5], 32        // Load last B
170        FMLA    v22.4s, v12.4s,  v1.s[0]
171        FMLA    v24.4s, v12.4s,  v2.s[0]
172        FMLA    v26.4s, v12.4s,  v3.s[0]
173        FMLA    v28.4s, v12.4s,  v4.s[0]
174        FMLA    v30.4s, v12.4s,  v5.s[0]
175        $if PREFETCH:
176          PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
177        FMLA    v21.4s, v13.4s,  v0.s[0]
178        FMLA    v23.4s, v13.4s,  v1.s[0]
179        FMLA    v25.4s, v13.4s,  v2.s[0]
180        $if PREFETCH:
181          PRFM    PLDL1KEEP, [x5, 320]
182        FMLA    v27.4s, v13.4s,  v3.s[0]
183        FMLA    v29.4s, v13.4s,  v4.s[0]
184        FMLA    v31.4s, v13.4s,  v5.s[0]
185        $if PREFETCH:
186          PRFM    PLDL1KEEP, [x5, 384]
187        FMLA    v20.4s, v14.4s,  v0.s[1]
188        FMLA    v22.4s, v14.4s,  v1.s[1]
189        FMLA    v24.4s, v14.4s,  v2.s[1]
190        $if PREFETCH:
191          PRFM    PLDL1KEEP, [x5, 448]
192        FMLA    v26.4s, v14.4s,  v3.s[1]
193        FMLA    v28.4s, v14.4s,  v4.s[1]
194        FMLA    v30.4s, v14.4s,  v5.s[1]
195        FMLA    v21.4s, v15.4s,  v0.s[1]
196        FMLA    v23.4s, v15.4s,  v1.s[1]
197        FMLA    v25.4s, v15.4s,  v2.s[1]
198        LDR     q6, [x14], 16             // Load next 6 A
199        FMLA    v27.4s, v15.4s,  v3.s[1]
200        FMLA    v29.4s, v15.4s,  v4.s[1]
201        FMLA    v31.4s, v15.4s,  v5.s[1]
202        LDR     q7, [x15], 16
203
204        FMLA    v20.4s, v16.4s,  v0.s[2]
205        FMLA    v22.4s, v16.4s,  v1.s[2]
206        FMLA    v24.4s, v16.4s,  v2.s[2]
207        LDR     q8, [x20], 16
208        FMLA    v26.4s, v16.4s,  v3.s[2]
209        FMLA    v28.4s, v16.4s,  v4.s[2]
210        FMLA    v30.4s, v16.4s,  v5.s[2]
211        LDR     q9, [x21], 16
212        FMLA    v21.4s, v17.4s,  v0.s[2]
213        FMLA    v23.4s, v17.4s,  v1.s[2]
214        FMLA    v25.4s, v17.4s,  v2.s[2]
215        LDR     q10, [x22], 16
216        FMLA    v27.4s, v17.4s,  v3.s[2]
217        FMLA    v29.4s, v17.4s,  v4.s[2]
218        FMLA    v31.4s, v17.4s,  v5.s[2]
219        LDR     q11, [x23], 16
220
221        FMLA    v20.4s, v18.4s,  v0.s[3]
222        FMLA    v22.4s, v18.4s,  v1.s[3]
223        FMLA    v24.4s, v18.4s,  v2.s[3]
224        LDP     q12, q13, [x5], 32        // Load 4 B
225        FMLA    v26.4s, v18.4s,  v3.s[3]
226        FMLA    v28.4s, v18.4s,  v4.s[3]
227        FMLA    v30.4s, v18.4s,  v5.s[3]
228        LDP     q14, q15, [x5], 32
229        FMLA    v21.4s, v19.4s,  v0.s[3]
230        FMLA    v23.4s, v19.4s,  v1.s[3]
231        FMLA    v25.4s, v19.4s,  v2.s[3]
232        LDP     q16, q17, [x5], 32
233        FMLA    v27.4s, v19.4s,  v3.s[3]
234        FMLA    v29.4s, v19.4s,  v4.s[3]
235        FMLA    v31.4s, v19.4s,  v5.s[3]
236        LDP     q18, q19, [x5], 32
237
238        # Second group of 4 A.  48 FMA.
239        FMLA    v20.4s, v12.4s,  v6.s[0]
240        FMLA    v22.4s, v12.4s,  v7.s[0]
241        FMLA    v24.4s, v12.4s,  v8.s[0]
242        LDR     q0, [x14], 16            // Load next 6 A
243        FMLA    v26.4s, v12.4s,  v9.s[0]
244        FMLA    v28.4s, v12.4s, v10.s[0]
245        FMLA    v30.4s, v12.4s, v11.s[0]
246        LDR     q1, [x15], 16
247        FMLA    v21.4s, v13.4s,  v6.s[0]
248        FMLA    v23.4s, v13.4s,  v7.s[0]
249        FMLA    v25.4s, v13.4s,  v8.s[0]
250        LDR     q2, [x20], 16
251        FMLA    v27.4s, v13.4s,  v9.s[0]
252        FMLA    v29.4s, v13.4s, v10.s[0]
253        FMLA    v31.4s, v13.4s, v11.s[0]
254        LDR     q3, [x21], 16
255
256        FMLA    v20.4s, v14.4s,  v6.s[1]
257        FMLA    v22.4s, v14.4s,  v7.s[1]
258        FMLA    v24.4s, v14.4s,  v8.s[1]
259        LDR     q4, [x22], 16
260        FMLA    v26.4s, v14.4s,  v9.s[1]
261        FMLA    v28.4s, v14.4s, v10.s[1]
262        FMLA    v30.4s, v14.4s, v11.s[1]
263        LDR     q5, [x23], 16
264        FMLA    v21.4s, v15.4s,  v6.s[1]
265        FMLA    v23.4s, v15.4s,  v7.s[1]
266        FMLA    v25.4s, v15.4s,  v8.s[1]
267        LDP     q12, q13, [x5], 32        // Load next 3 B (not last)
268        FMLA    v27.4s, v15.4s,  v9.s[1]
269        FMLA    v29.4s, v15.4s, v10.s[1]
270        FMLA    v31.4s, v15.4s, v11.s[1]
271        LDP     q14, q15, [x5], 32
272
273        FMLA    v20.4s, v16.4s,  v6.s[2]
274        FMLA    v22.4s, v16.4s,  v7.s[2]
275        FMLA    v24.4s, v16.4s,  v8.s[2]
276        FMLA    v26.4s, v16.4s,  v9.s[2]
277        FMLA    v28.4s, v16.4s, v10.s[2]
278        FMLA    v30.4s, v16.4s, v11.s[2]
279        FMLA    v21.4s, v17.4s,  v6.s[2]
280        FMLA    v23.4s, v17.4s,  v7.s[2]
281        FMLA    v25.4s, v17.4s,  v8.s[2]
282        FMLA    v27.4s, v17.4s,  v9.s[2]
283        FMLA    v29.4s, v17.4s, v10.s[2]
284        FMLA    v31.4s, v17.4s, v11.s[2]
285
286        FMLA    v20.4s, v18.4s,  v6.s[3]
287        FMLA    v22.4s, v18.4s,  v7.s[3]
288        LDP     q16,  q17, [x5], 32
289        FMLA    v24.4s, v18.4s,  v8.s[3]
290        FMLA    v26.4s, v18.4s,  v9.s[3]
291        FMLA    v28.4s, v18.4s, v10.s[3]
292        FMLA    v30.4s, v18.4s, v11.s[3]
293        SUBS    x0, x0, 32
294        FMLA    v21.4s, v19.4s,  v6.s[3]
295        FMLA    v23.4s, v19.4s,  v7.s[3]
296        FMLA    v25.4s, v19.4s,  v8.s[3]
297        FMLA    v27.4s, v19.4s,  v9.s[3]
298        FMLA    v29.4s, v19.4s, v10.s[3]
299        FMLA    v31.4s, v19.4s, v11.s[3]
300        B.HS    2b
301
302        # Epilogue - 8 floats of A (32 bytes)
303        # 96 FMA + 6 LDP A + 8 LDP B
304        # First block same as main loop.  Second block has no preloads.
3053:
306        # First group of 4 A.  48 FMA.
307        FMLA    v20.4s, v12.4s,  v0.s[0]
308        LDP     q18, q19, [x5], 32        // Load last B
309        FMLA    v22.4s, v12.4s,  v1.s[0]
310        FMLA    v24.4s, v12.4s,  v2.s[0]
311        FMLA    v26.4s, v12.4s,  v3.s[0]
312        FMLA    v28.4s, v12.4s,  v4.s[0]
313        FMLA    v30.4s, v12.4s,  v5.s[0]
314        $if PREFETCH:
315          PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
316        FMLA    v21.4s, v13.4s,  v0.s[0]
317        FMLA    v23.4s, v13.4s,  v1.s[0]
318        FMLA    v25.4s, v13.4s,  v2.s[0]
319        $if PREFETCH:
320          PRFM    PLDL1KEEP, [x5, 320]
321        FMLA    v27.4s, v13.4s,  v3.s[0]
322        FMLA    v29.4s, v13.4s,  v4.s[0]
323        FMLA    v31.4s, v13.4s,  v5.s[0]
324        $if PREFETCH:
325          PRFM    PLDL1KEEP, [x5, 384]
326        FMLA    v20.4s, v14.4s,  v0.s[1]
327        FMLA    v22.4s, v14.4s,  v1.s[1]
328        FMLA    v24.4s, v14.4s,  v2.s[1]
329        $if PREFETCH:
330          PRFM    PLDL1KEEP, [x5, 448]
331        FMLA    v26.4s, v14.4s,  v3.s[1]
332        FMLA    v28.4s, v14.4s,  v4.s[1]
333        FMLA    v30.4s, v14.4s,  v5.s[1]
334        FMLA    v21.4s, v15.4s,  v0.s[1]
335        FMLA    v23.4s, v15.4s,  v1.s[1]
336        FMLA    v25.4s, v15.4s,  v2.s[1]
337        LDR     q6, [x14], 16             // Load next 6 A
338        FMLA    v27.4s, v15.4s,  v3.s[1]
339        FMLA    v29.4s, v15.4s,  v4.s[1]
340        FMLA    v31.4s, v15.4s,  v5.s[1]
341        LDR     q7, [x15], 16
342
343        FMLA    v20.4s, v16.4s,  v0.s[2]
344        FMLA    v22.4s, v16.4s,  v1.s[2]
345        FMLA    v24.4s, v16.4s,  v2.s[2]
346        LDR     q8, [x20], 16
347        FMLA    v26.4s, v16.4s,  v3.s[2]
348        FMLA    v28.4s, v16.4s,  v4.s[2]
349        FMLA    v30.4s, v16.4s,  v5.s[2]
350        LDR     q9, [x21], 16
351        FMLA    v21.4s, v17.4s,  v0.s[2]
352        FMLA    v23.4s, v17.4s,  v1.s[2]
353        FMLA    v25.4s, v17.4s,  v2.s[2]
354        LDR     q10, [x22], 16
355        FMLA    v27.4s, v17.4s,  v3.s[2]
356        FMLA    v29.4s, v17.4s,  v4.s[2]
357        FMLA    v31.4s, v17.4s,  v5.s[2]
358        LDR     q11, [x23], 16
359
360        FMLA    v20.4s, v18.4s,  v0.s[3]
361        FMLA    v22.4s, v18.4s,  v1.s[3]
362        FMLA    v24.4s, v18.4s,  v2.s[3]
363        LDP     q12, q13, [x5], 32        // Load 4 B
364        FMLA    v26.4s, v18.4s,  v3.s[3]
365        FMLA    v28.4s, v18.4s,  v4.s[3]
366        FMLA    v30.4s, v18.4s,  v5.s[3]
367        LDP     q14, q15, [x5], 32
368        FMLA    v21.4s, v19.4s,  v0.s[3]
369        FMLA    v23.4s, v19.4s,  v1.s[3]
370        FMLA    v25.4s, v19.4s,  v2.s[3]
371        LDP     q16, q17, [x5], 32
372        FMLA    v27.4s, v19.4s,  v3.s[3]
373        FMLA    v29.4s, v19.4s,  v4.s[3]
374        FMLA    v31.4s, v19.4s,  v5.s[3]
375        LDP     q18, q19, [x5], 32
376
377        # Second group of 4 A.  48 FMA.
378        FMLA    v20.4s, v12.4s,  v6.s[0]
379        FMLA    v22.4s, v12.4s,  v7.s[0]
380        FMLA    v24.4s, v12.4s,  v8.s[0]
381        FMLA    v26.4s, v12.4s,  v9.s[0]
382        FMLA    v28.4s, v12.4s, v10.s[0]
383        FMLA    v30.4s, v12.4s, v11.s[0]
384        FMLA    v21.4s, v13.4s,  v6.s[0]
385        FMLA    v23.4s, v13.4s,  v7.s[0]
386        FMLA    v25.4s, v13.4s,  v8.s[0]
387        FMLA    v27.4s, v13.4s,  v9.s[0]
388        FMLA    v29.4s, v13.4s, v10.s[0]
389        FMLA    v31.4s, v13.4s, v11.s[0]
390
391        FMLA    v20.4s, v14.4s,  v6.s[1]
392        FMLA    v22.4s, v14.4s,  v7.s[1]
393        FMLA    v24.4s, v14.4s,  v8.s[1]
394        FMLA    v26.4s, v14.4s,  v9.s[1]
395        FMLA    v28.4s, v14.4s, v10.s[1]
396        FMLA    v30.4s, v14.4s, v11.s[1]
397        FMLA    v21.4s, v15.4s,  v6.s[1]
398        FMLA    v23.4s, v15.4s,  v7.s[1]
399        FMLA    v25.4s, v15.4s,  v8.s[1]
400        FMLA    v27.4s, v15.4s,  v9.s[1]
401        FMLA    v29.4s, v15.4s, v10.s[1]
402        FMLA    v31.4s, v15.4s, v11.s[1]
403
404        FMLA    v20.4s, v16.4s,  v6.s[2]
405        FMLA    v22.4s, v16.4s,  v7.s[2]
406        FMLA    v24.4s, v16.4s,  v8.s[2]
407        FMLA    v26.4s, v16.4s,  v9.s[2]
408        FMLA    v28.4s, v16.4s, v10.s[2]
409        FMLA    v30.4s, v16.4s, v11.s[2]
410        FMLA    v21.4s, v17.4s,  v6.s[2]
411        FMLA    v23.4s, v17.4s,  v7.s[2]
412        FMLA    v25.4s, v17.4s,  v8.s[2]
413        FMLA    v27.4s, v17.4s,  v9.s[2]
414        FMLA    v29.4s, v17.4s, v10.s[2]
415        FMLA    v31.4s, v17.4s, v11.s[2]
416
417
418        FMLA    v20.4s, v18.4s,  v6.s[3]
419        FMLA    v22.4s, v18.4s,  v7.s[3]
420        FMLA    v24.4s, v18.4s,  v8.s[3]
421        FMLA    v26.4s, v18.4s,  v9.s[3]
422        FMLA    v28.4s, v18.4s, v10.s[3]
423        FMLA    v30.4s, v18.4s, v11.s[3]
424
425        # Is there a remainder?- 4 floats of A (16 bytes) or less
426        TST     x0, 31
427
428        FMLA    v21.4s, v19.4s,  v6.s[3]
429        FMLA    v23.4s, v19.4s,  v7.s[3]
430        FMLA    v25.4s, v19.4s,  v8.s[3]
431        LD2R    {v6.4s, v7.4s}, [x8]      // Load min/max values
432        FMLA    v27.4s, v19.4s,  v9.s[3]
433        FMLA    v29.4s, v19.4s, v10.s[3]
434        FMLA    v31.4s, v19.4s, v11.s[3]
435        B.NE    5f
436
4374:
438        # ks loop
439        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
440        B.HI    1b
441
442        # Clamp
443        FMAX    v20.4s, v20.4s, v6.4s
444        FMAX    v21.4s, v21.4s, v6.4s
445        FMAX    v22.4s, v22.4s, v6.4s
446        FMAX    v23.4s, v23.4s, v6.4s
447        LDR     x0, [sp, 96]            // Load cn_stride
448        FMAX    v24.4s, v24.4s, v6.4s
449        FMAX    v25.4s, v25.4s, v6.4s
450        FMAX    v26.4s, v26.4s, v6.4s
451        FMAX    v27.4s, v27.4s, v6.4s
452        FMAX    v28.4s, v28.4s, v6.4s
453        FMAX    v29.4s, v29.4s, v6.4s
454        FMAX    v30.4s, v30.4s, v6.4s
455        FMAX    v31.4s, v31.4s, v6.4s
456        SUBS    x1, x1, 8
457        FMIN    v20.4s, v20.4s, v7.4s
458        FMIN    v21.4s, v21.4s, v7.4s
459        FMIN    v22.4s, v22.4s, v7.4s
460        FMIN    v23.4s, v23.4s, v7.4s
461        FMIN    v24.4s, v24.4s, v7.4s
462        FMIN    v25.4s, v25.4s, v7.4s
463        FMIN    v26.4s, v26.4s, v7.4s
464        FMIN    v27.4s, v27.4s, v7.4s
465        FMIN    v28.4s, v28.4s, v7.4s
466        FMIN    v29.4s, v29.4s, v7.4s
467        FMIN    v30.4s, v30.4s, v7.4s
468        FMIN    v31.4s, v31.4s, v7.4s
469
470        # Store full 6 x 8
471        B.LO    8f
472
473        STP     q30, q31,  [x7]
474        ADD     x7, x7, x0
475        STP     q28, q29, [x13]
476        ADD     x13, x13, x0
477        STP     q26, q27, [x10]
478        ADD     x10, x10, x0
479        STP     q24, q25, [x17]
480        ADD     x17, x17, x0
481        STP     q22, q23, [x16]
482        ADD     x16, x16, x0
483        STP     q20, q21,  [x6]
484        ADD     x6,  x6, x0
485
486        SUB     x4, x4, x3              // a -= ks
487
488        # nc loop
489        B.HI    0b
490
491        # Restore x20,x21,x22,x23 from stack
492        LDP     x22, x23, [sp, 80]
493        LDP     x20, x21, [sp, 64]
494
495        # Restore d8-d15 from stack
496        LDP     d14, d15, [sp, 48]
497        LDP     d12, d13, [sp, 32]
498        LDP     d10, d11, [sp, 16]
499        LDP     d8,  d9, [sp], 96
500        RET
501
5025:
503        # Load min/max values
504        LD2R    {v6.4s, v7.4s}, [x8]
505
506        # Is there a remainder?- 4 floats of A (16 bytes)
507        TBZ     x0, 4, 6f
508
509        # Remainder- 4 floats of A (16 bytes)
510        # Load A
511        LDR     q0, [x14], 16
512        LDR     q1, [x15], 16
513        LDR     q2, [x20], 16
514        LDR     q3, [x21], 16
515        LDR     q4, [x22], 16
516        LDR     q5, [x23], 16
517        # Load B
518        LDP     q12, q13, [x5], 32
519        LDP     q14, q15, [x5], 32
520        LDP     q16, q17, [x5], 32
521        LDP     q18, q19, [x5], 32
522
523        FMLA    v20.4s, v12.4s,  v0.s[0]
524        FMLA    v22.4s, v12.4s,  v1.s[0]
525        FMLA    v24.4s, v12.4s,  v2.s[0]
526        FMLA    v26.4s, v12.4s,  v3.s[0]
527        FMLA    v28.4s, v12.4s,  v4.s[0]
528        FMLA    v30.4s, v12.4s,  v5.s[0]
529        FMLA    v21.4s, v13.4s,  v0.s[0]
530        FMLA    v23.4s, v13.4s,  v1.s[0]
531        FMLA    v25.4s, v13.4s,  v2.s[0]
532        FMLA    v27.4s, v13.4s,  v3.s[0]
533        FMLA    v29.4s, v13.4s,  v4.s[0]
534        FMLA    v31.4s, v13.4s,  v5.s[0]
535
536        FMLA    v20.4s, v14.4s,  v0.s[1]
537        FMLA    v22.4s, v14.4s,  v1.s[1]
538        FMLA    v24.4s, v14.4s,  v2.s[1]
539        FMLA    v26.4s, v14.4s,  v3.s[1]
540        FMLA    v28.4s, v14.4s,  v4.s[1]
541        FMLA    v30.4s, v14.4s,  v5.s[1]
542        FMLA    v21.4s, v15.4s,  v0.s[1]
543        FMLA    v23.4s, v15.4s,  v1.s[1]
544        FMLA    v25.4s, v15.4s,  v2.s[1]
545        FMLA    v27.4s, v15.4s,  v3.s[1]
546        FMLA    v29.4s, v15.4s,  v4.s[1]
547        FMLA    v31.4s, v15.4s,  v5.s[1]
548
549        FMLA    v20.4s, v16.4s,  v0.s[2]
550        FMLA    v22.4s, v16.4s,  v1.s[2]
551        FMLA    v24.4s, v16.4s,  v2.s[2]
552        FMLA    v26.4s, v16.4s,  v3.s[2]
553        FMLA    v28.4s, v16.4s,  v4.s[2]
554        FMLA    v30.4s, v16.4s,  v5.s[2]
555        FMLA    v21.4s, v17.4s,  v0.s[2]
556        FMLA    v23.4s, v17.4s,  v1.s[2]
557        FMLA    v25.4s, v17.4s,  v2.s[2]
558        FMLA    v27.4s, v17.4s,  v3.s[2]
559        FMLA    v29.4s, v17.4s,  v4.s[2]
560        FMLA    v31.4s, v17.4s,  v5.s[2]
561
562        FMLA    v20.4s, v18.4s,  v0.s[3]
563        FMLA    v22.4s, v18.4s,  v1.s[3]
564        FMLA    v24.4s, v18.4s,  v2.s[3]
565        FMLA    v26.4s, v18.4s,  v3.s[3]
566        FMLA    v28.4s, v18.4s,  v4.s[3]
567        FMLA    v30.4s, v18.4s,  v5.s[3]
568        FMLA    v21.4s, v19.4s,  v0.s[3]
569        FMLA    v23.4s, v19.4s,  v1.s[3]
570        FMLA    v25.4s, v19.4s,  v2.s[3]
571        FMLA    v27.4s, v19.4s,  v3.s[3]
572        FMLA    v29.4s, v19.4s,  v4.s[3]
573        FMLA    v31.4s, v19.4s,  v5.s[3]
574
575        # Is there a remainder?- 2 floats of A (8 bytes)
5766:
577        TBZ     x0, 3, 7f
578
579        # Remainder- 2 floats of A (8 bytes)
580        # Load A
581        LDR     d0, [x14], 8
582        LDR     d1, [x15], 8
583        LDR     d2, [x20], 8
584        LDR     d3, [x21], 8
585        LDR     d4, [x22], 8
586        LDR     d5, [x23], 8
587        # Load B
588        LDP     q12, q13, [x5], 32
589        LDP     q14, q15, [x5], 32
590
591        FMLA    v20.4s, v12.4s,  v0.s[0]
592        FMLA    v22.4s, v12.4s,  v1.s[0]
593        FMLA    v24.4s, v12.4s,  v2.s[0]
594        FMLA    v26.4s, v12.4s,  v3.s[0]
595        FMLA    v28.4s, v12.4s,  v4.s[0]
596        FMLA    v30.4s, v12.4s,  v5.s[0]
597        FMLA    v21.4s, v13.4s,  v0.s[0]
598        FMLA    v23.4s, v13.4s,  v1.s[0]
599        FMLA    v25.4s, v13.4s,  v2.s[0]
600        FMLA    v27.4s, v13.4s,  v3.s[0]
601        FMLA    v29.4s, v13.4s,  v4.s[0]
602        FMLA    v31.4s, v13.4s,  v5.s[0]
603
604        FMLA    v20.4s, v14.4s,  v0.s[1]
605        FMLA    v22.4s, v14.4s,  v1.s[1]
606        FMLA    v24.4s, v14.4s,  v2.s[1]
607        FMLA    v26.4s, v14.4s,  v3.s[1]
608        FMLA    v28.4s, v14.4s,  v4.s[1]
609        FMLA    v30.4s, v14.4s,  v5.s[1]
610        FMLA    v21.4s, v15.4s,  v0.s[1]
611        FMLA    v23.4s, v15.4s,  v1.s[1]
612        FMLA    v25.4s, v15.4s,  v2.s[1]
613        FMLA    v27.4s, v15.4s,  v3.s[1]
614        FMLA    v29.4s, v15.4s,  v4.s[1]
615        FMLA    v31.4s, v15.4s,  v5.s[1]
616
617        # Is there a remainder?- 1 float of A (4 bytes)
6187:
619        TBZ     x0, 2, 4b
620
621        # Remainder- 1 float of A (4 bytes)
622        # Load A
623        LDR     s0, [x14], 4
624        LDR     s1, [x15], 4
625        LDR     s2, [x20], 4
626        LDR     s3, [x21], 4
627        LDR     s4, [x22], 4
628        LDR     s5, [x23], 4
629        # Load B
630        LDP     q12, q13, [x5], 32
631
632        FMLA    v20.4s, v12.4s,  v0.s[0]
633        FMLA    v22.4s, v12.4s,  v1.s[0]
634        FMLA    v24.4s, v12.4s,  v2.s[0]
635        FMLA    v26.4s, v12.4s,  v3.s[0]
636        FMLA    v28.4s, v12.4s,  v4.s[0]
637        FMLA    v30.4s, v12.4s,  v5.s[0]
638        FMLA    v21.4s, v13.4s,  v0.s[0]
639        FMLA    v23.4s, v13.4s,  v1.s[0]
640        FMLA    v25.4s, v13.4s,  v2.s[0]
641        FMLA    v27.4s, v13.4s,  v3.s[0]
642        FMLA    v29.4s, v13.4s,  v4.s[0]
643        FMLA    v31.4s, v13.4s,  v5.s[0]
644        B       4b
645
646        # Store odd width
6478:
648        TBZ     x1, 2, 9f
649        STR     q30,  [x7], 16
650        MOV     v30.16b, v31.16b
651        STR     q28, [x13], 16
652        MOV     v28.16b, v29.16b
653        STR     q26, [x10], 16
654        MOV     v26.16b, v27.16b
655        STR     q24, [x17], 16
656        MOV     v24.16b, v25.16b
657        STR     q22, [x16], 16
658        MOV     v22.16b, v23.16b
659        STR     q20,  [x6], 16
660        MOV     v20.16b, v21.16b
6619:
662        TBZ     x1, 1, 10f
663        STR     d30,  [x7], 8
664        STR     d28, [x13], 8
665        DUP     d30, v30.d[1]
666        DUP     d28, v28.d[1]
667        STR     d26, [x10], 8
668        STR     d24, [x17], 8
669        DUP     d26, v26.d[1]
670        DUP     d24, v24.d[1]
671        STR     d22, [x16], 8
672        STR     d20,  [x6], 8
673        DUP     d22, v22.d[1]
674        DUP     d20, v20.d[1]
675
67610:
677        TBZ     x1, 0, 11f
678        STR     s30,  [x7]
679        STR     s28, [x13]
680        STR     s26, [x10]
681        STR     s24, [x17]
682        STR     s22, [x16]
683        STR     s20,  [x6]
68411:
685        # Restore x20,x21,x22,x23 from stack
686        LDP     x22, x23, [sp, 80]
687        LDP     x20, x21, [sp, 64]
688
689        # Restore d8-d15 from stack
690        LDP     d14, d15, [sp, 48]
691        LDP     d12, d13, [sp, 32]
692        LDP     d10, d11, [sp, 16]
693        LDP     d8,  d9, [sp], 96
694        RET
695
696END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
697# LINT.ThenChange(6x8-aarch64-neonfma-cortex-a75.cc,upto6x8-aarch64-neonfma-cortex-a75.cc)
698
699#ifdef __ELF__
700.section ".note.GNU-stack","",%progbits
701#endif
702