xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float** a,                   x4
18#     const void* w,                     x5
19#     uint8_t* c,                        x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75
62
63        # Clamp C pointers / Save d8-d15 on stack
64        CMP     x0, 2                   // if mr < 2
65        STP     d8,  d9, [sp, -96]!
66        ADD     x16, x6, x7             // c1 = c0 + cm_stride
67        STP     d10, d11, [sp, 16]
68        CSEL    x16, x6, x16, LO        //   c1 = c0
69        STP     d12, d13, [sp, 32]
70
71        ADD     x17, x16, x7            // c2 = c1 + cm_stride
72        STP     d14, d15, [sp, 48]
73                                        // if mr <= 2
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75        STP     x20, x21, [sp, 64]
76
77        CMP     x0, 4                   // if mr < 4
78        STP     x22, x23, [sp, 80]
79        ADD     x10, x17, x7            // c3 = c2 + cm_stride
80        CSEL    x10, x17, x10, LO       //   c3 = c2
81
82        ADD     x13, x10, x7            // c4 = c3 + cm_stride
83                                        // if mr <= 4
84        CSEL    x13, x10, x13, LS       //   c4 = c3
85
86        # Load zero, params pointer
87        LDP     x12, x8, [sp, 112]
88
89        CMP     x0, 6                   // if mr < 6
90        ADD     x7, x13, x7             // c5 = c4 + cm_stride
91        LDR     x11, [sp, 104]          // Load a_offset
92        CSEL    x7, x13, x7, LO         //   c5 = c4
93
940:
95        # Load initial bias from w into accumulators
96        LDP     q20, q21, [x5], 32
97        MOV     v22.16b, v20.16b
98        MOV     v23.16b, v21.16b
99        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
100        MOV     v24.16b, v20.16b
101        PRFM    PLDL1KEEP, [x5, 64]
102        MOV     v25.16b, v21.16b
103        PRFM    PLDL1KEEP, [x5, 128]
104        MOV     v26.16b, v20.16b
105        PRFM    PLDL1KEEP, [x5, 192]
106        MOV     v27.16b, v21.16b
107        PRFM    PLDL1KEEP, [x5, 256]
108        MOV     v28.16b, v20.16b
109        PRFM    PLDL1KEEP, [x5, 320]
110        MOV     v29.16b, v21.16b
111        MOV     v30.16b, v20.16b
112        MOV     v31.16b, v21.16b
113
114        MOV     x9, x3                  // p = ks
115
1161:
117        # Load next 6 A pointers
118        LDP     x14, x15, [x4], 16
119        LDP     x20, x21, [x4], 16
120        LDP     x22, x23, [x4], 16
121
122        CMP     x14, x12                // if a0 == zero
123        ADD     x14, x14, x11           // a0 += a_offset
124        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
125        CMP     x15, x12                // if a1 == zero
126        ADD     x15, x15, x11           // a1 += a_offset
127        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
128        CMP     x20, x12                // if a2 == zero
129        ADD     x20, x20, x11           // a2 += a_offset
130        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
131        CMP     x21, x12                // if a3 == zero
132        ADD     x21, x21, x11           // a3 += a_offset
133        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
134        CMP     x22, x12                // if a4 == zero
135        ADD     x22, x22, x11           // a4 += a_offset
136        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
137        CMP     x23, x12                // if a5 == zero
138        ADD     x23, x23, x11           // a5 += a_offset
139        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
140
141        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
142        SUBS    x0, x2, 32              // k = kc - 32
143        B.LO    5f
144
145        # Prologue - loads for main loop of 96 FMA
146        LDR     q0, [x14], 16
147        LDP     q12, q13, [x5], 32      // Fetch 3 B (4th deferred)
148        LDR     q1, [x15], 16
149        LDR     q2, [x20], 16
150        LDR     q3, [x21], 16
151        LDR     q4, [x22], 16
152        LDR     q5, [x23], 16
153        LDP     q14, q15, [x5], 32
154        LDP     q16, q17, [x5], 32
155
156        # Is there at least 8 floats (32 bytes) for main loop?
157        SUBS    x0, x0, 32
158        B.LO    3f
159
160        # Main loop - 8 floats of A (32 bytes)
161        # 96 FMA + 6 LDP A + 8 LDP B
162        # 64 float weights = 256 bytes.  4 cache lines.
1632:
164        # First group of 4 A.  48 FMA.
165        FMLA    v20.4s, v12.4s,  v0.s[0]
166        LDP     q18, q19, [x5], 32        // Load last B
167        FMLA    v22.4s, v12.4s,  v1.s[0]
168        FMLA    v24.4s, v12.4s,  v2.s[0]
169        FMLA    v26.4s, v12.4s,  v3.s[0]
170        FMLA    v28.4s, v12.4s,  v4.s[0]
171        FMLA    v30.4s, v12.4s,  v5.s[0]
172        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
173        FMLA    v21.4s, v13.4s,  v0.s[0]
174        FMLA    v23.4s, v13.4s,  v1.s[0]
175        FMLA    v25.4s, v13.4s,  v2.s[0]
176        PRFM    PLDL1KEEP, [x5, 320]
177        FMLA    v27.4s, v13.4s,  v3.s[0]
178        FMLA    v29.4s, v13.4s,  v4.s[0]
179        FMLA    v31.4s, v13.4s,  v5.s[0]
180        PRFM    PLDL1KEEP, [x5, 384]
181        FMLA    v20.4s, v14.4s,  v0.s[1]
182        FMLA    v22.4s, v14.4s,  v1.s[1]
183        FMLA    v24.4s, v14.4s,  v2.s[1]
184        PRFM    PLDL1KEEP, [x5, 448]
185        FMLA    v26.4s, v14.4s,  v3.s[1]
186        FMLA    v28.4s, v14.4s,  v4.s[1]
187        FMLA    v30.4s, v14.4s,  v5.s[1]
188        FMLA    v21.4s, v15.4s,  v0.s[1]
189        FMLA    v23.4s, v15.4s,  v1.s[1]
190        FMLA    v25.4s, v15.4s,  v2.s[1]
191        LDR     q6, [x14], 16             // Load next 6 A
192        FMLA    v27.4s, v15.4s,  v3.s[1]
193        FMLA    v29.4s, v15.4s,  v4.s[1]
194        FMLA    v31.4s, v15.4s,  v5.s[1]
195        LDR     q7, [x15], 16
196
197        FMLA    v20.4s, v16.4s,  v0.s[2]
198        FMLA    v22.4s, v16.4s,  v1.s[2]
199        FMLA    v24.4s, v16.4s,  v2.s[2]
200        LDR     q8, [x20], 16
201        FMLA    v26.4s, v16.4s,  v3.s[2]
202        FMLA    v28.4s, v16.4s,  v4.s[2]
203        FMLA    v30.4s, v16.4s,  v5.s[2]
204        LDR     q9, [x21], 16
205        FMLA    v21.4s, v17.4s,  v0.s[2]
206        FMLA    v23.4s, v17.4s,  v1.s[2]
207        FMLA    v25.4s, v17.4s,  v2.s[2]
208        LDR     q10, [x22], 16
209        FMLA    v27.4s, v17.4s,  v3.s[2]
210        FMLA    v29.4s, v17.4s,  v4.s[2]
211        FMLA    v31.4s, v17.4s,  v5.s[2]
212        LDR     q11, [x23], 16
213
214        FMLA    v20.4s, v18.4s,  v0.s[3]
215        FMLA    v22.4s, v18.4s,  v1.s[3]
216        FMLA    v24.4s, v18.4s,  v2.s[3]
217        LDP     q12, q13, [x5], 32        // Load 4 B
218        FMLA    v26.4s, v18.4s,  v3.s[3]
219        FMLA    v28.4s, v18.4s,  v4.s[3]
220        FMLA    v30.4s, v18.4s,  v5.s[3]
221        LDP     q14, q15, [x5], 32
222        FMLA    v21.4s, v19.4s,  v0.s[3]
223        FMLA    v23.4s, v19.4s,  v1.s[3]
224        FMLA    v25.4s, v19.4s,  v2.s[3]
225        LDP     q16, q17, [x5], 32
226        FMLA    v27.4s, v19.4s,  v3.s[3]
227        FMLA    v29.4s, v19.4s,  v4.s[3]
228        FMLA    v31.4s, v19.4s,  v5.s[3]
229        LDP     q18, q19, [x5], 32
230
231        # Second group of 4 A.  48 FMA.
232        FMLA    v20.4s, v12.4s,  v6.s[0]
233        FMLA    v22.4s, v12.4s,  v7.s[0]
234        FMLA    v24.4s, v12.4s,  v8.s[0]
235        LDR     q0, [x14], 16            // Load next 6 A
236        FMLA    v26.4s, v12.4s,  v9.s[0]
237        FMLA    v28.4s, v12.4s, v10.s[0]
238        FMLA    v30.4s, v12.4s, v11.s[0]
239        LDR     q1, [x15], 16
240        FMLA    v21.4s, v13.4s,  v6.s[0]
241        FMLA    v23.4s, v13.4s,  v7.s[0]
242        FMLA    v25.4s, v13.4s,  v8.s[0]
243        LDR     q2, [x20], 16
244        FMLA    v27.4s, v13.4s,  v9.s[0]
245        FMLA    v29.4s, v13.4s, v10.s[0]
246        FMLA    v31.4s, v13.4s, v11.s[0]
247        LDR     q3, [x21], 16
248
249        FMLA    v20.4s, v14.4s,  v6.s[1]
250        FMLA    v22.4s, v14.4s,  v7.s[1]
251        FMLA    v24.4s, v14.4s,  v8.s[1]
252        LDR     q4, [x22], 16
253        FMLA    v26.4s, v14.4s,  v9.s[1]
254        FMLA    v28.4s, v14.4s, v10.s[1]
255        FMLA    v30.4s, v14.4s, v11.s[1]
256        LDR     q5, [x23], 16
257        FMLA    v21.4s, v15.4s,  v6.s[1]
258        FMLA    v23.4s, v15.4s,  v7.s[1]
259        FMLA    v25.4s, v15.4s,  v8.s[1]
260        LDP     q12, q13, [x5], 32        // Load next 3 B (not last)
261        FMLA    v27.4s, v15.4s,  v9.s[1]
262        FMLA    v29.4s, v15.4s, v10.s[1]
263        FMLA    v31.4s, v15.4s, v11.s[1]
264        LDP     q14, q15, [x5], 32
265
266        FMLA    v20.4s, v16.4s,  v6.s[2]
267        FMLA    v22.4s, v16.4s,  v7.s[2]
268        FMLA    v24.4s, v16.4s,  v8.s[2]
269        FMLA    v26.4s, v16.4s,  v9.s[2]
270        FMLA    v28.4s, v16.4s, v10.s[2]
271        FMLA    v30.4s, v16.4s, v11.s[2]
272        FMLA    v21.4s, v17.4s,  v6.s[2]
273        FMLA    v23.4s, v17.4s,  v7.s[2]
274        FMLA    v25.4s, v17.4s,  v8.s[2]
275        FMLA    v27.4s, v17.4s,  v9.s[2]
276        FMLA    v29.4s, v17.4s, v10.s[2]
277        FMLA    v31.4s, v17.4s, v11.s[2]
278
279        FMLA    v20.4s, v18.4s,  v6.s[3]
280        FMLA    v22.4s, v18.4s,  v7.s[3]
281        LDP     q16,  q17, [x5], 32
282        FMLA    v24.4s, v18.4s,  v8.s[3]
283        FMLA    v26.4s, v18.4s,  v9.s[3]
284        FMLA    v28.4s, v18.4s, v10.s[3]
285        FMLA    v30.4s, v18.4s, v11.s[3]
286        SUBS    x0, x0, 32
287        FMLA    v21.4s, v19.4s,  v6.s[3]
288        FMLA    v23.4s, v19.4s,  v7.s[3]
289        FMLA    v25.4s, v19.4s,  v8.s[3]
290        FMLA    v27.4s, v19.4s,  v9.s[3]
291        FMLA    v29.4s, v19.4s, v10.s[3]
292        FMLA    v31.4s, v19.4s, v11.s[3]
293        B.HS    2b
294
295        # Epilogue - 8 floats of A (32 bytes)
296        # 96 FMA + 6 LDP A + 8 LDP B
297        # First block same as main loop.  Second block has no preloads.
2983:
299        # First group of 4 A.  48 FMA.
300        FMLA    v20.4s, v12.4s,  v0.s[0]
301        LDP     q18, q19, [x5], 32        // Load last B
302        FMLA    v22.4s, v12.4s,  v1.s[0]
303        FMLA    v24.4s, v12.4s,  v2.s[0]
304        FMLA    v26.4s, v12.4s,  v3.s[0]
305        FMLA    v28.4s, v12.4s,  v4.s[0]
306        FMLA    v30.4s, v12.4s,  v5.s[0]
307        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
308        FMLA    v21.4s, v13.4s,  v0.s[0]
309        FMLA    v23.4s, v13.4s,  v1.s[0]
310        FMLA    v25.4s, v13.4s,  v2.s[0]
311        PRFM    PLDL1KEEP, [x5, 320]
312        FMLA    v27.4s, v13.4s,  v3.s[0]
313        FMLA    v29.4s, v13.4s,  v4.s[0]
314        FMLA    v31.4s, v13.4s,  v5.s[0]
315        PRFM    PLDL1KEEP, [x5, 384]
316        FMLA    v20.4s, v14.4s,  v0.s[1]
317        FMLA    v22.4s, v14.4s,  v1.s[1]
318        FMLA    v24.4s, v14.4s,  v2.s[1]
319        PRFM    PLDL1KEEP, [x5, 448]
320        FMLA    v26.4s, v14.4s,  v3.s[1]
321        FMLA    v28.4s, v14.4s,  v4.s[1]
322        FMLA    v30.4s, v14.4s,  v5.s[1]
323        FMLA    v21.4s, v15.4s,  v0.s[1]
324        FMLA    v23.4s, v15.4s,  v1.s[1]
325        FMLA    v25.4s, v15.4s,  v2.s[1]
326        LDR     q6, [x14], 16             // Load next 6 A
327        FMLA    v27.4s, v15.4s,  v3.s[1]
328        FMLA    v29.4s, v15.4s,  v4.s[1]
329        FMLA    v31.4s, v15.4s,  v5.s[1]
330        LDR     q7, [x15], 16
331
332        FMLA    v20.4s, v16.4s,  v0.s[2]
333        FMLA    v22.4s, v16.4s,  v1.s[2]
334        FMLA    v24.4s, v16.4s,  v2.s[2]
335        LDR     q8, [x20], 16
336        FMLA    v26.4s, v16.4s,  v3.s[2]
337        FMLA    v28.4s, v16.4s,  v4.s[2]
338        FMLA    v30.4s, v16.4s,  v5.s[2]
339        LDR     q9, [x21], 16
340        FMLA    v21.4s, v17.4s,  v0.s[2]
341        FMLA    v23.4s, v17.4s,  v1.s[2]
342        FMLA    v25.4s, v17.4s,  v2.s[2]
343        LDR     q10, [x22], 16
344        FMLA    v27.4s, v17.4s,  v3.s[2]
345        FMLA    v29.4s, v17.4s,  v4.s[2]
346        FMLA    v31.4s, v17.4s,  v5.s[2]
347        LDR     q11, [x23], 16
348
349        FMLA    v20.4s, v18.4s,  v0.s[3]
350        FMLA    v22.4s, v18.4s,  v1.s[3]
351        FMLA    v24.4s, v18.4s,  v2.s[3]
352        LDP     q12, q13, [x5], 32        // Load 4 B
353        FMLA    v26.4s, v18.4s,  v3.s[3]
354        FMLA    v28.4s, v18.4s,  v4.s[3]
355        FMLA    v30.4s, v18.4s,  v5.s[3]
356        LDP     q14, q15, [x5], 32
357        FMLA    v21.4s, v19.4s,  v0.s[3]
358        FMLA    v23.4s, v19.4s,  v1.s[3]
359        FMLA    v25.4s, v19.4s,  v2.s[3]
360        LDP     q16, q17, [x5], 32
361        FMLA    v27.4s, v19.4s,  v3.s[3]
362        FMLA    v29.4s, v19.4s,  v4.s[3]
363        FMLA    v31.4s, v19.4s,  v5.s[3]
364        LDP     q18, q19, [x5], 32
365
366        # Second group of 4 A.  48 FMA.
367        FMLA    v20.4s, v12.4s,  v6.s[0]
368        FMLA    v22.4s, v12.4s,  v7.s[0]
369        FMLA    v24.4s, v12.4s,  v8.s[0]
370        FMLA    v26.4s, v12.4s,  v9.s[0]
371        FMLA    v28.4s, v12.4s, v10.s[0]
372        FMLA    v30.4s, v12.4s, v11.s[0]
373        FMLA    v21.4s, v13.4s,  v6.s[0]
374        FMLA    v23.4s, v13.4s,  v7.s[0]
375        FMLA    v25.4s, v13.4s,  v8.s[0]
376        FMLA    v27.4s, v13.4s,  v9.s[0]
377        FMLA    v29.4s, v13.4s, v10.s[0]
378        FMLA    v31.4s, v13.4s, v11.s[0]
379
380        FMLA    v20.4s, v14.4s,  v6.s[1]
381        FMLA    v22.4s, v14.4s,  v7.s[1]
382        FMLA    v24.4s, v14.4s,  v8.s[1]
383        FMLA    v26.4s, v14.4s,  v9.s[1]
384        FMLA    v28.4s, v14.4s, v10.s[1]
385        FMLA    v30.4s, v14.4s, v11.s[1]
386        FMLA    v21.4s, v15.4s,  v6.s[1]
387        FMLA    v23.4s, v15.4s,  v7.s[1]
388        FMLA    v25.4s, v15.4s,  v8.s[1]
389        FMLA    v27.4s, v15.4s,  v9.s[1]
390        FMLA    v29.4s, v15.4s, v10.s[1]
391        FMLA    v31.4s, v15.4s, v11.s[1]
392
393        FMLA    v20.4s, v16.4s,  v6.s[2]
394        FMLA    v22.4s, v16.4s,  v7.s[2]
395        FMLA    v24.4s, v16.4s,  v8.s[2]
396        FMLA    v26.4s, v16.4s,  v9.s[2]
397        FMLA    v28.4s, v16.4s, v10.s[2]
398        FMLA    v30.4s, v16.4s, v11.s[2]
399        FMLA    v21.4s, v17.4s,  v6.s[2]
400        FMLA    v23.4s, v17.4s,  v7.s[2]
401        FMLA    v25.4s, v17.4s,  v8.s[2]
402        FMLA    v27.4s, v17.4s,  v9.s[2]
403        FMLA    v29.4s, v17.4s, v10.s[2]
404        FMLA    v31.4s, v17.4s, v11.s[2]
405
406
407        FMLA    v20.4s, v18.4s,  v6.s[3]
408        FMLA    v22.4s, v18.4s,  v7.s[3]
409        FMLA    v24.4s, v18.4s,  v8.s[3]
410        FMLA    v26.4s, v18.4s,  v9.s[3]
411        FMLA    v28.4s, v18.4s, v10.s[3]
412        FMLA    v30.4s, v18.4s, v11.s[3]
413
414        # Is there a remainder?- 4 floats of A (16 bytes) or less
415        TST     x0, 31
416
417        FMLA    v21.4s, v19.4s,  v6.s[3]
418        FMLA    v23.4s, v19.4s,  v7.s[3]
419        FMLA    v25.4s, v19.4s,  v8.s[3]
420        LD2R    {v6.4s, v7.4s}, [x8]      // Load min/max values
421        FMLA    v27.4s, v19.4s,  v9.s[3]
422        FMLA    v29.4s, v19.4s, v10.s[3]
423        FMLA    v31.4s, v19.4s, v11.s[3]
424        B.NE    5f
425
4264:
427        # ks loop
428        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
429        B.HI    1b
430
431        # Clamp
432        FMAX    v20.4s, v20.4s, v6.4s
433        FMAX    v21.4s, v21.4s, v6.4s
434        FMAX    v22.4s, v22.4s, v6.4s
435        FMAX    v23.4s, v23.4s, v6.4s
436        LDR     x0, [sp, 96]            // Load cn_stride
437        FMAX    v24.4s, v24.4s, v6.4s
438        FMAX    v25.4s, v25.4s, v6.4s
439        FMAX    v26.4s, v26.4s, v6.4s
440        FMAX    v27.4s, v27.4s, v6.4s
441        FMAX    v28.4s, v28.4s, v6.4s
442        FMAX    v29.4s, v29.4s, v6.4s
443        FMAX    v30.4s, v30.4s, v6.4s
444        FMAX    v31.4s, v31.4s, v6.4s
445        SUBS    x1, x1, 8
446        FMIN    v20.4s, v20.4s, v7.4s
447        FMIN    v21.4s, v21.4s, v7.4s
448        FMIN    v22.4s, v22.4s, v7.4s
449        FMIN    v23.4s, v23.4s, v7.4s
450        FMIN    v24.4s, v24.4s, v7.4s
451        FMIN    v25.4s, v25.4s, v7.4s
452        FMIN    v26.4s, v26.4s, v7.4s
453        FMIN    v27.4s, v27.4s, v7.4s
454        FMIN    v28.4s, v28.4s, v7.4s
455        FMIN    v29.4s, v29.4s, v7.4s
456        FMIN    v30.4s, v30.4s, v7.4s
457        FMIN    v31.4s, v31.4s, v7.4s
458
459        # Store full 6 x 8
460        B.LO    8f
461
462        STP     q30, q31,  [x7]
463        ADD     x7, x7, x0
464        STP     q28, q29, [x13]
465        ADD     x13, x13, x0
466        STP     q26, q27, [x10]
467        ADD     x10, x10, x0
468        STP     q24, q25, [x17]
469        ADD     x17, x17, x0
470        STP     q22, q23, [x16]
471        ADD     x16, x16, x0
472        STP     q20, q21,  [x6]
473        ADD     x6,  x6, x0
474
475        SUB     x4, x4, x3              // a -= ks
476
477        # nc loop
478        B.HI    0b
479
480        # Restore x20,x21,x22,x23 from stack
481        LDP     x22, x23, [sp, 80]
482        LDP     x20, x21, [sp, 64]
483
484        # Restore d8-d15 from stack
485        LDP     d14, d15, [sp, 48]
486        LDP     d12, d13, [sp, 32]
487        LDP     d10, d11, [sp, 16]
488        LDP     d8,  d9, [sp], 96
489        RET
490
4915:
492        # Load min/max values
493        LD2R    {v6.4s, v7.4s}, [x8]
494
495        # Is there a remainder?- 4 floats of A (16 bytes)
496        TBZ     x0, 4, 6f
497
498        # Remainder- 4 floats of A (16 bytes)
499        # Load A
500        LDR     q0, [x14], 16
501        LDR     q1, [x15], 16
502        LDR     q2, [x20], 16
503        LDR     q3, [x21], 16
504        LDR     q4, [x22], 16
505        LDR     q5, [x23], 16
506        # Load B
507        LDP     q12, q13, [x5], 32
508        LDP     q14, q15, [x5], 32
509        LDP     q16, q17, [x5], 32
510        LDP     q18, q19, [x5], 32
511
512        FMLA    v20.4s, v12.4s,  v0.s[0]
513        FMLA    v22.4s, v12.4s,  v1.s[0]
514        FMLA    v24.4s, v12.4s,  v2.s[0]
515        FMLA    v26.4s, v12.4s,  v3.s[0]
516        FMLA    v28.4s, v12.4s,  v4.s[0]
517        FMLA    v30.4s, v12.4s,  v5.s[0]
518        FMLA    v21.4s, v13.4s,  v0.s[0]
519        FMLA    v23.4s, v13.4s,  v1.s[0]
520        FMLA    v25.4s, v13.4s,  v2.s[0]
521        FMLA    v27.4s, v13.4s,  v3.s[0]
522        FMLA    v29.4s, v13.4s,  v4.s[0]
523        FMLA    v31.4s, v13.4s,  v5.s[0]
524
525        FMLA    v20.4s, v14.4s,  v0.s[1]
526        FMLA    v22.4s, v14.4s,  v1.s[1]
527        FMLA    v24.4s, v14.4s,  v2.s[1]
528        FMLA    v26.4s, v14.4s,  v3.s[1]
529        FMLA    v28.4s, v14.4s,  v4.s[1]
530        FMLA    v30.4s, v14.4s,  v5.s[1]
531        FMLA    v21.4s, v15.4s,  v0.s[1]
532        FMLA    v23.4s, v15.4s,  v1.s[1]
533        FMLA    v25.4s, v15.4s,  v2.s[1]
534        FMLA    v27.4s, v15.4s,  v3.s[1]
535        FMLA    v29.4s, v15.4s,  v4.s[1]
536        FMLA    v31.4s, v15.4s,  v5.s[1]
537
538        FMLA    v20.4s, v16.4s,  v0.s[2]
539        FMLA    v22.4s, v16.4s,  v1.s[2]
540        FMLA    v24.4s, v16.4s,  v2.s[2]
541        FMLA    v26.4s, v16.4s,  v3.s[2]
542        FMLA    v28.4s, v16.4s,  v4.s[2]
543        FMLA    v30.4s, v16.4s,  v5.s[2]
544        FMLA    v21.4s, v17.4s,  v0.s[2]
545        FMLA    v23.4s, v17.4s,  v1.s[2]
546        FMLA    v25.4s, v17.4s,  v2.s[2]
547        FMLA    v27.4s, v17.4s,  v3.s[2]
548        FMLA    v29.4s, v17.4s,  v4.s[2]
549        FMLA    v31.4s, v17.4s,  v5.s[2]
550
551        FMLA    v20.4s, v18.4s,  v0.s[3]
552        FMLA    v22.4s, v18.4s,  v1.s[3]
553        FMLA    v24.4s, v18.4s,  v2.s[3]
554        FMLA    v26.4s, v18.4s,  v3.s[3]
555        FMLA    v28.4s, v18.4s,  v4.s[3]
556        FMLA    v30.4s, v18.4s,  v5.s[3]
557        FMLA    v21.4s, v19.4s,  v0.s[3]
558        FMLA    v23.4s, v19.4s,  v1.s[3]
559        FMLA    v25.4s, v19.4s,  v2.s[3]
560        FMLA    v27.4s, v19.4s,  v3.s[3]
561        FMLA    v29.4s, v19.4s,  v4.s[3]
562        FMLA    v31.4s, v19.4s,  v5.s[3]
563
564        # Is there a remainder?- 2 floats of A (8 bytes)
5656:
566        TBZ     x0, 3, 7f
567
568        # Remainder- 2 floats of A (8 bytes)
569        # Load A
570        LDR     d0, [x14], 8
571        LDR     d1, [x15], 8
572        LDR     d2, [x20], 8
573        LDR     d3, [x21], 8
574        LDR     d4, [x22], 8
575        LDR     d5, [x23], 8
576        # Load B
577        LDP     q12, q13, [x5], 32
578        LDP     q14, q15, [x5], 32
579
580        FMLA    v20.4s, v12.4s,  v0.s[0]
581        FMLA    v22.4s, v12.4s,  v1.s[0]
582        FMLA    v24.4s, v12.4s,  v2.s[0]
583        FMLA    v26.4s, v12.4s,  v3.s[0]
584        FMLA    v28.4s, v12.4s,  v4.s[0]
585        FMLA    v30.4s, v12.4s,  v5.s[0]
586        FMLA    v21.4s, v13.4s,  v0.s[0]
587        FMLA    v23.4s, v13.4s,  v1.s[0]
588        FMLA    v25.4s, v13.4s,  v2.s[0]
589        FMLA    v27.4s, v13.4s,  v3.s[0]
590        FMLA    v29.4s, v13.4s,  v4.s[0]
591        FMLA    v31.4s, v13.4s,  v5.s[0]
592
593        FMLA    v20.4s, v14.4s,  v0.s[1]
594        FMLA    v22.4s, v14.4s,  v1.s[1]
595        FMLA    v24.4s, v14.4s,  v2.s[1]
596        FMLA    v26.4s, v14.4s,  v3.s[1]
597        FMLA    v28.4s, v14.4s,  v4.s[1]
598        FMLA    v30.4s, v14.4s,  v5.s[1]
599        FMLA    v21.4s, v15.4s,  v0.s[1]
600        FMLA    v23.4s, v15.4s,  v1.s[1]
601        FMLA    v25.4s, v15.4s,  v2.s[1]
602        FMLA    v27.4s, v15.4s,  v3.s[1]
603        FMLA    v29.4s, v15.4s,  v4.s[1]
604        FMLA    v31.4s, v15.4s,  v5.s[1]
605
606        # Is there a remainder?- 1 float of A (4 bytes)
6077:
608        TBZ     x0, 2, 4b
609
610        # Remainder- 1 float of A (4 bytes)
611        # Load A
612        LDR     s0, [x14], 4
613        LDR     s1, [x15], 4
614        LDR     s2, [x20], 4
615        LDR     s3, [x21], 4
616        LDR     s4, [x22], 4
617        LDR     s5, [x23], 4
618        # Load B
619        LDP     q12, q13, [x5], 32
620
621        FMLA    v20.4s, v12.4s,  v0.s[0]
622        FMLA    v22.4s, v12.4s,  v1.s[0]
623        FMLA    v24.4s, v12.4s,  v2.s[0]
624        FMLA    v26.4s, v12.4s,  v3.s[0]
625        FMLA    v28.4s, v12.4s,  v4.s[0]
626        FMLA    v30.4s, v12.4s,  v5.s[0]
627        FMLA    v21.4s, v13.4s,  v0.s[0]
628        FMLA    v23.4s, v13.4s,  v1.s[0]
629        FMLA    v25.4s, v13.4s,  v2.s[0]
630        FMLA    v27.4s, v13.4s,  v3.s[0]
631        FMLA    v29.4s, v13.4s,  v4.s[0]
632        FMLA    v31.4s, v13.4s,  v5.s[0]
633        B       4b
634
635        # Store odd width
6368:
637        TBZ     x1, 2, 9f
638        STR     q30,  [x7], 16
639        MOV     v30.16b, v31.16b
640        STR     q28, [x13], 16
641        MOV     v28.16b, v29.16b
642        STR     q26, [x10], 16
643        MOV     v26.16b, v27.16b
644        STR     q24, [x17], 16
645        MOV     v24.16b, v25.16b
646        STR     q22, [x16], 16
647        MOV     v22.16b, v23.16b
648        STR     q20,  [x6], 16
649        MOV     v20.16b, v21.16b
6509:
651        TBZ     x1, 1, 10f
652        STR     d30,  [x7], 8
653        STR     d28, [x13], 8
654        DUP     d30, v30.d[1]
655        DUP     d28, v28.d[1]
656        STR     d26, [x10], 8
657        STR     d24, [x17], 8
658        DUP     d26, v26.d[1]
659        DUP     d24, v24.d[1]
660        STR     d22, [x16], 8
661        STR     d20,  [x6], 8
662        DUP     d22, v22.d[1]
663        DUP     d20, v20.d[1]
664
66510:
666        TBZ     x1, 0, 11f
667        STR     s30,  [x7]
668        STR     s28, [x13]
669        STR     s26, [x10]
670        STR     s24, [x17]
671        STR     s22, [x16]
672        STR     s20,  [x6]
67311:
674        # Restore x20,x21,x22,x23 from stack
675        LDP     x22, x23, [sp, 80]
676        LDP     x20, x21, [sp, 64]
677
678        # Restore d8-d15 from stack
679        LDP     d14, d15, [sp, 48]
680        LDP     d12, d13, [sp, 32]
681        LDP     d10, d11, [sp, 16]
682        LDP     d8,  d9, [sp], 96
683        RET
684
685END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75
686
687#ifdef __ELF__
688.section ".note.GNU-stack","",%progbits
689#endif
690