xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# LINT.IfChange
9# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75(
10#     size_t mr,                x0
11#     size_t nc,                x1
12#     size_t kc,                x2 / x0
13#     const uint8_t*restrict a, x3
14#     size_t a_stride,          x4
15#     const void*restrict w,    x5
16#     uint8_t*restrict c,       x6
17#     size_t cm_stride,         x7
18#     size_t cn_stride,         [sp] -> x14
19$if INC:
20  #     const float*restrict acc,  [sp + 8] -> x15
21  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
22$else:
23  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28# x3  a0
29# x11 a1
30# x12 a2
31# x4  a3 / a_stride
32
33# C pointers
34# x6  c0
35# x9  c1
36# x10 c2
37# x7  c3 / cm_stride
38
39# Vector register usage
40# A0  v0  v4
41# A1  v1  v5
42# A2  v2  v6
43# A3  v3  v7
44# B   v8  v9 v10 v11
45# B  v12 v13 v14 v15
46# B  v16 v17 v18 v19
47# B  v20 v21 v22 v23
48# C  v24 v25
49# C  v26 v27
50# C  v28 v29
51# C  v30 v31
52# Clamp v4 v5
53
54BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
55
56        $if INC:
57          # Load cn_stride, acc
58          LDP     x14, x15, [sp]
59          # Load params pointer
60          LDR     x8, [sp, 16]
61        $else:
62          # Load cn_stride, params pointer
63          LDP     x14, x8, [sp]
64
65        # Load min/max values
66        LD2R    {v4.4s, v5.4s}, [x8]
67
68        # Save d8-d15 on stack
69        STP     d8,  d9, [sp, -64]!
70        STP     d10, d11, [sp, 16]
71        STP     d12, d13, [sp, 32]
72        STP     d14, d15, [sp, 48]
73
74        # Clamp A and C pointers
75        CMP     x0, 2                   // if mr < 2
76        ADD     x11, x3, x4             // a1 = a0 + a_stride
77        ADD     x9, x6, x7              // c1 = c0 + cm_stride
78        CSEL    x11, x3, x11, LO        //   a1 = a0
79        CSEL    x9, x6, x9, LO          //   c1 = c0
80
81        ADD     x12, x11, x4            // a2 = a1 + a_stride
82        ADD     x10, x9, x7             // c2 = c1 + cm_stride
83                                        // if mr <= 2
84        CSEL    x12, x11, x12, LS       //   a2 = a1
85        CSEL    x10, x9, x10, LS        //   c2 = c1
86
87        CMP     x0, 4                   // if mr < 4
88        ADD     x4, x12, x4             // a3 = a2 + a_stride
89        ADD     x7, x10, x7             // c3 = c2 + cm_stride
90        CSEL    x4, x12, x4, LO         //   a3 = a2
91        CSEL    x7, x10, x7, LO         //   c3 = c2
92
930:
94        $if INC:
95          # Load initial accumulators
96          LDP     q24, q25, [x15], 32
97          LDP     q26, q27, [x15], 32
98          LDP     q28, q29, [x15], 32
99          LDP     q30, q31, [x15], 32
100        $else:
101          # Load initial bias from w into accumulators
102          LDP     q24, q25, [x5], 32
103          MOV     v26.16b, v24.16b
104          MOV     v27.16b, v25.16b
105          MOV     v28.16b, v24.16b
106          MOV     v29.16b, v25.16b
107          MOV     v30.16b, v24.16b
108          MOV     v31.16b, v25.16b
109
110        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
111        SUBS    x0, x2, 32              // k = kc - 32
112        B.LO    3f
113
114        # 16 prologue
115        # Read first block of 4 A and B.
116        LDR     q0,  [x3], 16
117        LDP     q16, q17, [x5], 32
118        LDR     q1, [x11], 16
119        LDR     q2, [x12], 16
120        LDR     q3,  [x4], 16
121        LDP     q18, q19, [x5], 32
122        LDP     q20, q21, [x5], 32
123        LDP     q22, q23, [x5], 32
124
125        # Is there at least 32.  yes do main loop
126        SUBS    x0, x0, 32
127        B.LO    2f
128
129        # Main loop - 8 floats of A (32 bytes)
1301:
131        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
132        FMLA    v24.4s, v16.4s, v0.s[0]
133        LDP     q8, q9, [x5], 32
134        FMLA    v25.4s, v17.4s, v0.s[0]
135        FMLA    v26.4s, v16.4s, v1.s[0]
136        LDP     q10, q11, [x5], 32
137        FMLA    v27.4s, v17.4s, v1.s[0]
138        FMLA    v28.4s, v16.4s, v2.s[0]
139        LDP     q12, q13, [x5], 32
140        FMLA    v29.4s, v17.4s, v2.s[0]
141        FMLA    v30.4s, v16.4s, v3.s[0]
142        LDP     q14, q15, [x5], 32
143        FMLA    v31.4s, v17.4s, v3.s[0]
144        FMLA    v24.4s, v18.4s, v0.s[1]
145        LDR     q4, [x3], 16
146        FMLA    v25.4s, v19.4s, v0.s[1]
147        FMLA    v26.4s, v18.4s, v1.s[1]
148        LDR     q5, [x11], 16
149        FMLA    v27.4s, v19.4s, v1.s[1]
150        FMLA    v28.4s, v18.4s, v2.s[1]
151        LDR     q6, [x12], 16
152        FMLA    v29.4s, v19.4s, v2.s[1]
153        FMLA    v30.4s, v18.4s, v3.s[1]
154        LDR     q7, [x4], 16
155        FMLA    v31.4s, v19.4s, v3.s[1]
156        FMLA    v24.4s, v20.4s, v0.s[2]
157        $if PREFETCH:
158          PRFM    PLDL1KEEP, [x5, 128]
159        FMLA    v25.4s, v21.4s, v0.s[2]
160        FMLA    v26.4s, v20.4s, v1.s[2]
161        $if PREFETCH:
162          PRFM    PLDL1KEEP, [x5, 192]
163        FMLA    v27.4s, v21.4s, v1.s[2]
164        FMLA    v28.4s, v20.4s, v2.s[2]
165        $if PREFETCH:
166          PRFM    PLDL1KEEP, [x5, 256]
167        FMLA    v29.4s, v21.4s, v2.s[2]
168        FMLA    v30.4s, v20.4s, v3.s[2]
169        $if PREFETCH:
170          PRFM    PLDL1KEEP, [x5, 320]
171        FMLA    v31.4s, v21.4s, v3.s[2]
172        FMLA    v24.4s, v22.4s, v0.s[3]
173        FMLA    v25.4s, v23.4s, v0.s[3]
174        FMLA    v26.4s, v22.4s, v1.s[3]
175        FMLA    v27.4s, v23.4s, v1.s[3]
176        FMLA    v28.4s, v22.4s, v2.s[3]
177        FMLA    v29.4s, v23.4s, v2.s[3]
178        FMLA    v30.4s, v22.4s, v3.s[3]
179        FMLA    v31.4s, v23.4s, v3.s[3]
180
181        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
182        FMLA    v24.4s, v8.4s, v4.s[0]
183        LDP     q16, q17, [x5], 32
184        FMLA    v25.4s, v9.4s, v4.s[0]
185        FMLA    v26.4s, v8.4s, v5.s[0]
186        LDP     q18, q19, [x5], 32
187        FMLA    v27.4s, v9.4s, v5.s[0]
188        FMLA    v28.4s, v8.4s, v6.s[0]
189        LDP     q20, q21, [x5], 32
190        FMLA    v29.4s, v9.4s, v6.s[0]
191        FMLA    v30.4s, v8.4s, v7.s[0]
192        LDP     q22, q23, [x5], 32
193        FMLA    v31.4s, v9.4s, v7.s[0]
194        FMLA    v24.4s, v10.4s, v4.s[1]
195        LDR     q0, [x3], 16
196        FMLA    v25.4s, v11.4s, v4.s[1]
197        FMLA    v26.4s, v10.4s, v5.s[1]
198        LDR     q1, [x11], 16
199        FMLA    v27.4s, v11.4s, v5.s[1]
200        FMLA    v28.4s, v10.4s, v6.s[1]
201        LDR     q2, [x12], 16
202        FMLA    v29.4s, v11.4s, v6.s[1]
203        FMLA    v30.4s, v10.4s, v7.s[1]
204        LDR     q3, [x4], 16
205        FMLA    v31.4s, v11.4s, v7.s[1]
206        FMLA    v24.4s, v12.4s, v4.s[2]
207        FMLA    v25.4s, v13.4s, v4.s[2]
208        FMLA    v26.4s, v12.4s, v5.s[2]
209        FMLA    v27.4s, v13.4s, v5.s[2]
210        FMLA    v28.4s, v12.4s, v6.s[2]
211        FMLA    v29.4s, v13.4s, v6.s[2]
212        FMLA    v30.4s, v12.4s, v7.s[2]
213        FMLA    v31.4s, v13.4s, v7.s[2]
214        FMLA    v24.4s, v14.4s, v4.s[3]
215        FMLA    v25.4s, v15.4s, v4.s[3]
216        FMLA    v26.4s, v14.4s, v5.s[3]
217        FMLA    v27.4s, v15.4s, v5.s[3]
218        FMLA    v28.4s, v14.4s, v6.s[3]
219        FMLA    v29.4s, v15.4s, v6.s[3]
220        SUBS    x0, x0, 32
221        FMLA    v30.4s, v14.4s, v7.s[3]
222        FMLA    v31.4s, v15.4s, v7.s[3]
223        B.HS    1b
224
2252:
226        # Epilogue
227        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
228        FMLA    v24.4s, v16.4s, v0.s[0]
229        LDP     q8, q9, [x5], 32
230        FMLA    v25.4s, v17.4s, v0.s[0]
231        FMLA    v26.4s, v16.4s, v1.s[0]
232        LDP     q10, q11, [x5], 32
233        FMLA    v27.4s, v17.4s, v1.s[0]
234        FMLA    v28.4s, v16.4s, v2.s[0]
235        LDP     q12, q13, [x5], 32
236        FMLA    v29.4s, v17.4s, v2.s[0]
237        FMLA    v30.4s, v16.4s, v3.s[0]
238        LDP     q14, q15, [x5], 32
239        FMLA    v31.4s, v17.4s, v3.s[0]
240        FMLA    v24.4s, v18.4s, v0.s[1]
241        LDR     q4, [x3], 16
242        FMLA    v25.4s, v19.4s, v0.s[1]
243        FMLA    v26.4s, v18.4s, v1.s[1]
244        LDR     q5, [x11], 16
245        FMLA    v27.4s, v19.4s, v1.s[1]
246        FMLA    v28.4s, v18.4s, v2.s[1]
247        LDR     q6, [x12], 16
248        FMLA    v29.4s, v19.4s, v2.s[1]
249        FMLA    v30.4s, v18.4s, v3.s[1]
250        LDR     q7, [x4], 16
251        FMLA    v31.4s, v19.4s, v3.s[1]
252        FMLA    v24.4s, v20.4s, v0.s[2]
253        FMLA    v25.4s, v21.4s, v0.s[2]
254        FMLA    v26.4s, v20.4s, v1.s[2]
255        FMLA    v27.4s, v21.4s, v1.s[2]
256        FMLA    v28.4s, v20.4s, v2.s[2]
257        FMLA    v29.4s, v21.4s, v2.s[2]
258        FMLA    v30.4s, v20.4s, v3.s[2]
259        FMLA    v31.4s, v21.4s, v3.s[2]
260        FMLA    v24.4s, v22.4s, v0.s[3]
261        FMLA    v25.4s, v23.4s, v0.s[3]
262        FMLA    v26.4s, v22.4s, v1.s[3]
263        FMLA    v27.4s, v23.4s, v1.s[3]
264        FMLA    v28.4s, v22.4s, v2.s[3]
265        FMLA    v29.4s, v23.4s, v2.s[3]
266        FMLA    v30.4s, v22.4s, v3.s[3]
267        FMLA    v31.4s, v23.4s, v3.s[3]
268
269        # Second block of 4.  FMA for second 4, noloads
270        FMLA    v24.4s, v8.4s, v4.s[0]
271        FMLA    v25.4s, v9.4s, v4.s[0]
272        FMLA    v26.4s, v8.4s, v5.s[0]
273        FMLA    v27.4s, v9.4s, v5.s[0]
274        FMLA    v28.4s, v8.4s, v6.s[0]
275        FMLA    v29.4s, v9.4s, v6.s[0]
276        FMLA    v30.4s, v8.4s, v7.s[0]
277        FMLA    v31.4s, v9.4s, v7.s[0]
278
279        FMLA    v24.4s, v10.4s, v4.s[1]
280        FMLA    v25.4s, v11.4s, v4.s[1]
281        FMLA    v26.4s, v10.4s, v5.s[1]
282        FMLA    v27.4s, v11.4s, v5.s[1]
283        FMLA    v28.4s, v10.4s, v6.s[1]
284        FMLA    v29.4s, v11.4s, v6.s[1]
285        FMLA    v30.4s, v10.4s, v7.s[1]
286        FMLA    v31.4s, v11.4s, v7.s[1]
287
288        FMLA    v24.4s, v12.4s, v4.s[2]
289        FMLA    v25.4s, v13.4s, v4.s[2]
290        FMLA    v26.4s, v12.4s, v5.s[2]
291        FMLA    v27.4s, v13.4s, v5.s[2]
292        FMLA    v28.4s, v12.4s, v6.s[2]
293        FMLA    v29.4s, v13.4s, v6.s[2]
294        FMLA    v30.4s, v12.4s, v7.s[2]
295        FMLA    v31.4s, v13.4s, v7.s[2]
296
297        FMLA    v24.4s, v14.4s, v4.s[3]
298        FMLA    v25.4s, v15.4s, v4.s[3]
299        FMLA    v26.4s, v14.4s, v5.s[3]
300        FMLA    v27.4s, v15.4s, v5.s[3]
301
302        # Load min/max values
303        LD2R    {v4.4s, v5.4s}, [x8]
304
305        FMLA    v28.4s, v14.4s, v6.s[3]
306        FMLA    v29.4s, v15.4s, v6.s[3]
307        FMLA    v30.4s, v14.4s, v7.s[3]
308        FMLA    v31.4s, v15.4s, v7.s[3]
309
3103:
311        # Remainder- 4 floats of A (16 bytes)
312        TBZ     x0, 4, 4f
313
314        LDR     q0,  [x3], 16
315        LDP     q16, q17, [x5], 32
316        LDR     q1, [x11], 16
317        LDR     q2, [x12], 16
318        LDR     q3,  [x4], 16
319        FMLA    v24.4s, v16.4s, v0.s[0]
320        FMLA    v25.4s, v17.4s, v0.s[0]
321        LDP     q18, q19, [x5], 32
322        FMLA    v26.4s, v16.4s, v1.s[0]
323        FMLA    v27.4s, v17.4s, v1.s[0]
324        LDP     q20, q21, [x5], 32
325        FMLA    v28.4s, v16.4s, v2.s[0]
326        FMLA    v29.4s, v17.4s, v2.s[0]
327        LDP     q22, q23, [x5], 32
328        FMLA    v30.4s, v16.4s, v3.s[0]
329        FMLA    v31.4s, v17.4s, v3.s[0]
330        FMLA    v24.4s, v18.4s, v0.s[1]
331        FMLA    v25.4s, v19.4s, v0.s[1]
332        FMLA    v26.4s, v18.4s, v1.s[1]
333        FMLA    v27.4s, v19.4s, v1.s[1]
334        FMLA    v28.4s, v18.4s, v2.s[1]
335        FMLA    v29.4s, v19.4s, v2.s[1]
336        FMLA    v30.4s, v18.4s, v3.s[1]
337        FMLA    v31.4s, v19.4s, v3.s[1]
338        FMLA    v24.4s, v20.4s, v0.s[2]
339        FMLA    v25.4s, v21.4s, v0.s[2]
340        FMLA    v26.4s, v20.4s, v1.s[2]
341        FMLA    v27.4s, v21.4s, v1.s[2]
342        FMLA    v28.4s, v20.4s, v2.s[2]
343        FMLA    v29.4s, v21.4s, v2.s[2]
344        FMLA    v30.4s, v20.4s, v3.s[2]
345        FMLA    v31.4s, v21.4s, v3.s[2]
346        FMLA    v24.4s, v22.4s, v0.s[3]
347        FMLA    v25.4s, v23.4s, v0.s[3]
348        FMLA    v26.4s, v22.4s, v1.s[3]
349        FMLA    v27.4s, v23.4s, v1.s[3]
350        FMLA    v28.4s, v22.4s, v2.s[3]
351        FMLA    v29.4s, v23.4s, v2.s[3]
352        FMLA    v30.4s, v22.4s, v3.s[3]
353        FMLA    v31.4s, v23.4s, v3.s[3]
354
3554:
356        # Remainder- 2 floats of A (8 bytes)
357        TBZ     x0, 3, 5f
358
359        LDR     d0,  [x3], 8
360        LDP     q16, q17, [x5], 32
361        LDR     d1, [x11], 8
362        LDR     d2, [x12], 8
363        LDR     d3,  [x4], 8
364        FMLA    v24.4s, v16.4s, v0.s[0]
365        FMLA    v25.4s, v17.4s, v0.s[0]
366        LDP     q18, q19, [x5], 32
367        FMLA    v26.4s, v16.4s, v1.s[0]
368        FMLA    v27.4s, v17.4s, v1.s[0]
369        FMLA    v28.4s, v16.4s, v2.s[0]
370        FMLA    v29.4s, v17.4s, v2.s[0]
371        FMLA    v30.4s, v16.4s, v3.s[0]
372        FMLA    v31.4s, v17.4s, v3.s[0]
373        FMLA    v24.4s, v18.4s, v0.s[1]
374        FMLA    v25.4s, v19.4s, v0.s[1]
375        FMLA    v26.4s, v18.4s, v1.s[1]
376        FMLA    v27.4s, v19.4s, v1.s[1]
377        FMLA    v28.4s, v18.4s, v2.s[1]
378        FMLA    v29.4s, v19.4s, v2.s[1]
379        FMLA    v30.4s, v18.4s, v3.s[1]
380        FMLA    v31.4s, v19.4s, v3.s[1]
381
3825:
383        # Remainder- 1 float of A (4 bytes)
384        TBZ     x0, 2, 6f
385
386        LDR     s0,  [x3], 4
387        LDP     q16, q17, [x5], 32
388        LDR     s1, [x11], 4
389        LDR     s2, [x12], 4
390        LDR     s3,  [x4], 4
391        FMLA    v24.4s, v16.4s, v0.s[0]
392        FMLA    v25.4s, v17.4s, v0.s[0]
393        FMLA    v26.4s, v16.4s, v1.s[0]
394        FMLA    v27.4s, v17.4s, v1.s[0]
395        FMLA    v28.4s, v16.4s, v2.s[0]
396        FMLA    v29.4s, v17.4s, v2.s[0]
397        FMLA    v30.4s, v16.4s, v3.s[0]
398        FMLA    v31.4s, v17.4s, v3.s[0]
399
4006:
401        # Clamp
402        FMAX    v24.4s, v24.4s, v4.4s
403        SUBS    x1, x1, 8
404        FMAX    v25.4s, v25.4s, v4.4s
405        FMAX    v26.4s, v26.4s, v4.4s
406        FMAX    v27.4s, v27.4s, v4.4s
407        FMAX    v28.4s, v28.4s, v4.4s
408        FMAX    v29.4s, v29.4s, v4.4s
409        FMAX    v30.4s, v30.4s, v4.4s
410        FMAX    v31.4s, v31.4s, v4.4s
411        FMIN    v24.4s, v24.4s, v5.4s
412        FMIN    v25.4s, v25.4s, v5.4s
413        FMIN    v26.4s, v26.4s, v5.4s
414        FMIN    v27.4s, v27.4s, v5.4s
415        FMIN    v28.4s, v28.4s, v5.4s
416        FMIN    v29.4s, v29.4s, v5.4s
417        FMIN    v30.4s, v30.4s, v5.4s
418        FMIN    v31.4s, v31.4s, v5.4s
419
420        # Store full 4 x 8
421        B.LO    7f
422
423        $if INC:
424          STP     q30, q31,  [x7]
425          SUB     x3,  x3, x2             // a0 -= kc
426          ADD     x7,  x7, x14
427          STP     q28, q29, [x10]
428          SUB     x11, x11, x2            // a1 -= kc
429          ADD     x10, x10, x14
430          STP     q26, q27,  [x9]
431          SUB     x12, x12, x2            // a2 -= kc
432          ADD     x9,  x9, x14
433          STP     q24, q25,  [x6]
434          SUB     x4,  x4, x2             // a3 -= kc
435          ADD     x6,  x6, x14
436        $else:
437          STP     q24, q25,  [x6]
438          SUB     x3,  x3, x2             // a0 -= kc
439          ADD     x6,  x6, x14
440          STP     q26, q27,  [x9]
441          SUB     x11, x11, x2            // a1 -= kc
442          ADD     x9,  x9, x14
443          STP     q28, q29, [x10]
444          SUB     x12, x12, x2            // a2 -= kc
445          ADD     x10, x10, x14
446          STP     q30, q31,  [x7]
447          SUB     x4,  x4, x2             // a3 -= kc
448          ADD     x7,  x7, x14
449
450        B.HI    0b
451
452        # Restore d8-d15 from stack
453        LDP     d14, d15, [sp, 48]
454        LDP     d12, d13, [sp, 32]
455        LDP     d10, d11, [sp, 16]
456        LDP     d8,  d9, [sp], 64
457        RET
458
459        # Store odd width
4607:
461        TBZ     x1, 2, 8f
462        $if INC:
463          STR     q30, [x7], 16
464          MOV     v30.16b, v31.16b
465          STR     q28, [x10], 16
466          MOV     v28.16b, v29.16b
467          STR     q26, [x9], 16
468          MOV     v26.16b, v27.16b
469          STR     q24, [x6], 16
470          MOV     v24.16b, v25.16b
471        $else:
472          STR     q24, [x6], 16
473          MOV     v24.16b, v25.16b
474          STR     q26, [x9], 16
475          MOV     v26.16b, v27.16b
476          STR     q28, [x10], 16
477          MOV     v28.16b, v29.16b
478          STR     q30, [x7], 16
479          MOV     v30.16b, v31.16b
480
4818:
482        TBZ     x1, 1, 9f
483        $if INC:
484          STR     d30, [x7], 8
485          STR     d28, [x10], 8
486          DUP     d30, v30.d[1]
487          DUP     d28, v28.d[1]
488          STR     d26, [x9], 8
489          STR     d24, [x6], 8
490          DUP     d26, v26.d[1]
491          DUP     d24, v24.d[1]
492        $else:
493          STR     d24, [x6], 8
494          STR     d26, [x9], 8
495          DUP     d24, v24.d[1]
496          DUP     d26, v26.d[1]
497          STR     d28, [x10], 8
498          STR     d30, [x7], 8
499          DUP     d28, v28.d[1]
500          DUP     d30, v30.d[1]
501
5029:
503        TBZ     x1, 0, 10f
504        $if INC:
505          STR     s30,  [x7]
506          STR     s28, [x10]
507          STR     s26,  [x9]
508          STR     s24,  [x6]
509        $else:
510          STR     s24,  [x6]
511          STR     s26,  [x9]
512          STR     s28, [x10]
513          STR     s30,  [x7]
51410:
515        # Restore d8-d15 from stack
516        LDP     d14, d15, [sp, 48]
517        LDP     d12, d13, [sp, 32]
518        LDP     d10, d11, [sp, 16]
519        LDP     d8,  d9, [sp], 64
520        RET
521
522
523END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
524# LINT.ThenChange(4x8-aarch64-neonfma-cortex-a75.cc)
525
526#ifdef __ELF__
527.section ".note.GNU-stack","",%progbits
528#endif
529