xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a55r0.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55r0.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0
28# A1  x9 v1
29# A2 x10 v2
30# A3 x11 v3
31# A4 x12 v4
32# A5  x4 v5
33
34# B   x5 v16 v17 v18 v19
35
36# C0  x6  v20 v21
37# C1 x16  v22 v23
38# C2 x17  v24 v25
39# C3 x14  v26 v27
40# C4 x13  v28 v29
41# C5  x7  v30 v31
42
43# Clamp v6, (v4), (v5)
44# unused     v7
45# unused A   v8 v9 v10 v11
46# unused B   v12 v13 v14 v15
47
48# x8 temporary vector shadow register
49
50BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0
51
52        # Load params pointer
53        LDR     x8, [sp, 8]
54
55        # Clamp A and C pointers
56        CMP     x0, 2                   // if mr < 2
57        ADD     x9, x3, x4              // a1 = a0 + a_stride
58        ADD     x16, x6, x7             // c1 = c0 + cm_stride
59        CSEL    x9, x3, x9, LO          //   a1 = a0
60        CSEL    x16, x6, x16, LO        //   c1 = c0
61
62        # Load params
63        LDR     s6, [x8]
64
65        ADD     x10, x9, x4             // a2 = a1 + a_stride
66        ADD     x17, x16, x7            // c2 = c1 + cm_stride
67                                        // if mr <= 2
68        CSEL    x10, x9, x10, LS        //   a2 = a1
69        CSEL    x17, x16, x17, LS       //   c2 = c1
70
71        CMP     x0, 4                   // if mr < 4
72        ADD     x11, x10, x4            // a3 = a2 + a_stride
73        ADD     x14, x17, x7            // c3 = c2 + cm_stride
74        CSEL    x11, x10, x11, LO       //   a3 = a2
75        CSEL    x14, x17, x14, LO       //   c3 = c2
76
77        ADD     x12, x11, x4            // a4 = a3 + a_stride
78        ADD     x13, x14, x7            // c4 = c3 + cm_stride
79                                        // if mr <= 4
80        CSEL    x12, x11, x12, LS       //   a4 = a3
81        CSEL    x13, x14, x13, LS       //   c4 = c3
82
83        CMP     x0, 6                   // if mr < 6
84        ADD     x4, x12, x4             // a5 = a4 + a_stride
85        ADD     x7, x13, x7             // c5 = c4 + cm_stride
86        CSEL    x4, x12, x4, LO         //   a5 = a4
87        CSEL    x7, x13, x7, LO         //   c5 = c4
88
89        # Save d12-d15 on stack
90        STP     d12, d13, [sp, -32]!
91        STP     d14, d15, [sp, 16]
920:
93        # Load initial bias from w into accumulators
94        LDP     q20, q21, [x5], 32
95        MOV     v22.16b, v20.16b
96        MOV     v23.16b, v21.16b
97        MOV     v24.16b, v20.16b
98        MOV     v25.16b, v21.16b
99        MOV     v26.16b, v20.16b
100        MOV     v27.16b, v21.16b
101        MOV     v28.16b, v20.16b
102        MOV     v29.16b, v21.16b
103        MOV     v30.16b, v20.16b
104        MOV     v31.16b, v21.16b
105
106
107        # Is there at least 4 halffloats (8 bytes) for prologue + epilogue?
108        SUBS    x0, x2, 8               // k = kc - 8
109        B.LO    4f
110
111        # Prologue - First group loads, no FMA
112        LDR     s0, [x3], 4               // A0
113        LDP     q16, q17, [x5], 32        // B
114        LDR     s1, [x10], 4              // A2
115        LDR     s2, [x12], 4              // A4
116        LD1     {v0.s}[2],  [x9], 4       // A1
117        LD1     {v1.s}[2], [x11], 4       // A3
118        LD1     {v2.s}[2],  [x4], 4       // A5
119        LDR     q18, [x5], 16
120        LDR     d19, [x5], 8
121        LDR     x8, [x5], 8               // ins is in BLOCK 0
122        SUBS    x0, x0, 8
123
124        # Is there at least 4 halffloats (8 bytes) for main loop?
125        B.LO    2f
126
127        # Main loop - 4 halffloats of A (8 bytes)
128        # 48 FMA + 12 LD32 A + 8 LDR B
1291:
130        # First group of 24 FMA, Second group loads
131        # BLOCK 0
132        LDR     s3, [x3], 4               // A0
133        INS     v19.d[1], x8              // B from second group
134        FMLA    v20.8h, v16.8h,  v0.h[0]
135        LDR     w8, [x9], 4               // A1
136        FMLA    v22.8h, v16.8h,  v0.h[4]
137        FMLA    v24.8h, v16.8h,  v1.h[0]
138
139        # BLOCK 1
140        LDR     d12, [x5]
141        INS     v3.d[1], x8               // A1 ins
142        FMLA    v26.8h, v16.8h,  v1.h[4]
143        LDR     x8, [x5, 8]               // B
144        FMLA    v28.8h, v16.8h,  v2.h[0]
145        FMLA    v30.8h, v16.8h,  v2.h[4]
146
147        # BLOCK 2
148        LDR     s4, [x10], 4              // A2
149        INS     v12.d[1], x8              // B  ins
150        FMLA    v21.8h, v17.8h,  v0.h[0]
151        LDR     w8, [x11], 4              // A3
152        FMLA    v23.8h, v17.8h,  v0.h[4]
153        FMLA    v25.8h, v17.8h,  v1.h[0]
154
155        # BLOCK 3
156        LDR     s5, [x12], 4              // A4
157        INS     v4.d[1], x8               // A3 ins
158        FMLA    v27.8h, v17.8h,  v1.h[4]
159        LDR     w8, [x4], 4               // A5
160        FMLA    v29.8h, v17.8h,  v2.h[0]
161        FMLA    v31.8h, v17.8h,  v2.h[4]
162
163        # BLOCK 4
164        LDR     d13, [x5, 16]
165        INS     v5.d[1], x8               // A5 ins
166        FMLA    v20.8h, v18.8h,  v0.h[1]
167        LDR     x8, [x5, 24]
168        FMLA    v22.8h, v18.8h,  v0.h[5]
169        FMLA    v24.8h, v18.8h,  v1.h[1]
170
171        # BLOCK 5
172        LDR     d14, [x5, 32]
173        INS     v13.d[1], x8              // B
174        FMLA    v26.8h, v18.8h,  v1.h[5]
175        LDR     x8, [x5, 40]
176        FMLA    v28.8h, v18.8h,  v2.h[1]
177        FMLA    v30.8h, v18.8h,  v2.h[5]
178
179        # BLOCK 6
180        LDR     d15, [x5, 48]
181        INS     v14.d[1], x8              // B
182        FMLA    v21.8h, v19.8h,  v0.h[1]
183        LDR     x8, [x5, 56]
184        FMLA    v23.8h, v19.8h,  v0.h[5]
185        FMLA    v25.8h, v19.8h,  v1.h[1]
186
187        # BLOCK 7
188        INS     v15.d[1], x8
189        FMLA    v27.8h, v19.8h,  v1.h[5]
190        FMLA    v29.8h, v19.8h,  v2.h[1]
191        FMLA    v31.8h, v19.8h,  v2.h[5]
192
193        # Second group of 24 FMA, First group of loads
194        # BLOCK 0
195        LDR     s0, [x3], 4               // A0
196        FMLA    v20.8h, v12.8h,  v3.h[0]
197        LDR     w8, [x9], 4               // A1
198        FMLA    v22.8h, v12.8h,  v3.h[4]
199        FMLA    v24.8h, v12.8h,  v4.h[0]
200
201        # BLOCK 1
202        LDR     d16, [x5, 64]
203        INS     v0.d[1], x8               // A1 ins
204        FMLA    v26.8h, v12.8h,  v4.h[4]
205        LDR     x8, [x5, 72]              // B
206        FMLA    v28.8h, v12.8h,  v5.h[0]
207        FMLA    v30.8h, v12.8h,  v5.h[4]
208
209        # BLOCK 2
210        LDR     s1, [x10], 4              // A2
211        INS     v16.d[1], x8              // B
212        FMLA    v21.8h, v13.8h,  v3.h[0]
213        LDR     w8, [x11], 4              // A3
214        FMLA    v23.8h, v13.8h,  v3.h[4]
215        FMLA    v25.8h, v13.8h,  v4.h[0]
216
217        # BLOCK 3
218        LDR     s2, [x12], 4              // A4
219        INS     v1.d[1], x8               // A3 ins
220        FMLA    v27.8h, v13.8h,  v4.h[4]
221        LDR     w8,  [x4], 4              // A5
222        FMLA    v29.8h, v13.8h,  v5.h[0]
223        FMLA    v31.8h, v13.8h,  v5.h[4]
224
225        # BLOCK 4
226        LDR     d17, [x5, 80]
227        INS     v2.d[1], x8               // A5 ins
228        FMLA    v20.8h, v14.8h,  v3.h[1]
229        LDR     x8, [x5, 88]
230        FMLA    v22.8h, v14.8h,  v3.h[5]
231        FMLA    v24.8h, v14.8h,  v4.h[1]
232
233        # BLOCK 5
234        LDR     d18, [x5, 96]
235        INS     v17.d[1], x8              // B
236        FMLA    v26.8h, v14.8h,  v4.h[5]
237        LDR     x8, [x5, 104]
238        FMLA    v28.8h, v14.8h,  v5.h[1]
239        FMLA    v30.8h, v14.8h,  v5.h[5]
240
241        # BLOCK 6
242        LDR     d19, [x5, 112]
243        INS     v18.d[1], x8              // B
244        FMLA    v21.8h, v15.8h,  v3.h[1]
245        LDR     x8, [x5, 120]
246        FMLA    v23.8h, v15.8h,  v3.h[5]
247        FMLA    v25.8h, v15.8h,  v4.h[1]
248
249        # BLOCK 7
250        SUBS    x0, x0, 8                 // LDR lands here
251        FMLA    v27.8h, v15.8h,  v4.h[5]
252        FMLA    v29.8h, v15.8h,  v5.h[1]
253        ADD     x5, x5, 128
254        FMLA    v31.8h, v15.8h,  v5.h[5]
255        B.HS    1b
256
257        # Epilogue - 4 halffloats of A (8 bytes)
258        # 48 FMA + 12 LD32 A + 8 LDR B
2592:
260        # First group of 24 FMA, Second group loads
261        # BLOCK 0
262        LDR     s3, [x3], 4               // A0
263        INS     v19.d[1], x8              // B from second group
264        FMLA    v20.8h, v16.8h,  v0.h[0]
265        LDR     w8, [x9], 4               // A1
266        FMLA    v22.8h, v16.8h,  v0.h[4]
267        FMLA    v24.8h, v16.8h,  v1.h[0]
268
269        # BLOCK 1
270        LDR     d12, [x5]
271        INS     v3.d[1], x8               // A1 ins
272        FMLA    v26.8h, v16.8h,  v1.h[4]
273        LDR     x8, [x5, 8]               // B
274        FMLA    v28.8h, v16.8h,  v2.h[0]
275        FMLA    v30.8h, v16.8h,  v2.h[4]
276
277        # BLOCK 2
278        LDR     s4, [x10], 4              // A2
279        INS     v12.d[1], x8              // B  ins
280        FMLA    v21.8h, v17.8h,  v0.h[0]
281        LDR     w8, [x11], 4              // A3
282        FMLA    v23.8h, v17.8h,  v0.h[4]
283        FMLA    v25.8h, v17.8h,  v1.h[0]
284
285        # BLOCK 3
286        LDR     s5, [x12], 4              // A4
287        INS     v4.d[1], x8               // A3 ins
288        FMLA    v27.8h, v17.8h,  v1.h[4]
289        LDR     w8, [x4], 4               // A5
290        FMLA    v29.8h, v17.8h,  v2.h[0]
291        FMLA    v31.8h, v17.8h,  v2.h[4]
292
293        # BLOCK 4
294        LDR     d13, [x5, 16]
295        INS     v5.d[1], x8               // A5 ins
296        FMLA    v20.8h, v18.8h,  v0.h[1]
297        LDR     x8, [x5, 24]
298        FMLA    v22.8h, v18.8h,  v0.h[5]
299        FMLA    v24.8h, v18.8h,  v1.h[1]
300
301        # BLOCK 5
302        LDR     d14, [x5, 32]
303        INS     v13.d[1], x8              // B
304        FMLA    v26.8h, v18.8h,  v1.h[5]
305        LDR     x8, [x5, 40]
306        FMLA    v28.8h, v18.8h,  v2.h[1]
307        FMLA    v30.8h, v18.8h,  v2.h[5]
308
309        # BLOCK 6
310        LDR     d15, [x5, 48]
311        INS     v14.d[1], x8              // B
312        FMLA    v21.8h, v19.8h,  v0.h[1]
313        LDR     x8, [x5, 56]
314        FMLA    v23.8h, v19.8h,  v0.h[5]
315        FMLA    v25.8h, v19.8h,  v1.h[1]
316
317        # BLOCK 7
318        INS     v15.d[1], x8              // B
319        FMLA    v27.8h, v19.8h,  v1.h[5]
320        FMLA    v29.8h, v19.8h,  v2.h[1]
321        FMLA    v31.8h, v19.8h,  v2.h[5]
322
323        # Second group of 24 FMA, First group of loads
324        # BLOCK 0
325        FMLA    v20.8h, v12.8h,  v3.h[0]
326        FMLA    v22.8h, v12.8h,  v3.h[4]
327        FMLA    v24.8h, v12.8h,  v4.h[0]
328
329        # BLOCK 1
330        FMLA    v26.8h, v12.8h,  v4.h[4]
331        FMLA    v28.8h, v12.8h,  v5.h[0]
332        FMLA    v30.8h, v12.8h,  v5.h[4]
333
334        # BLOCK 2
335        FMLA    v21.8h, v13.8h,  v3.h[0]
336        FMLA    v23.8h, v13.8h,  v3.h[4]
337        FMLA    v25.8h, v13.8h,  v4.h[0]
338
339        # BLOCK 3
340        FMLA    v27.8h, v13.8h,  v4.h[4]
341        FMLA    v29.8h, v13.8h,  v5.h[0]
342        FMLA    v31.8h, v13.8h,  v5.h[4]
343
344        # BLOCK 4
345        FMLA    v20.8h, v14.8h,  v3.h[1]
346        FMLA    v22.8h, v14.8h,  v3.h[5]
347        FMLA    v24.8h, v14.8h,  v4.h[1]
348
349        # BLOCK 5
350        FMLA    v26.8h, v14.8h,  v4.h[5]
351        FMLA    v28.8h, v14.8h,  v5.h[1]
352        FMLA    v30.8h, v14.8h,  v5.h[5]
353        TST     x0, 7
354
355        # BLOCK 6
356        FMLA    v21.8h, v15.8h,  v3.h[1]
357        FMLA    v23.8h, v15.8h,  v3.h[5]
358        FMLA    v25.8h, v15.8h,  v4.h[1]
359        ADD     x5, x5, 64
360
361        # BLOCK 7
362        FMLA    v27.8h, v15.8h,  v4.h[5]
363        FMLA    v29.8h, v15.8h,  v5.h[1]
364        FMLA    v31.8h, v15.8h,  v5.h[5]
365
366        # Is there a remainder?- 2 halffloats of A (4 bytes) or less
367        B.NE    4f
368
3693:
370        # Clamp
371        DUP     v4.8h, v6.h[0]
372        DUP     v5.8h, v6.h[1]
373        FMAX    v20.8h, v20.8h, v4.8h
374        LDR     x0, [sp, 32]            // cn_stride
375        FMAX    v21.8h, v21.8h, v4.8h
376        FMAX    v22.8h, v22.8h, v4.8h
377        FMAX    v23.8h, v23.8h, v4.8h
378        FMAX    v24.8h, v24.8h, v4.8h
379        FMAX    v25.8h, v25.8h, v4.8h
380        FMAX    v26.8h, v26.8h, v4.8h
381        FMAX    v27.8h, v27.8h, v4.8h
382        FMAX    v28.8h, v28.8h, v4.8h
383        FMAX    v29.8h, v29.8h, v4.8h
384        FMAX    v30.8h, v30.8h, v4.8h
385        FMAX    v31.8h, v31.8h, v4.8h
386        SUBS    x1, x1, 16
387        FMIN    v20.8h, v20.8h, v5.8h
388        FMIN    v21.8h, v21.8h, v5.8h
389        FMIN    v22.8h, v22.8h, v5.8h
390        FMIN    v23.8h, v23.8h, v5.8h
391        FMIN    v24.8h, v24.8h, v5.8h
392        FMIN    v25.8h, v25.8h, v5.8h
393        FMIN    v26.8h, v26.8h, v5.8h
394        FMIN    v27.8h, v27.8h, v5.8h
395        FMIN    v28.8h, v28.8h, v5.8h
396        FMIN    v29.8h, v29.8h, v5.8h
397        FMIN    v30.8h, v30.8h, v5.8h
398        FMIN    v31.8h, v31.8h, v5.8h
399
400        # Store full 6 x 16
401        B.LO    6f
402
403        ST1     {v20.16b, v21.16b},  [x6], x0
404        SUB     x3,  x3, x2             // a0 -= kc
405        ST1     {v22.16b, v23.16b}, [x16], x0
406        SUB     x9,  x9, x2             // a1 -= kc
407        ST1     {v24.16b, v25.16b}, [x17], x0
408        SUB     x10, x10, x2            // a2 -= kc
409        ST1     {v26.16b, v27.16b}, [x14], x0
410        SUB     x11, x11, x2            // a3 -= kc
411        ST1     {v28.16b, v29.16b}, [x13], x0
412        SUB     x12, x12, x2            // a4 -= kc
413        ST1     {v30.16b, v31.16b},  [x7], x0
414        SUB     x4,  x4, x2             // a5 -= kc
415
416        B.HI    0b
417
418        # Restore d12-d15 from stack
419        LDP     d14, d15, [sp, 16]
420        LDP     d12, d13, [sp], 32
421        RET
422
4234:
424        # Is there a remainder?- 2 halffloats of A (4 bytes)
425        TBZ     x0, 2, 5f
426
427        # Remainder- 2 halffloats of A (4 bytes)
428        LDR     s0, [x3], 4               // A0
429        LDP     q16, q17, [x5], 32        // B
430        LDR     s1, [x10], 4              // A2
431        LDR     s2, [x12], 4              // A4
432        LD1     {v0.s}[2],  [x9], 4       // A1
433        LD1     {v1.s}[2], [x11], 4       // A3
434        LD1     {v2.s}[2],  [x4], 4       // A5
435        LDR     q18, [x5], 16
436        LDR     q19, [x5], 16
437        FMLA    v20.8h, v16.8h,  v0.h[0]
438        FMLA    v22.8h, v16.8h,  v0.h[4]
439        FMLA    v24.8h, v16.8h,  v1.h[0]
440        FMLA    v26.8h, v16.8h,  v1.h[4]
441        FMLA    v28.8h, v16.8h,  v2.h[0]
442        FMLA    v30.8h, v16.8h,  v2.h[4]
443        FMLA    v21.8h, v17.8h,  v0.h[0]
444        FMLA    v23.8h, v17.8h,  v0.h[4]
445        FMLA    v25.8h, v17.8h,  v1.h[0]
446        FMLA    v27.8h, v17.8h,  v1.h[4]
447        FMLA    v29.8h, v17.8h,  v2.h[0]
448        FMLA    v31.8h, v17.8h,  v2.h[4]
449        FMLA    v20.8h, v18.8h,  v0.h[1]
450        FMLA    v22.8h, v18.8h,  v0.h[5]
451        FMLA    v24.8h, v18.8h,  v1.h[1]
452        FMLA    v26.8h, v18.8h,  v1.h[5]
453        FMLA    v28.8h, v18.8h,  v2.h[1]
454        FMLA    v30.8h, v18.8h,  v2.h[5]
455        FMLA    v21.8h, v19.8h,  v0.h[1]
456        FMLA    v23.8h, v19.8h,  v0.h[5]
457        FMLA    v25.8h, v19.8h,  v1.h[1]
458        FMLA    v27.8h, v19.8h,  v1.h[5]
459        FMLA    v29.8h, v19.8h,  v2.h[1]
460        FMLA    v31.8h, v19.8h,  v2.h[5]
461
462        # Is there a remainder?- 1 halffloat of A (2 bytes)
463        TBZ     x0, 1, 3b
4645:
465
466        # Remainder- 1 halffloat of A (2 bytes)
467        LDR     h0, [x3], 2               // A0
468        LDP     q16, q17, [x5], 32        // B
469        LDR     h1, [x10], 2              // A2
470        LDR     h2, [x12], 2              // A4
471        LD1     {v0.h}[4],  [x9], 2       // A1
472        LD1     {v1.h}[4], [x11], 2       // A3
473        LD1     {v2.h}[4],  [x4], 2       // A5
474        FMLA    v20.8h, v16.8h,  v0.h[0]
475        FMLA    v22.8h, v16.8h,  v0.h[4]
476        FMLA    v24.8h, v16.8h,  v1.h[0]
477        FMLA    v26.8h, v16.8h,  v1.h[4]
478        FMLA    v28.8h, v16.8h,  v2.h[0]
479        FMLA    v30.8h, v16.8h,  v2.h[4]
480        FMLA    v21.8h, v17.8h,  v0.h[0]
481        FMLA    v23.8h, v17.8h,  v0.h[4]
482        FMLA    v25.8h, v17.8h,  v1.h[0]
483        FMLA    v27.8h, v17.8h,  v1.h[4]
484        FMLA    v29.8h, v17.8h,  v2.h[0]
485        FMLA    v31.8h, v17.8h,  v2.h[4]
486        B       3b
487
488        # Store odd width
4896:
490        TBZ     x1, 3, 7f
491        STR     q20,  [x6], 16
492        MOV     v20.16b, v21.16b
493        STR     q22, [x16], 16
494        MOV     v22.16b, v23.16b
495        STR     q24, [x17], 16
496        MOV     v24.16b, v25.16b
497        STR     q26, [x14], 16
498        MOV     v26.16b, v27.16b
499        STR     q28, [x13], 16
500        MOV     v28.16b, v29.16b
501        STR     q30,  [x7], 16
502        MOV     v30.16b, v31.16b
503
5047:
505        TBZ     x1, 2, 8f
506        STR     d20,  [x6], 8
507        STR     d22, [x16], 8
508        DUP     d20, v20.d[1]
509        DUP     d22, v22.d[1]
510        STR     d24, [x17], 8
511        STR     d26, [x14], 8
512        DUP     d24, v24.d[1]
513        DUP     d26, v26.d[1]
514        STR     d28, [x13], 8
515        STR     d30,  [x7], 8
516        DUP     d28, v28.d[1]
517        DUP     d30, v30.d[1]
518
5198:
520        TBZ     x1, 1, 9f
521        STR     s20,  [x6], 4
522        STR     s22, [x16], 4
523        DUP     s20, v20.s[1]
524        DUP     s22, v22.s[1]
525        STR     s24, [x17], 4
526        STR     s26, [x14], 4
527        DUP     s24, v24.s[1]
528        DUP     s26, v26.s[1]
529        STR     s28, [x13], 4
530        STR     s30,  [x7], 4
531        DUP     s28, v28.s[1]
532        DUP     s30, v30.s[1]
533
5349:
535        TBZ     x1, 0, 10f
536        STR     h20,  [x6]
537        STR     h22, [x16]
538        STR     h24, [x17]
539        STR     h26, [x14]
540        STR     h28, [x13]
541        STR     h30,  [x7]
54210:
543        # Restore d12-d15 from stack
544        LDP     d14, d15, [sp, 16]
545        LDP     d12, d13, [sp], 32
546        RET
547
548END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0
549
550#ifdef __ELF__
551.section ".note.GNU-stack","",%progbits
552#endif
553