1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 v3 28# A1 x9 v0[1] v3[1] 29# A2 x10 v1 v4 30# A3 x11 v1[1] v4[1] 31# A4 x12 v2 v5 32# A5 x4 v2[1] v5[1] 33 34# B x5 v12 v13 v14 v15 second set of B 35# B v16 v17 v18 v19 first set 36 37# C x6 v20 v21 38# C x16 v22 v23 39# C x17 v24 v25 40# C x14 v26 v27 41# C x13 v28 v29 42# C x7 v30 v31 43 44# Clamp v6 v7 45# unused A v8 v9 v10 v11 46# x8 temporary vector shadow register 47 48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53 49 50 # Load params pointer 51 LDR x8, [sp, 8] 52 53 # Clamp A and C pointers 54 CMP x0, 2 // if mr < 2 55 ADD x9, x3, x4 // A1 = a0 + a_stride 56 ADD x16, x6, x7 // c1 = c0 + cm_stride 57 CSEL x9, x3, x9, LO // a1 = a0 58 CSEL x16, x6, x16, LO // c1 = c0 59 60 ADD x10, x9, x4 // A2 = a1 + a_stride 61 ADD x17, x16, x7 // c2 = c1 + cm_stride 62 // if mr <= 2 63 CSEL x10, x9, x10, LS // a2 = a1 64 CSEL x17, x16, x17, LS // c2 = c1 65 66 CMP x0, 4 // if mr < 4 67 ADD x11, x10, x4 // A3 = a2 + a_stride 68 ADD x14, x17, x7 // c3 = c2 + cm_stride 69 CSEL x11, x10, x11, LO // a3 = a2 70 CSEL x14, x17, x14, LO // c3 = c2 71 72 ADD x12, x11, x4 // A4 = a3 + a_stride 73 ADD x13, x14, x7 // c4 = c3 + cm_stride 74 // if mr <= 4 75 CSEL x12, x11, x12, LS // a4 = a3 76 CSEL x13, x14, x13, LS // c4 = c3 77 78 CMP x0, 6 // if mr < 6 79 ADD x4, x12, x4 // A5 = a4 + a_stride 80 ADD x7, x13, x7 // c5 = c4 + cm_stride 81 CSEL x4, x12, x4, LO // a5 = a4 82 CSEL x7, x13, x7, LO // c5 = c4 83 84 # Load min/max values 85 LD2R {v6.4s, v7.4s}, [x8] 86 87 # Save d12-d15 on stack 88 STP d12, d13, [sp, -32]! 89 STP d14, d15, [sp, 16] 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 MOV v22.16b, v20.16b 95 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 96 PRFM PLDL1KEEP, [x3, 64] 97 MOV v23.16b, v21.16b 98 PRFM PLDL1KEEP, [x9, 0] 99 PRFM PLDL1KEEP, [x9, 64] 100 MOV v24.16b, v20.16b 101 PRFM PLDL1KEEP, [x10, 0] 102 PRFM PLDL1KEEP, [x10, 64] 103 MOV v25.16b, v21.16b 104 PRFM PLDL1KEEP, [x11, 0] 105 PRFM PLDL1KEEP, [x11, 64] 106 MOV v26.16b, v20.16b 107 PRFM PLDL1KEEP, [x12, 0] 108 PRFM PLDL1KEEP, [x12, 64] 109 MOV v27.16b, v21.16b 110 PRFM PLDL1KEEP, [x4, 0] 111 PRFM PLDL1KEEP, [x4, 64] 112 MOV v28.16b, v20.16b 113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 114 MOV v29.16b, v21.16b 115 PRFM PLDL1KEEP, [x5, 64] 116 MOV v30.16b, v20.16b 117 PRFM PLDL1KEEP, [x5, 128] 118 MOV v31.16b, v21.16b 119 PRFM PLDL1KEEP, [x5, 192] 120 121 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 122 SUBS x0, x2, 16 // k = kc - 16 123 B.LO 4f 124 125 # Prologue - First group loads, no FMA 126 LDR d0, [x3], 8 // A0 127 LDP q16, q17, [x5], 32 // B 128 LDR d1, [x10], 8 // A2 129 LDR d2, [x12], 8 // A4 130 LD1 {v0.d}[1], [x9], 8 // A1 131 LD1 {v1.d}[1], [x11], 8 // A3 132 LD1 {v2.d}[1], [x4], 8 // A5 133 SUBS x0, x0, 16 134 LDR q18, [x5], 16 135 LDR d19, [x5], 8 136 LDR x8, [x5], 8 // ins is in BLOCK 0 137 138 # Is there at least 4 floats (16 bytes) for main loop? 139 B.LO 2f 140 141 # Main loop - 4 floats of A (16 bytes) 142 # 48 FMA + 12 LD64 A + 8 LDR B 1431: 144 # First group of 24 FMA, Second group loads 145 # BLOCK 0 146 LDR d3, [x3], 8 // A0 147 INS v19.d[1], x8 // B from second group 148 FMLA v20.4s, v16.4s, v0.s[0] 149 LDR x8, [x9], 8 // A1 150 FMLA v22.4s, v16.4s, v0.s[2] 151 FMLA v24.4s, v16.4s, v1.s[0] 152 153 # BLOCK 1 154 LDR d12, [x5] 155 INS v3.d[1], x8 // A1 ins 156 FMLA v26.4s, v16.4s, v1.s[2] 157 LDR x8, [x5, 8] // B 158 FMLA v28.4s, v16.4s, v2.s[0] 159 FMLA v30.4s, v16.4s, v2.s[2] 160 161 # BLOCK 2 162 LDR d4, [x10], 8 // A2 163 INS v12.d[1], x8 // B ins 164 FMLA v21.4s, v17.4s, v0.s[0] 165 LDR x8, [x11], 8 // A3 166 FMLA v23.4s, v17.4s, v0.s[2] 167 FMLA v25.4s, v17.4s, v1.s[0] 168 169 # BLOCK 3 170 LDR d5, [x12], 8 // A4 171 INS v4.d[1], x8 // A3 ins 172 FMLA v27.4s, v17.4s, v1.s[2] 173 LDR x8, [x4], 8 // A5 174 FMLA v29.4s, v17.4s, v2.s[0] 175 FMLA v31.4s, v17.4s, v2.s[2] 176 177 # BLOCK 4 178 LDR d13, [x5, 16] 179 INS v5.d[1], x8 // A5 ins 180 FMLA v20.4s, v18.4s, v0.s[1] 181 LDR x8, [x5, 24] 182 FMLA v22.4s, v18.4s, v0.s[3] 183 FMLA v24.4s, v18.4s, v1.s[1] 184 185 # BLOCK 5 186 LDR d14, [x5, 32] 187 INS v13.d[1], x8 // B 188 FMLA v26.4s, v18.4s, v1.s[3] 189 LDR x8, [x5, 40] 190 FMLA v28.4s, v18.4s, v2.s[1] 191 FMLA v30.4s, v18.4s, v2.s[3] 192 193 # BLOCK 6 194 LDR d15, [x5, 48] 195 INS v14.d[1], x8 // B 196 FMLA v21.4s, v19.4s, v0.s[1] 197 LDR x8, [x5, 56] 198 FMLA v23.4s, v19.4s, v0.s[3] 199 FMLA v25.4s, v19.4s, v1.s[1] 200 201 # BLOCK 7 202 INS v15.d[1], x8 203 FMLA v27.4s, v19.4s, v1.s[3] 204 FMLA v29.4s, v19.4s, v2.s[1] 205 FMLA v31.4s, v19.4s, v2.s[3] 206 207 # Second group of 24 FMA, First group of loads 208 # BLOCK 0 209 LDR d0, [x3], 8 // A0 210 FMLA v20.4s, v12.4s, v3.s[0] 211 LDR x8, [x9], 8 // A1 212 FMLA v22.4s, v12.4s, v3.s[2] 213 FMLA v24.4s, v12.4s, v4.s[0] 214 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 215 216 # BLOCK 1 217 LDR d16, [x5, 64] 218 INS v0.d[1], x8 // A1 ins 219 FMLA v26.4s, v12.4s, v4.s[2] 220 LDR x8, [x5, 72] // B 221 FMLA v28.4s, v12.4s, v5.s[0] 222 FMLA v30.4s, v12.4s, v5.s[2] 223 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 224 225 # BLOCK 2 226 LDR d1, [x10], 8 // A2 227 INS v16.d[1], x8 // B 228 FMLA v21.4s, v13.4s, v3.s[0] 229 LDR x8, [x11], 8 // A3 230 FMLA v23.4s, v13.4s, v3.s[2] 231 FMLA v25.4s, v13.4s, v4.s[0] 232 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 233 234 # BLOCK 3 235 LDR d2, [x12], 8 // A4 236 INS v1.d[1], x8 // A3 ins 237 FMLA v27.4s, v13.4s, v4.s[2] 238 LDR x8, [x4], 8 // A5 239 FMLA v29.4s, v13.4s, v5.s[0] 240 FMLA v31.4s, v13.4s, v5.s[2] 241 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 242 243 # BLOCK 4 244 LDR d17, [x5, 80] 245 INS v2.d[1], x8 // A5 ins 246 FMLA v20.4s, v14.4s, v3.s[1] 247 LDR x8, [x5, 88] 248 FMLA v22.4s, v14.4s, v3.s[3] 249 FMLA v24.4s, v14.4s, v4.s[1] 250 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4 251 252 # BLOCK 5 253 LDR d18, [x5, 96] 254 INS v17.d[1], x8 // B 255 FMLA v26.4s, v14.4s, v4.s[3] 256 LDR x8, [x5, 104] 257 FMLA v28.4s, v14.4s, v5.s[1] 258 FMLA v30.4s, v14.4s, v5.s[3] 259 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5 260 261 # BLOCK 6 262 LDR d19, [x5, 112] 263 INS v18.d[1], x8 // B 264 FMLA v21.4s, v15.4s, v3.s[1] 265 LDR x8, [x5, 120] 266 FMLA v23.4s, v15.4s, v3.s[3] 267 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 268 FMLA v25.4s, v15.4s, v4.s[1] 269 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 270 271 # BLOCK 7 272 SUBS x0, x0, 16 // LDR lands here 273 FMLA v27.4s, v15.4s, v4.s[3] 274 FMLA v29.4s, v15.4s, v5.s[1] 275 ADD x5, x5, 128 276 FMLA v31.4s, v15.4s, v5.s[3] 277 B.HS 1b 278 279 # Epilogue - 4 floats of A (16 bytes) 280 # 48 FMA + 12 LD64 A + 8 LDR B 2812: 282 # First group of 24 FMA, Second group loads 283 # BLOCK 0 284 LDR d3, [x3], 8 // A0 285 INS v19.d[1], x8 // B from second group 286 FMLA v20.4s, v16.4s, v0.s[0] 287 LDR x8, [x9], 8 // A1 288 FMLA v22.4s, v16.4s, v0.s[2] 289 FMLA v24.4s, v16.4s, v1.s[0] 290 PRFM PSTL1KEEP, [x6] // Prefetch C0 291 292 # BLOCK 1 293 LDR d12, [x5] 294 INS v3.d[1], x8 // A1 ins 295 FMLA v26.4s, v16.4s, v1.s[2] 296 LDR x8, [x5, 8] // B 297 FMLA v28.4s, v16.4s, v2.s[0] 298 FMLA v30.4s, v16.4s, v2.s[2] 299 PRFM PSTL1KEEP, [x16] // Prefetch C1 300 301 # BLOCK 2 302 LDR d4, [x10], 8 // A2 303 INS v12.d[1], x8 // B ins 304 FMLA v21.4s, v17.4s, v0.s[0] 305 LDR x8, [x11], 8 // A3 306 FMLA v23.4s, v17.4s, v0.s[2] 307 FMLA v25.4s, v17.4s, v1.s[0] 308 PRFM PSTL1KEEP, [x17] // Prefetch C2 309 310 # BLOCK 3 311 LDR d5, [x12], 8 // A4 312 INS v4.d[1], x8 // A3 ins 313 FMLA v27.4s, v17.4s, v1.s[2] 314 LDR x8, [x4], 8 // A5 315 FMLA v29.4s, v17.4s, v2.s[0] 316 FMLA v31.4s, v17.4s, v2.s[2] 317 PRFM PSTL1KEEP, [x14] // Prefetch C3 318 319 # BLOCK 4 320 LDR d13, [x5, 16] 321 INS v5.d[1], x8 // A5 ins 322 FMLA v20.4s, v18.4s, v0.s[1] 323 LDR x8, [x5, 24] 324 FMLA v22.4s, v18.4s, v0.s[3] 325 FMLA v24.4s, v18.4s, v1.s[1] 326 PRFM PSTL1KEEP, [x13] // Prefetch C4 327 328 # BLOCK 5 329 LDR d14, [x5, 32] 330 INS v13.d[1], x8 // B 331 FMLA v26.4s, v18.4s, v1.s[3] 332 LDR x8, [x5, 40] 333 FMLA v28.4s, v18.4s, v2.s[1] 334 FMLA v30.4s, v18.4s, v2.s[3] 335 PRFM PSTL1KEEP, [x7] // Prefetch C5 336 337 # BLOCK 6 338 LDR d15, [x5, 48] 339 INS v14.d[1], x8 // B 340 FMLA v21.4s, v19.4s, v0.s[1] 341 LDR x8, [x5, 56] 342 FMLA v23.4s, v19.4s, v0.s[3] 343 FMLA v25.4s, v19.4s, v1.s[1] 344 345 # BLOCK 7 346 INS v15.d[1], x8 // B 347 FMLA v27.4s, v19.4s, v1.s[3] 348 FMLA v29.4s, v19.4s, v2.s[1] 349 FMLA v31.4s, v19.4s, v2.s[3] 350 351 # Second group of 24 FMA, First group of loads 352 # BLOCK 0 353 FMLA v20.4s, v12.4s, v3.s[0] 354 FMLA v22.4s, v12.4s, v3.s[2] 355 FMLA v24.4s, v12.4s, v4.s[0] 356 357 # BLOCK 1 358 FMLA v26.4s, v12.4s, v4.s[2] 359 FMLA v28.4s, v12.4s, v5.s[0] 360 FMLA v30.4s, v12.4s, v5.s[2] 361 362 # BLOCK 2 363 FMLA v21.4s, v13.4s, v3.s[0] 364 FMLA v23.4s, v13.4s, v3.s[2] 365 FMLA v25.4s, v13.4s, v4.s[0] 366 367 # BLOCK 3 368 FMLA v27.4s, v13.4s, v4.s[2] 369 FMLA v29.4s, v13.4s, v5.s[0] 370 FMLA v31.4s, v13.4s, v5.s[2] 371 372 # BLOCK 4 373 FMLA v20.4s, v14.4s, v3.s[1] 374 FMLA v22.4s, v14.4s, v3.s[3] 375 FMLA v24.4s, v14.4s, v4.s[1] 376 377 # BLOCK 5 378 FMLA v26.4s, v14.4s, v4.s[3] 379 FMLA v28.4s, v14.4s, v5.s[1] 380 FMLA v30.4s, v14.4s, v5.s[3] 381 TST x0, 15 382 383 # BLOCK 6 384 FMLA v21.4s, v15.4s, v3.s[1] 385 FMLA v23.4s, v15.4s, v3.s[3] 386 FMLA v25.4s, v15.4s, v4.s[1] 387 ADD x5, x5, 64 388 389 # BLOCK 7 390 FMLA v27.4s, v15.4s, v4.s[3] 391 FMLA v29.4s, v15.4s, v5.s[1] 392 FMLA v31.4s, v15.4s, v5.s[3] 393 394 # Is there a remainder?- 2 floats of A (8 bytes) or less 395 B.NE 4f 3963: 397 # Clamp 398 FMAX v20.4s, v20.4s, v6.4s 399 # Load cn_stride 400 LDR x0, [sp, 32] 401 FMAX v21.4s, v21.4s, v6.4s 402 FMAX v22.4s, v22.4s, v6.4s 403 FMAX v23.4s, v23.4s, v6.4s 404 FMAX v24.4s, v24.4s, v6.4s 405 FMAX v25.4s, v25.4s, v6.4s 406 FMAX v26.4s, v26.4s, v6.4s 407 FMAX v27.4s, v27.4s, v6.4s 408 FMAX v28.4s, v28.4s, v6.4s 409 FMAX v29.4s, v29.4s, v6.4s 410 FMAX v30.4s, v30.4s, v6.4s 411 FMAX v31.4s, v31.4s, v6.4s 412 SUBS x1, x1, 8 413 FMIN v20.4s, v20.4s, v7.4s 414 FMIN v21.4s, v21.4s, v7.4s 415 FMIN v22.4s, v22.4s, v7.4s 416 FMIN v23.4s, v23.4s, v7.4s 417 FMIN v24.4s, v24.4s, v7.4s 418 FMIN v25.4s, v25.4s, v7.4s 419 FMIN v26.4s, v26.4s, v7.4s 420 FMIN v27.4s, v27.4s, v7.4s 421 FMIN v28.4s, v28.4s, v7.4s 422 FMIN v29.4s, v29.4s, v7.4s 423 FMIN v30.4s, v30.4s, v7.4s 424 FMIN v31.4s, v31.4s, v7.4s 425 426 # Store full 6 x 8 427 B.LO 6f 428 429 ST1 {v20.16b, v21.16b}, [x6], x0 430 SUB x3, x3, x2 // A0 -= kc 431 ST1 {v22.16b, v23.16b}, [x16], x0 432 SUB x9, x9, x2 // A1 -= kc 433 ST1 {v24.16b, v25.16b}, [x17], x0 434 SUB x10, x10, x2 // A2 -= kc 435 ST1 {v26.16b, v27.16b}, [x14], x0 436 SUB x11, x11, x2 // A3 -= kc 437 ST1 {v28.16b, v29.16b}, [x13], x0 438 SUB x12, x12, x2 // A4 -= kc 439 ST1 {v30.16b, v31.16b}, [x7], x0 440 SUB x4, x4, x2 // A5 -= kc 441 442 B.HI 0b 443 444 # Restore d12-d15 from stack 445 LDP d14, d15, [sp, 16] 446 LDP d12, d13, [sp], 32 447 RET 448 4494: 450 # Is there a remainder?- 2 floats of A (8 bytes) 451 TBZ x0, 3, 5f 452 453 # Remainder- 2 floats of A (8 bytes) 454 LDR d0, [x3], 8 455 LDR q16, [x5], 16 456 LD1 {v0.d}[1], [x9], 8 457 LDR d1, [x10], 8 458 LD1 {v1.d}[1], [x11], 8 459 LDR d2, [x12], 8 460 LD1 {v2.d}[1], [x4], 8 461 LDR q17, [x5], 16 462 LDR q18, [x5], 16 463 LDR q19, [x5], 16 464 465 FMLA v20.4s, v16.4s, v0.s[0] 466 FMLA v22.4s, v16.4s, v0.s[2] 467 FMLA v24.4s, v16.4s, v1.s[0] 468 FMLA v26.4s, v16.4s, v1.s[2] 469 FMLA v28.4s, v16.4s, v2.s[0] 470 FMLA v30.4s, v16.4s, v2.s[2] 471 FMLA v21.4s, v17.4s, v0.s[0] 472 FMLA v23.4s, v17.4s, v0.s[2] 473 FMLA v25.4s, v17.4s, v1.s[0] 474 FMLA v27.4s, v17.4s, v1.s[2] 475 FMLA v29.4s, v17.4s, v2.s[0] 476 FMLA v31.4s, v17.4s, v2.s[2] 477 478 FMLA v20.4s, v18.4s, v0.s[1] 479 FMLA v22.4s, v18.4s, v0.s[3] 480 FMLA v24.4s, v18.4s, v1.s[1] 481 FMLA v26.4s, v18.4s, v1.s[3] 482 FMLA v28.4s, v18.4s, v2.s[1] 483 FMLA v30.4s, v18.4s, v2.s[3] 484 FMLA v21.4s, v19.4s, v0.s[1] 485 FMLA v23.4s, v19.4s, v0.s[3] 486 FMLA v25.4s, v19.4s, v1.s[1] 487 FMLA v27.4s, v19.4s, v1.s[3] 488 FMLA v29.4s, v19.4s, v2.s[1] 489 FMLA v31.4s, v19.4s, v2.s[3] 490 491 # Is there a remainder?- 1 float of A (4 bytes) 492 TBZ x0, 2, 3b 4935: 494 # Remainder- 1 float of A (4 bytes) 495 LDR s0, [x3], 4 496 LDR q16, [x5], 16 497 LD1 {v0.s}[2], [x9], 4 498 LDR s1, [x10], 4 499 LD1 {v1.s}[2], [x11], 4 500 LDR s2, [x12], 4 501 LD1 {v2.s}[2], [x4], 4 502 LDR q17, [x5], 16 503 504 FMLA v20.4s, v16.4s, v0.s[0] 505 FMLA v22.4s, v16.4s, v0.s[2] 506 FMLA v24.4s, v16.4s, v1.s[0] 507 FMLA v26.4s, v16.4s, v1.s[2] 508 FMLA v28.4s, v16.4s, v2.s[0] 509 FMLA v30.4s, v16.4s, v2.s[2] 510 FMLA v21.4s, v17.4s, v0.s[0] 511 FMLA v23.4s, v17.4s, v0.s[2] 512 FMLA v25.4s, v17.4s, v1.s[0] 513 FMLA v27.4s, v17.4s, v1.s[2] 514 FMLA v29.4s, v17.4s, v2.s[0] 515 FMLA v31.4s, v17.4s, v2.s[2] 516 B 3b 517 518 # Store odd width 5196: 520 TBZ x1, 2, 7f 521 STR q20, [x6], 16 522 MOV v20.16b, v21.16b 523 STR q22, [x16], 16 524 MOV v22.16b, v23.16b 525 STR q24, [x17], 16 526 MOV v24.16b, v25.16b 527 STR q26, [x14], 16 528 MOV v26.16b, v27.16b 529 STR q28, [x13], 16 530 MOV v28.16b, v29.16b 531 STR q30, [x7], 16 532 MOV v30.16b, v31.16b 533 5347: 535 TBZ x1, 1, 8f 536 STR d20, [x6], 8 537 STR d22, [x16], 8 538 DUP d20, v20.d[1] 539 DUP d22, v22.d[1] 540 STR d24, [x17], 8 541 STR d26, [x14], 8 542 DUP d24, v24.d[1] 543 DUP d26, v26.d[1] 544 STR d28, [x13], 8 545 STR d30, [x7], 8 546 DUP d28, v28.d[1] 547 DUP d30, v30.d[1] 548 5498: 550 TBZ x1, 0, 9f 551 STR s20, [x6] 552 STR s22, [x16] 553 STR s24, [x17] 554 STR s26, [x14] 555 STR s28, [x13] 556 STR s30, [x7] 5579: 558 # Restore d12-d15 from stack 559 LDP d14, d15, [sp, 16] 560 LDP d12, d13, [sp], 32 561 RET 562 563END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53 564 565#ifdef __ELF__ 566.section ".note.GNU-stack","",%progbits 567#endif 568