1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x13 a0 26# x14 a1 27# x15 a2 28# x16 a3 29 30# C pointers 31# x6 c0 32# x17 c1 33# x10 c2 34# x7 c3 / cm_stride 35 36# x8 temporary vector shadow register 37 38# Vector register usage and GPR shadows 39# A0 v0 40# A1 v0[1] 41# A2 v1 42# A3 v1[1] 43# A0 v2 44# A1 v2[1] 45# A2 v3 46# A3 v3[1] 47# B v6 v7 v8 48# B v9 v10 v11 49# B v14 v15 v16 50# B v17 v18 v19 51# C v20 v21 v22 52# C v23 v24 v25 53# C v26 v27 v28 54# C v29 v30 v31 55# Clamp v4 v5 56# v12 to v13 unused. 57 58BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 59 60 # Load a_offset 61 LDR x11, [sp, 8] 62 63 # Load zero, params pointer 64 LDP x12, x8, [sp, 16] 65 66 # Save d8-d11,d14,d15 on stack 67 STP d8, d9, [sp, -48]! 68 STP d10, d11, [sp, 16] 69 STP d14, d15, [sp, 32] 70 71 # Load min/max values 72 LD2R {v4.4s, v5.4s}, [x8] 73 74 # Clamp C pointers 75 CMP x0, 2 // if mr < 2 76 ADD x17, x6, x7 // c1 = c0 + cm_stride 77 CSEL x17, x6, x17, LO // c1 = c0 78 79 ADD x10, x17, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 82 CSEL x10, x17, x10, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x7, x10, x7 // c3 = c2 + cm_stride 86 CSEL x7, x10, x7, LO // c3 = c2 87 880: 89 # Load initial bias from w into accumulators 90 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 91 MOV v23.16b, v20.16b 92 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 93 MOV v24.16b, v21.16b 94 PRFM PLDL1KEEP, [x5, 64] 95 MOV v25.16b, v22.16b 96 PRFM PLDL1KEEP, [x5, 128] 97 MOV v26.16b, v20.16b 98 PRFM PLDL1KEEP, [x5, 192] 99 MOV v27.16b, v21.16b 100 PRFM PLDL1KEEP, [x5, 256] 101 MOV v28.16b, v22.16b 102 PRFM PLDL1KEEP, [x5, 320] 103 MOV v29.16b, v20.16b 104 MOV v30.16b, v21.16b 105 MOV v31.16b, v22.16b 106 107 MOV x9, x3 // p = ks 108 1091: 110 # Load next 4 A pointers 111 LDP x13, x14, [x4], 16 112 LDP x15, x16, [x4], 16 113 114 CMP x13, x12 // if a0 == zero 115 ADD x13, x13, x11 // a0 += a_offset 116 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 117 CMP x14, x12 // if a1 == zero 118 ADD x14, x14, x11 // a1 += a_offset 119 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 120 CMP x15, x12 // if a2 == zero 121 ADD x15, x15, x11 // a2 += a_offset 122 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 123 CMP x16, x12 // if a3 == zero 124 ADD x16, x16, x11 // a3 += a_offset 125 CSEL x16, x12, x16, EQ // a3 = zero, else += a3 + a_offset 126 127 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 128 SUBS x0, x2, 16 // k = kc - 16 129 130 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 131 PRFM PLDL1KEEP, [x13, 64] 132 PRFM PLDL1KEEP, [x14, 0] 133 PRFM PLDL1KEEP, [x14, 64] 134 PRFM PLDL1KEEP, [x15, 0] 135 PRFM PLDL1KEEP, [x15, 64] 136 PRFM PLDL1KEEP, [x16, 0] 137 PRFM PLDL1KEEP, [x16, 64] 138 B.LO 5f 139 140 SUBS x0, x0, 16 // 4 floats for main loop 141 142 # Prologue - loads for first group of 24 FMA 143 144 # Read first block of 4 A. 145 LDR d0, [x13], 8 // a0 146 LDR d1, [x15], 8 // a2 147 LD1 {v0.d}[1], [x14], 8 // a1 148 LD1 {v1.d}[1], [x16], 8 // a3 149 150 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 151 LD1 {v9.16b, v10.16b}, [x5], 32 152 LDR d11, [x5], 8 153 LDR x8, [x5], 8 154 155 # Is there at least 4 floats (16 bytes) for main loop? 156 B.LO 3f 157 158 # Main loop - 4 floats of A (16 bytes) 1592: 160 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 161 # A is loaded for 2nd group into v2/v3 162 # INS is 4 blocks (16 cycles) after load 163 164 # BLOCK 0 165 LDR d2, [x13], 8 // a0 166 INS v11.d[1], x8 167 FMLA v20.4s, v6.4s, v0.s[0] 168 LDR x8, [x14], 8 // a1 169 FMLA v23.4s, v6.4s, v0.s[2] 170 FMLA v26.4s, v6.4s, v1.s[0] 171 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0 172 173 # BLOCK 1 174 LDR d3, [x15], 8 // a2 175 INS v2.d[1], x8 // a1 was loaded in block 0 176 FMLA v29.4s, v6.4s, v1.s[2] 177 LDR x8, [x16], 8 // a3 178 FMLA v21.4s, v7.4s, v0.s[0] 179 FMLA v24.4s, v7.4s, v0.s[2] 180 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1 181 182 # BLOCK 2 183 LDR d14, [x5] // vb0x0123 184 INS v3.d[1], x8 // a3 was loaded in block 1 185 FMLA v27.4s, v7.4s, v1.s[0] 186 LDR x8, [x5, 8] 187 FMLA v30.4s, v7.4s, v1.s[2] 188 FMLA v22.4s, v8.4s, v0.s[0] 189 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2 190 191 # BLOCK 3 192 LDR d15, [x5, 16] // vb0x4567 193 INS v14.d[1], x8 // v14 was loaded in block 2 194 FMLA v25.4s, v8.4s, v0.s[2] 195 LDR x8, [x5, 24] 196 FMLA v28.4s, v8.4s, v1.s[0] 197 FMLA v31.4s, v8.4s, v1.s[2] 198 PRFM PLDL1KEEP, [x16, 128] // Prefetch A3 199 200 # BLOCK 4 201 LDR d16, [x5, 32] // vb0x89AB 202 INS v15.d[1], x8 203 FMLA v20.4s, v9.4s, v0.s[1] 204 LDR x8, [x5, 40] 205 FMLA v23.4s, v9.4s, v0.s[3] 206 FMLA v26.4s, v9.4s, v1.s[1] 207 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 208 209 # BLOCK 5 210 LDR d17, [x5, 48] // vb1x0123 211 INS v16.d[1], x8 212 FMLA v29.4s, v9.4s, v1.s[3] 213 LDR x8, [x5, 56] 214 FMLA v21.4s, v10.4s, v0.s[1] 215 FMLA v24.4s, v10.4s, v0.s[3] 216 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 217 218 # BLOCK 6 219 LDR d18, [x5, 64] // vb1x4567 220 INS v17.d[1], x8 221 FMLA v27.4s, v10.4s, v1.s[1] 222 LDR x8, [x5, 72] 223 FMLA v30.4s, v10.4s, v1.s[3] 224 FMLA v22.4s, v11.4s, v0.s[1] 225 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 226 227 # BLOCK 7 228 LDR d19, [x5, 80] // vb1x89AB 229 INS v18.d[1], x8 230 FMLA v25.4s, v11.4s, v0.s[3] 231 LDR x8, [x5, 88] 232 FMLA v28.4s, v11.4s, v1.s[1] 233 FMLA v31.4s, v11.4s, v1.s[3] 234 235 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 236 # A is loaded for 1st group into v0/v1 237 238 # BLOCK 0 239 LDR d0, [x13], 8 // a0 240 INS v19.d[1], x8 241 FMLA v20.4s, v14.4s, v2.s[0] 242 LDR x8, [x14], 8 // a1 243 FMLA v23.4s, v14.4s, v2.s[2] 244 FMLA v26.4s, v14.4s, v3.s[0] 245 246 # BLOCK 1 247 LDR d1, [x15], 8 // a2 248 INS v0.d[1], x8 // a1 249 FMLA v29.4s, v14.4s, v3.s[2] 250 LDR x8, [x16], 8 // a3 251 FMLA v21.4s, v15.4s, v2.s[0] 252 FMLA v24.4s, v15.4s, v2.s[2] 253 254 # BLOCK 2 255 LDR d6, [x5, 96] // vb0x0123 256 INS v1.d[1], x8 // a3 257 FMLA v27.4s, v15.4s, v3.s[0] 258 LDR x8, [x5, 104] 259 FMLA v30.4s, v15.4s, v3.s[2] 260 FMLA v22.4s, v16.4s, v2.s[0] 261 262 # BLOCK 3 263 LDR d7, [x5, 112] // vb0x4567 264 INS v6.d[1], x8 265 FMLA v25.4s, v16.4s, v2.s[2] 266 LDR x8, [x5, 120] 267 FMLA v28.4s, v16.4s, v3.s[0] 268 FMLA v31.4s, v16.4s, v3.s[2] 269 270 # BLOCK 4 271 LDR d8, [x5, 128] // vb0x89AB 272 INS v7.d[1], x8 273 FMLA v20.4s, v17.4s, v2.s[1] 274 LDR x8, [x5, 136] 275 FMLA v23.4s, v17.4s, v2.s[3] 276 FMLA v26.4s, v17.4s, v3.s[1] 277 278 # BLOCK 5 279 LDR d9, [x5, 144] // vb1x0123 280 INS v8.d[1], x8 281 FMLA v29.4s, v17.4s, v3.s[3] 282 LDR x8, [x5, 152] 283 FMLA v21.4s, v18.4s, v2.s[1] 284 FMLA v24.4s, v18.4s, v2.s[3] 285 286 # BLOCK 6 287 LDR d10, [x5, 160] // vb1x4567 288 INS v9.d[1], x8 289 FMLA v27.4s, v18.4s, v3.s[1] 290 LDR x8, [x5, 168] 291 FMLA v30.4s, v18.4s, v3.s[3] 292 SUBS x0, x0, 16 293 FMLA v22.4s, v19.4s, v2.s[1] 294 295 # BLOCK 7 296 LDR d11, [x5, 176] // vb1x89AB 297 INS v10.d[1], x8 298 FMLA v25.4s, v19.4s, v2.s[3] 299 LDR x8, [x5, 184] 300 FMLA v28.4s, v19.4s, v3.s[1] 301 ADD x5, x5, 192 302 FMLA v31.4s, v19.4s, v3.s[3] 303 B.HS 2b 304 305 # Epilogue 306 # First block same as main loop. Second block has no loads. 3073: 308 # BLOCK 0 309 LDR d2, [x13], 8 // a0 310 INS v11.d[1], x8 311 FMLA v20.4s, v6.4s, v0.s[0] 312 LDR x8, [x14], 8 // a1 313 FMLA v23.4s, v6.4s, v0.s[2] 314 FMLA v26.4s, v6.4s, v1.s[0] 315 316 # BLOCK 1 317 LDR d3, [x15], 8 // a2 318 INS v2.d[1], x8 // a1 was loaded in block 0 319 FMLA v29.4s, v6.4s, v1.s[2] 320 LDR x8, [x16], 8 // a3 321 FMLA v21.4s, v7.4s, v0.s[0] 322 FMLA v24.4s, v7.4s, v0.s[2] 323 324 # BLOCK 2 325 LDR d14, [x5] // vb0x0123 326 INS v3.d[1], x8 // a3 was loaded in block 1 327 FMLA v27.4s, v7.4s, v1.s[0] 328 LDR x8, [x5, 8] 329 FMLA v30.4s, v7.4s, v1.s[2] 330 FMLA v22.4s, v8.4s, v0.s[0] 331 332 # BLOCK 3 333 LDR d15, [x5, 16] // vb0x4567 334 INS v14.d[1], x8 // v14 was loaded in block 2 335 FMLA v25.4s, v8.4s, v0.s[2] 336 LDR x8, [x5, 24] 337 FMLA v28.4s, v8.4s, v1.s[0] 338 FMLA v31.4s, v8.4s, v1.s[2] 339 340 # BLOCK 4 341 LDR d16, [x5, 32] // vb0x89AB 342 INS v15.d[1], x8 343 FMLA v20.4s, v9.4s, v0.s[1] 344 LDR x8, [x5, 40] 345 FMLA v23.4s, v9.4s, v0.s[3] 346 FMLA v26.4s, v9.4s, v1.s[1] 347 348 # BLOCK 5 349 LDR d17, [x5, 48] // vb1x0123 350 INS v16.d[1], x8 351 FMLA v29.4s, v9.4s, v1.s[3] 352 LDR x8, [x5, 56] 353 FMLA v21.4s, v10.4s, v0.s[1] 354 FMLA v24.4s, v10.4s, v0.s[3] 355 356 # BLOCK 6 357 LDR d18, [x5, 64] // vb1x4567 358 INS v17.d[1], x8 359 FMLA v27.4s, v10.4s, v1.s[1] 360 LDR x8, [x5, 72] 361 FMLA v30.4s, v10.4s, v1.s[3] 362 FMLA v22.4s, v11.4s, v0.s[1] 363 364 # BLOCK 7 365 LDR d19, [x5, 80] // vb1x89AB 366 INS v18.d[1], x8 367 FMLA v25.4s, v11.4s, v0.s[3] 368 LDR x8, [x5, 88] 369 FMLA v28.4s, v11.4s, v1.s[1] 370 FMLA v31.4s, v11.4s, v1.s[3] 371 372 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 373 # A is loaded for 1st group into v0/v1 374 375 # BLOCK 0 376 INS v19.d[1], x8 377 FMLA v20.4s, v14.4s, v2.s[0] 378 FMLA v23.4s, v14.4s, v2.s[2] 379 FMLA v26.4s, v14.4s, v3.s[0] 380 381 # BLOCK 1 382 FMLA v29.4s, v14.4s, v3.s[2] 383 FMLA v21.4s, v15.4s, v2.s[0] 384 FMLA v24.4s, v15.4s, v2.s[2] 385 386 # BLOCK 2 387 FMLA v27.4s, v15.4s, v3.s[0] 388 FMLA v30.4s, v15.4s, v3.s[2] 389 FMLA v22.4s, v16.4s, v2.s[0] 390 391 # BLOCK 3 392 FMLA v25.4s, v16.4s, v2.s[2] 393 FMLA v28.4s, v16.4s, v3.s[0] 394 FMLA v31.4s, v16.4s, v3.s[2] 395 396 # BLOCK 4 397 FMLA v20.4s, v17.4s, v2.s[1] 398 FMLA v23.4s, v17.4s, v2.s[3] 399 FMLA v26.4s, v17.4s, v3.s[1] 400 401 # BLOCK 5 402 FMLA v29.4s, v17.4s, v3.s[3] 403 FMLA v21.4s, v18.4s, v2.s[1] 404 FMLA v24.4s, v18.4s, v2.s[3] 405 406 # BLOCK 6 407 FMLA v27.4s, v18.4s, v3.s[1] 408 FMLA v30.4s, v18.4s, v3.s[3] 409 FMLA v22.4s, v19.4s, v2.s[1] 410 TST x0, 15 411 412 # BLOCK 7 413 FMLA v25.4s, v19.4s, v2.s[3] 414 FMLA v28.4s, v19.4s, v3.s[1] 415 ADD x5, x5, 96 416 FMLA v31.4s, v19.4s, v3.s[3] 417 418 # Is there a remainder?- 2 floats of A (8 bytes) or less 419 B.NE 5f 420 4214: 422 # ks loop 423 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 424 B.HI 1b 425 426 # Clamp 427 FMAX v20.4s, v20.4s, v4.4s 428 # Load cn_stride 429 LDR x0, [sp, 48] 430 FMAX v21.4s, v21.4s, v4.4s 431 FMAX v22.4s, v22.4s, v4.4s 432 FMAX v23.4s, v23.4s, v4.4s 433 FMAX v24.4s, v24.4s, v4.4s 434 FMAX v25.4s, v25.4s, v4.4s 435 FMAX v26.4s, v26.4s, v4.4s 436 FMAX v27.4s, v27.4s, v4.4s 437 FMAX v28.4s, v28.4s, v4.4s 438 FMAX v29.4s, v29.4s, v4.4s 439 FMAX v30.4s, v30.4s, v4.4s 440 FMAX v31.4s, v31.4s, v4.4s 441 SUBS x1, x1, 12 442 FMIN v20.4s, v20.4s, v5.4s 443 FMIN v21.4s, v21.4s, v5.4s 444 FMIN v22.4s, v22.4s, v5.4s 445 FMIN v23.4s, v23.4s, v5.4s 446 FMIN v24.4s, v24.4s, v5.4s 447 FMIN v25.4s, v25.4s, v5.4s 448 FMIN v26.4s, v26.4s, v5.4s 449 FMIN v27.4s, v27.4s, v5.4s 450 FMIN v28.4s, v28.4s, v5.4s 451 FMIN v29.4s, v29.4s, v5.4s 452 FMIN v30.4s, v30.4s, v5.4s 453 FMIN v31.4s, v31.4s, v5.4s 454 455 # Store full 4 x 12 456 B.LO 7f 457 458 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x0 459 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x0 460 ST1 {v23.16b, v24.16b, v25.16b}, [x17], x0 461 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x0 462 SUB x4, x4, x3 // a -= ks 463 464 # nc loop 465 B.HI 0b 466 467 # Restore d8-d11,d14,d15 from stack 468 LDP d14, d15, [sp, 32] 469 LDP d10, d11, [sp, 16] 470 LDP d8, d9, [sp], 48 471 RET 472 4735: 474 # Is there a remainder?- 2 floats of A (8 bytes) 475 TBZ x0, 3, 6f 476 477 # Remainder- 2 floats of A (8 bytes) 478 LDR d0, [x13], 8 // a0 479 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 480 LDR d1, [x14], 8 // a1 481 LDR d2, [x15], 8 // a2 482 LDR d3, [x16], 8 // a3 483 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 484 485 # First block of 3 B 486 FMLA v20.4s, v6.4s, v0.s[0] 487 FMLA v23.4s, v6.4s, v1.s[0] 488 FMLA v26.4s, v6.4s, v2.s[0] 489 FMLA v29.4s, v6.4s, v3.s[0] 490 FMLA v21.4s, v7.4s, v0.s[0] 491 FMLA v24.4s, v7.4s, v1.s[0] 492 FMLA v27.4s, v7.4s, v2.s[0] 493 FMLA v30.4s, v7.4s, v3.s[0] 494 FMLA v22.4s, v8.4s, v0.s[0] 495 FMLA v25.4s, v8.4s, v1.s[0] 496 FMLA v28.4s, v8.4s, v2.s[0] 497 FMLA v31.4s, v8.4s, v3.s[0] 498 499 # Second block of 3 B 500 FMLA v20.4s, v9.4s, v0.s[1] 501 FMLA v23.4s, v9.4s, v1.s[1] 502 FMLA v26.4s, v9.4s, v2.s[1] 503 FMLA v29.4s, v9.4s, v3.s[1] 504 FMLA v21.4s, v10.4s, v0.s[1] 505 FMLA v24.4s, v10.4s, v1.s[1] 506 FMLA v27.4s, v10.4s, v2.s[1] 507 FMLA v30.4s, v10.4s, v3.s[1] 508 FMLA v22.4s, v11.4s, v0.s[1] 509 FMLA v25.4s, v11.4s, v1.s[1] 510 FMLA v28.4s, v11.4s, v2.s[1] 511 FMLA v31.4s, v11.4s, v3.s[1] 512 513 # Is there a remainder?- 1 float of A (4 bytes) 514 TBZ x0, 2, 4b 5156: 516 # Remainder- 1 float of A (4 bytes) 517 LDR s0, [x13], 4 // a0 518 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 519 LDR s1, [x14], 4 // a1 520 LDR s2, [x15], 4 // a2 521 LDR s3, [x16], 4 // a3 522 523 FMLA v20.4s, v6.4s, v0.s[0] 524 FMLA v23.4s, v6.4s, v1.s[0] 525 FMLA v26.4s, v6.4s, v2.s[0] 526 FMLA v29.4s, v6.4s, v3.s[0] 527 FMLA v21.4s, v7.4s, v0.s[0] 528 FMLA v24.4s, v7.4s, v1.s[0] 529 FMLA v27.4s, v7.4s, v2.s[0] 530 FMLA v30.4s, v7.4s, v3.s[0] 531 FMLA v22.4s, v8.4s, v0.s[0] 532 FMLA v25.4s, v8.4s, v1.s[0] 533 FMLA v28.4s, v8.4s, v2.s[0] 534 FMLA v31.4s, v8.4s, v3.s[0] 535 B 4b 536 5377: 538 ADD x1, x1, 12 539 # Store odd channels 540 TBZ x1, 3, 8f 541 STP q29, q30, [x7], 32 542 MOV v29.16b, v31.16b 543 STP q26, q27, [x10], 32 544 MOV v26.16b, v28.16b 545 STP q23, q24, [x17], 32 546 MOV v23.16b, v25.16b 547 STP q20, q21, [x6], 32 548 MOV v20.16b, v22.16b 549 5508: 551 TBZ x1, 2, 9f 552 STR q29, [x7], 16 553 MOV v29.16b, v30.16b 554 STR q26, [x10], 16 555 MOV v26.16b, v27.16b 556 STR q23, [x17], 16 557 MOV v23.16b, v24.16b 558 STR q20, [x6], 16 559 MOV v20.16b, v21.16b 560 5619: 562 TBZ x1, 1, 10f 563 STR d29, [x7], 8 564 DUP d29, v29.d[1] 565 STR d26, [x10], 8 566 DUP d26, v26.d[1] 567 STR d23, [x17], 8 568 DUP d23, v23.d[1] 569 STR d20, [x6], 8 570 DUP d20, v20.d[1] 571 57210: 573 TBZ x1, 0, 11f 574 STR s29, [x7] 575 STR s26, [x10] 576 STR s23, [x17] 577 STR s20, [x6] 57811: 579 # Restore d8-d11,d14,d15 from stack 580 LDP d14, d15, [sp, 32] 581 LDP d10, d11, [sp, 16] 582 LDP d8, d9, [sp], 48 583 RET 584 585END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 586 587#ifdef __ELF__ 588.section ".note.GNU-stack","",%progbits 589#endif 590