1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# Register usage 29# A0 x14 v0 v3 30# A1 x15 v0[1] v3[1] 31# A2 x20 v1 v4 32# A3 x21 v1[1] v4[1] 33# A4 x22 v2 v5 34# A5 x23 v2[1] v5[1] 35 36# B x5 v12 v13 v14 v15 second set of B 37# B v16 v17 v18 v19 first set 38 39# C0 x6 v20 v21 40# C1 x16 v22 v23 41# C2 x17 v24 v25 42# C3 x10 v26 v27 43# C4 x13 v28 v29 44# C5 x7 v30 v31 45 46# Clamp v6 v7 47# unused A v8 v9 v10 v11 48# x8 temporary vector shadow register 49 50BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53 51 52 # Load a_offset 53 LDR x11, [sp, 8] 54 55 # Load zero, params pointer 56 LDP x12, x8, [sp, 16] 57 58 # Clamp C pointers 59 CMP x0, 2 // if mr < 2 60 ADD x16, x6, x7 // c1 = c0 + cm_stride 61 CSEL x16, x6, x16, LO // c1 = c0 62 63 ADD x17, x16, x7 // c2 = c1 + cm_stride 64 // if mr <= 2 65 CSEL x17, x16, x17, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 ADD x10, x17, x7 // c3 = c2 + cm_stride 69 CSEL x10, x17, x10, LO // c3 = c2 70 71 ADD x13, x10, x7 // c4 = c3 + cm_stride 72 // if mr <= 4 73 CSEL x13, x10, x13, LS // c4 = c3 74 75 CMP x0, 6 // if mr < 6 76 ADD x7, x13, x7 // c5 = c4 + cm_stride 77 CSEL x7, x13, x7, LO // c5 = c4 78 79 # Load min/max values 80 LD2R {v6.4s, v7.4s}, [x8] 81 82 # Save x20-x23, d12-d15 on stack 83 STP d12, d13, [sp, -64]! 84 STP d14, d15, [sp, 16] 85 STP x20, x21, [sp, 32] 86 STP x22, x23, [sp, 48] 87 880: 89 # Load initial bias from w into accumulators 90 LDP q20, q21, [x5], 32 91 MOV v22.16b, v20.16b 92 MOV v23.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 94 MOV v24.16b, v20.16b 95 PRFM PLDL1KEEP, [x5, 64] 96 MOV v25.16b, v21.16b 97 PRFM PLDL1KEEP, [x5, 128] 98 MOV v26.16b, v20.16b 99 PRFM PLDL1KEEP, [x5, 192] 100 MOV v27.16b, v21.16b 101 MOV v28.16b, v20.16b 102 MOV v29.16b, v21.16b 103 MOV v30.16b, v20.16b 104 MOV v31.16b, v21.16b 105 106 MOV x9, x3 // p = ks 107 1081: 109 # Load next 6 A pointers 110 LDP x14, x15, [x4], 16 111 LDP x20, x21, [x4], 16 112 LDP x22, x23, [x4], 16 113 114 CMP x14, x12 // if a0 == zero 115 ADD x14, x14, x11 // A0 += a_offset 116 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 117 CMP x15, x12 // if a1 == zero 118 ADD x15, x15, x11 // A1 += a_offset 119 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 120 CMP x20, x12 // if a2 == zero 121 ADD x20, x20, x11 // A2 += a_offset 122 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 123 CMP x21, x12 // if a3 == zero 124 ADD x21, x21, x11 // A3 += a_offset 125 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 126 CMP x22, x12 // if a4 == zero 127 ADD x22, x22, x11 // A4 += a_offset 128 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 129 CMP x23, x12 // if a5 == zero 130 ADD x23, x23, x11 // A5 += a_offset 131 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 132 133 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 134 SUBS x0, x2, 16 // k = kc - 16 135 B.LO 5f 136 137 # Prologue - First group loads, no FMA 138 LDR d0, [x14], 8 // A0 139 LDP q16, q17, [x5], 32 // B 140 LDR d1, [x20], 8 // A2 141 LDR d2, [x22], 8 // A4 142 LD1 {v0.d}[1], [x15], 8 // A1 143 LD1 {v1.d}[1], [x21], 8 // A3 144 LD1 {v2.d}[1], [x23], 8 // A5 145 SUBS x0, x0, 16 146 LDR q18, [x5], 16 147 LDR d19, [x5], 8 148 LDR x8, [x5], 8 // ins is in BLOCK 0 149 150 # Is there at least 4 floats (16 bytes) for main loop? 151 B.LO 3f 152 153 # Main loop - 4 floats of A (16 bytes) 154 # 48 FMA + 12 LD64 A + 8 LDR B 1552: 156 # First group of 24 FMA, Second group loads 157 # BLOCK 0 158 LDR d3, [x14], 8 // A0 159 INS v19.d[1], x8 // B from second group 160 FMLA v20.4s, v16.4s, v0.s[0] 161 LDR x8, [x15], 8 // A1 162 FMLA v22.4s, v16.4s, v0.s[2] 163 FMLA v24.4s, v16.4s, v1.s[0] 164 165 # BLOCK 1 166 LDR d12, [x5] 167 INS v3.d[1], x8 // A1 ins 168 FMLA v26.4s, v16.4s, v1.s[2] 169 LDR x8, [x5, 8] // B 170 FMLA v28.4s, v16.4s, v2.s[0] 171 FMLA v30.4s, v16.4s, v2.s[2] 172 173 # BLOCK 2 174 LDR d4, [x20], 8 // A2 175 INS v12.d[1], x8 // B ins 176 FMLA v21.4s, v17.4s, v0.s[0] 177 LDR x8, [x21], 8 // A3 178 FMLA v23.4s, v17.4s, v0.s[2] 179 FMLA v25.4s, v17.4s, v1.s[0] 180 181 # BLOCK 3 182 LDR d5, [x22], 8 // A4 183 INS v4.d[1], x8 // A3 ins 184 FMLA v27.4s, v17.4s, v1.s[2] 185 LDR x8, [x23], 8 // A5 186 FMLA v29.4s, v17.4s, v2.s[0] 187 FMLA v31.4s, v17.4s, v2.s[2] 188 189 # BLOCK 4 190 LDR d13, [x5, 16] 191 INS v5.d[1], x8 // A5 ins 192 FMLA v20.4s, v18.4s, v0.s[1] 193 LDR x8, [x5, 24] 194 FMLA v22.4s, v18.4s, v0.s[3] 195 FMLA v24.4s, v18.4s, v1.s[1] 196 197 # BLOCK 5 198 LDR d14, [x5, 32] 199 INS v13.d[1], x8 // B 200 FMLA v26.4s, v18.4s, v1.s[3] 201 LDR x8, [x5, 40] 202 FMLA v28.4s, v18.4s, v2.s[1] 203 FMLA v30.4s, v18.4s, v2.s[3] 204 205 # BLOCK 6 206 LDR d15, [x5, 48] 207 INS v14.d[1], x8 // B 208 FMLA v21.4s, v19.4s, v0.s[1] 209 LDR x8, [x5, 56] 210 FMLA v23.4s, v19.4s, v0.s[3] 211 FMLA v25.4s, v19.4s, v1.s[1] 212 213 # BLOCK 7 214 INS v15.d[1], x8 215 FMLA v27.4s, v19.4s, v1.s[3] 216 FMLA v29.4s, v19.4s, v2.s[1] 217 FMLA v31.4s, v19.4s, v2.s[3] 218 219 # Second group of 24 FMA, First group of loads 220 # BLOCK 0 221 LDR d0, [x14], 8 // A0 222 FMLA v20.4s, v12.4s, v3.s[0] 223 LDR x8, [x15], 8 // A1 224 FMLA v22.4s, v12.4s, v3.s[2] 225 FMLA v24.4s, v12.4s, v4.s[0] 226 PRFM PLDL1KEEP, [x14, 128] // Prefetch A0 227 228 # BLOCK 1 229 LDR d16, [x5, 64] 230 INS v0.d[1], x8 // A1 ins 231 FMLA v26.4s, v12.4s, v4.s[2] 232 LDR x8, [x5, 72] // B 233 FMLA v28.4s, v12.4s, v5.s[0] 234 FMLA v30.4s, v12.4s, v5.s[2] 235 PRFM PLDL1KEEP, [x15, 128] // Prefetch A1 236 237 # BLOCK 2 238 LDR d1, [x20], 8 // A2 239 INS v16.d[1], x8 // B 240 FMLA v21.4s, v13.4s, v3.s[0] 241 LDR x8, [x21], 8 // A3 242 FMLA v23.4s, v13.4s, v3.s[2] 243 FMLA v25.4s, v13.4s, v4.s[0] 244 PRFM PLDL1KEEP, [x20, 128] // Prefetch A2 245 246 # BLOCK 3 247 LDR d2, [x22], 8 // A4 248 INS v1.d[1], x8 // A3 ins 249 FMLA v27.4s, v13.4s, v4.s[2] 250 LDR x8, [x23], 8 // A5 251 FMLA v29.4s, v13.4s, v5.s[0] 252 FMLA v31.4s, v13.4s, v5.s[2] 253 PRFM PLDL1KEEP, [x21, 128] // Prefetch A3 254 255 # BLOCK 4 256 LDR d17, [x5, 80] 257 INS v2.d[1], x8 // A5 ins 258 FMLA v20.4s, v14.4s, v3.s[1] 259 LDR x8, [x5, 88] 260 FMLA v22.4s, v14.4s, v3.s[3] 261 FMLA v24.4s, v14.4s, v4.s[1] 262 PRFM PLDL1KEEP, [x22, 128] // Prefetch A4 263 264 # BLOCK 5 265 LDR d18, [x5, 96] 266 INS v17.d[1], x8 // B 267 FMLA v26.4s, v14.4s, v4.s[3] 268 LDR x8, [x5, 104] 269 FMLA v28.4s, v14.4s, v5.s[1] 270 FMLA v30.4s, v14.4s, v5.s[3] 271 PRFM PLDL1KEEP, [x23, 128] // Prefetch A5 272 273 # BLOCK 6 274 LDR d19, [x5, 112] 275 INS v18.d[1], x8 // B 276 FMLA v21.4s, v15.4s, v3.s[1] 277 LDR x8, [x5, 120] 278 FMLA v23.4s, v15.4s, v3.s[3] 279 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 280 FMLA v25.4s, v15.4s, v4.s[1] 281 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 282 283 # BLOCK 7 284 SUBS x0, x0, 16 // LDR lands here 285 FMLA v27.4s, v15.4s, v4.s[3] 286 FMLA v29.4s, v15.4s, v5.s[1] 287 ADD x5, x5, 128 288 FMLA v31.4s, v15.4s, v5.s[3] 289 B.HS 2b 290 291 # Epilogue - 4 floats of A (16 bytes) 292 # 48 FMA + 12 LD64 A + 8 LDR B 2933: 294 # First group of 24 FMA, Second group loads 295 # BLOCK 0 296 LDR d3, [x14], 8 // A0 297 INS v19.d[1], x8 // B from second group 298 FMLA v20.4s, v16.4s, v0.s[0] 299 LDR x8, [x15], 8 // A1 300 FMLA v22.4s, v16.4s, v0.s[2] 301 FMLA v24.4s, v16.4s, v1.s[0] 302 PRFM PSTL1KEEP, [x6] // Prefetch C0 303 304 # BLOCK 1 305 LDR d12, [x5] 306 INS v3.d[1], x8 // A1 ins 307 FMLA v26.4s, v16.4s, v1.s[2] 308 LDR x8, [x5, 8] // B 309 FMLA v28.4s, v16.4s, v2.s[0] 310 FMLA v30.4s, v16.4s, v2.s[2] 311 PRFM PSTL1KEEP, [x16] // Prefetch C1 312 313 # BLOCK 2 314 LDR d4, [x20], 8 // A2 315 INS v12.d[1], x8 // B ins 316 FMLA v21.4s, v17.4s, v0.s[0] 317 LDR x8, [x21], 8 // A3 318 FMLA v23.4s, v17.4s, v0.s[2] 319 FMLA v25.4s, v17.4s, v1.s[0] 320 PRFM PSTL1KEEP, [x17] // Prefetch C2 321 322 # BLOCK 3 323 LDR d5, [x22], 8 // A4 324 INS v4.d[1], x8 // A3 ins 325 FMLA v27.4s, v17.4s, v1.s[2] 326 LDR x8, [x23], 8 // A5 327 FMLA v29.4s, v17.4s, v2.s[0] 328 FMLA v31.4s, v17.4s, v2.s[2] 329 PRFM PSTL1KEEP, [x10] // Prefetch C3 330 331 # BLOCK 4 332 LDR d13, [x5, 16] 333 INS v5.d[1], x8 // A5 ins 334 FMLA v20.4s, v18.4s, v0.s[1] 335 LDR x8, [x5, 24] 336 FMLA v22.4s, v18.4s, v0.s[3] 337 FMLA v24.4s, v18.4s, v1.s[1] 338 PRFM PSTL1KEEP, [x13] // Prefetch C4 339 340 # BLOCK 5 341 LDR d14, [x5, 32] 342 INS v13.d[1], x8 // B 343 FMLA v26.4s, v18.4s, v1.s[3] 344 LDR x8, [x5, 40] 345 FMLA v28.4s, v18.4s, v2.s[1] 346 FMLA v30.4s, v18.4s, v2.s[3] 347 PRFM PSTL1KEEP, [x7] // Prefetch C5 348 349 # BLOCK 6 350 LDR d15, [x5, 48] 351 INS v14.d[1], x8 // B 352 FMLA v21.4s, v19.4s, v0.s[1] 353 LDR x8, [x5, 56] 354 FMLA v23.4s, v19.4s, v0.s[3] 355 FMLA v25.4s, v19.4s, v1.s[1] 356 357 # BLOCK 7 358 INS v15.d[1], x8 // B from previous 359 FMLA v27.4s, v19.4s, v1.s[3] 360 FMLA v29.4s, v19.4s, v2.s[1] 361 FMLA v31.4s, v19.4s, v2.s[3] 362 363 # Second group of 24 FMA, First group of loads 364 # BLOCK 0 365 FMLA v20.4s, v12.4s, v3.s[0] 366 FMLA v22.4s, v12.4s, v3.s[2] 367 FMLA v24.4s, v12.4s, v4.s[0] 368 369 # BLOCK 1 370 FMLA v26.4s, v12.4s, v4.s[2] 371 FMLA v28.4s, v12.4s, v5.s[0] 372 FMLA v30.4s, v12.4s, v5.s[2] 373 374 # BLOCK 2 375 FMLA v21.4s, v13.4s, v3.s[0] 376 FMLA v23.4s, v13.4s, v3.s[2] 377 FMLA v25.4s, v13.4s, v4.s[0] 378 379 # BLOCK 3 380 FMLA v27.4s, v13.4s, v4.s[2] 381 FMLA v29.4s, v13.4s, v5.s[0] 382 FMLA v31.4s, v13.4s, v5.s[2] 383 384 # BLOCK 4 385 FMLA v20.4s, v14.4s, v3.s[1] 386 FMLA v22.4s, v14.4s, v3.s[3] 387 FMLA v24.4s, v14.4s, v4.s[1] 388 389 # BLOCK 5 390 FMLA v26.4s, v14.4s, v4.s[3] 391 FMLA v28.4s, v14.4s, v5.s[1] 392 FMLA v30.4s, v14.4s, v5.s[3] 393 TST x0, 15 394 395 # BLOCK 6 396 FMLA v21.4s, v15.4s, v3.s[1] 397 FMLA v23.4s, v15.4s, v3.s[3] 398 FMLA v25.4s, v15.4s, v4.s[1] 399 ADD x5, x5, 64 400 401 # BLOCK 7 402 FMLA v27.4s, v15.4s, v4.s[3] 403 FMLA v29.4s, v15.4s, v5.s[1] 404 FMLA v31.4s, v15.4s, v5.s[3] 405 406 # Is there a remainder?- 2 floats of A (8 bytes) or less 407 B.NE 5f 408 4094: 410 # ks loop 411 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 412 B.HI 1b 413 414 # Clamp 415 FMAX v20.4s, v20.4s, v6.4s 416 # Load cn_stride 417 LDR x0, [sp, 64] 418 FMAX v21.4s, v21.4s, v6.4s 419 FMAX v22.4s, v22.4s, v6.4s 420 FMAX v23.4s, v23.4s, v6.4s 421 FMAX v24.4s, v24.4s, v6.4s 422 FMAX v25.4s, v25.4s, v6.4s 423 FMAX v26.4s, v26.4s, v6.4s 424 FMAX v27.4s, v27.4s, v6.4s 425 FMAX v28.4s, v28.4s, v6.4s 426 FMAX v29.4s, v29.4s, v6.4s 427 FMAX v30.4s, v30.4s, v6.4s 428 FMAX v31.4s, v31.4s, v6.4s 429 SUBS x1, x1, 8 430 FMIN v20.4s, v20.4s, v7.4s 431 FMIN v21.4s, v21.4s, v7.4s 432 FMIN v22.4s, v22.4s, v7.4s 433 FMIN v23.4s, v23.4s, v7.4s 434 FMIN v24.4s, v24.4s, v7.4s 435 FMIN v25.4s, v25.4s, v7.4s 436 FMIN v26.4s, v26.4s, v7.4s 437 FMIN v27.4s, v27.4s, v7.4s 438 FMIN v28.4s, v28.4s, v7.4s 439 FMIN v29.4s, v29.4s, v7.4s 440 FMIN v30.4s, v30.4s, v7.4s 441 FMIN v31.4s, v31.4s, v7.4s 442 443 # Store full 6 x 8 444 B.LO 7f 445 446 STP q30, q31, [x7] 447 ADD x7, x7, x0 448 STP q28, q29, [x13] 449 ADD x13, x13, x0 450 STP q26, q27, [x10] 451 ADD x10, x10, x0 452 STP q24, q25, [x17] 453 ADD x17, x17, x0 454 STP q22, q23, [x16] 455 ADD x16, x16, x0 456 STP q20, q21, [x6] 457 ADD x6, x6, x0 458 459 SUB x4, x4, x3 // A -= ks 460 461 # nc loop 462 B.HI 0b 463 464 # Restore x20-x23, d12-d15 from stack 465 LDP x22, x23, [sp, 48] 466 LDP x20, x21, [sp, 32] 467 LDP d14, d15, [sp, 16] 468 LDP d12, d13, [sp], 64 469 RET 470 4715: 472 # Is there a remainder?- 2 floats of A (8 bytes) 473 TBZ x0, 3, 6f 474 475 # Remainder- 2 floats of A (8 bytes) 476 LDR d0, [x14], 8 477 LDR q16, [x5], 16 478 LD1 {v0.d}[1], [x15], 8 479 LDR d1, [x20], 8 480 LD1 {v1.d}[1], [x21], 8 481 LDR d2, [x22], 8 482 LD1 {v2.d}[1], [x23], 8 483 LDR q17, [x5], 16 484 LDR q18, [x5], 16 485 LDR q19, [x5], 16 486 FMLA v20.4s, v16.4s, v0.s[0] 487 FMLA v22.4s, v16.4s, v0.s[2] 488 FMLA v24.4s, v16.4s, v1.s[0] 489 FMLA v26.4s, v16.4s, v1.s[2] 490 FMLA v28.4s, v16.4s, v2.s[0] 491 FMLA v30.4s, v16.4s, v2.s[2] 492 FMLA v21.4s, v17.4s, v0.s[0] 493 FMLA v23.4s, v17.4s, v0.s[2] 494 FMLA v25.4s, v17.4s, v1.s[0] 495 FMLA v27.4s, v17.4s, v1.s[2] 496 FMLA v29.4s, v17.4s, v2.s[0] 497 FMLA v31.4s, v17.4s, v2.s[2] 498 499 FMLA v20.4s, v18.4s, v0.s[1] 500 FMLA v22.4s, v18.4s, v0.s[3] 501 FMLA v24.4s, v18.4s, v1.s[1] 502 FMLA v26.4s, v18.4s, v1.s[3] 503 FMLA v28.4s, v18.4s, v2.s[1] 504 FMLA v30.4s, v18.4s, v2.s[3] 505 FMLA v21.4s, v19.4s, v0.s[1] 506 FMLA v23.4s, v19.4s, v0.s[3] 507 FMLA v25.4s, v19.4s, v1.s[1] 508 FMLA v27.4s, v19.4s, v1.s[3] 509 FMLA v29.4s, v19.4s, v2.s[1] 510 FMLA v31.4s, v19.4s, v2.s[3] 511 512 # Is there a remainder?- 1 float of A (4 bytes) 513 TBZ x0, 2, 4b 5146: 515 # Remainder- 1 float of A (4 bytes) 516 LDR s0, [x14], 4 517 LDR q16, [x5], 16 518 LD1 {v0.s}[2], [x15], 4 519 LDR s1, [x20], 4 520 LD1 {v1.s}[2], [x21], 4 521 LDR s2, [x22], 4 522 LD1 {v2.s}[2], [x23], 4 523 LDR q17, [x5], 16 524 525 FMLA v20.4s, v16.4s, v0.s[0] 526 FMLA v22.4s, v16.4s, v0.s[2] 527 FMLA v24.4s, v16.4s, v1.s[0] 528 FMLA v26.4s, v16.4s, v1.s[2] 529 FMLA v28.4s, v16.4s, v2.s[0] 530 FMLA v30.4s, v16.4s, v2.s[2] 531 FMLA v21.4s, v17.4s, v0.s[0] 532 FMLA v23.4s, v17.4s, v0.s[2] 533 FMLA v25.4s, v17.4s, v1.s[0] 534 FMLA v27.4s, v17.4s, v1.s[2] 535 FMLA v29.4s, v17.4s, v2.s[0] 536 FMLA v31.4s, v17.4s, v2.s[2] 537 B 4b 538 539 # Store odd width 5407: 541 TBZ x1, 2, 8f 542 STR q30, [x7], 16 543 MOV v30.16b, v31.16b 544 STR q28, [x13], 16 545 MOV v28.16b, v29.16b 546 STR q26, [x10], 16 547 MOV v26.16b, v27.16b 548 STR q24, [x17], 16 549 MOV v24.16b, v25.16b 550 STR q22, [x16], 16 551 MOV v22.16b, v23.16b 552 STR q20, [x6], 16 553 MOV v20.16b, v21.16b 5548: 555 TBZ x1, 1, 9f 556 STR d30, [x7], 8 557 STR d28, [x13], 8 558 DUP d30, v30.d[1] 559 DUP d28, v28.d[1] 560 STR d26, [x10], 8 561 STR d24, [x17], 8 562 DUP d26, v26.d[1] 563 DUP d24, v24.d[1] 564 STR d22, [x16], 8 565 STR d20, [x6], 8 566 DUP d22, v22.d[1] 567 DUP d20, v20.d[1] 568 5699: 570 TBZ x1, 0, 10f 571 STR s30, [x7] 572 STR s28, [x13] 573 STR s26, [x10] 574 STR s24, [x17] 575 STR s22, [x16] 576 STR s20, [x6] 57710: 578 # Restore x20-x23, d12-d15 from stack 579 LDP x22, x23, [sp, 48] 580 LDP x20, x21, [sp, 32] 581 LDP d14, d15, [sp, 16] 582 LDP d12, d13, [sp], 64 583 RET 584 585END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53 586 587#ifdef __ELF__ 588.section ".note.GNU-stack","",%progbits 589#endif 590