1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x13 a0 30# x14 a1 31# x15 a2 32# x8 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 39 40# x19 temporary vector shadow register 41 42# Vector register usage 43# A0 v0 v3 44# A1 v0[1] v3[1] 45# A2 v1 v4 46# A3 v1[1] v4[1] 47 48# B v12 v13 v14 v15 second set of B 49# B v16 v17 v18 v19 first set 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# Clamp v6 v7 55 56# unused A v8 v9 v10 v11 57# x12 a4 58# x4 a5 59# x13 c4 60# x7 c5 61# A4 v2 v5 62# A5 v2[1] v5[1] 63# C v28 v29 64# C v30 v31 65 66BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53 67 68 # Clamp C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 CMP x0, 4 // if mr < 4 78 ADD x7, x17, x7 // c3 = c2 + cm_stride 79 CSEL x7, x17, x7, LO // c3 = c2 80 81 # Load cn_stride, a_offset 82 LDP x10, x11, [sp] 83 84 # Load zero, params pointer 85 LDP x12, x8, [sp, 16] 86 87 # Load min/max values 88 LD2R {v6.4s, v7.4s}, [x8] 89 90 # Save x19, d12-d15 on stack 91 STP d12, d13, [sp, -48]! 92 STP d14, d15, [sp, 16] 93 STR x19, [sp, 32] 94 950: 96 # Load initial bias from w into accumulators 97 LDP q20, q21, [x5], 32 98 MOV v22.16b, v20.16b 99 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 100 PRFM PLDL1KEEP, [x13, 64] 101 MOV v23.16b, v21.16b 102 PRFM PLDL1KEEP, [x14, 0] 103 PRFM PLDL1KEEP, [x14, 64] 104 MOV v24.16b, v20.16b 105 PRFM PLDL1KEEP, [x15, 0] 106 PRFM PLDL1KEEP, [x15, 64] 107 MOV v25.16b, v21.16b 108 PRFM PLDL1KEEP, [x8, 0] 109 PRFM PLDL1KEEP, [x8, 64] 110 MOV v26.16b, v20.16b 111 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 112 PRFM PLDL1KEEP, [x5, 64] 113 MOV v27.16b, v21.16b 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 117 MOV x9, x3 // p = ks 118 1191: 120 # Load next 4 A pointers 121 LDP x13, x14, [x4], 16 122 LDP x15, x8, [x4], 16 123 124 CMP x13, x12 // if a0 == zero 125 ADD x13, x13, x11 // a0 += a_offset 126 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 127 CMP x14, x12 // if a1 == zero 128 ADD x14, x14, x11 // a1 += a_offset 129 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 130 CMP x15, x12 // if a2 == zero 131 ADD x15, x15, x11 // a2 += a_offset 132 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 133 CMP x8, x12 // if a3 == zero 134 ADD x8, x8, x11 // a3 += a_offset 135 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset 136 137 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 138 SUBS x0, x2, 16 // k = kc - 16 139 B.LO 4f 140 141 # Prologue - First group loads, no FMA 142 LDR d0, [x13], 8 // a0 143 LDP q16, q17, [x5], 32 // b 144 LDR d1, [x15], 8 // a2 145 LD1 {v0.d}[1], [x14], 8 // a1 146 LD1 {v1.d}[1], [x8], 8 // a3 147 SUBS x0, x0, 16 148 LDR q18, [x5], 16 149 LDR d19, [x5], 8 150 LDR x19, [x5], 8 // ins is in BLOCK 0 151 152 # Is there at least 4 floats (16 bytes) for main loop? 153 B.LO 3f 154 155 # Main loop - 4 floats of A (16 bytes) 156 # 32 FMA + 8 LD64 A + 8 LDR B 1572: 158 # First group of 16 FMA, Second group loads 159 # BLOCK 0 160 LDR d3, [x13], 8 // a0 161 INS v19.d[1], x19 // b from second group 162 FMLA v20.4s, v16.4s, v0.s[0] 163 LDR x19, [x14], 8 // a1 164 FMLA v22.4s, v16.4s, v0.s[2] 165 FMLA v24.4s, v16.4s, v1.s[0] 166 167 # BLOCK 1 168 LDR d12, [x5] 169 INS v3.d[1], x19 // a1 ins 170 FMLA v26.4s, v16.4s, v1.s[2] 171 LDR x19, [x5, 8] // b 172 FMLA v21.4s, v17.4s, v0.s[0] 173 FMLA v23.4s, v17.4s, v0.s[2] 174 175 # BLOCK 2 176 LDR d4, [x15], 8 // a2 177 INS v12.d[1], x19 // b ins 178 FMLA v25.4s, v17.4s, v1.s[0] 179 LDR x19, [x8], 8 // a3 180 FMLA v27.4s, v17.4s, v1.s[2] 181 FMLA v20.4s, v18.4s, v0.s[1] 182 183 # BLOCK 3 184 LDR d13, [x5, 16] 185 INS v4.d[1], x19 // a3 ins 186 FMLA v22.4s, v18.4s, v0.s[3] 187 LDR x19, [x5, 24] 188 FMLA v24.4s, v18.4s, v1.s[1] 189 FMLA v26.4s, v18.4s, v1.s[3] 190 191 # BLOCK 4 192 LDR d14, [x5, 32] 193 INS v13.d[1], x19 // b 194 FMLA v21.4s, v19.4s, v0.s[1] 195 LDR x19, [x5, 40] 196 FMLA v23.4s, v19.4s, v0.s[3] 197 FMLA v25.4s, v19.4s, v1.s[1] 198 199 # BLOCK 5 200 # NOPs to ensure 4 cycle LDR lands on next LDR 201 LDR d15, [x5, 48] 202 INS v14.d[1], x19 // b from previous 203 FMLA v27.4s, v19.4s, v1.s[3] 204 LDR x19, [x5, 56] 205 NOP 206 NOP 207 NOP 208 NOP 209 210 # Second group of 16 FMA, First group of loads 211 # BLOCK 0 212 LDR d0, [x13], 8 // a0 213 INS v15.d[1], x19 // b from previous 214 FMLA v20.4s, v12.4s, v3.s[0] 215 LDR x19, [x14], 8 // a1 216 FMLA v22.4s, v12.4s, v3.s[2] 217 FMLA v24.4s, v12.4s, v4.s[0] 218 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0 219 220 # BLOCK 1 221 LDR d16, [x5, 64] 222 INS v0.d[1], x19 // a1 ins 223 FMLA v26.4s, v12.4s, v4.s[2] 224 LDR x19, [x5, 72] // b 225 FMLA v21.4s, v13.4s, v3.s[0] 226 FMLA v23.4s, v13.4s, v3.s[2] 227 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1 228 229 # BLOCK 2 230 LDR d1, [x15], 8 // a2 231 INS v16.d[1], x19 // b 232 FMLA v25.4s, v13.4s, v4.s[0] 233 LDR x19, [x8], 8 // a3 234 FMLA v27.4s, v13.4s, v4.s[2] 235 FMLA v20.4s, v14.4s, v3.s[1] 236 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2 237 238 # BLOCK 3 239 LDR d17, [x5, 80] 240 INS v1.d[1], x19 // a3 ins 241 FMLA v22.4s, v14.4s, v3.s[3] 242 LDR x19, [x5, 88] 243 FMLA v24.4s, v14.4s, v4.s[1] 244 FMLA v26.4s, v14.4s, v4.s[3] 245 PRFM PLDL1KEEP, [x8, 128] // Prefetch A3 246 247 # BLOCK 4 248 LDR d18, [x5, 96] 249 INS v17.d[1], x19 // b 250 FMLA v21.4s, v15.4s, v3.s[1] 251 LDR x19, [x5, 104] 252 FMLA v23.4s, v15.4s, v3.s[3] 253 FMLA v25.4s, v15.4s, v4.s[1] 254 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 255 256 # BLOCK 5 257 # NOTE that block needs to be 4 cycles for LDR not to stall 258 LDR d19, [x5, 112] 259 INS v18.d[1], x19 260 FMLA v27.4s, v15.4s, v4.s[3] 261 LDR x19, [x5, 120] 262 SUBS x0, x0, 16 263 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 264 ADD x5, x5, 128 265 B.HS 2b 266 267 # Epilogue - 4 floats of A (16 bytes) 268 # 32 FMA + 8 LD64 A + 8 LDR B 2693: 270 # First group of 16 FMA, Second group loads 271 # BLOCK 0 272 LDR d3, [x13], 8 // a0 273 INS v19.d[1], x19 // b from second group 274 FMLA v20.4s, v16.4s, v0.s[0] 275 LDR x19, [x14], 8 // a1 276 FMLA v22.4s, v16.4s, v0.s[2] 277 FMLA v24.4s, v16.4s, v1.s[0] 278 279 # BLOCK 1 280 LDR d12, [x5] 281 INS v3.d[1], x19 // a1 ins 282 FMLA v26.4s, v16.4s, v1.s[2] 283 LDR x19, [x5, 8] // b 284 FMLA v21.4s, v17.4s, v0.s[0] 285 FMLA v23.4s, v17.4s, v0.s[2] 286 287 # BLOCK 2 288 LDR d4, [x15], 8 // a2 289 INS v12.d[1], x19 // b ins 290 FMLA v25.4s, v17.4s, v1.s[0] 291 LDR x19, [x8], 8 // a3 292 FMLA v27.4s, v17.4s, v1.s[2] 293 FMLA v20.4s, v18.4s, v0.s[1] 294 295 # BLOCK 3 296 LDR d13, [x5, 16] 297 INS v4.d[1], x19 // a3 ins 298 FMLA v22.4s, v18.4s, v0.s[3] 299 LDR x19, [x5, 24] 300 FMLA v24.4s, v18.4s, v1.s[1] 301 FMLA v26.4s, v18.4s, v1.s[3] 302 303 # BLOCK 4 304 LDR d14, [x5, 32] 305 INS v13.d[1], x19 // b 306 FMLA v21.4s, v19.4s, v0.s[1] 307 LDR x19, [x5, 40] 308 FMLA v23.4s, v19.4s, v0.s[3] 309 FMLA v25.4s, v19.4s, v1.s[1] 310 311 # BLOCK 5 312 # NOPs to ensure 4 cycle LDR lands on next LDR 313 LDR d15, [x5, 48] 314 INS v14.d[1], x19 315 FMLA v27.4s, v19.4s, v1.s[3] 316 LDR x19, [x5, 56] 317 NOP // fma 318 NOP 319 NOP // fma 320 NOP 321 322 # Second group of 16 FMA, no loads 323 # BLOCK 0 324 INS v15.d[1], x19 // b from previous 325 FMLA v20.4s, v12.4s, v3.s[0] 326 FMLA v22.4s, v12.4s, v3.s[2] 327 FMLA v24.4s, v12.4s, v4.s[0] 328 329 # BLOCK 1 330 FMLA v26.4s, v12.4s, v4.s[2] 331 FMLA v21.4s, v13.4s, v3.s[0] 332 FMLA v23.4s, v13.4s, v3.s[2] 333 334 # BLOCK 2 335 FMLA v25.4s, v13.4s, v4.s[0] 336 FMLA v27.4s, v13.4s, v4.s[2] 337 FMLA v20.4s, v14.4s, v3.s[1] 338 339 # BLOCK 3 340 FMLA v22.4s, v14.4s, v3.s[3] 341 FMLA v24.4s, v14.4s, v4.s[1] 342 FMLA v26.4s, v14.4s, v4.s[3] 343 344 # BLOCK 4 345 FMLA v21.4s, v15.4s, v3.s[1] 346 FMLA v23.4s, v15.4s, v3.s[3] 347 FMLA v25.4s, v15.4s, v4.s[1] 348 ADD x5, x5, 64 349 350 # BLOCK 5 351 FMLA v27.4s, v15.4s, v4.s[3] 352 3534: 354 # Is there a remainder?- 2 floats of A (8 bytes) 355 TBNZ x0, 3, 6f 356 # Is there a remainder?- 1 float of A (4 bytes) 357 TBNZ x0, 2, 7f 3585: 359 # ks loop 360 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 361 B.HI 1b 362 363 # Clamp 364 FMAX v20.4s, v20.4s, v6.4s 365 FMAX v21.4s, v21.4s, v6.4s 366 FMAX v22.4s, v22.4s, v6.4s 367 FMAX v23.4s, v23.4s, v6.4s 368 FMAX v24.4s, v24.4s, v6.4s 369 FMAX v25.4s, v25.4s, v6.4s 370 FMAX v26.4s, v26.4s, v6.4s 371 FMAX v27.4s, v27.4s, v6.4s 372 FMIN v20.4s, v20.4s, v7.4s 373 FMIN v21.4s, v21.4s, v7.4s 374 FMIN v22.4s, v22.4s, v7.4s 375 FMIN v23.4s, v23.4s, v7.4s 376 FMIN v24.4s, v24.4s, v7.4s 377 FMIN v25.4s, v25.4s, v7.4s 378 FMIN v26.4s, v26.4s, v7.4s 379 FMIN v27.4s, v27.4s, v7.4s 380 381 # Store full 4 x 8 382 SUBS x1, x1, 8 383 B.LO 8f 384 385 STP q26, q27, [x7] 386 ADD x7, x7, x10 387 STP q24, q25, [x17] 388 ADD x17, x17, x10 389 STP q22, q23, [x16] 390 ADD x16, x16, x10 391 STP q20, q21, [x6] 392 ADD x6, x6, x10 393 394 SUB x4, x4, x3 // a -= ks 395 396 # nc loop 397 B.HI 0b 398 399 # Restore x19, d12-d15 from stack 400 LDR x19, [sp, 32] 401 LDP d14, d15, [sp, 16] 402 LDP d12, d13, [sp], 48 403 RET 404 405 # Remainder - 2 floats of A (8 bytes) 406 # 16 FMA + 4 LD64 A + 2 LDP B 4076: 408 LDR d0, [x13], 8 409 LDP q16, q17, [x5], 32 410 LD1 {v0.d}[1], [x14], 8 411 LDR d1, [x15], 8 412 LD1 {v1.d}[1], [x8], 8 413 LDP q18, q19, [x5], 32 414 FMLA v20.4s, v16.4s, v0.s[0] 415 FMLA v22.4s, v16.4s, v0.s[2] 416 FMLA v24.4s, v16.4s, v1.s[0] 417 FMLA v26.4s, v16.4s, v1.s[2] 418 FMLA v21.4s, v17.4s, v0.s[0] 419 FMLA v23.4s, v17.4s, v0.s[2] 420 FMLA v25.4s, v17.4s, v1.s[0] 421 FMLA v27.4s, v17.4s, v1.s[2] 422 423 FMLA v20.4s, v18.4s, v0.s[1] 424 FMLA v22.4s, v18.4s, v0.s[3] 425 FMLA v24.4s, v18.4s, v1.s[1] 426 FMLA v26.4s, v18.4s, v1.s[3] 427 FMLA v21.4s, v19.4s, v0.s[1] 428 FMLA v23.4s, v19.4s, v0.s[3] 429 FMLA v25.4s, v19.4s, v1.s[1] 430 FMLA v27.4s, v19.4s, v1.s[3] 431 432 # Is there a remainder?- 1 float of A (4 bytes) 433 TBZ x0, 2, 5b 434 4357: 436 # Remainder- 1 float of A (4 bytes) 437 LDR s0, [x13], 4 438 LDP q16, q17, [x5], 32 439 LD1 {v0.s}[2], [x14], 4 440 LDR s1, [x15], 4 441 LD1 {v1.s}[2], [x8], 4 442 443 FMLA v20.4s, v16.4s, v0.s[0] 444 FMLA v22.4s, v16.4s, v0.s[2] 445 FMLA v24.4s, v16.4s, v1.s[0] 446 FMLA v26.4s, v16.4s, v1.s[2] 447 FMLA v21.4s, v17.4s, v0.s[0] 448 FMLA v23.4s, v17.4s, v0.s[2] 449 FMLA v25.4s, v17.4s, v1.s[0] 450 FMLA v27.4s, v17.4s, v1.s[2] 451 B 5b 452 453 # Store odd width 4548: 455 TBZ x1, 2, 9f 456 STR q26, [x7], 16 457 MOV v26.16b, v27.16b 458 STR q24, [x17], 16 459 MOV v24.16b, v25.16b 460 STR q22, [x16], 16 461 MOV v22.16b, v23.16b 462 STR q20, [x6], 16 463 MOV v20.16b, v21.16b 4649: 465 TBZ x1, 1, 10f 466 STR d26, [x7], 8 467 STR d24, [x17], 8 468 DUP d26, v26.d[1] 469 DUP d24, v24.d[1] 470 STR d22, [x16], 8 471 STR d20, [x6], 8 472 DUP d22, v22.d[1] 473 DUP d20, v20.d[1] 474 47510: 476 TBZ x1, 0, 11f 477 STR s26, [x7] 478 STR s24, [x17] 479 STR s22, [x16] 480 STR s20, [x6] 48111: 482 # Restore x19, d12-d15 from stack 483 LDR x19, [sp, 32] 484 LDP d14, d15, [sp, 16] 485 LDP d12, d13, [sp], 48 486 RET 487 488END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53 489 490#ifdef __ELF__ 491.section ".note.GNU-stack","",%progbits 492#endif 493