1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x13 a0 26# x14 a1 27# x15 a2 28# x8 a3 29 30# C pointers 31# x6 c0 32# x16 c1 33# x17 c2 34# x7 c3 35 36# x19 temporary vector shadow register 37 38# Vector register usage 39# A0 v0 v3 40# A1 v0[1] v3[1] 41# A2 v1 v4 42# A3 v1[1] v4[1] 43 44# B v12 v13 v14 v15 second set of B 45# B v16 v17 v18 v19 first set 46# C v20 v21 47# C v22 v23 48# C v24 v25 49# C v26 v27 50# Clamp v6 v7 51 52# unused A v8 v9 v10 v11 53# x12 a4 54# x4 a5 55# x13 c4 56# x7 c5 57# A4 v2 v5 58# A5 v2[1] v5[1] 59# C v28 v29 60# C v30 v31 61 62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 63 64 # Clamp C pointers 65 CMP x0, 2 // if mr < 2 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x16, x6, x16, LO // c1 = c0 68 69 ADD x17, x16, x7 // c2 = c1 + cm_stride 70 // if mr <= 2 71 CSEL x17, x16, x17, LS // c2 = c1 72 73 CMP x0, 4 // if mr < 4 74 ADD x7, x17, x7 // c3 = c2 + cm_stride 75 CSEL x7, x17, x7, LO // c3 = c2 76 77 # Load cn_stride, a_offset 78 LDP x10, x11, [sp] 79 80 # Load zero, params pointer 81 LDP x12, x8, [sp, 16] 82 83 # Load min/max values 84 LD2R {v6.4s, v7.4s}, [x8] 85 86 # Save x19, d12-d15 on stack 87 STP d12, d13, [sp, -48]! 88 STP d14, d15, [sp, 16] 89 STR x19, [sp, 32] 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 MOV v22.16b, v20.16b 95 $if PREFETCH: 96 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 97 $if PREFETCH: 98 PRFM PLDL1KEEP, [x13, 64] 99 MOV v23.16b, v21.16b 100 $if PREFETCH: 101 PRFM PLDL1KEEP, [x14, 0] 102 $if PREFETCH: 103 PRFM PLDL1KEEP, [x14, 64] 104 MOV v24.16b, v20.16b 105 $if PREFETCH: 106 PRFM PLDL1KEEP, [x15, 0] 107 $if PREFETCH: 108 PRFM PLDL1KEEP, [x15, 64] 109 MOV v25.16b, v21.16b 110 $if PREFETCH: 111 PRFM PLDL1KEEP, [x8, 0] 112 $if PREFETCH: 113 PRFM PLDL1KEEP, [x8, 64] 114 MOV v26.16b, v20.16b 115 $if PREFETCH: 116 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 117 $if PREFETCH: 118 PRFM PLDL1KEEP, [x5, 64] 119 MOV v27.16b, v21.16b 120 $if PREFETCH: 121 PRFM PLDL1KEEP, [x5, 128] 122 $if PREFETCH: 123 PRFM PLDL1KEEP, [x5, 192] 124 125 MOV x9, x3 // p = ks 126 1271: 128 # Load next 4 A pointers 129 LDP x13, x14, [x4], 16 130 LDP x15, x8, [x4], 16 131 132 CMP x13, x12 // if a0 == zero 133 ADD x13, x13, x11 // a0 += a_offset 134 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 135 CMP x14, x12 // if a1 == zero 136 ADD x14, x14, x11 // a1 += a_offset 137 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 138 CMP x15, x12 // if a2 == zero 139 ADD x15, x15, x11 // a2 += a_offset 140 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 141 CMP x8, x12 // if a3 == zero 142 ADD x8, x8, x11 // a3 += a_offset 143 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset 144 145 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 146 SUBS x0, x2, 16 // k = kc - 16 147 B.LO 4f 148 149 # Prologue - First group loads, no FMA 150 LDR d0, [x13], 8 // a0 151 LDP q16, q17, [x5], 32 // b 152 LDR d1, [x15], 8 // a2 153 LD1 {v0.d}[1], [x14], 8 // a1 154 LD1 {v1.d}[1], [x8], 8 // a3 155 SUBS x0, x0, 16 156 LDR q18, [x5], 16 157 LDR d19, [x5], 8 158 LDR x19, [x5], 8 // ins is in BLOCK 0 159 160 # Is there at least 4 floats (16 bytes) for main loop? 161 B.LO 3f 162 163 # Main loop - 4 floats of A (16 bytes) 164 # 32 FMA + 8 LD64 A + 8 LDR B 1652: 166 # First group of 16 FMA, Second group loads 167 # BLOCK 0 168 LDR d3, [x13], 8 // a0 169 INS v19.d[1], x19 // b from second group 170 FMLA v20.4s, v16.4s, v0.s[0] 171 LDR x19, [x14], 8 // a1 172 FMLA v22.4s, v16.4s, v0.s[2] 173 FMLA v24.4s, v16.4s, v1.s[0] 174 175 # BLOCK 1 176 LDR d12, [x5] 177 INS v3.d[1], x19 // a1 ins 178 FMLA v26.4s, v16.4s, v1.s[2] 179 LDR x19, [x5, 8] // b 180 FMLA v21.4s, v17.4s, v0.s[0] 181 FMLA v23.4s, v17.4s, v0.s[2] 182 183 # BLOCK 2 184 LDR d4, [x15], 8 // a2 185 INS v12.d[1], x19 // b ins 186 FMLA v25.4s, v17.4s, v1.s[0] 187 LDR x19, [x8], 8 // a3 188 FMLA v27.4s, v17.4s, v1.s[2] 189 FMLA v20.4s, v18.4s, v0.s[1] 190 191 # BLOCK 3 192 LDR d13, [x5, 16] 193 INS v4.d[1], x19 // a3 ins 194 FMLA v22.4s, v18.4s, v0.s[3] 195 LDR x19, [x5, 24] 196 FMLA v24.4s, v18.4s, v1.s[1] 197 FMLA v26.4s, v18.4s, v1.s[3] 198 199 # BLOCK 4 200 LDR d14, [x5, 32] 201 INS v13.d[1], x19 // b 202 FMLA v21.4s, v19.4s, v0.s[1] 203 LDR x19, [x5, 40] 204 FMLA v23.4s, v19.4s, v0.s[3] 205 FMLA v25.4s, v19.4s, v1.s[1] 206 207 # BLOCK 5 208 # NOPs to ensure 4 cycle LDR lands on next LDR 209 LDR d15, [x5, 48] 210 INS v14.d[1], x19 // b from previous 211 FMLA v27.4s, v19.4s, v1.s[3] 212 LDR x19, [x5, 56] 213 NOP 214 NOP 215 NOP 216 NOP 217 218 # Second group of 16 FMA, First group of loads 219 # BLOCK 0 220 LDR d0, [x13], 8 // a0 221 INS v15.d[1], x19 // b from previous 222 FMLA v20.4s, v12.4s, v3.s[0] 223 LDR x19, [x14], 8 // a1 224 FMLA v22.4s, v12.4s, v3.s[2] 225 FMLA v24.4s, v12.4s, v4.s[0] 226 $if PREFETCH: 227 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0 228 229 # BLOCK 1 230 LDR d16, [x5, 64] 231 INS v0.d[1], x19 // a1 ins 232 FMLA v26.4s, v12.4s, v4.s[2] 233 LDR x19, [x5, 72] // b 234 FMLA v21.4s, v13.4s, v3.s[0] 235 FMLA v23.4s, v13.4s, v3.s[2] 236 $if PREFETCH: 237 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1 238 239 # BLOCK 2 240 LDR d1, [x15], 8 // a2 241 INS v16.d[1], x19 // b 242 FMLA v25.4s, v13.4s, v4.s[0] 243 LDR x19, [x8], 8 // a3 244 FMLA v27.4s, v13.4s, v4.s[2] 245 FMLA v20.4s, v14.4s, v3.s[1] 246 $if PREFETCH: 247 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2 248 249 # BLOCK 3 250 LDR d17, [x5, 80] 251 INS v1.d[1], x19 // a3 ins 252 FMLA v22.4s, v14.4s, v3.s[3] 253 LDR x19, [x5, 88] 254 FMLA v24.4s, v14.4s, v4.s[1] 255 FMLA v26.4s, v14.4s, v4.s[3] 256 $if PREFETCH: 257 PRFM PLDL1KEEP, [x8, 128] // Prefetch A3 258 259 # BLOCK 4 260 LDR d18, [x5, 96] 261 INS v17.d[1], x19 // b 262 FMLA v21.4s, v15.4s, v3.s[1] 263 LDR x19, [x5, 104] 264 FMLA v23.4s, v15.4s, v3.s[3] 265 FMLA v25.4s, v15.4s, v4.s[1] 266 $if PREFETCH: 267 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 268 269 # BLOCK 5 270 # NOTE that block needs to be 4 cycles for LDR not to stall 271 LDR d19, [x5, 112] 272 INS v18.d[1], x19 273 FMLA v27.4s, v15.4s, v4.s[3] 274 LDR x19, [x5, 120] 275 SUBS x0, x0, 16 276 $if PREFETCH: 277 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 278 ADD x5, x5, 128 279 B.HS 2b 280 281 # Epilogue - 4 floats of A (16 bytes) 282 # 32 FMA + 8 LD64 A + 8 LDR B 2833: 284 # First group of 16 FMA, Second group loads 285 # BLOCK 0 286 LDR d3, [x13], 8 // a0 287 INS v19.d[1], x19 // b from second group 288 FMLA v20.4s, v16.4s, v0.s[0] 289 LDR x19, [x14], 8 // a1 290 FMLA v22.4s, v16.4s, v0.s[2] 291 FMLA v24.4s, v16.4s, v1.s[0] 292 293 # BLOCK 1 294 LDR d12, [x5] 295 INS v3.d[1], x19 // a1 ins 296 FMLA v26.4s, v16.4s, v1.s[2] 297 LDR x19, [x5, 8] // b 298 FMLA v21.4s, v17.4s, v0.s[0] 299 FMLA v23.4s, v17.4s, v0.s[2] 300 301 # BLOCK 2 302 LDR d4, [x15], 8 // a2 303 INS v12.d[1], x19 // b ins 304 FMLA v25.4s, v17.4s, v1.s[0] 305 LDR x19, [x8], 8 // a3 306 FMLA v27.4s, v17.4s, v1.s[2] 307 FMLA v20.4s, v18.4s, v0.s[1] 308 309 # BLOCK 3 310 LDR d13, [x5, 16] 311 INS v4.d[1], x19 // a3 ins 312 FMLA v22.4s, v18.4s, v0.s[3] 313 LDR x19, [x5, 24] 314 FMLA v24.4s, v18.4s, v1.s[1] 315 FMLA v26.4s, v18.4s, v1.s[3] 316 317 # BLOCK 4 318 LDR d14, [x5, 32] 319 INS v13.d[1], x19 // b 320 FMLA v21.4s, v19.4s, v0.s[1] 321 LDR x19, [x5, 40] 322 FMLA v23.4s, v19.4s, v0.s[3] 323 FMLA v25.4s, v19.4s, v1.s[1] 324 325 # BLOCK 5 326 # NOPs to ensure 4 cycle LDR lands on next LDR 327 LDR d15, [x5, 48] 328 INS v14.d[1], x19 329 FMLA v27.4s, v19.4s, v1.s[3] 330 LDR x19, [x5, 56] 331 NOP // fma 332 NOP 333 NOP // fma 334 NOP 335 336 # Second group of 16 FMA, no loads 337 # BLOCK 0 338 INS v15.d[1], x19 // b from previous 339 FMLA v20.4s, v12.4s, v3.s[0] 340 FMLA v22.4s, v12.4s, v3.s[2] 341 FMLA v24.4s, v12.4s, v4.s[0] 342 343 # BLOCK 1 344 FMLA v26.4s, v12.4s, v4.s[2] 345 FMLA v21.4s, v13.4s, v3.s[0] 346 FMLA v23.4s, v13.4s, v3.s[2] 347 348 # BLOCK 2 349 FMLA v25.4s, v13.4s, v4.s[0] 350 FMLA v27.4s, v13.4s, v4.s[2] 351 FMLA v20.4s, v14.4s, v3.s[1] 352 353 # BLOCK 3 354 FMLA v22.4s, v14.4s, v3.s[3] 355 FMLA v24.4s, v14.4s, v4.s[1] 356 FMLA v26.4s, v14.4s, v4.s[3] 357 358 # BLOCK 4 359 FMLA v21.4s, v15.4s, v3.s[1] 360 FMLA v23.4s, v15.4s, v3.s[3] 361 FMLA v25.4s, v15.4s, v4.s[1] 362 ADD x5, x5, 64 363 364 # BLOCK 5 365 FMLA v27.4s, v15.4s, v4.s[3] 366 3674: 368 # Is there a remainder?- 2 floats of A (8 bytes) 369 TBNZ x0, 3, 6f 370 # Is there a remainder?- 1 float of A (4 bytes) 371 TBNZ x0, 2, 7f 3725: 373 # ks loop 374 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 375 B.HI 1b 376 377 # Clamp 378 FMAX v20.4s, v20.4s, v6.4s 379 FMAX v21.4s, v21.4s, v6.4s 380 FMAX v22.4s, v22.4s, v6.4s 381 FMAX v23.4s, v23.4s, v6.4s 382 FMAX v24.4s, v24.4s, v6.4s 383 FMAX v25.4s, v25.4s, v6.4s 384 FMAX v26.4s, v26.4s, v6.4s 385 FMAX v27.4s, v27.4s, v6.4s 386 FMIN v20.4s, v20.4s, v7.4s 387 FMIN v21.4s, v21.4s, v7.4s 388 FMIN v22.4s, v22.4s, v7.4s 389 FMIN v23.4s, v23.4s, v7.4s 390 FMIN v24.4s, v24.4s, v7.4s 391 FMIN v25.4s, v25.4s, v7.4s 392 FMIN v26.4s, v26.4s, v7.4s 393 FMIN v27.4s, v27.4s, v7.4s 394 395 # Store full 4 x 8 396 SUBS x1, x1, 8 397 B.LO 8f 398 399 STP q26, q27, [x7] 400 ADD x7, x7, x10 401 STP q24, q25, [x17] 402 ADD x17, x17, x10 403 STP q22, q23, [x16] 404 ADD x16, x16, x10 405 STP q20, q21, [x6] 406 ADD x6, x6, x10 407 408 SUB x4, x4, x3 // a -= ks 409 410 # nc loop 411 B.HI 0b 412 413 # Restore x19, d12-d15 from stack 414 LDR x19, [sp, 32] 415 LDP d14, d15, [sp, 16] 416 LDP d12, d13, [sp], 48 417 RET 418 419 # Remainder - 2 floats of A (8 bytes) 420 # 16 FMA + 4 LD64 A + 2 LDP B 4216: 422 LDR d0, [x13], 8 423 LDP q16, q17, [x5], 32 424 LD1 {v0.d}[1], [x14], 8 425 LDR d1, [x15], 8 426 LD1 {v1.d}[1], [x8], 8 427 LDP q18, q19, [x5], 32 428 FMLA v20.4s, v16.4s, v0.s[0] 429 FMLA v22.4s, v16.4s, v0.s[2] 430 FMLA v24.4s, v16.4s, v1.s[0] 431 FMLA v26.4s, v16.4s, v1.s[2] 432 FMLA v21.4s, v17.4s, v0.s[0] 433 FMLA v23.4s, v17.4s, v0.s[2] 434 FMLA v25.4s, v17.4s, v1.s[0] 435 FMLA v27.4s, v17.4s, v1.s[2] 436 437 FMLA v20.4s, v18.4s, v0.s[1] 438 FMLA v22.4s, v18.4s, v0.s[3] 439 FMLA v24.4s, v18.4s, v1.s[1] 440 FMLA v26.4s, v18.4s, v1.s[3] 441 FMLA v21.4s, v19.4s, v0.s[1] 442 FMLA v23.4s, v19.4s, v0.s[3] 443 FMLA v25.4s, v19.4s, v1.s[1] 444 FMLA v27.4s, v19.4s, v1.s[3] 445 446 # Is there a remainder?- 1 float of A (4 bytes) 447 TBZ x0, 2, 5b 448 4497: 450 # Remainder- 1 float of A (4 bytes) 451 LDR s0, [x13], 4 452 LDP q16, q17, [x5], 32 453 LD1 {v0.s}[2], [x14], 4 454 LDR s1, [x15], 4 455 LD1 {v1.s}[2], [x8], 4 456 457 FMLA v20.4s, v16.4s, v0.s[0] 458 FMLA v22.4s, v16.4s, v0.s[2] 459 FMLA v24.4s, v16.4s, v1.s[0] 460 FMLA v26.4s, v16.4s, v1.s[2] 461 FMLA v21.4s, v17.4s, v0.s[0] 462 FMLA v23.4s, v17.4s, v0.s[2] 463 FMLA v25.4s, v17.4s, v1.s[0] 464 FMLA v27.4s, v17.4s, v1.s[2] 465 B 5b 466 467 # Store odd width 4688: 469 TBZ x1, 2, 9f 470 STR q26, [x7], 16 471 MOV v26.16b, v27.16b 472 STR q24, [x17], 16 473 MOV v24.16b, v25.16b 474 STR q22, [x16], 16 475 MOV v22.16b, v23.16b 476 STR q20, [x6], 16 477 MOV v20.16b, v21.16b 4789: 479 TBZ x1, 1, 10f 480 STR d26, [x7], 8 481 STR d24, [x17], 8 482 DUP d26, v26.d[1] 483 DUP d24, v24.d[1] 484 STR d22, [x16], 8 485 STR d20, [x6], 8 486 DUP d22, v22.d[1] 487 DUP d20, v20.d[1] 488 48910: 490 TBZ x1, 0, 11f 491 STR s26, [x7] 492 STR s24, [x17] 493 STR s22, [x16] 494 STR s20, [x6] 49511: 496 # Restore x19, d12-d15 from stack 497 LDR x19, [sp, 32] 498 LDP d14, d15, [sp, 16] 499 LDP d12, d13, [sp], 48 500 RET 501 502END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 503 504#ifdef __ELF__ 505.section ".note.GNU-stack","",%progbits 506#endif 507