1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 64 65 $if INC: 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 $else: 69 # Load params pointer 70 LDR x8, [sp, 8] 71 72 # Clamp A and C pointers 73 CMP x0, 2 // if mr < 2 74 ADD x9, x3, x4 // a1 = a0 + a_stride 75 ADD x16, x6, x7 // c1 = c0 + cm_stride 76 CSEL x9, x3, x9, LO // a1 = a0 77 CSEL x16, x6, x16, LO // c1 = c0 78 79 ADD x10, x9, x4 // a2 = a1 + a_stride 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x10, x9, x10, LS // a2 = a1 83 CSEL x17, x16, x17, LS // c2 = c1 84 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x14, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x14, x17, x14, LO // c3 = c2 90 91 # Load min/max values 92 LD2R {v6.4s, v7.4s}, [x8] 93 94 # Save d12-d15 on stack 95 STP d12, d13, [sp, -32]! 96 STP d14, d15, [sp, 16] 97 980: 99 $if INC: 100 # Load initial accumulators 101 LDP q20, q21, [x15], 32 102 LDP q22, q23, [x15], 32 103 LDP q24, q25, [x15], 32 104 LDP q26, q27, [x15], 32 105 $if PREFETCH: 106 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 107 $if PREFETCH: 108 PRFM PLDL1KEEP, [x3, 64] 109 $if PREFETCH: 110 PRFM PLDL1KEEP, [x9, 0] 111 $if PREFETCH: 112 PRFM PLDL1KEEP, [x9, 64] 113 $if PREFETCH: 114 PRFM PLDL1KEEP, [x10, 0] 115 $if PREFETCH: 116 PRFM PLDL1KEEP, [x10, 64] 117 $if PREFETCH: 118 PRFM PLDL1KEEP, [x11, 0] 119 $if PREFETCH: 120 PRFM PLDL1KEEP, [x11, 64] 121 $if PREFETCH: 122 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 123 $if PREFETCH: 124 PRFM PLDL1KEEP, [x5, 64] 125 $if PREFETCH: 126 PRFM PLDL1KEEP, [x5, 128] 127 $if PREFETCH: 128 PRFM PLDL1KEEP, [x5, 192] 129 $else: 130 # Load initial bias from w into accumulators 131 LDP q20, q21, [x5], 32 132 MOV v22.16b, v20.16b 133 $if PREFETCH: 134 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 135 $if PREFETCH: 136 PRFM PLDL1KEEP, [x3, 64] 137 MOV v23.16b, v21.16b 138 $if PREFETCH: 139 PRFM PLDL1KEEP, [x9, 0] 140 $if PREFETCH: 141 PRFM PLDL1KEEP, [x9, 64] 142 MOV v24.16b, v20.16b 143 $if PREFETCH: 144 PRFM PLDL1KEEP, [x10, 0] 145 $if PREFETCH: 146 PRFM PLDL1KEEP, [x10, 64] 147 MOV v25.16b, v21.16b 148 $if PREFETCH: 149 PRFM PLDL1KEEP, [x11, 0] 150 $if PREFETCH: 151 PRFM PLDL1KEEP, [x11, 64] 152 MOV v26.16b, v20.16b 153 $if PREFETCH: 154 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 155 MOV v27.16b, v21.16b 156 $if PREFETCH: 157 PRFM PLDL1KEEP, [x5, 64] 158 $if PREFETCH: 159 PRFM PLDL1KEEP, [x5, 128] 160 $if PREFETCH: 161 PRFM PLDL1KEEP, [x5, 192] 162 163 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 164 SUBS x0, x2, 16 // k = kc - 16 165 B.LO 4f 166 167 # Prologue - First group loads, no FMA 168 LDR d0, [x3], 8 // a0 169 LDP q16, q17, [x5], 32 // b 170 LDR d1, [x10], 8 // a2 171 LD1 {v0.d}[1], [x9], 8 // a1 172 LD1 {v1.d}[1], [x11], 8 // a3 173 SUBS x0, x0, 16 174 LDR q18, [x5], 16 175 LDR d19, [x5], 8 176 LDR x4, [x5], 8 // ins is in BLOCK 0 177 178 # Is there at least 4 floats (16 bytes) for main loop? 179 B.LO 2f 180 181 # Main loop - 4 floats of A (16 bytes) 182 # 32 FMA + 8 LD64 A + 8 LDR B 1831: 184 # First group of 16 FMA, Second group loads 185 # BLOCK 0 186 LDR d3, [x3], 8 // a0 187 INS v19.d[1], x4 // b from second group 188 FMLA v20.4s, v16.4s, v0.s[0] 189 LDR x4, [x9], 8 // a1 190 FMLA v22.4s, v16.4s, v0.s[2] 191 FMLA v24.4s, v16.4s, v1.s[0] 192 193 # BLOCK 1 194 LDR d12, [x5] 195 INS v3.d[1], x4 // a1 ins 196 FMLA v26.4s, v16.4s, v1.s[2] 197 LDR x4, [x5, 8] // b 198 FMLA v21.4s, v17.4s, v0.s[0] 199 FMLA v23.4s, v17.4s, v0.s[2] 200 201 # BLOCK 2 202 LDR d4, [x10], 8 // a2 203 INS v12.d[1], x4 // b ins 204 FMLA v25.4s, v17.4s, v1.s[0] 205 LDR x4, [x11], 8 // a3 206 FMLA v27.4s, v17.4s, v1.s[2] 207 FMLA v20.4s, v18.4s, v0.s[1] 208 209 # BLOCK 3 210 LDR d13, [x5, 16] 211 INS v4.d[1], x4 // a3 ins 212 FMLA v22.4s, v18.4s, v0.s[3] 213 LDR x4, [x5, 24] 214 FMLA v24.4s, v18.4s, v1.s[1] 215 FMLA v26.4s, v18.4s, v1.s[3] 216 217 # BLOCK 4 218 LDR d14, [x5, 32] 219 INS v13.d[1], x4 // b 220 FMLA v21.4s, v19.4s, v0.s[1] 221 LDR x4, [x5, 40] 222 FMLA v23.4s, v19.4s, v0.s[3] 223 FMLA v25.4s, v19.4s, v1.s[1] 224 225 # BLOCK 5 226 # NOPs to ensure 4 cycle LDR lands on next LDR 227 LDR d15, [x5, 48] 228 INS v14.d[1], x4 // b from previous 229 FMLA v27.4s, v19.4s, v1.s[3] 230 LDR x4, [x5, 56] 231 NOP 232 NOP 233 NOP 234 NOP 235 236 # Second group of 16 FMA, First group of loads 237 # BLOCK 0 238 LDR d0, [x3], 8 // a0 239 INS v15.d[1], x4 // b from previous 240 FMLA v20.4s, v12.4s, v3.s[0] 241 LDR x4, [x9], 8 // a1 242 FMLA v22.4s, v12.4s, v3.s[2] 243 FMLA v24.4s, v12.4s, v4.s[0] 244 $if PREFETCH: 245 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 246 247 # BLOCK 1 248 LDR d16, [x5, 64] 249 INS v0.d[1], x4 // a1 ins 250 FMLA v26.4s, v12.4s, v4.s[2] 251 LDR x4, [x5, 72] // b 252 FMLA v21.4s, v13.4s, v3.s[0] 253 FMLA v23.4s, v13.4s, v3.s[2] 254 $if PREFETCH: 255 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 256 257 # BLOCK 2 258 LDR d1, [x10], 8 // a2 259 INS v16.d[1], x4 // b 260 FMLA v25.4s, v13.4s, v4.s[0] 261 LDR x4, [x11], 8 // a3 262 FMLA v27.4s, v13.4s, v4.s[2] 263 FMLA v20.4s, v14.4s, v3.s[1] 264 $if PREFETCH: 265 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 266 267 # BLOCK 3 268 LDR d17, [x5, 80] 269 INS v1.d[1], x4 // a3 ins 270 FMLA v22.4s, v14.4s, v3.s[3] 271 LDR x4, [x5, 88] 272 FMLA v24.4s, v14.4s, v4.s[1] 273 FMLA v26.4s, v14.4s, v4.s[3] 274 $if PREFETCH: 275 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 276 277 # BLOCK 4 278 LDR d18, [x5, 96] 279 INS v17.d[1], x4 // b 280 FMLA v21.4s, v15.4s, v3.s[1] 281 LDR x4, [x5, 104] 282 FMLA v23.4s, v15.4s, v3.s[3] 283 FMLA v25.4s, v15.4s, v4.s[1] 284 $if PREFETCH: 285 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 286 287 # BLOCK 5 288 # NOTE that block needs to be 4 cycles for LDR not to stall 289 LDR d19, [x5, 112] 290 INS v18.d[1], x4 291 FMLA v27.4s, v15.4s, v4.s[3] 292 LDR x4, [x5, 120] 293 SUBS x0, x0, 16 294 $if PREFETCH: 295 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 296 ADD x5, x5, 128 297 B.HS 1b 298 299 # Epilogue - 4 floats of A (16 bytes) 300 # 32 FMA + 8 LD64 A + 8 LDR B 3012: 302 # First group of 16 FMA, Second group loads 303 # BLOCK 0 304 LDR d3, [x3], 8 // a0 305 INS v19.d[1], x4 // b from second group 306 FMLA v20.4s, v16.4s, v0.s[0] 307 LDR x4, [x9], 8 // a1 308 FMLA v22.4s, v16.4s, v0.s[2] 309 FMLA v24.4s, v16.4s, v1.s[0] 310 311 # BLOCK 1 312 LDR d12, [x5] 313 INS v3.d[1], x4 // a1 ins 314 FMLA v26.4s, v16.4s, v1.s[2] 315 LDR x4, [x5, 8] // b 316 FMLA v21.4s, v17.4s, v0.s[0] 317 FMLA v23.4s, v17.4s, v0.s[2] 318 319 # BLOCK 2 320 LDR d4, [x10], 8 // a2 321 INS v12.d[1], x4 // b ins 322 FMLA v25.4s, v17.4s, v1.s[0] 323 LDR x4, [x11], 8 // a3 324 FMLA v27.4s, v17.4s, v1.s[2] 325 FMLA v20.4s, v18.4s, v0.s[1] 326 327 # BLOCK 3 328 LDR d13, [x5, 16] 329 INS v4.d[1], x4 // a3 ins 330 FMLA v22.4s, v18.4s, v0.s[3] 331 LDR x4, [x5, 24] 332 FMLA v24.4s, v18.4s, v1.s[1] 333 FMLA v26.4s, v18.4s, v1.s[3] 334 335 # BLOCK 4 336 LDR d14, [x5, 32] 337 INS v13.d[1], x4 // b 338 FMLA v21.4s, v19.4s, v0.s[1] 339 LDR x4, [x5, 40] 340 FMLA v23.4s, v19.4s, v0.s[3] 341 FMLA v25.4s, v19.4s, v1.s[1] 342 343 # BLOCK 5 344 # NOPs to ensure 4 cycle LDR lands on next LDR 345 LDR d15, [x5, 48] 346 INS v14.d[1], x4 347 FMLA v27.4s, v19.4s, v1.s[3] 348 LDR x4, [x5, 56] 349 NOP // fma 350 NOP 351 NOP // fma 352 NOP 353 354 # Second group of 16 FMA, no loads 355 # BLOCK 0 356 INS v15.d[1], x4 // b from previous 357 FMLA v20.4s, v12.4s, v3.s[0] 358 FMLA v22.4s, v12.4s, v3.s[2] 359 FMLA v24.4s, v12.4s, v4.s[0] 360 361 # BLOCK 1 362 FMLA v26.4s, v12.4s, v4.s[2] 363 FMLA v21.4s, v13.4s, v3.s[0] 364 FMLA v23.4s, v13.4s, v3.s[2] 365 366 # BLOCK 2 367 FMLA v25.4s, v13.4s, v4.s[0] 368 FMLA v27.4s, v13.4s, v4.s[2] 369 FMLA v20.4s, v14.4s, v3.s[1] 370 371 # BLOCK 3 372 FMLA v22.4s, v14.4s, v3.s[3] 373 FMLA v24.4s, v14.4s, v4.s[1] 374 FMLA v26.4s, v14.4s, v4.s[3] 375 TST x0, 15 376 377 # BLOCK 4 378 FMLA v21.4s, v15.4s, v3.s[1] 379 FMLA v23.4s, v15.4s, v3.s[3] 380 FMLA v25.4s, v15.4s, v4.s[1] 381 ADD x5, x5, 64 382 383 # BLOCK 5 384 FMLA v27.4s, v15.4s, v4.s[3] 385 386 # Is there a remainder?- 2 floats of A (8 bytes) or less 387 B.NE 4f 388 3893: 390 # Clamp 391 FMAX v20.4s, v20.4s, v6.4s 392 # Load cn_stride 393 LDR x0, [sp, 32] 394 FMAX v21.4s, v21.4s, v6.4s 395 FMAX v22.4s, v22.4s, v6.4s 396 FMAX v23.4s, v23.4s, v6.4s 397 FMAX v24.4s, v24.4s, v6.4s 398 FMAX v25.4s, v25.4s, v6.4s 399 FMAX v26.4s, v26.4s, v6.4s 400 FMAX v27.4s, v27.4s, v6.4s 401 SUBS x1, x1, 8 402 FMIN v20.4s, v20.4s, v7.4s 403 FMIN v21.4s, v21.4s, v7.4s 404 FMIN v22.4s, v22.4s, v7.4s 405 FMIN v23.4s, v23.4s, v7.4s 406 FMIN v24.4s, v24.4s, v7.4s 407 FMIN v25.4s, v25.4s, v7.4s 408 FMIN v26.4s, v26.4s, v7.4s 409 FMIN v27.4s, v27.4s, v7.4s 410 411 # Store full 4 x 8 412 B.LO 6f 413 414 $if INC: 415 ST1 {v26.16b, v27.16b}, [x14], x0 416 SUB x3, x3, x2 // a0 -= kc 417 ST1 {v24.16b, v25.16b}, [x17], x0 418 SUB x9, x9, x2 // a1 -= kc 419 ST1 {v22.16b, v23.16b}, [x16], x0 420 SUB x10, x10, x2 // a2 -= kc 421 ST1 {v20.16b, v21.16b}, [x6], x0 422 SUB x11, x11, x2 // a3 -= kc 423 $else: 424 ST1 {v20.16b, v21.16b}, [x6], x0 425 SUB x3, x3, x2 // a0 -= kc 426 ST1 {v22.16b, v23.16b}, [x16], x0 427 SUB x9, x9, x2 // a1 -= kc 428 ST1 {v24.16b, v25.16b}, [x17], x0 429 SUB x10, x10, x2 // a2 -= kc 430 ST1 {v26.16b, v27.16b}, [x14], x0 431 SUB x11, x11, x2 // a3 -= kc 432 433 B.HI 0b 434 435 # Restore d12-d15 from stack 436 LDP d14, d15, [sp, 16] 437 LDP d12, d13, [sp], 32 438 RET 439 4404: 441 # Is there a remainder?- 2 floats of A (8 bytes) 442 TBZ x0, 3, 5f 443 444 # Remainder- 2 floats of A (8 bytes) 445 LDR d0, [x3], 8 446 LDR q16, [x5], 16 447 LD1 {v0.d}[1], [x9], 8 448 LDR d1, [x10], 8 449 LD1 {v1.d}[1], [x11], 8 450 LDR q17, [x5], 16 451 LDR q18, [x5], 16 452 LDR q19, [x5], 16 453 FMLA v20.4s, v16.4s, v0.s[0] 454 FMLA v22.4s, v16.4s, v0.s[2] 455 FMLA v24.4s, v16.4s, v1.s[0] 456 FMLA v26.4s, v16.4s, v1.s[2] 457 FMLA v21.4s, v17.4s, v0.s[0] 458 FMLA v23.4s, v17.4s, v0.s[2] 459 FMLA v25.4s, v17.4s, v1.s[0] 460 FMLA v27.4s, v17.4s, v1.s[2] 461 462 FMLA v20.4s, v18.4s, v0.s[1] 463 FMLA v22.4s, v18.4s, v0.s[3] 464 FMLA v24.4s, v18.4s, v1.s[1] 465 FMLA v26.4s, v18.4s, v1.s[3] 466 FMLA v21.4s, v19.4s, v0.s[1] 467 FMLA v23.4s, v19.4s, v0.s[3] 468 FMLA v25.4s, v19.4s, v1.s[1] 469 FMLA v27.4s, v19.4s, v1.s[3] 470 471 # Is there a remainder?- 1 float of A (4 bytes) 472 TBZ x0, 2, 3b 473 4745: 475 # Remainder- 1 float of A (4 bytes) 476 LDR s0, [x3], 4 477 LDR q16, [x5], 16 478 LD1 {v0.s}[2], [x9], 4 479 LDR s1, [x10], 4 480 LD1 {v1.s}[2], [x11], 4 481 LDR q17, [x5], 16 482 483 FMLA v20.4s, v16.4s, v0.s[0] 484 FMLA v22.4s, v16.4s, v0.s[2] 485 FMLA v24.4s, v16.4s, v1.s[0] 486 FMLA v26.4s, v16.4s, v1.s[2] 487 FMLA v21.4s, v17.4s, v0.s[0] 488 FMLA v23.4s, v17.4s, v0.s[2] 489 FMLA v25.4s, v17.4s, v1.s[0] 490 FMLA v27.4s, v17.4s, v1.s[2] 491 B 3b 492 493 # Store odd width 4946: 495 TBZ x1, 2, 7f 496 $if INC: 497 STR q26, [x14], 16 498 MOV v26.16b, v27.16b 499 STR q24, [x17], 16 500 MOV v24.16b, v25.16b 501 STR q22, [x16], 16 502 MOV v22.16b, v23.16b 503 STR q20, [x6], 16 504 MOV v20.16b, v21.16b 505 $else: 506 STR q20, [x6], 16 507 MOV v20.16b, v21.16b 508 STR q22, [x16], 16 509 MOV v22.16b, v23.16b 510 STR q24, [x17], 16 511 MOV v24.16b, v25.16b 512 STR q26, [x14], 16 513 MOV v26.16b, v27.16b 514 5157: 516 TBZ x1, 1, 8f 517 $if INC: 518 STR d26, [x14], 8 519 STR d24, [x17], 8 520 DUP d26, v26.d[1] 521 DUP d24, v24.d[1] 522 STR d22, [x16], 8 523 STR d20, [x6], 8 524 DUP d22, v22.d[1] 525 DUP d20, v20.d[1] 526 $else: 527 STR d20, [x6], 8 528 STR d22, [x16], 8 529 DUP d20, v20.d[1] 530 DUP d22, v22.d[1] 531 STR d24, [x17], 8 532 STR d26, [x14], 8 533 DUP d24, v24.d[1] 534 DUP d26, v26.d[1] 535 5368: 537 TBZ x1, 0, 9f 538 $if INC: 539 STR s26, [x14] 540 STR s24, [x17] 541 STR s22, [x16] 542 STR s20, [x6] 543 $else: 544 STR s20, [x6] 545 STR s22, [x16] 546 STR s24, [x17] 547 STR s26, [x14] 5489: 549 # Restore d12-d15 from stack 550 LDP d14, d15, [sp, 16] 551 LDP d12, d13, [sp], 32 552 RET 553 554END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a53 555 556#ifdef __ELF__ 557.section ".note.GNU-stack","",%progbits 558#endif 559