1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x13 a0 26# x14 a1 27# x15 a2 28# x8 a3 29 30# C pointers 31# x6 c0 32# x16 c1 33# x17 c2 34# x7 c3 35 36# x19 temporary vector shadow register 37 38# Vector register usage 39# A0 v0 v3 40# A1 v0[1] v3[1] 41# A2 v1 v4 42# A3 v1[1] v4[1] 43 44# B v12 v13 v14 v15 second set of B 45# B v16 v17 v18 v19 first set 46# C v20 v21 47# C v22 v23 48# C v24 v25 49# C v26 v27 50# Clamp v6 v7 51 52# unused A v8 v9 v10 v11 53# x12 a4 54# x4 a5 55# x13 c4 56# x7 c5 57# A4 v2 v5 58# A5 v2[1] v5[1] 59# C v28 v29 60# C v30 v31 61 62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 63 64 # Clamp C pointers 65 CMP x0, 2 // if mr < 2 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x16, x6, x16, LO // c1 = c0 68 69 ADD x17, x16, x7 // c2 = c1 + cm_stride 70 // if mr <= 2 71 CSEL x17, x16, x17, LS // c2 = c1 72 73 CMP x0, 4 // if mr < 4 74 ADD x7, x17, x7 // c3 = c2 + cm_stride 75 CSEL x7, x17, x7, LO // c3 = c2 76 77 # Load cn_stride, a_offset 78 LDP x10, x11, [sp] 79 80 # Load zero, params pointer 81 LDP x12, x8, [sp, 16] 82 83 # Load min/max values 84 LD2R {v6.4s, v7.4s}, [x8] 85 86 # Save x19, d12-d15 on stack 87 STP d12, d13, [sp, -48]! 88 STP d14, d15, [sp, 16] 89 STR x19, [sp, 32] 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 MOV v22.16b, v20.16b 95 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 96 PRFM PLDL1KEEP, [x13, 64] 97 MOV v23.16b, v21.16b 98 PRFM PLDL1KEEP, [x14, 0] 99 PRFM PLDL1KEEP, [x14, 64] 100 MOV v24.16b, v20.16b 101 PRFM PLDL1KEEP, [x15, 0] 102 PRFM PLDL1KEEP, [x15, 64] 103 MOV v25.16b, v21.16b 104 PRFM PLDL1KEEP, [x8, 0] 105 PRFM PLDL1KEEP, [x8, 64] 106 MOV v26.16b, v20.16b 107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 108 PRFM PLDL1KEEP, [x5, 64] 109 MOV v27.16b, v21.16b 110 PRFM PLDL1KEEP, [x5, 128] 111 PRFM PLDL1KEEP, [x5, 192] 112 113 MOV x9, x3 // p = ks 114 1151: 116 # Load next 4 A pointers 117 LDP x13, x14, [x4], 16 118 LDP x15, x8, [x4], 16 119 120 CMP x13, x12 // if a0 == zero 121 ADD x13, x13, x11 // a0 += a_offset 122 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 123 CMP x14, x12 // if a1 == zero 124 ADD x14, x14, x11 // a1 += a_offset 125 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 126 CMP x15, x12 // if a2 == zero 127 ADD x15, x15, x11 // a2 += a_offset 128 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 129 CMP x8, x12 // if a3 == zero 130 ADD x8, x8, x11 // a3 += a_offset 131 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset 132 133 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 134 SUBS x0, x2, 16 // k = kc - 16 135 B.LO 4f 136 137 # Prologue - First group loads, no FMA 138 LDR d0, [x13], 8 // a0 139 LDP q16, q17, [x5], 32 // b 140 LDR d1, [x15], 8 // a2 141 LD1 {v0.d}[1], [x14], 8 // a1 142 LD1 {v1.d}[1], [x8], 8 // a3 143 SUBS x0, x0, 16 144 LDR q18, [x5], 16 145 LDR d19, [x5], 8 146 LDR x19, [x5], 8 // ins is in BLOCK 0 147 148 # Is there at least 4 floats (16 bytes) for main loop? 149 B.LO 3f 150 151 # Main loop - 4 floats of A (16 bytes) 152 # 32 FMA + 8 LD64 A + 8 LDR B 1532: 154 # First group of 16 FMA, Second group loads 155 # BLOCK 0 156 FMLA v20.4s, v16.4s, v0.s[0] 157 LDR d3, [x13], 8 // a0 158 FMLA v22.4s, v16.4s, v0.s[2] 159 INS v19.d[1], x19 // b from second group 160 FMLA v24.4s, v16.4s, v1.s[0] 161 LDR x19, [x14], 8 // a1 162 163 # BLOCK 1 164 FMLA v26.4s, v16.4s, v1.s[2] 165 LDR d12, [x5] 166 FMLA v21.4s, v17.4s, v0.s[0] 167 INS v3.d[1], x19 // a1 ins 168 FMLA v23.4s, v17.4s, v0.s[2] 169 LDR x19, [x5, 8] // b 170 171 # BLOCK 2 172 FMLA v25.4s, v17.4s, v1.s[0] 173 LDR d4, [x15], 8 // a2 174 FMLA v27.4s, v17.4s, v1.s[2] 175 INS v12.d[1], x19 // b ins 176 FMLA v20.4s, v18.4s, v0.s[1] 177 LDR x19, [x8], 8 // a3 178 179 # BLOCK 3 180 FMLA v22.4s, v18.4s, v0.s[3] 181 LDR d13, [x5, 16] 182 FMLA v24.4s, v18.4s, v1.s[1] 183 INS v4.d[1], x19 // a3 ins 184 FMLA v26.4s, v18.4s, v1.s[3] 185 LDR x19, [x5, 24] 186 187 # BLOCK 4 188 FMLA v21.4s, v19.4s, v0.s[1] 189 LDR d14, [x5, 32] 190 FMLA v23.4s, v19.4s, v0.s[3] 191 INS v13.d[1], x19 // b 192 FMLA v25.4s, v19.4s, v1.s[1] 193 LDR x19, [x5, 40] 194 195 # BLOCK 5 196 # NOPs to ensure 4 cycle LDR lands on next LDR 197 FMLA v27.4s, v19.4s, v1.s[3] 198 LDR d15, [x5, 48] 199 NOP 200 INS v14.d[1], x19 // b from previous 201 SUBS x0, x0, 16 202 LDR x19, [x5, 56] 203 204 # Second group of 16 FMA, First group of loads 205 # BLOCK 0 206 FMLA v20.4s, v12.4s, v3.s[0] 207 LDR d0, [x13], 8 // a0 208 FMLA v22.4s, v12.4s, v3.s[2] 209 INS v15.d[1], x19 // b from previous 210 FMLA v24.4s, v12.4s, v4.s[0] 211 LDR x19, [x14], 8 // a1 212 213 # BLOCK 1 214 FMLA v26.4s, v12.4s, v4.s[2] 215 LDR d16, [x5, 64] 216 FMLA v21.4s, v13.4s, v3.s[0] 217 INS v0.d[1], x19 // a1 ins 218 FMLA v23.4s, v13.4s, v3.s[2] 219 LDR x19, [x5, 72] // b 220 221 # BLOCK 2 222 FMLA v25.4s, v13.4s, v4.s[0] 223 LDR d1, [x15], 8 // a2 224 FMLA v27.4s, v13.4s, v4.s[2] 225 INS v16.d[1], x19 // b 226 FMLA v20.4s, v14.4s, v3.s[1] 227 LDR x19, [x8], 8 // a3 228 229 # BLOCK 3 230 FMLA v22.4s, v14.4s, v3.s[3] 231 LDR d17, [x5, 80] 232 FMLA v24.4s, v14.4s, v4.s[1] 233 INS v1.d[1], x19 // a3 ins 234 FMLA v26.4s, v14.4s, v4.s[3] 235 LDR x19, [x5, 88] 236 237 # BLOCK 4 238 FMLA v21.4s, v15.4s, v3.s[1] 239 LDR d18, [x5, 96] 240 FMLA v23.4s, v15.4s, v3.s[3] 241 INS v17.d[1], x19 // b 242 FMLA v25.4s, v15.4s, v4.s[1] 243 LDR x19, [x5, 104] 244 245 # BLOCK 5 246 # NOTE that block needs to be 4 cycles for LDR not to stall 247 FMLA v27.4s, v15.4s, v4.s[3] 248 LDR d19, [x5, 112] 249 INS v18.d[1], x19 250 LDR x19, [x5, 120] 251 ADD x5, x5, 128 252 B.HS 2b 253 254 # Epilogue - 4 floats of A (16 bytes) 255 # 32 FMA + 8 LD64 A + 8 LDR B 2563: 257 # First group of 16 FMA, Second group loads 258 # BLOCK 0 259 LDR d3, [x13], 8 // a0 260 INS v19.d[1], x19 // b from second group 261 FMLA v20.4s, v16.4s, v0.s[0] 262 LDR x19, [x14], 8 // a1 263 FMLA v22.4s, v16.4s, v0.s[2] 264 FMLA v24.4s, v16.4s, v1.s[0] 265 266 # BLOCK 1 267 LDR d12, [x5] 268 INS v3.d[1], x19 // a1 ins 269 FMLA v26.4s, v16.4s, v1.s[2] 270 LDR x19, [x5, 8] // b 271 FMLA v21.4s, v17.4s, v0.s[0] 272 FMLA v23.4s, v17.4s, v0.s[2] 273 274 # BLOCK 2 275 LDR d4, [x15], 8 // a2 276 INS v12.d[1], x19 // b ins 277 FMLA v25.4s, v17.4s, v1.s[0] 278 LDR x19, [x8], 8 // a3 279 FMLA v27.4s, v17.4s, v1.s[2] 280 FMLA v20.4s, v18.4s, v0.s[1] 281 282 # BLOCK 3 283 LDR d13, [x5, 16] 284 INS v4.d[1], x19 // a3 ins 285 FMLA v22.4s, v18.4s, v0.s[3] 286 LDR x19, [x5, 24] 287 FMLA v24.4s, v18.4s, v1.s[1] 288 FMLA v26.4s, v18.4s, v1.s[3] 289 290 # BLOCK 4 291 LDR d14, [x5, 32] 292 INS v13.d[1], x19 // b 293 FMLA v21.4s, v19.4s, v0.s[1] 294 LDR x19, [x5, 40] 295 FMLA v23.4s, v19.4s, v0.s[3] 296 FMLA v25.4s, v19.4s, v1.s[1] 297 298 # BLOCK 5 299 # NOPs to ensure 4 cycle LDR lands on next LDR 300 LDR d15, [x5, 48] 301 INS v14.d[1], x19 302 FMLA v27.4s, v19.4s, v1.s[3] 303 LDR x19, [x5, 56] 304 NOP // fma 305 NOP 306 NOP // fma 307 NOP 308 309 # Second group of 16 FMA, no loads 310 # BLOCK 0 311 INS v15.d[1], x19 // b from previous 312 FMLA v20.4s, v12.4s, v3.s[0] 313 FMLA v22.4s, v12.4s, v3.s[2] 314 FMLA v24.4s, v12.4s, v4.s[0] 315 316 # BLOCK 1 317 FMLA v26.4s, v12.4s, v4.s[2] 318 FMLA v21.4s, v13.4s, v3.s[0] 319 FMLA v23.4s, v13.4s, v3.s[2] 320 321 # BLOCK 2 322 FMLA v25.4s, v13.4s, v4.s[0] 323 FMLA v27.4s, v13.4s, v4.s[2] 324 FMLA v20.4s, v14.4s, v3.s[1] 325 326 # BLOCK 3 327 FMLA v22.4s, v14.4s, v3.s[3] 328 FMLA v24.4s, v14.4s, v4.s[1] 329 FMLA v26.4s, v14.4s, v4.s[3] 330 331 # BLOCK 4 332 FMLA v21.4s, v15.4s, v3.s[1] 333 FMLA v23.4s, v15.4s, v3.s[3] 334 FMLA v25.4s, v15.4s, v4.s[1] 335 ADD x5, x5, 64 336 337 # BLOCK 5 338 FMLA v27.4s, v15.4s, v4.s[3] 339 3404: 341 # Is there a remainder?- 2 floats of A (8 bytes) 342 TBNZ x0, 3, 6f 343 # Is there a remainder?- 1 float of A (4 bytes) 344 TBNZ x0, 2, 7f 3455: 346 # ks loop 347 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 348 B.HI 1b 349 350 # Clamp 351 FMAX v20.4s, v20.4s, v6.4s 352 FMAX v21.4s, v21.4s, v6.4s 353 FMAX v22.4s, v22.4s, v6.4s 354 FMAX v23.4s, v23.4s, v6.4s 355 FMAX v24.4s, v24.4s, v6.4s 356 FMAX v25.4s, v25.4s, v6.4s 357 FMAX v26.4s, v26.4s, v6.4s 358 FMAX v27.4s, v27.4s, v6.4s 359 FMIN v20.4s, v20.4s, v7.4s 360 FMIN v21.4s, v21.4s, v7.4s 361 FMIN v22.4s, v22.4s, v7.4s 362 FMIN v23.4s, v23.4s, v7.4s 363 FMIN v24.4s, v24.4s, v7.4s 364 FMIN v25.4s, v25.4s, v7.4s 365 FMIN v26.4s, v26.4s, v7.4s 366 FMIN v27.4s, v27.4s, v7.4s 367 368 # Store full 4 x 8 369 SUBS x1, x1, 8 370 B.LO 8f 371 372 STP q26, q27, [x7] 373 ADD x7, x7, x10 374 STP q24, q25, [x17] 375 ADD x17, x17, x10 376 STP q22, q23, [x16] 377 ADD x16, x16, x10 378 STP q20, q21, [x6] 379 ADD x6, x6, x10 380 381 SUB x4, x4, x3 // a -= ks 382 383 # nc loop 384 B.HI 0b 385 386 # Restore x19, d12-d15 from stack 387 LDR x19, [sp, 32] 388 LDP d14, d15, [sp, 16] 389 LDP d12, d13, [sp], 48 390 RET 391 392 # Remainder - 2 floats of A (8 bytes) 393 # 16 FMA + 4 LD64 A + 2 LDP B 3946: 395 LDR d0, [x13], 8 396 LDP q16, q17, [x5], 32 397 LD1 {v0.d}[1], [x14], 8 398 LDR d1, [x15], 8 399 LD1 {v1.d}[1], [x8], 8 400 LDP q18, q19, [x5], 32 401 FMLA v20.4s, v16.4s, v0.s[0] 402 FMLA v22.4s, v16.4s, v0.s[2] 403 FMLA v24.4s, v16.4s, v1.s[0] 404 FMLA v26.4s, v16.4s, v1.s[2] 405 FMLA v21.4s, v17.4s, v0.s[0] 406 FMLA v23.4s, v17.4s, v0.s[2] 407 FMLA v25.4s, v17.4s, v1.s[0] 408 FMLA v27.4s, v17.4s, v1.s[2] 409 410 FMLA v20.4s, v18.4s, v0.s[1] 411 FMLA v22.4s, v18.4s, v0.s[3] 412 FMLA v24.4s, v18.4s, v1.s[1] 413 FMLA v26.4s, v18.4s, v1.s[3] 414 FMLA v21.4s, v19.4s, v0.s[1] 415 FMLA v23.4s, v19.4s, v0.s[3] 416 FMLA v25.4s, v19.4s, v1.s[1] 417 FMLA v27.4s, v19.4s, v1.s[3] 418 419 # Is there a remainder?- 1 float of A (4 bytes) 420 TBZ x0, 2, 5b 421 4227: 423 # Remainder- 1 float of A (4 bytes) 424 LDR s0, [x13], 4 425 LDP q16, q17, [x5], 32 426 LD1 {v0.s}[2], [x14], 4 427 LDR s1, [x15], 4 428 LD1 {v1.s}[2], [x8], 4 429 430 FMLA v20.4s, v16.4s, v0.s[0] 431 FMLA v22.4s, v16.4s, v0.s[2] 432 FMLA v24.4s, v16.4s, v1.s[0] 433 FMLA v26.4s, v16.4s, v1.s[2] 434 FMLA v21.4s, v17.4s, v0.s[0] 435 FMLA v23.4s, v17.4s, v0.s[2] 436 FMLA v25.4s, v17.4s, v1.s[0] 437 FMLA v27.4s, v17.4s, v1.s[2] 438 B 5b 439 440 # Store odd width 4418: 442 TBZ x1, 2, 9f 443 STR q26, [x7], 16 444 MOV v26.16b, v27.16b 445 STR q24, [x17], 16 446 MOV v24.16b, v25.16b 447 STR q22, [x16], 16 448 MOV v22.16b, v23.16b 449 STR q20, [x6], 16 450 MOV v20.16b, v21.16b 4519: 452 TBZ x1, 1, 10f 453 STR d26, [x7], 8 454 STR d24, [x17], 8 455 DUP d26, v26.d[1] 456 DUP d24, v24.d[1] 457 STR d22, [x16], 8 458 STR d20, [x6], 8 459 DUP d22, v22.d[1] 460 DUP d20, v20.d[1] 461 46210: 463 TBZ x1, 0, 11f 464 STR s26, [x7] 465 STR s24, [x17] 466 STR s22, [x16] 467 STR s20, [x6] 46811: 469 # Restore x19, d12-d15 from stack 470 LDR x19, [sp, 32] 471 LDP d14, d15, [sp, 16] 472 LDP d12, d13, [sp], 48 473 RET 474 475END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 476 477#ifdef __ELF__ 478.section ".note.GNU-stack","",%progbits 479#endif 480