1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x13 a0 30# x14 a1 31# x15 a2 32# x8 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 39 40# x19 temporary vector shadow register 41 42# Vector register usage 43# A0 v0 v3 44# A1 v0[1] v3[1] 45# A2 v1 v4 46# A3 v1[1] v4[1] 47 48# B v12 v13 v14 v15 second set of B 49# B v16 v17 v18 v19 first set 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# Clamp v6 v7 55 56# unused A v8 v9 v10 v11 57# x12 a4 58# x4 a5 59# x13 c4 60# x7 c5 61# A4 v2 v5 62# A5 v2[1] v5[1] 63# C v28 v29 64# C v30 v31 65 66BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 67 68 # Clamp C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 CMP x0, 4 // if mr < 4 78 ADD x7, x17, x7 // c3 = c2 + cm_stride 79 CSEL x7, x17, x7, LO // c3 = c2 80 81 # Load cn_stride, a_offset 82 LDP x10, x11, [sp] 83 84 # Load zero, params pointer 85 LDP x12, x8, [sp, 16] 86 87 # Load min/max values 88 LD2R {v6.4s, v7.4s}, [x8] 89 90 # Save x19, d12-d15 on stack 91 STP d12, d13, [sp, -48]! 92 STP d14, d15, [sp, 16] 93 STR x19, [sp, 32] 94 950: 96 # Load initial bias from w into accumulators 97 LDP q20, q21, [x5], 32 98 MOV v22.16b, v20.16b 99 MOV v23.16b, v21.16b 100 MOV v24.16b, v20.16b 101 MOV v25.16b, v21.16b 102 MOV v26.16b, v20.16b 103 MOV v27.16b, v21.16b 104 105 MOV x9, x3 // p = ks 106 1071: 108 # Load next 4 A pointers 109 LDP x13, x14, [x4], 16 110 LDP x15, x8, [x4], 16 111 112 CMP x13, x12 // if a0 == zero 113 ADD x13, x13, x11 // a0 += a_offset 114 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 115 CMP x14, x12 // if a1 == zero 116 ADD x14, x14, x11 // a1 += a_offset 117 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 118 CMP x15, x12 // if a2 == zero 119 ADD x15, x15, x11 // a2 += a_offset 120 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 121 CMP x8, x12 // if a3 == zero 122 ADD x8, x8, x11 // a3 += a_offset 123 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset 124 125 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 126 SUBS x0, x2, 16 // k = kc - 16 127 B.LO 4f 128 129 # Prologue - First group loads, no FMA 130 LDR d0, [x13], 8 // a0 131 LDP q16, q17, [x5], 32 // b 132 LDR d1, [x15], 8 // a2 133 LD1 {v0.d}[1], [x14], 8 // a1 134 LD1 {v1.d}[1], [x8], 8 // a3 135 SUBS x0, x0, 16 136 LDR q18, [x5], 16 137 LDR d19, [x5], 8 138 LDR x19, [x5], 8 // ins is in BLOCK 0 139 140 # Is there at least 4 floats (16 bytes) for main loop? 141 B.LO 3f 142 143 # Main loop - 4 floats of A (16 bytes) 144 # 32 FMA + 8 LD64 A + 8 LDR B 1452: 146 # First group of 16 FMA, Second group loads 147 # BLOCK 0 148 LDR d3, [x13], 8 // a0 149 INS v19.d[1], x19 // b from second group 150 FMLA v20.4s, v16.4s, v0.s[0] 151 LDR x19, [x14], 8 // a1 152 FMLA v22.4s, v16.4s, v0.s[2] 153 FMLA v24.4s, v16.4s, v1.s[0] 154 155 # BLOCK 1 156 LDR d12, [x5] 157 INS v3.d[1], x19 // a1 ins 158 FMLA v26.4s, v16.4s, v1.s[2] 159 LDR x19, [x5, 8] // b 160 FMLA v21.4s, v17.4s, v0.s[0] 161 FMLA v23.4s, v17.4s, v0.s[2] 162 163 # BLOCK 2 164 LDR d4, [x15], 8 // a2 165 INS v12.d[1], x19 // b ins 166 FMLA v25.4s, v17.4s, v1.s[0] 167 LDR x19, [x8], 8 // a3 168 FMLA v27.4s, v17.4s, v1.s[2] 169 FMLA v20.4s, v18.4s, v0.s[1] 170 171 # BLOCK 3 172 LDR d13, [x5, 16] 173 INS v4.d[1], x19 // a3 ins 174 FMLA v22.4s, v18.4s, v0.s[3] 175 LDR x19, [x5, 24] 176 FMLA v24.4s, v18.4s, v1.s[1] 177 FMLA v26.4s, v18.4s, v1.s[3] 178 179 # BLOCK 4 180 LDR d14, [x5, 32] 181 INS v13.d[1], x19 // b 182 FMLA v21.4s, v19.4s, v0.s[1] 183 LDR x19, [x5, 40] 184 FMLA v23.4s, v19.4s, v0.s[3] 185 FMLA v25.4s, v19.4s, v1.s[1] 186 187 # BLOCK 5 188 # NOPs to ensure 4 cycle LDR lands on next LDR 189 LDR d15, [x5, 48] 190 INS v14.d[1], x19 // b from previous 191 FMLA v27.4s, v19.4s, v1.s[3] 192 LDR x19, [x5, 56] 193 NOP 194 NOP 195 NOP 196 NOP 197 198 # Second group of 16 FMA, First group of loads 199 # BLOCK 0 200 LDR d0, [x13], 8 // a0 201 INS v15.d[1], x19 // b from previous 202 FMLA v20.4s, v12.4s, v3.s[0] 203 LDR x19, [x14], 8 // a1 204 FMLA v22.4s, v12.4s, v3.s[2] 205 FMLA v24.4s, v12.4s, v4.s[0] 206 207 # BLOCK 1 208 LDR d16, [x5, 64] 209 INS v0.d[1], x19 // a1 ins 210 FMLA v26.4s, v12.4s, v4.s[2] 211 LDR x19, [x5, 72] // b 212 FMLA v21.4s, v13.4s, v3.s[0] 213 FMLA v23.4s, v13.4s, v3.s[2] 214 215 # BLOCK 2 216 LDR d1, [x15], 8 // a2 217 INS v16.d[1], x19 // b 218 FMLA v25.4s, v13.4s, v4.s[0] 219 LDR x19, [x8], 8 // a3 220 FMLA v27.4s, v13.4s, v4.s[2] 221 FMLA v20.4s, v14.4s, v3.s[1] 222 223 # BLOCK 3 224 LDR d17, [x5, 80] 225 INS v1.d[1], x19 // a3 ins 226 FMLA v22.4s, v14.4s, v3.s[3] 227 LDR x19, [x5, 88] 228 FMLA v24.4s, v14.4s, v4.s[1] 229 FMLA v26.4s, v14.4s, v4.s[3] 230 231 # BLOCK 4 232 LDR d18, [x5, 96] 233 INS v17.d[1], x19 // b 234 FMLA v21.4s, v15.4s, v3.s[1] 235 LDR x19, [x5, 104] 236 FMLA v23.4s, v15.4s, v3.s[3] 237 FMLA v25.4s, v15.4s, v4.s[1] 238 239 # BLOCK 5 240 # NOTE that block needs to be 4 cycles for LDR not to stall 241 LDR d19, [x5, 112] 242 INS v18.d[1], x19 243 FMLA v27.4s, v15.4s, v4.s[3] 244 LDR x19, [x5, 120] 245 SUBS x0, x0, 16 246 ADD x5, x5, 128 247 B.HS 2b 248 249 # Epilogue - 4 floats of A (16 bytes) 250 # 32 FMA + 8 LD64 A + 8 LDR B 2513: 252 # First group of 16 FMA, Second group loads 253 # BLOCK 0 254 LDR d3, [x13], 8 // a0 255 INS v19.d[1], x19 // b from second group 256 FMLA v20.4s, v16.4s, v0.s[0] 257 LDR x19, [x14], 8 // a1 258 FMLA v22.4s, v16.4s, v0.s[2] 259 FMLA v24.4s, v16.4s, v1.s[0] 260 261 # BLOCK 1 262 LDR d12, [x5] 263 INS v3.d[1], x19 // a1 ins 264 FMLA v26.4s, v16.4s, v1.s[2] 265 LDR x19, [x5, 8] // b 266 FMLA v21.4s, v17.4s, v0.s[0] 267 FMLA v23.4s, v17.4s, v0.s[2] 268 269 # BLOCK 2 270 LDR d4, [x15], 8 // a2 271 INS v12.d[1], x19 // b ins 272 FMLA v25.4s, v17.4s, v1.s[0] 273 LDR x19, [x8], 8 // a3 274 FMLA v27.4s, v17.4s, v1.s[2] 275 FMLA v20.4s, v18.4s, v0.s[1] 276 277 # BLOCK 3 278 LDR d13, [x5, 16] 279 INS v4.d[1], x19 // a3 ins 280 FMLA v22.4s, v18.4s, v0.s[3] 281 LDR x19, [x5, 24] 282 FMLA v24.4s, v18.4s, v1.s[1] 283 FMLA v26.4s, v18.4s, v1.s[3] 284 285 # BLOCK 4 286 LDR d14, [x5, 32] 287 INS v13.d[1], x19 // b 288 FMLA v21.4s, v19.4s, v0.s[1] 289 LDR x19, [x5, 40] 290 FMLA v23.4s, v19.4s, v0.s[3] 291 FMLA v25.4s, v19.4s, v1.s[1] 292 293 # BLOCK 5 294 # NOPs to ensure 4 cycle LDR lands on next LDR 295 LDR d15, [x5, 48] 296 INS v14.d[1], x19 297 FMLA v27.4s, v19.4s, v1.s[3] 298 LDR x19, [x5, 56] 299 NOP // fma 300 NOP 301 NOP // fma 302 NOP 303 304 # Second group of 16 FMA, no loads 305 # BLOCK 0 306 INS v15.d[1], x19 // b from previous 307 FMLA v20.4s, v12.4s, v3.s[0] 308 FMLA v22.4s, v12.4s, v3.s[2] 309 FMLA v24.4s, v12.4s, v4.s[0] 310 311 # BLOCK 1 312 FMLA v26.4s, v12.4s, v4.s[2] 313 FMLA v21.4s, v13.4s, v3.s[0] 314 FMLA v23.4s, v13.4s, v3.s[2] 315 316 # BLOCK 2 317 FMLA v25.4s, v13.4s, v4.s[0] 318 FMLA v27.4s, v13.4s, v4.s[2] 319 FMLA v20.4s, v14.4s, v3.s[1] 320 321 # BLOCK 3 322 FMLA v22.4s, v14.4s, v3.s[3] 323 FMLA v24.4s, v14.4s, v4.s[1] 324 FMLA v26.4s, v14.4s, v4.s[3] 325 326 # BLOCK 4 327 FMLA v21.4s, v15.4s, v3.s[1] 328 FMLA v23.4s, v15.4s, v3.s[3] 329 FMLA v25.4s, v15.4s, v4.s[1] 330 ADD x5, x5, 64 331 332 # BLOCK 5 333 FMLA v27.4s, v15.4s, v4.s[3] 334 3354: 336 # Is there a remainder?- 2 floats of A (8 bytes) 337 TBNZ x0, 3, 6f 338 # Is there a remainder?- 1 float of A (4 bytes) 339 TBNZ x0, 2, 7f 3405: 341 # ks loop 342 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 343 B.HI 1b 344 345 # Clamp 346 FMAX v20.4s, v20.4s, v6.4s 347 FMAX v21.4s, v21.4s, v6.4s 348 FMAX v22.4s, v22.4s, v6.4s 349 FMAX v23.4s, v23.4s, v6.4s 350 FMAX v24.4s, v24.4s, v6.4s 351 FMAX v25.4s, v25.4s, v6.4s 352 FMAX v26.4s, v26.4s, v6.4s 353 FMAX v27.4s, v27.4s, v6.4s 354 FMIN v20.4s, v20.4s, v7.4s 355 FMIN v21.4s, v21.4s, v7.4s 356 FMIN v22.4s, v22.4s, v7.4s 357 FMIN v23.4s, v23.4s, v7.4s 358 FMIN v24.4s, v24.4s, v7.4s 359 FMIN v25.4s, v25.4s, v7.4s 360 FMIN v26.4s, v26.4s, v7.4s 361 FMIN v27.4s, v27.4s, v7.4s 362 363 # Store full 4 x 8 364 SUBS x1, x1, 8 365 B.LO 8f 366 367 STP q26, q27, [x7] 368 ADD x7, x7, x10 369 STP q24, q25, [x17] 370 ADD x17, x17, x10 371 STP q22, q23, [x16] 372 ADD x16, x16, x10 373 STP q20, q21, [x6] 374 ADD x6, x6, x10 375 376 SUB x4, x4, x3 // a -= ks 377 378 # nc loop 379 B.HI 0b 380 381 # Restore x19, d12-d15 from stack 382 LDR x19, [sp, 32] 383 LDP d14, d15, [sp, 16] 384 LDP d12, d13, [sp], 48 385 RET 386 387 # Remainder - 2 floats of A (8 bytes) 388 # 16 FMA + 4 LD64 A + 2 LDP B 3896: 390 LDR d0, [x13], 8 391 LDP q16, q17, [x5], 32 392 LD1 {v0.d}[1], [x14], 8 393 LDR d1, [x15], 8 394 LD1 {v1.d}[1], [x8], 8 395 LDP q18, q19, [x5], 32 396 FMLA v20.4s, v16.4s, v0.s[0] 397 FMLA v22.4s, v16.4s, v0.s[2] 398 FMLA v24.4s, v16.4s, v1.s[0] 399 FMLA v26.4s, v16.4s, v1.s[2] 400 FMLA v21.4s, v17.4s, v0.s[0] 401 FMLA v23.4s, v17.4s, v0.s[2] 402 FMLA v25.4s, v17.4s, v1.s[0] 403 FMLA v27.4s, v17.4s, v1.s[2] 404 405 FMLA v20.4s, v18.4s, v0.s[1] 406 FMLA v22.4s, v18.4s, v0.s[3] 407 FMLA v24.4s, v18.4s, v1.s[1] 408 FMLA v26.4s, v18.4s, v1.s[3] 409 FMLA v21.4s, v19.4s, v0.s[1] 410 FMLA v23.4s, v19.4s, v0.s[3] 411 FMLA v25.4s, v19.4s, v1.s[1] 412 FMLA v27.4s, v19.4s, v1.s[3] 413 414 # Is there a remainder?- 1 float of A (4 bytes) 415 TBZ x0, 2, 5b 416 4177: 418 # Remainder- 1 float of A (4 bytes) 419 LDR s0, [x13], 4 420 LDP q16, q17, [x5], 32 421 LD1 {v0.s}[2], [x14], 4 422 LDR s1, [x15], 4 423 LD1 {v1.s}[2], [x8], 4 424 425 FMLA v20.4s, v16.4s, v0.s[0] 426 FMLA v22.4s, v16.4s, v0.s[2] 427 FMLA v24.4s, v16.4s, v1.s[0] 428 FMLA v26.4s, v16.4s, v1.s[2] 429 FMLA v21.4s, v17.4s, v0.s[0] 430 FMLA v23.4s, v17.4s, v0.s[2] 431 FMLA v25.4s, v17.4s, v1.s[0] 432 FMLA v27.4s, v17.4s, v1.s[2] 433 B 5b 434 435 # Store odd width 4368: 437 TBZ x1, 2, 9f 438 STR q26, [x7], 16 439 MOV v26.16b, v27.16b 440 STR q24, [x17], 16 441 MOV v24.16b, v25.16b 442 STR q22, [x16], 16 443 MOV v22.16b, v23.16b 444 STR q20, [x6], 16 445 MOV v20.16b, v21.16b 4469: 447 TBZ x1, 1, 10f 448 STR d26, [x7], 8 449 STR d24, [x17], 8 450 DUP d26, v26.d[1] 451 DUP d24, v24.d[1] 452 STR d22, [x16], 8 453 STR d20, [x6], 8 454 DUP d22, v22.d[1] 455 DUP d20, v20.d[1] 456 45710: 458 TBZ x1, 0, 11f 459 STR s26, [x7] 460 STR s24, [x17] 461 STR s22, [x16] 462 STR s20, [x6] 46311: 464 # Restore x19, d12-d15 from stack 465 LDR x19, [sp, 32] 466 LDP d14, d15, [sp, 16] 467 LDP d12, d13, [sp], 48 468 RET 469 470END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 471 472#ifdef __ELF__ 473.section ".note.GNU-stack","",%progbits 474#endif 475