1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# Register usage 29# A0 x14 v0 v3 30# A1 x15 v0[1] v3[1] 31# A2 x20 v1 v4 32# A3 x21 v1[1] v4[1] 33# A4 x22 v2 v5 34# A5 x23 v2[1] v5[1] 35 36# B x5 v12 v13 v14 v15 second set of B 37# B v16 v17 v18 v19 first set 38 39# C0 x6 v20 v21 40# C1 x16 v22 v23 41# C2 x17 v24 v25 42# C3 x10 v26 v27 43# C4 x13 v28 v29 44# C5 x7 v30 v31 45 46# Clamp v6 v7 47# unused A v8 v9 v10 v11 48# x8 temporary vector shadow register 49 50BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 51 52 # Load a_offset 53 LDR x11, [sp, 8] 54 55 # Load zero, params pointer 56 LDP x12, x8, [sp, 16] 57 58 # Clamp C pointers 59 CMP x0, 2 // if mr < 2 60 ADD x16, x6, x7 // c1 = c0 + cm_stride 61 CSEL x16, x6, x16, LO // c1 = c0 62 63 ADD x17, x16, x7 // c2 = c1 + cm_stride 64 // if mr <= 2 65 CSEL x17, x16, x17, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 ADD x10, x17, x7 // c3 = c2 + cm_stride 69 CSEL x10, x17, x10, LO // c3 = c2 70 71 ADD x13, x10, x7 // c4 = c3 + cm_stride 72 // if mr <= 4 73 CSEL x13, x10, x13, LS // c4 = c3 74 75 CMP x0, 6 // if mr < 6 76 ADD x7, x13, x7 // c5 = c4 + cm_stride 77 CSEL x7, x13, x7, LO // c5 = c4 78 79 # Load min/max values 80 LD2R {v6.4s, v7.4s}, [x8] 81 82 # Save x20-x23, d12-d15 on stack 83 STP d12, d13, [sp, -64]! 84 STP d14, d15, [sp, 16] 85 STP x20, x21, [sp, 32] 86 STP x22, x23, [sp, 48] 87 880: 89 # Load initial bias from w into accumulators 90 LDP q20, q21, [x5], 32 91 MOV v22.16b, v20.16b 92 MOV v23.16b, v21.16b 93 MOV v24.16b, v20.16b 94 MOV v25.16b, v21.16b 95 MOV v26.16b, v20.16b 96 MOV v27.16b, v21.16b 97 MOV v28.16b, v20.16b 98 MOV v29.16b, v21.16b 99 MOV v30.16b, v20.16b 100 MOV v31.16b, v21.16b 101 102 MOV x9, x3 // p = ks 103 1041: 105 # Load next 6 A pointers 106 LDP x14, x15, [x4], 16 107 LDP x20, x21, [x4], 16 108 LDP x22, x23, [x4], 16 109 110 CMP x14, x12 // if a0 == zero 111 ADD x14, x14, x11 // A0 += a_offset 112 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 113 CMP x15, x12 // if a1 == zero 114 ADD x15, x15, x11 // A1 += a_offset 115 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 116 CMP x20, x12 // if a2 == zero 117 ADD x20, x20, x11 // A2 += a_offset 118 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 119 CMP x21, x12 // if a3 == zero 120 ADD x21, x21, x11 // A3 += a_offset 121 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 122 CMP x22, x12 // if a4 == zero 123 ADD x22, x22, x11 // A4 += a_offset 124 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 125 CMP x23, x12 // if a5 == zero 126 ADD x23, x23, x11 // A5 += a_offset 127 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 128 129 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 130 SUBS x0, x2, 16 // k = kc - 16 131 B.LO 5f 132 133 # Prologue - First group loads, no FMA 134 LDR d0, [x14], 8 // A0 135 LDP q16, q17, [x5], 32 // B 136 LDR d1, [x20], 8 // A2 137 LDR d2, [x22], 8 // A4 138 LD1 {v0.d}[1], [x15], 8 // A1 139 LD1 {v1.d}[1], [x21], 8 // A3 140 LD1 {v2.d}[1], [x23], 8 // A5 141 SUBS x0, x0, 16 142 LDR q18, [x5], 16 143 LDR d19, [x5], 8 144 LDR x8, [x5], 8 // ins is in BLOCK 0 145 146 # Is there at least 4 floats (16 bytes) for main loop? 147 B.LO 3f 148 149 # Main loop - 4 floats of A (16 bytes) 150 # 48 FMA + 12 LD64 A + 8 LDR B 1512: 152 # First group of 24 FMA, Second group loads 153 # BLOCK 0 154 LDR d3, [x14], 8 // A0 155 INS v19.d[1], x8 // B from second group 156 FMLA v20.4s, v16.4s, v0.s[0] 157 LDR x8, [x15], 8 // A1 158 FMLA v22.4s, v16.4s, v0.s[2] 159 FMLA v24.4s, v16.4s, v1.s[0] 160 161 # BLOCK 1 162 LDR d12, [x5] 163 INS v3.d[1], x8 // A1 ins 164 FMLA v26.4s, v16.4s, v1.s[2] 165 LDR x8, [x5, 8] // B 166 FMLA v28.4s, v16.4s, v2.s[0] 167 FMLA v30.4s, v16.4s, v2.s[2] 168 169 # BLOCK 2 170 LDR d4, [x20], 8 // A2 171 INS v12.d[1], x8 // B ins 172 FMLA v21.4s, v17.4s, v0.s[0] 173 LDR x8, [x21], 8 // A3 174 FMLA v23.4s, v17.4s, v0.s[2] 175 FMLA v25.4s, v17.4s, v1.s[0] 176 177 # BLOCK 3 178 LDR d5, [x22], 8 // A4 179 INS v4.d[1], x8 // A3 ins 180 FMLA v27.4s, v17.4s, v1.s[2] 181 LDR x8, [x23], 8 // A5 182 FMLA v29.4s, v17.4s, v2.s[0] 183 FMLA v31.4s, v17.4s, v2.s[2] 184 185 # BLOCK 4 186 LDR d13, [x5, 16] 187 INS v5.d[1], x8 // A5 ins 188 FMLA v20.4s, v18.4s, v0.s[1] 189 LDR x8, [x5, 24] 190 FMLA v22.4s, v18.4s, v0.s[3] 191 FMLA v24.4s, v18.4s, v1.s[1] 192 193 # BLOCK 5 194 LDR d14, [x5, 32] 195 INS v13.d[1], x8 // B 196 FMLA v26.4s, v18.4s, v1.s[3] 197 LDR x8, [x5, 40] 198 FMLA v28.4s, v18.4s, v2.s[1] 199 FMLA v30.4s, v18.4s, v2.s[3] 200 201 # BLOCK 6 202 LDR d15, [x5, 48] 203 INS v14.d[1], x8 // B 204 FMLA v21.4s, v19.4s, v0.s[1] 205 LDR x8, [x5, 56] 206 FMLA v23.4s, v19.4s, v0.s[3] 207 FMLA v25.4s, v19.4s, v1.s[1] 208 209 # BLOCK 7 210 INS v15.d[1], x8 211 FMLA v27.4s, v19.4s, v1.s[3] 212 FMLA v29.4s, v19.4s, v2.s[1] 213 FMLA v31.4s, v19.4s, v2.s[3] 214 215 # Second group of 24 FMA, First group of loads 216 # BLOCK 0 217 LDR d0, [x14], 8 // A0 218 FMLA v20.4s, v12.4s, v3.s[0] 219 LDR x8, [x15], 8 // A1 220 FMLA v22.4s, v12.4s, v3.s[2] 221 FMLA v24.4s, v12.4s, v4.s[0] 222 223 # BLOCK 1 224 LDR d16, [x5, 64] 225 INS v0.d[1], x8 // A1 ins 226 FMLA v26.4s, v12.4s, v4.s[2] 227 LDR x8, [x5, 72] // B 228 FMLA v28.4s, v12.4s, v5.s[0] 229 FMLA v30.4s, v12.4s, v5.s[2] 230 231 # BLOCK 2 232 LDR d1, [x20], 8 // A2 233 INS v16.d[1], x8 // B 234 FMLA v21.4s, v13.4s, v3.s[0] 235 LDR x8, [x21], 8 // A3 236 FMLA v23.4s, v13.4s, v3.s[2] 237 FMLA v25.4s, v13.4s, v4.s[0] 238 239 # BLOCK 3 240 LDR d2, [x22], 8 // A4 241 INS v1.d[1], x8 // A3 ins 242 FMLA v27.4s, v13.4s, v4.s[2] 243 LDR x8, [x23], 8 // A5 244 FMLA v29.4s, v13.4s, v5.s[0] 245 FMLA v31.4s, v13.4s, v5.s[2] 246 247 # BLOCK 4 248 LDR d17, [x5, 80] 249 INS v2.d[1], x8 // A5 ins 250 FMLA v20.4s, v14.4s, v3.s[1] 251 LDR x8, [x5, 88] 252 FMLA v22.4s, v14.4s, v3.s[3] 253 FMLA v24.4s, v14.4s, v4.s[1] 254 255 # BLOCK 5 256 LDR d18, [x5, 96] 257 INS v17.d[1], x8 // B 258 FMLA v26.4s, v14.4s, v4.s[3] 259 LDR x8, [x5, 104] 260 FMLA v28.4s, v14.4s, v5.s[1] 261 FMLA v30.4s, v14.4s, v5.s[3] 262 263 # BLOCK 6 264 LDR d19, [x5, 112] 265 INS v18.d[1], x8 // B 266 FMLA v21.4s, v15.4s, v3.s[1] 267 LDR x8, [x5, 120] 268 FMLA v23.4s, v15.4s, v3.s[3] 269 FMLA v25.4s, v15.4s, v4.s[1] 270 271 # BLOCK 7 272 SUBS x0, x0, 16 // LDR lands here 273 FMLA v27.4s, v15.4s, v4.s[3] 274 FMLA v29.4s, v15.4s, v5.s[1] 275 ADD x5, x5, 128 276 FMLA v31.4s, v15.4s, v5.s[3] 277 B.HS 2b 278 279 # Epilogue - 4 floats of A (16 bytes) 280 # 48 FMA + 12 LD64 A + 8 LDR B 2813: 282 # First group of 24 FMA, Second group loads 283 # BLOCK 0 284 LDR d3, [x14], 8 // A0 285 INS v19.d[1], x8 // B from second group 286 FMLA v20.4s, v16.4s, v0.s[0] 287 LDR x8, [x15], 8 // A1 288 FMLA v22.4s, v16.4s, v0.s[2] 289 FMLA v24.4s, v16.4s, v1.s[0] 290 291 # BLOCK 1 292 LDR d12, [x5] 293 INS v3.d[1], x8 // A1 ins 294 FMLA v26.4s, v16.4s, v1.s[2] 295 LDR x8, [x5, 8] // B 296 FMLA v28.4s, v16.4s, v2.s[0] 297 FMLA v30.4s, v16.4s, v2.s[2] 298 299 # BLOCK 2 300 LDR d4, [x20], 8 // A2 301 INS v12.d[1], x8 // B ins 302 FMLA v21.4s, v17.4s, v0.s[0] 303 LDR x8, [x21], 8 // A3 304 FMLA v23.4s, v17.4s, v0.s[2] 305 FMLA v25.4s, v17.4s, v1.s[0] 306 307 # BLOCK 3 308 LDR d5, [x22], 8 // A4 309 INS v4.d[1], x8 // A3 ins 310 FMLA v27.4s, v17.4s, v1.s[2] 311 LDR x8, [x23], 8 // A5 312 FMLA v29.4s, v17.4s, v2.s[0] 313 FMLA v31.4s, v17.4s, v2.s[2] 314 315 # BLOCK 4 316 LDR d13, [x5, 16] 317 INS v5.d[1], x8 // A5 ins 318 FMLA v20.4s, v18.4s, v0.s[1] 319 LDR x8, [x5, 24] 320 FMLA v22.4s, v18.4s, v0.s[3] 321 FMLA v24.4s, v18.4s, v1.s[1] 322 323 # BLOCK 5 324 LDR d14, [x5, 32] 325 INS v13.d[1], x8 // B 326 FMLA v26.4s, v18.4s, v1.s[3] 327 LDR x8, [x5, 40] 328 FMLA v28.4s, v18.4s, v2.s[1] 329 FMLA v30.4s, v18.4s, v2.s[3] 330 331 # BLOCK 6 332 LDR d15, [x5, 48] 333 INS v14.d[1], x8 // B 334 FMLA v21.4s, v19.4s, v0.s[1] 335 LDR x8, [x5, 56] 336 FMLA v23.4s, v19.4s, v0.s[3] 337 FMLA v25.4s, v19.4s, v1.s[1] 338 339 # BLOCK 7 340 INS v15.d[1], x8 // B from previous 341 FMLA v27.4s, v19.4s, v1.s[3] 342 FMLA v29.4s, v19.4s, v2.s[1] 343 FMLA v31.4s, v19.4s, v2.s[3] 344 345 # Second group of 24 FMA, First group of loads 346 # BLOCK 0 347 FMLA v20.4s, v12.4s, v3.s[0] 348 FMLA v22.4s, v12.4s, v3.s[2] 349 FMLA v24.4s, v12.4s, v4.s[0] 350 351 # BLOCK 1 352 FMLA v26.4s, v12.4s, v4.s[2] 353 FMLA v28.4s, v12.4s, v5.s[0] 354 FMLA v30.4s, v12.4s, v5.s[2] 355 356 # BLOCK 2 357 FMLA v21.4s, v13.4s, v3.s[0] 358 FMLA v23.4s, v13.4s, v3.s[2] 359 FMLA v25.4s, v13.4s, v4.s[0] 360 361 # BLOCK 3 362 FMLA v27.4s, v13.4s, v4.s[2] 363 FMLA v29.4s, v13.4s, v5.s[0] 364 FMLA v31.4s, v13.4s, v5.s[2] 365 366 # BLOCK 4 367 FMLA v20.4s, v14.4s, v3.s[1] 368 FMLA v22.4s, v14.4s, v3.s[3] 369 FMLA v24.4s, v14.4s, v4.s[1] 370 371 # BLOCK 5 372 FMLA v26.4s, v14.4s, v4.s[3] 373 FMLA v28.4s, v14.4s, v5.s[1] 374 FMLA v30.4s, v14.4s, v5.s[3] 375 TST x0, 15 376 377 # BLOCK 6 378 FMLA v21.4s, v15.4s, v3.s[1] 379 FMLA v23.4s, v15.4s, v3.s[3] 380 FMLA v25.4s, v15.4s, v4.s[1] 381 ADD x5, x5, 64 382 383 # BLOCK 7 384 FMLA v27.4s, v15.4s, v4.s[3] 385 FMLA v29.4s, v15.4s, v5.s[1] 386 FMLA v31.4s, v15.4s, v5.s[3] 387 388 # Is there a remainder?- 2 floats of A (8 bytes) or less 389 B.NE 5f 390 3914: 392 # ks loop 393 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 394 B.HI 1b 395 396 # Clamp 397 FMAX v20.4s, v20.4s, v6.4s 398 # Load cn_stride 399 LDR x0, [sp, 64] 400 FMAX v21.4s, v21.4s, v6.4s 401 FMAX v22.4s, v22.4s, v6.4s 402 FMAX v23.4s, v23.4s, v6.4s 403 FMAX v24.4s, v24.4s, v6.4s 404 FMAX v25.4s, v25.4s, v6.4s 405 FMAX v26.4s, v26.4s, v6.4s 406 FMAX v27.4s, v27.4s, v6.4s 407 FMAX v28.4s, v28.4s, v6.4s 408 FMAX v29.4s, v29.4s, v6.4s 409 FMAX v30.4s, v30.4s, v6.4s 410 FMAX v31.4s, v31.4s, v6.4s 411 SUBS x1, x1, 8 412 FMIN v20.4s, v20.4s, v7.4s 413 FMIN v21.4s, v21.4s, v7.4s 414 FMIN v22.4s, v22.4s, v7.4s 415 FMIN v23.4s, v23.4s, v7.4s 416 FMIN v24.4s, v24.4s, v7.4s 417 FMIN v25.4s, v25.4s, v7.4s 418 FMIN v26.4s, v26.4s, v7.4s 419 FMIN v27.4s, v27.4s, v7.4s 420 FMIN v28.4s, v28.4s, v7.4s 421 FMIN v29.4s, v29.4s, v7.4s 422 FMIN v30.4s, v30.4s, v7.4s 423 FMIN v31.4s, v31.4s, v7.4s 424 425 # Store full 6 x 8 426 B.LO 7f 427 428 STP q30, q31, [x7] 429 ADD x7, x7, x0 430 STP q28, q29, [x13] 431 ADD x13, x13, x0 432 STP q26, q27, [x10] 433 ADD x10, x10, x0 434 STP q24, q25, [x17] 435 ADD x17, x17, x0 436 STP q22, q23, [x16] 437 ADD x16, x16, x0 438 STP q20, q21, [x6] 439 ADD x6, x6, x0 440 441 SUB x4, x4, x3 // A -= ks 442 443 # nc loop 444 B.HI 0b 445 446 # Restore x20-x23, d12-d15 from stack 447 LDP x22, x23, [sp, 48] 448 LDP x20, x21, [sp, 32] 449 LDP d14, d15, [sp, 16] 450 LDP d12, d13, [sp], 64 451 RET 452 4535: 454 # Is there a remainder?- 2 floats of A (8 bytes) 455 TBZ x0, 3, 6f 456 457 # Remainder- 2 floats of A (8 bytes) 458 LDR d0, [x14], 8 459 LDR q16, [x5], 16 460 LD1 {v0.d}[1], [x15], 8 461 LDR d1, [x20], 8 462 LD1 {v1.d}[1], [x21], 8 463 LDR d2, [x22], 8 464 LD1 {v2.d}[1], [x23], 8 465 LDR q17, [x5], 16 466 LDR q18, [x5], 16 467 LDR q19, [x5], 16 468 FMLA v20.4s, v16.4s, v0.s[0] 469 FMLA v22.4s, v16.4s, v0.s[2] 470 FMLA v24.4s, v16.4s, v1.s[0] 471 FMLA v26.4s, v16.4s, v1.s[2] 472 FMLA v28.4s, v16.4s, v2.s[0] 473 FMLA v30.4s, v16.4s, v2.s[2] 474 FMLA v21.4s, v17.4s, v0.s[0] 475 FMLA v23.4s, v17.4s, v0.s[2] 476 FMLA v25.4s, v17.4s, v1.s[0] 477 FMLA v27.4s, v17.4s, v1.s[2] 478 FMLA v29.4s, v17.4s, v2.s[0] 479 FMLA v31.4s, v17.4s, v2.s[2] 480 481 FMLA v20.4s, v18.4s, v0.s[1] 482 FMLA v22.4s, v18.4s, v0.s[3] 483 FMLA v24.4s, v18.4s, v1.s[1] 484 FMLA v26.4s, v18.4s, v1.s[3] 485 FMLA v28.4s, v18.4s, v2.s[1] 486 FMLA v30.4s, v18.4s, v2.s[3] 487 FMLA v21.4s, v19.4s, v0.s[1] 488 FMLA v23.4s, v19.4s, v0.s[3] 489 FMLA v25.4s, v19.4s, v1.s[1] 490 FMLA v27.4s, v19.4s, v1.s[3] 491 FMLA v29.4s, v19.4s, v2.s[1] 492 FMLA v31.4s, v19.4s, v2.s[3] 493 494 # Is there a remainder?- 1 float of A (4 bytes) 495 TBZ x0, 2, 4b 4966: 497 # Remainder- 1 float of A (4 bytes) 498 LDR s0, [x14], 4 499 LDR q16, [x5], 16 500 LD1 {v0.s}[2], [x15], 4 501 LDR s1, [x20], 4 502 LD1 {v1.s}[2], [x21], 4 503 LDR s2, [x22], 4 504 LD1 {v2.s}[2], [x23], 4 505 LDR q17, [x5], 16 506 507 FMLA v20.4s, v16.4s, v0.s[0] 508 FMLA v22.4s, v16.4s, v0.s[2] 509 FMLA v24.4s, v16.4s, v1.s[0] 510 FMLA v26.4s, v16.4s, v1.s[2] 511 FMLA v28.4s, v16.4s, v2.s[0] 512 FMLA v30.4s, v16.4s, v2.s[2] 513 FMLA v21.4s, v17.4s, v0.s[0] 514 FMLA v23.4s, v17.4s, v0.s[2] 515 FMLA v25.4s, v17.4s, v1.s[0] 516 FMLA v27.4s, v17.4s, v1.s[2] 517 FMLA v29.4s, v17.4s, v2.s[0] 518 FMLA v31.4s, v17.4s, v2.s[2] 519 B 4b 520 521 # Store odd width 5227: 523 TBZ x1, 2, 8f 524 STR q30, [x7], 16 525 MOV v30.16b, v31.16b 526 STR q28, [x13], 16 527 MOV v28.16b, v29.16b 528 STR q26, [x10], 16 529 MOV v26.16b, v27.16b 530 STR q24, [x17], 16 531 MOV v24.16b, v25.16b 532 STR q22, [x16], 16 533 MOV v22.16b, v23.16b 534 STR q20, [x6], 16 535 MOV v20.16b, v21.16b 5368: 537 TBZ x1, 1, 9f 538 STR d30, [x7], 8 539 STR d28, [x13], 8 540 DUP d30, v30.d[1] 541 DUP d28, v28.d[1] 542 STR d26, [x10], 8 543 STR d24, [x17], 8 544 DUP d26, v26.d[1] 545 DUP d24, v24.d[1] 546 STR d22, [x16], 8 547 STR d20, [x6], 8 548 DUP d22, v22.d[1] 549 DUP d20, v20.d[1] 550 5519: 552 TBZ x1, 0, 10f 553 STR s30, [x7] 554 STR s28, [x13] 555 STR s26, [x10] 556 STR s24, [x17] 557 STR s22, [x16] 558 STR s20, [x6] 55910: 560 # Restore x20-x23, d12-d15 from stack 561 LDP x22, x23, [sp, 48] 562 LDP x20, x21, [sp, 32] 563 LDP d14, d15, [sp, 16] 564 LDP d12, d13, [sp], 64 565 RET 566 567END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 568 569#ifdef __ELF__ 570.section ".note.GNU-stack","",%progbits 571#endif 572