1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53 64 65 # Load params pointer 66 LDR x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 CMP x0, 4 // if mr < 4 82 ADD x11, x10, x4 // a3 = a2 + a_stride 83 ADD x14, x17, x7 // c3 = c2 + cm_stride 84 CSEL x11, x10, x11, LO // a3 = a2 85 CSEL x14, x17, x14, LO // c3 = c2 86 87 # Load min/max values 88 LD2R {v6.4s, v7.4s}, [x8] 89 90 # Save d12-d15 on stack 91 STP d12, d13, [sp, -32]! 92 STP d14, d15, [sp, 16] 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 99 PRFM PLDL1KEEP, [x3, 64] 100 MOV v23.16b, v21.16b 101 PRFM PLDL1KEEP, [x9, 0] 102 PRFM PLDL1KEEP, [x9, 64] 103 MOV v24.16b, v20.16b 104 PRFM PLDL1KEEP, [x10, 0] 105 PRFM PLDL1KEEP, [x10, 64] 106 MOV v25.16b, v21.16b 107 PRFM PLDL1KEEP, [x11, 0] 108 PRFM PLDL1KEEP, [x11, 64] 109 MOV v26.16b, v20.16b 110 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 111 MOV v27.16b, v21.16b 112 PRFM PLDL1KEEP, [x5, 64] 113 PRFM PLDL1KEEP, [x5, 128] 114 PRFM PLDL1KEEP, [x5, 192] 115 116 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 117 SUBS x0, x2, 16 // k = kc - 16 118 B.LO 4f 119 120 # Prologue - First group loads, no FMA 121 LDR d0, [x3], 8 // a0 122 LDP q16, q17, [x5], 32 // b 123 LDR d1, [x10], 8 // a2 124 LD1 {v0.d}[1], [x9], 8 // a1 125 LD1 {v1.d}[1], [x11], 8 // a3 126 SUBS x0, x0, 16 127 LDR q18, [x5], 16 128 LDR d19, [x5], 8 129 LDR x4, [x5], 8 // ins is in BLOCK 0 130 131 # Is there at least 4 floats (16 bytes) for main loop? 132 B.LO 2f 133 134 # Main loop - 4 floats of A (16 bytes) 135 # 32 FMA + 8 LD64 A + 8 LDR B 1361: 137 # First group of 16 FMA, Second group loads 138 # BLOCK 0 139 LDR d3, [x3], 8 // a0 140 INS v19.d[1], x4 // b from second group 141 FMLA v20.4s, v16.4s, v0.s[0] 142 LDR x4, [x9], 8 // a1 143 FMLA v22.4s, v16.4s, v0.s[2] 144 FMLA v24.4s, v16.4s, v1.s[0] 145 146 # BLOCK 1 147 LDR d12, [x5] 148 INS v3.d[1], x4 // a1 ins 149 FMLA v26.4s, v16.4s, v1.s[2] 150 LDR x4, [x5, 8] // b 151 FMLA v21.4s, v17.4s, v0.s[0] 152 FMLA v23.4s, v17.4s, v0.s[2] 153 154 # BLOCK 2 155 LDR d4, [x10], 8 // a2 156 INS v12.d[1], x4 // b ins 157 FMLA v25.4s, v17.4s, v1.s[0] 158 LDR x4, [x11], 8 // a3 159 FMLA v27.4s, v17.4s, v1.s[2] 160 FMLA v20.4s, v18.4s, v0.s[1] 161 162 # BLOCK 3 163 LDR d13, [x5, 16] 164 INS v4.d[1], x4 // a3 ins 165 FMLA v22.4s, v18.4s, v0.s[3] 166 LDR x4, [x5, 24] 167 FMLA v24.4s, v18.4s, v1.s[1] 168 FMLA v26.4s, v18.4s, v1.s[3] 169 170 # BLOCK 4 171 LDR d14, [x5, 32] 172 INS v13.d[1], x4 // b 173 FMLA v21.4s, v19.4s, v0.s[1] 174 LDR x4, [x5, 40] 175 FMLA v23.4s, v19.4s, v0.s[3] 176 FMLA v25.4s, v19.4s, v1.s[1] 177 178 # BLOCK 5 179 # NOPs to ensure 4 cycle LDR lands on next LDR 180 LDR d15, [x5, 48] 181 INS v14.d[1], x4 // b from previous 182 FMLA v27.4s, v19.4s, v1.s[3] 183 LDR x4, [x5, 56] 184 NOP 185 NOP 186 NOP 187 NOP 188 189 # Second group of 16 FMA, First group of loads 190 # BLOCK 0 191 LDR d0, [x3], 8 // a0 192 INS v15.d[1], x4 // b from previous 193 FMLA v20.4s, v12.4s, v3.s[0] 194 LDR x4, [x9], 8 // a1 195 FMLA v22.4s, v12.4s, v3.s[2] 196 FMLA v24.4s, v12.4s, v4.s[0] 197 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 198 199 # BLOCK 1 200 LDR d16, [x5, 64] 201 INS v0.d[1], x4 // a1 ins 202 FMLA v26.4s, v12.4s, v4.s[2] 203 LDR x4, [x5, 72] // b 204 FMLA v21.4s, v13.4s, v3.s[0] 205 FMLA v23.4s, v13.4s, v3.s[2] 206 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 207 208 # BLOCK 2 209 LDR d1, [x10], 8 // a2 210 INS v16.d[1], x4 // b 211 FMLA v25.4s, v13.4s, v4.s[0] 212 LDR x4, [x11], 8 // a3 213 FMLA v27.4s, v13.4s, v4.s[2] 214 FMLA v20.4s, v14.4s, v3.s[1] 215 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 216 217 # BLOCK 3 218 LDR d17, [x5, 80] 219 INS v1.d[1], x4 // a3 ins 220 FMLA v22.4s, v14.4s, v3.s[3] 221 LDR x4, [x5, 88] 222 FMLA v24.4s, v14.4s, v4.s[1] 223 FMLA v26.4s, v14.4s, v4.s[3] 224 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 225 226 # BLOCK 4 227 LDR d18, [x5, 96] 228 INS v17.d[1], x4 // b 229 FMLA v21.4s, v15.4s, v3.s[1] 230 LDR x4, [x5, 104] 231 FMLA v23.4s, v15.4s, v3.s[3] 232 FMLA v25.4s, v15.4s, v4.s[1] 233 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 234 235 # BLOCK 5 236 # NOTE that block needs to be 4 cycles for LDR not to stall 237 LDR d19, [x5, 112] 238 INS v18.d[1], x4 239 FMLA v27.4s, v15.4s, v4.s[3] 240 LDR x4, [x5, 120] 241 SUBS x0, x0, 16 242 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 243 ADD x5, x5, 128 244 B.HS 1b 245 246 # Epilogue - 4 floats of A (16 bytes) 247 # 32 FMA + 8 LD64 A + 8 LDR B 2482: 249 # First group of 16 FMA, Second group loads 250 # BLOCK 0 251 LDR d3, [x3], 8 // a0 252 INS v19.d[1], x4 // b from second group 253 FMLA v20.4s, v16.4s, v0.s[0] 254 LDR x4, [x9], 8 // a1 255 FMLA v22.4s, v16.4s, v0.s[2] 256 FMLA v24.4s, v16.4s, v1.s[0] 257 258 # BLOCK 1 259 LDR d12, [x5] 260 INS v3.d[1], x4 // a1 ins 261 FMLA v26.4s, v16.4s, v1.s[2] 262 LDR x4, [x5, 8] // b 263 FMLA v21.4s, v17.4s, v0.s[0] 264 FMLA v23.4s, v17.4s, v0.s[2] 265 266 # BLOCK 2 267 LDR d4, [x10], 8 // a2 268 INS v12.d[1], x4 // b ins 269 FMLA v25.4s, v17.4s, v1.s[0] 270 LDR x4, [x11], 8 // a3 271 FMLA v27.4s, v17.4s, v1.s[2] 272 FMLA v20.4s, v18.4s, v0.s[1] 273 274 # BLOCK 3 275 LDR d13, [x5, 16] 276 INS v4.d[1], x4 // a3 ins 277 FMLA v22.4s, v18.4s, v0.s[3] 278 LDR x4, [x5, 24] 279 FMLA v24.4s, v18.4s, v1.s[1] 280 FMLA v26.4s, v18.4s, v1.s[3] 281 282 # BLOCK 4 283 LDR d14, [x5, 32] 284 INS v13.d[1], x4 // b 285 FMLA v21.4s, v19.4s, v0.s[1] 286 LDR x4, [x5, 40] 287 FMLA v23.4s, v19.4s, v0.s[3] 288 FMLA v25.4s, v19.4s, v1.s[1] 289 290 # BLOCK 5 291 # NOPs to ensure 4 cycle LDR lands on next LDR 292 LDR d15, [x5, 48] 293 INS v14.d[1], x4 294 FMLA v27.4s, v19.4s, v1.s[3] 295 LDR x4, [x5, 56] 296 NOP // fma 297 NOP 298 NOP // fma 299 NOP 300 301 # Second group of 16 FMA, no loads 302 # BLOCK 0 303 INS v15.d[1], x4 // b from previous 304 FMLA v20.4s, v12.4s, v3.s[0] 305 FMLA v22.4s, v12.4s, v3.s[2] 306 FMLA v24.4s, v12.4s, v4.s[0] 307 308 # BLOCK 1 309 FMLA v26.4s, v12.4s, v4.s[2] 310 FMLA v21.4s, v13.4s, v3.s[0] 311 FMLA v23.4s, v13.4s, v3.s[2] 312 313 # BLOCK 2 314 FMLA v25.4s, v13.4s, v4.s[0] 315 FMLA v27.4s, v13.4s, v4.s[2] 316 FMLA v20.4s, v14.4s, v3.s[1] 317 318 # BLOCK 3 319 FMLA v22.4s, v14.4s, v3.s[3] 320 FMLA v24.4s, v14.4s, v4.s[1] 321 FMLA v26.4s, v14.4s, v4.s[3] 322 TST x0, 15 323 324 # BLOCK 4 325 FMLA v21.4s, v15.4s, v3.s[1] 326 FMLA v23.4s, v15.4s, v3.s[3] 327 FMLA v25.4s, v15.4s, v4.s[1] 328 ADD x5, x5, 64 329 330 # BLOCK 5 331 FMLA v27.4s, v15.4s, v4.s[3] 332 333 # Is there a remainder?- 2 floats of A (8 bytes) or less 334 B.NE 4f 335 3363: 337 # Clamp 338 FMAX v20.4s, v20.4s, v6.4s 339 # Load cn_stride 340 LDR x0, [sp, 32] 341 FMAX v21.4s, v21.4s, v6.4s 342 FMAX v22.4s, v22.4s, v6.4s 343 FMAX v23.4s, v23.4s, v6.4s 344 FMAX v24.4s, v24.4s, v6.4s 345 FMAX v25.4s, v25.4s, v6.4s 346 FMAX v26.4s, v26.4s, v6.4s 347 FMAX v27.4s, v27.4s, v6.4s 348 SUBS x1, x1, 8 349 FMIN v20.4s, v20.4s, v7.4s 350 FMIN v21.4s, v21.4s, v7.4s 351 FMIN v22.4s, v22.4s, v7.4s 352 FMIN v23.4s, v23.4s, v7.4s 353 FMIN v24.4s, v24.4s, v7.4s 354 FMIN v25.4s, v25.4s, v7.4s 355 FMIN v26.4s, v26.4s, v7.4s 356 FMIN v27.4s, v27.4s, v7.4s 357 358 # Store full 4 x 8 359 B.LO 6f 360 361 ST1 {v20.16b, v21.16b}, [x6], x0 362 SUB x3, x3, x2 // a0 -= kc 363 ST1 {v22.16b, v23.16b}, [x16], x0 364 SUB x9, x9, x2 // a1 -= kc 365 ST1 {v24.16b, v25.16b}, [x17], x0 366 SUB x10, x10, x2 // a2 -= kc 367 ST1 {v26.16b, v27.16b}, [x14], x0 368 SUB x11, x11, x2 // a3 -= kc 369 370 B.HI 0b 371 372 # Restore d12-d15 from stack 373 LDP d14, d15, [sp, 16] 374 LDP d12, d13, [sp], 32 375 RET 376 3774: 378 # Is there a remainder?- 2 floats of A (8 bytes) 379 TBZ x0, 3, 5f 380 381 # Remainder- 2 floats of A (8 bytes) 382 LDR d0, [x3], 8 383 LDR q16, [x5], 16 384 LD1 {v0.d}[1], [x9], 8 385 LDR d1, [x10], 8 386 LD1 {v1.d}[1], [x11], 8 387 LDR q17, [x5], 16 388 LDR q18, [x5], 16 389 LDR q19, [x5], 16 390 FMLA v20.4s, v16.4s, v0.s[0] 391 FMLA v22.4s, v16.4s, v0.s[2] 392 FMLA v24.4s, v16.4s, v1.s[0] 393 FMLA v26.4s, v16.4s, v1.s[2] 394 FMLA v21.4s, v17.4s, v0.s[0] 395 FMLA v23.4s, v17.4s, v0.s[2] 396 FMLA v25.4s, v17.4s, v1.s[0] 397 FMLA v27.4s, v17.4s, v1.s[2] 398 399 FMLA v20.4s, v18.4s, v0.s[1] 400 FMLA v22.4s, v18.4s, v0.s[3] 401 FMLA v24.4s, v18.4s, v1.s[1] 402 FMLA v26.4s, v18.4s, v1.s[3] 403 FMLA v21.4s, v19.4s, v0.s[1] 404 FMLA v23.4s, v19.4s, v0.s[3] 405 FMLA v25.4s, v19.4s, v1.s[1] 406 FMLA v27.4s, v19.4s, v1.s[3] 407 408 # Is there a remainder?- 1 float of A (4 bytes) 409 TBZ x0, 2, 3b 410 4115: 412 # Remainder- 1 float of A (4 bytes) 413 LDR s0, [x3], 4 414 LDR q16, [x5], 16 415 LD1 {v0.s}[2], [x9], 4 416 LDR s1, [x10], 4 417 LD1 {v1.s}[2], [x11], 4 418 LDR q17, [x5], 16 419 420 FMLA v20.4s, v16.4s, v0.s[0] 421 FMLA v22.4s, v16.4s, v0.s[2] 422 FMLA v24.4s, v16.4s, v1.s[0] 423 FMLA v26.4s, v16.4s, v1.s[2] 424 FMLA v21.4s, v17.4s, v0.s[0] 425 FMLA v23.4s, v17.4s, v0.s[2] 426 FMLA v25.4s, v17.4s, v1.s[0] 427 FMLA v27.4s, v17.4s, v1.s[2] 428 B 3b 429 430 # Store odd width 4316: 432 TBZ x1, 2, 7f 433 STR q20, [x6], 16 434 MOV v20.16b, v21.16b 435 STR q22, [x16], 16 436 MOV v22.16b, v23.16b 437 STR q24, [x17], 16 438 MOV v24.16b, v25.16b 439 STR q26, [x14], 16 440 MOV v26.16b, v27.16b 441 4427: 443 TBZ x1, 1, 8f 444 STR d20, [x6], 8 445 STR d22, [x16], 8 446 DUP d20, v20.d[1] 447 DUP d22, v22.d[1] 448 STR d24, [x17], 8 449 STR d26, [x14], 8 450 DUP d24, v24.d[1] 451 DUP d26, v26.d[1] 452 4538: 454 TBZ x1, 0, 9f 455 STR s20, [x6] 456 STR s22, [x16] 457 STR s24, [x17] 458 STR s26, [x14] 4599: 460 # Restore d12-d15 from stack 461 LDP d14, d15, [sp, 16] 462 LDP d12, d13, [sp], 32 463 RET 464 465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53 466 467#ifdef __ELF__ 468.section ".note.GNU-stack","",%progbits 469#endif 470