1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 64 65 # Load params pointer 66 LDR x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 CMP x0, 4 // if mr < 4 82 ADD x11, x10, x4 // a3 = a2 + a_stride 83 ADD x14, x17, x7 // c3 = c2 + cm_stride 84 CSEL x11, x10, x11, LO // a3 = a2 85 CSEL x14, x17, x14, LO // c3 = c2 86 87 # Load min/max values 88 LD2R {v6.4s, v7.4s}, [x8] 89 90 # Save d12-d15 on stack 91 STP d12, d13, [sp, -32]! 92 STP d14, d15, [sp, 16] 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 99 PRFM PLDL1KEEP, [x3, 64] 100 MOV v23.16b, v21.16b 101 PRFM PLDL1KEEP, [x9, 0] 102 PRFM PLDL1KEEP, [x9, 64] 103 MOV v24.16b, v20.16b 104 PRFM PLDL1KEEP, [x10, 0] 105 PRFM PLDL1KEEP, [x10, 64] 106 MOV v25.16b, v21.16b 107 PRFM PLDL1KEEP, [x11, 0] 108 PRFM PLDL1KEEP, [x11, 64] 109 MOV v26.16b, v20.16b 110 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 111 MOV v27.16b, v21.16b 112 PRFM PLDL1KEEP, [x5, 64] 113 PRFM PLDL1KEEP, [x5, 128] 114 PRFM PLDL1KEEP, [x5, 192] 115 116 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 117 SUBS x0, x2, 16 // k = kc - 16 118 B.LO 4f 119 120 # Prologue - First group loads, no FMA 121 LDR d0, [x3], 8 // a0 122 LDP q16, q17, [x5], 32 // b 123 LDR d1, [x10], 8 // a2 124 LD1 {v0.d}[1], [x9], 8 // a1 125 LD1 {v1.d}[1], [x11], 8 // a3 126 SUBS x0, x0, 16 127 LDR q18, [x5], 16 128 LDR d19, [x5], 8 129 LDR x4, [x5], 8 // ins is in BLOCK 0 130 131 # Is there at least 4 floats (16 bytes) for main loop? 132 B.LO 2f 133 134 # Main loop - 4 floats of A (16 bytes) 135 # 32 FMA + 8 LD64 A + 8 LDR B 1361: 137 # First group of 16 FMA, Second group loads 138 # BLOCK 0 139 FMLA v20.4s, v16.4s, v0.s[0] 140 LDR d3, [x3], 8 // a0 141 FMLA v22.4s, v16.4s, v0.s[2] 142 INS v19.d[1], x4 // b from second group 143 FMLA v24.4s, v16.4s, v1.s[0] 144 LDR x4, [x9], 8 // a1 145 146 # BLOCK 1 147 FMLA v26.4s, v16.4s, v1.s[2] 148 LDR d12, [x5] 149 FMLA v21.4s, v17.4s, v0.s[0] 150 INS v3.d[1], x4 // a1 ins 151 FMLA v23.4s, v17.4s, v0.s[2] 152 LDR x4, [x5, 8] // b 153 154 # BLOCK 2 155 FMLA v25.4s, v17.4s, v1.s[0] 156 LDR d4, [x10], 8 // a2 157 FMLA v27.4s, v17.4s, v1.s[2] 158 INS v12.d[1], x4 // b ins 159 FMLA v20.4s, v18.4s, v0.s[1] 160 LDR x4, [x11], 8 // a3 161 162 # BLOCK 3 163 FMLA v22.4s, v18.4s, v0.s[3] 164 LDR d13, [x5, 16] 165 FMLA v24.4s, v18.4s, v1.s[1] 166 INS v4.d[1], x4 // a3 ins 167 FMLA v26.4s, v18.4s, v1.s[3] 168 LDR x4, [x5, 24] 169 170 # BLOCK 4 171 FMLA v21.4s, v19.4s, v0.s[1] 172 LDR d14, [x5, 32] 173 FMLA v23.4s, v19.4s, v0.s[3] 174 INS v13.d[1], x4 // b 175 FMLA v25.4s, v19.4s, v1.s[1] 176 LDR x4, [x5, 40] 177 178 # BLOCK 5 179 # NOPs to ensure 4 cycle LDR lands on next LDR 180 FMLA v27.4s, v19.4s, v1.s[3] 181 LDR d15, [x5, 48] 182 NOP 183 INS v14.d[1], x4 // b from previous 184 SUBS x0, x0, 16 185 LDR x4, [x5, 56] 186 187 # Second group of 16 FMA, First group of loads 188 # BLOCK 0 189 FMLA v20.4s, v12.4s, v3.s[0] 190 LDR d0, [x3], 8 // a0 191 FMLA v22.4s, v12.4s, v3.s[2] 192 INS v15.d[1], x4 // b from previous 193 FMLA v24.4s, v12.4s, v4.s[0] 194 LDR x4, [x9], 8 // a1 195 196 # BLOCK 1 197 FMLA v26.4s, v12.4s, v4.s[2] 198 LDR d16, [x5, 64] 199 FMLA v21.4s, v13.4s, v3.s[0] 200 INS v0.d[1], x4 // a1 ins 201 FMLA v23.4s, v13.4s, v3.s[2] 202 LDR x4, [x5, 72] // b 203 204 # BLOCK 2 205 FMLA v25.4s, v13.4s, v4.s[0] 206 LDR d1, [x10], 8 // a2 207 FMLA v27.4s, v13.4s, v4.s[2] 208 INS v16.d[1], x4 // b 209 FMLA v20.4s, v14.4s, v3.s[1] 210 LDR x4, [x11], 8 // a3 211 212 # BLOCK 3 213 FMLA v22.4s, v14.4s, v3.s[3] 214 LDR d17, [x5, 80] 215 FMLA v24.4s, v14.4s, v4.s[1] 216 INS v1.d[1], x4 // a3 ins 217 FMLA v26.4s, v14.4s, v4.s[3] 218 LDR x4, [x5, 88] 219 220 # BLOCK 4 221 FMLA v21.4s, v15.4s, v3.s[1] 222 LDR d18, [x5, 96] 223 FMLA v23.4s, v15.4s, v3.s[3] 224 INS v17.d[1], x4 // b 225 FMLA v25.4s, v15.4s, v4.s[1] 226 LDR x4, [x5, 104] 227 228 # BLOCK 5 229 # NOTE that block needs to be 4 cycles for LDR not to stall 230 FMLA v27.4s, v15.4s, v4.s[3] 231 LDR d19, [x5, 112] 232 INS v18.d[1], x4 233 LDR x4, [x5, 120] 234 ADD x5, x5, 128 235 B.HS 1b 236 237 # Epilogue - 4 floats of A (16 bytes) 238 # 32 FMA + 8 LD64 A + 8 LDR B 2392: 240 # First group of 16 FMA, Second group loads 241 # BLOCK 0 242 FMLA v20.4s, v16.4s, v0.s[0] 243 LDR d3, [x3], 8 // a0 244 FMLA v22.4s, v16.4s, v0.s[2] 245 INS v19.d[1], x4 // b from second group 246 FMLA v24.4s, v16.4s, v1.s[0] 247 LDR x4, [x9], 8 // a1 248 249 # BLOCK 1 250 FMLA v26.4s, v16.4s, v1.s[2] 251 LDR d12, [x5] 252 FMLA v21.4s, v17.4s, v0.s[0] 253 INS v3.d[1], x4 // a1 ins 254 FMLA v23.4s, v17.4s, v0.s[2] 255 LDR x4, [x5, 8] // b 256 257 # BLOCK 2 258 FMLA v25.4s, v17.4s, v1.s[0] 259 LDR d4, [x10], 8 // a2 260 FMLA v27.4s, v17.4s, v1.s[2] 261 INS v12.d[1], x4 // b ins 262 FMLA v20.4s, v18.4s, v0.s[1] 263 LDR x4, [x11], 8 // a3 264 265 # BLOCK 3 266 FMLA v22.4s, v18.4s, v0.s[3] 267 LDR d13, [x5, 16] 268 FMLA v24.4s, v18.4s, v1.s[1] 269 INS v4.d[1], x4 // a3 ins 270 FMLA v26.4s, v18.4s, v1.s[3] 271 LDR x4, [x5, 24] 272 273 # BLOCK 4 274 FMLA v21.4s, v19.4s, v0.s[1] 275 LDR d14, [x5, 32] 276 FMLA v23.4s, v19.4s, v0.s[3] 277 INS v13.d[1], x4 // b 278 FMLA v25.4s, v19.4s, v1.s[1] 279 LDR x4, [x5, 40] 280 281 # BLOCK 5 282 # NOPs to ensure 4 cycle LDR lands on next LDR 283 FMLA v27.4s, v19.4s, v1.s[3] 284 LDR d15, [x5, 48] 285 NOP // fma 286 INS v14.d[1], x4 287 NOP 288 LDR x4, [x5, 56] 289 290 # Second group of 16 FMA, no loads 291 # BLOCK 0 292 FMLA v20.4s, v12.4s, v3.s[0] 293 FMLA v22.4s, v12.4s, v3.s[2] 294 INS v15.d[1], x4 // b from previous 295 FMLA v24.4s, v12.4s, v4.s[0] 296 297 # BLOCK 1 298 FMLA v26.4s, v12.4s, v4.s[2] 299 FMLA v21.4s, v13.4s, v3.s[0] 300 FMLA v23.4s, v13.4s, v3.s[2] 301 302 # BLOCK 2 303 FMLA v25.4s, v13.4s, v4.s[0] 304 FMLA v27.4s, v13.4s, v4.s[2] 305 FMLA v20.4s, v14.4s, v3.s[1] 306 307 # BLOCK 3 308 FMLA v22.4s, v14.4s, v3.s[3] 309 FMLA v24.4s, v14.4s, v4.s[1] 310 FMLA v26.4s, v14.4s, v4.s[3] 311 TST x0, 15 312 313 # BLOCK 4 314 FMLA v21.4s, v15.4s, v3.s[1] 315 FMLA v23.4s, v15.4s, v3.s[3] 316 FMLA v25.4s, v15.4s, v4.s[1] 317 ADD x5, x5, 64 318 319 # BLOCK 5 320 FMLA v27.4s, v15.4s, v4.s[3] 321 322 # Is there a remainder?- 2 floats of A (8 bytes) or less 323 B.NE 4f 324 3253: 326 # Clamp 327 FMAX v20.4s, v20.4s, v6.4s 328 # Load cn_stride 329 LDR x0, [sp, 32] 330 FMAX v21.4s, v21.4s, v6.4s 331 FMAX v22.4s, v22.4s, v6.4s 332 FMAX v23.4s, v23.4s, v6.4s 333 FMAX v24.4s, v24.4s, v6.4s 334 FMAX v25.4s, v25.4s, v6.4s 335 FMAX v26.4s, v26.4s, v6.4s 336 FMAX v27.4s, v27.4s, v6.4s 337 SUBS x1, x1, 8 338 FMIN v20.4s, v20.4s, v7.4s 339 FMIN v21.4s, v21.4s, v7.4s 340 FMIN v22.4s, v22.4s, v7.4s 341 FMIN v23.4s, v23.4s, v7.4s 342 FMIN v24.4s, v24.4s, v7.4s 343 FMIN v25.4s, v25.4s, v7.4s 344 FMIN v26.4s, v26.4s, v7.4s 345 FMIN v27.4s, v27.4s, v7.4s 346 347 # Store full 4 x 8 348 B.LO 6f 349 350 ST1 {v20.16b, v21.16b}, [x6], x0 351 SUB x3, x3, x2 // a0 -= kc 352 ST1 {v22.16b, v23.16b}, [x16], x0 353 SUB x9, x9, x2 // a1 -= kc 354 ST1 {v24.16b, v25.16b}, [x17], x0 355 SUB x10, x10, x2 // a2 -= kc 356 ST1 {v26.16b, v27.16b}, [x14], x0 357 SUB x11, x11, x2 // a3 -= kc 358 359 B.HI 0b 360 361 # Restore d12-d15 from stack 362 LDP d14, d15, [sp, 16] 363 LDP d12, d13, [sp], 32 364 RET 365 3664: 367 # Is there a remainder?- 2 floats of A (8 bytes) 368 TBZ x0, 3, 5f 369 370 # Remainder- 2 floats of A (8 bytes) 371 LDR d0, [x3], 8 372 LDR q16, [x5], 16 373 LD1 {v0.d}[1], [x9], 8 374 LDR d1, [x10], 8 375 LD1 {v1.d}[1], [x11], 8 376 LDR q17, [x5], 16 377 LDR q18, [x5], 16 378 LDR q19, [x5], 16 379 FMLA v20.4s, v16.4s, v0.s[0] 380 FMLA v22.4s, v16.4s, v0.s[2] 381 FMLA v24.4s, v16.4s, v1.s[0] 382 FMLA v26.4s, v16.4s, v1.s[2] 383 FMLA v21.4s, v17.4s, v0.s[0] 384 FMLA v23.4s, v17.4s, v0.s[2] 385 FMLA v25.4s, v17.4s, v1.s[0] 386 FMLA v27.4s, v17.4s, v1.s[2] 387 388 FMLA v20.4s, v18.4s, v0.s[1] 389 FMLA v22.4s, v18.4s, v0.s[3] 390 FMLA v24.4s, v18.4s, v1.s[1] 391 FMLA v26.4s, v18.4s, v1.s[3] 392 FMLA v21.4s, v19.4s, v0.s[1] 393 FMLA v23.4s, v19.4s, v0.s[3] 394 FMLA v25.4s, v19.4s, v1.s[1] 395 FMLA v27.4s, v19.4s, v1.s[3] 396 397 # Is there a remainder?- 1 float of A (4 bytes) 398 TBZ x0, 2, 3b 399 4005: 401 # Remainder- 1 float of A (4 bytes) 402 LDR s0, [x3], 4 403 LDR q16, [x5], 16 404 LD1 {v0.s}[2], [x9], 4 405 LDR s1, [x10], 4 406 LD1 {v1.s}[2], [x11], 4 407 LDR q17, [x5], 16 408 409 FMLA v20.4s, v16.4s, v0.s[0] 410 FMLA v22.4s, v16.4s, v0.s[2] 411 FMLA v24.4s, v16.4s, v1.s[0] 412 FMLA v26.4s, v16.4s, v1.s[2] 413 FMLA v21.4s, v17.4s, v0.s[0] 414 FMLA v23.4s, v17.4s, v0.s[2] 415 FMLA v25.4s, v17.4s, v1.s[0] 416 FMLA v27.4s, v17.4s, v1.s[2] 417 B 3b 418 419 # Store odd width 4206: 421 TBZ x1, 2, 7f 422 STR q20, [x6], 16 423 MOV v20.16b, v21.16b 424 STR q22, [x16], 16 425 MOV v22.16b, v23.16b 426 STR q24, [x17], 16 427 MOV v24.16b, v25.16b 428 STR q26, [x14], 16 429 MOV v26.16b, v27.16b 430 4317: 432 TBZ x1, 1, 8f 433 STR d20, [x6], 8 434 STR d22, [x16], 8 435 DUP d20, v20.d[1] 436 DUP d22, v22.d[1] 437 STR d24, [x17], 8 438 STR d26, [x14], 8 439 DUP d24, v24.d[1] 440 DUP d26, v26.d[1] 441 4428: 443 TBZ x1, 0, 9f 444 STR s20, [x6] 445 STR s22, [x16] 446 STR s24, [x17] 447 STR s26, [x14] 4489: 449 # Restore d12-d15 from stack 450 LDP d14, d15, [sp, 16] 451 LDP d12, d13, [sp], 32 452 RET 453 454END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 455 456#ifdef __ELF__ 457.section ".note.GNU-stack","",%progbits 458#endif 459