1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 64 65 # Load params pointer 66 LDR x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 CMP x0, 4 // if mr < 4 82 ADD x11, x10, x4 // a3 = a2 + a_stride 83 ADD x14, x17, x7 // c3 = c2 + cm_stride 84 CSEL x11, x10, x11, LO // a3 = a2 85 CSEL x14, x17, x14, LO // c3 = c2 86 87 # Load min/max values 88 LD2R {v6.4s, v7.4s}, [x8] 89 90 # Save d12-d15 on stack 91 STP d12, d13, [sp, -32]! 92 STP d14, d15, [sp, 16] 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 MOV v23.16b, v21.16b 99 MOV v24.16b, v20.16b 100 MOV v25.16b, v21.16b 101 MOV v26.16b, v20.16b 102 MOV v27.16b, v21.16b 103 104 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 105 SUBS x0, x2, 16 // k = kc - 16 106 B.LO 4f 107 108 # Prologue - First group loads, no FMA 109 LDR d0, [x3], 8 // a0 110 LDP q16, q17, [x5], 32 // b 111 LDR d1, [x10], 8 // a2 112 LD1 {v0.d}[1], [x9], 8 // a1 113 LD1 {v1.d}[1], [x11], 8 // a3 114 SUBS x0, x0, 16 115 LDR q18, [x5], 16 116 LDR d19, [x5], 8 117 LDR x4, [x5], 8 // ins is in BLOCK 0 118 119 # Is there at least 4 floats (16 bytes) for main loop? 120 B.LO 2f 121 122 # Main loop - 4 floats of A (16 bytes) 123 # 32 FMA + 8 LD64 A + 8 LDR B 1241: 125 # First group of 16 FMA, Second group loads 126 # BLOCK 0 127 LDR d3, [x3], 8 // a0 128 INS v19.d[1], x4 // b from second group 129 FMLA v20.4s, v16.4s, v0.s[0] 130 LDR x4, [x9], 8 // a1 131 FMLA v22.4s, v16.4s, v0.s[2] 132 FMLA v24.4s, v16.4s, v1.s[0] 133 134 # BLOCK 1 135 LDR d12, [x5] 136 INS v3.d[1], x4 // a1 ins 137 FMLA v26.4s, v16.4s, v1.s[2] 138 LDR x4, [x5, 8] // b 139 FMLA v21.4s, v17.4s, v0.s[0] 140 FMLA v23.4s, v17.4s, v0.s[2] 141 142 # BLOCK 2 143 LDR d4, [x10], 8 // a2 144 INS v12.d[1], x4 // b ins 145 FMLA v25.4s, v17.4s, v1.s[0] 146 LDR x4, [x11], 8 // a3 147 FMLA v27.4s, v17.4s, v1.s[2] 148 FMLA v20.4s, v18.4s, v0.s[1] 149 150 # BLOCK 3 151 LDR d13, [x5, 16] 152 INS v4.d[1], x4 // a3 ins 153 FMLA v22.4s, v18.4s, v0.s[3] 154 LDR x4, [x5, 24] 155 FMLA v24.4s, v18.4s, v1.s[1] 156 FMLA v26.4s, v18.4s, v1.s[3] 157 158 # BLOCK 4 159 LDR d14, [x5, 32] 160 INS v13.d[1], x4 // b 161 FMLA v21.4s, v19.4s, v0.s[1] 162 LDR x4, [x5, 40] 163 FMLA v23.4s, v19.4s, v0.s[3] 164 FMLA v25.4s, v19.4s, v1.s[1] 165 166 # BLOCK 5 167 # NOPs to ensure 4 cycle LDR lands on next LDR 168 LDR d15, [x5, 48] 169 INS v14.d[1], x4 // b from previous 170 FMLA v27.4s, v19.4s, v1.s[3] 171 LDR x4, [x5, 56] 172 NOP 173 NOP 174 NOP 175 NOP 176 177 # Second group of 16 FMA, First group of loads 178 # BLOCK 0 179 LDR d0, [x3], 8 // a0 180 INS v15.d[1], x4 // b from previous 181 FMLA v20.4s, v12.4s, v3.s[0] 182 LDR x4, [x9], 8 // a1 183 FMLA v22.4s, v12.4s, v3.s[2] 184 FMLA v24.4s, v12.4s, v4.s[0] 185 186 # BLOCK 1 187 LDR d16, [x5, 64] 188 INS v0.d[1], x4 // a1 ins 189 FMLA v26.4s, v12.4s, v4.s[2] 190 LDR x4, [x5, 72] // b 191 FMLA v21.4s, v13.4s, v3.s[0] 192 FMLA v23.4s, v13.4s, v3.s[2] 193 194 # BLOCK 2 195 LDR d1, [x10], 8 // a2 196 INS v16.d[1], x4 // b 197 FMLA v25.4s, v13.4s, v4.s[0] 198 LDR x4, [x11], 8 // a3 199 FMLA v27.4s, v13.4s, v4.s[2] 200 FMLA v20.4s, v14.4s, v3.s[1] 201 202 # BLOCK 3 203 LDR d17, [x5, 80] 204 INS v1.d[1], x4 // a3 ins 205 FMLA v22.4s, v14.4s, v3.s[3] 206 LDR x4, [x5, 88] 207 FMLA v24.4s, v14.4s, v4.s[1] 208 FMLA v26.4s, v14.4s, v4.s[3] 209 210 # BLOCK 4 211 LDR d18, [x5, 96] 212 INS v17.d[1], x4 // b 213 FMLA v21.4s, v15.4s, v3.s[1] 214 LDR x4, [x5, 104] 215 FMLA v23.4s, v15.4s, v3.s[3] 216 FMLA v25.4s, v15.4s, v4.s[1] 217 218 # BLOCK 5 219 # NOTE that block needs to be 4 cycles for LDR not to stall 220 LDR d19, [x5, 112] 221 INS v18.d[1], x4 222 FMLA v27.4s, v15.4s, v4.s[3] 223 LDR x4, [x5, 120] 224 SUBS x0, x0, 16 225 ADD x5, x5, 128 226 B.HS 1b 227 228 # Epilogue - 4 floats of A (16 bytes) 229 # 32 FMA + 8 LD64 A + 8 LDR B 2302: 231 # First group of 16 FMA, Second group loads 232 # BLOCK 0 233 LDR d3, [x3], 8 // a0 234 INS v19.d[1], x4 // b from second group 235 FMLA v20.4s, v16.4s, v0.s[0] 236 LDR x4, [x9], 8 // a1 237 FMLA v22.4s, v16.4s, v0.s[2] 238 FMLA v24.4s, v16.4s, v1.s[0] 239 240 # BLOCK 1 241 LDR d12, [x5] 242 INS v3.d[1], x4 // a1 ins 243 FMLA v26.4s, v16.4s, v1.s[2] 244 LDR x4, [x5, 8] // b 245 FMLA v21.4s, v17.4s, v0.s[0] 246 FMLA v23.4s, v17.4s, v0.s[2] 247 248 # BLOCK 2 249 LDR d4, [x10], 8 // a2 250 INS v12.d[1], x4 // b ins 251 FMLA v25.4s, v17.4s, v1.s[0] 252 LDR x4, [x11], 8 // a3 253 FMLA v27.4s, v17.4s, v1.s[2] 254 FMLA v20.4s, v18.4s, v0.s[1] 255 256 # BLOCK 3 257 LDR d13, [x5, 16] 258 INS v4.d[1], x4 // a3 ins 259 FMLA v22.4s, v18.4s, v0.s[3] 260 LDR x4, [x5, 24] 261 FMLA v24.4s, v18.4s, v1.s[1] 262 FMLA v26.4s, v18.4s, v1.s[3] 263 264 # BLOCK 4 265 LDR d14, [x5, 32] 266 INS v13.d[1], x4 // b 267 FMLA v21.4s, v19.4s, v0.s[1] 268 LDR x4, [x5, 40] 269 FMLA v23.4s, v19.4s, v0.s[3] 270 FMLA v25.4s, v19.4s, v1.s[1] 271 272 # BLOCK 5 273 # NOPs to ensure 4 cycle LDR lands on next LDR 274 LDR d15, [x5, 48] 275 INS v14.d[1], x4 276 FMLA v27.4s, v19.4s, v1.s[3] 277 LDR x4, [x5, 56] 278 NOP // fma 279 NOP 280 NOP // fma 281 NOP 282 283 # Second group of 16 FMA, no loads 284 # BLOCK 0 285 INS v15.d[1], x4 // b from previous 286 FMLA v20.4s, v12.4s, v3.s[0] 287 FMLA v22.4s, v12.4s, v3.s[2] 288 FMLA v24.4s, v12.4s, v4.s[0] 289 290 # BLOCK 1 291 FMLA v26.4s, v12.4s, v4.s[2] 292 FMLA v21.4s, v13.4s, v3.s[0] 293 FMLA v23.4s, v13.4s, v3.s[2] 294 295 # BLOCK 2 296 FMLA v25.4s, v13.4s, v4.s[0] 297 FMLA v27.4s, v13.4s, v4.s[2] 298 FMLA v20.4s, v14.4s, v3.s[1] 299 300 # BLOCK 3 301 FMLA v22.4s, v14.4s, v3.s[3] 302 FMLA v24.4s, v14.4s, v4.s[1] 303 FMLA v26.4s, v14.4s, v4.s[3] 304 TST x0, 15 305 306 # BLOCK 4 307 FMLA v21.4s, v15.4s, v3.s[1] 308 FMLA v23.4s, v15.4s, v3.s[3] 309 FMLA v25.4s, v15.4s, v4.s[1] 310 ADD x5, x5, 64 311 312 # BLOCK 5 313 FMLA v27.4s, v15.4s, v4.s[3] 314 315 # Is there a remainder?- 2 floats of A (8 bytes) or less 316 B.NE 4f 317 3183: 319 # Clamp 320 FMAX v20.4s, v20.4s, v6.4s 321 # Load cn_stride 322 LDR x0, [sp, 32] 323 FMAX v21.4s, v21.4s, v6.4s 324 FMAX v22.4s, v22.4s, v6.4s 325 FMAX v23.4s, v23.4s, v6.4s 326 FMAX v24.4s, v24.4s, v6.4s 327 FMAX v25.4s, v25.4s, v6.4s 328 FMAX v26.4s, v26.4s, v6.4s 329 FMAX v27.4s, v27.4s, v6.4s 330 SUBS x1, x1, 8 331 FMIN v20.4s, v20.4s, v7.4s 332 FMIN v21.4s, v21.4s, v7.4s 333 FMIN v22.4s, v22.4s, v7.4s 334 FMIN v23.4s, v23.4s, v7.4s 335 FMIN v24.4s, v24.4s, v7.4s 336 FMIN v25.4s, v25.4s, v7.4s 337 FMIN v26.4s, v26.4s, v7.4s 338 FMIN v27.4s, v27.4s, v7.4s 339 340 # Store full 4 x 8 341 B.LO 6f 342 343 ST1 {v20.16b, v21.16b}, [x6], x0 344 SUB x3, x3, x2 // a0 -= kc 345 ST1 {v22.16b, v23.16b}, [x16], x0 346 SUB x9, x9, x2 // a1 -= kc 347 ST1 {v24.16b, v25.16b}, [x17], x0 348 SUB x10, x10, x2 // a2 -= kc 349 ST1 {v26.16b, v27.16b}, [x14], x0 350 SUB x11, x11, x2 // a3 -= kc 351 352 B.HI 0b 353 354 # Restore d12-d15 from stack 355 LDP d14, d15, [sp, 16] 356 LDP d12, d13, [sp], 32 357 RET 358 3594: 360 # Is there a remainder?- 2 floats of A (8 bytes) 361 TBZ x0, 3, 5f 362 363 # Remainder- 2 floats of A (8 bytes) 364 LDR d0, [x3], 8 365 LDR q16, [x5], 16 366 LD1 {v0.d}[1], [x9], 8 367 LDR d1, [x10], 8 368 LD1 {v1.d}[1], [x11], 8 369 LDR q17, [x5], 16 370 LDR q18, [x5], 16 371 LDR q19, [x5], 16 372 FMLA v20.4s, v16.4s, v0.s[0] 373 FMLA v22.4s, v16.4s, v0.s[2] 374 FMLA v24.4s, v16.4s, v1.s[0] 375 FMLA v26.4s, v16.4s, v1.s[2] 376 FMLA v21.4s, v17.4s, v0.s[0] 377 FMLA v23.4s, v17.4s, v0.s[2] 378 FMLA v25.4s, v17.4s, v1.s[0] 379 FMLA v27.4s, v17.4s, v1.s[2] 380 381 FMLA v20.4s, v18.4s, v0.s[1] 382 FMLA v22.4s, v18.4s, v0.s[3] 383 FMLA v24.4s, v18.4s, v1.s[1] 384 FMLA v26.4s, v18.4s, v1.s[3] 385 FMLA v21.4s, v19.4s, v0.s[1] 386 FMLA v23.4s, v19.4s, v0.s[3] 387 FMLA v25.4s, v19.4s, v1.s[1] 388 FMLA v27.4s, v19.4s, v1.s[3] 389 390 # Is there a remainder?- 1 float of A (4 bytes) 391 TBZ x0, 2, 3b 392 3935: 394 # Remainder- 1 float of A (4 bytes) 395 LDR s0, [x3], 4 396 LDR q16, [x5], 16 397 LD1 {v0.s}[2], [x9], 4 398 LDR s1, [x10], 4 399 LD1 {v1.s}[2], [x11], 4 400 LDR q17, [x5], 16 401 402 FMLA v20.4s, v16.4s, v0.s[0] 403 FMLA v22.4s, v16.4s, v0.s[2] 404 FMLA v24.4s, v16.4s, v1.s[0] 405 FMLA v26.4s, v16.4s, v1.s[2] 406 FMLA v21.4s, v17.4s, v0.s[0] 407 FMLA v23.4s, v17.4s, v0.s[2] 408 FMLA v25.4s, v17.4s, v1.s[0] 409 FMLA v27.4s, v17.4s, v1.s[2] 410 B 3b 411 412 # Store odd width 4136: 414 TBZ x1, 2, 7f 415 STR q20, [x6], 16 416 MOV v20.16b, v21.16b 417 STR q22, [x16], 16 418 MOV v22.16b, v23.16b 419 STR q24, [x17], 16 420 MOV v24.16b, v25.16b 421 STR q26, [x14], 16 422 MOV v26.16b, v27.16b 423 4247: 425 TBZ x1, 1, 8f 426 STR d20, [x6], 8 427 STR d22, [x16], 8 428 DUP d20, v20.d[1] 429 DUP d22, v22.d[1] 430 STR d24, [x17], 8 431 STR d26, [x14], 8 432 DUP d24, v24.d[1] 433 DUP d26, v26.d[1] 434 4358: 436 TBZ x1, 0, 9f 437 STR s20, [x6] 438 STR s22, [x16] 439 STR s24, [x17] 440 STR s26, [x14] 4419: 442 # Restore d12-d15 from stack 443 LDP d14, d15, [sp, 16] 444 LDP d12, d13, [sp], 32 445 RET 446 447END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 448 449#ifdef __ELF__ 450.section ".note.GNU-stack","",%progbits 451#endif 452