1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x14 c3 38 39# x4 temporary vector shadow register 40 41# Vector register usage 42# A0 v0 v3 43# A1 v0[1] v3[1] 44# A2 v1 v4 45# A3 v1[1] v4[1] 46 47# B v12 v13 v14 v15 second set of B 48# B v16 v17 v18 v19 first set 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# Clamp v6 v7 54 55# unused A v8 v9 v10 v11 56# x12 a4 57# x13 c4 58# x7 c5 59# A4 v2 v5 60# A5 v2[1] v5[1] 61# C v28 v29 62# C v30 v31 63 64BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 65 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 69 # Clamp A and C pointers 70 CMP x0, 2 // if mr < 2 71 ADD x9, x3, x4 // a1 = a0 + a_stride 72 ADD x16, x6, x7 // c1 = c0 + cm_stride 73 CSEL x9, x3, x9, LO // a1 = a0 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 ADD x10, x9, x4 // a2 = a1 + a_stride 77 ADD x17, x16, x7 // c2 = c1 + cm_stride 78 // if mr <= 2 79 CSEL x10, x9, x10, LS // a2 = a1 80 CSEL x17, x16, x17, LS // c2 = c1 81 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x14, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x14, x17, x14, LO // c3 = c2 87 88 # Load min/max values 89 LD2R {v6.4s, v7.4s}, [x8] 90 91 # Save d12-d15 on stack 92 STP d12, d13, [sp, -32]! 93 STP d14, d15, [sp, 16] 94 950: 96 # Load initial accumulators 97 LDP q20, q21, [x15], 32 98 LDP q22, q23, [x15], 32 99 LDP q24, q25, [x15], 32 100 LDP q26, q27, [x15], 32 101 102 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 103 SUBS x0, x2, 16 // k = kc - 16 104 B.LO 4f 105 106 # Prologue - First group loads, no FMA 107 LDR d0, [x3], 8 // a0 108 LDP q16, q17, [x5], 32 // b 109 LDR d1, [x10], 8 // a2 110 LD1 {v0.d}[1], [x9], 8 // a1 111 LD1 {v1.d}[1], [x11], 8 // a3 112 SUBS x0, x0, 16 113 LDR q18, [x5], 16 114 LDR d19, [x5], 8 115 LDR x4, [x5], 8 // ins is in BLOCK 0 116 117 # Is there at least 4 floats (16 bytes) for main loop? 118 B.LO 2f 119 120 # Main loop - 4 floats of A (16 bytes) 121 # 32 FMA + 8 LD64 A + 8 LDR B 1221: 123 # First group of 16 FMA, Second group loads 124 # BLOCK 0 125 LDR d3, [x3], 8 // a0 126 INS v19.d[1], x4 // b from second group 127 FMLA v20.4s, v16.4s, v0.s[0] 128 LDR x4, [x9], 8 // a1 129 FMLA v22.4s, v16.4s, v0.s[2] 130 FMLA v24.4s, v16.4s, v1.s[0] 131 132 # BLOCK 1 133 LDR d12, [x5] 134 INS v3.d[1], x4 // a1 ins 135 FMLA v26.4s, v16.4s, v1.s[2] 136 LDR x4, [x5, 8] // b 137 FMLA v21.4s, v17.4s, v0.s[0] 138 FMLA v23.4s, v17.4s, v0.s[2] 139 140 # BLOCK 2 141 LDR d4, [x10], 8 // a2 142 INS v12.d[1], x4 // b ins 143 FMLA v25.4s, v17.4s, v1.s[0] 144 LDR x4, [x11], 8 // a3 145 FMLA v27.4s, v17.4s, v1.s[2] 146 FMLA v20.4s, v18.4s, v0.s[1] 147 148 # BLOCK 3 149 LDR d13, [x5, 16] 150 INS v4.d[1], x4 // a3 ins 151 FMLA v22.4s, v18.4s, v0.s[3] 152 LDR x4, [x5, 24] 153 FMLA v24.4s, v18.4s, v1.s[1] 154 FMLA v26.4s, v18.4s, v1.s[3] 155 156 # BLOCK 4 157 LDR d14, [x5, 32] 158 INS v13.d[1], x4 // b 159 FMLA v21.4s, v19.4s, v0.s[1] 160 LDR x4, [x5, 40] 161 FMLA v23.4s, v19.4s, v0.s[3] 162 FMLA v25.4s, v19.4s, v1.s[1] 163 164 # BLOCK 5 165 # NOPs to ensure 4 cycle LDR lands on next LDR 166 LDR d15, [x5, 48] 167 INS v14.d[1], x4 // b from previous 168 FMLA v27.4s, v19.4s, v1.s[3] 169 LDR x4, [x5, 56] 170 NOP 171 NOP 172 NOP 173 NOP 174 175 # Second group of 16 FMA, First group of loads 176 # BLOCK 0 177 LDR d0, [x3], 8 // a0 178 INS v15.d[1], x4 // b from previous 179 FMLA v20.4s, v12.4s, v3.s[0] 180 LDR x4, [x9], 8 // a1 181 FMLA v22.4s, v12.4s, v3.s[2] 182 FMLA v24.4s, v12.4s, v4.s[0] 183 184 # BLOCK 1 185 LDR d16, [x5, 64] 186 INS v0.d[1], x4 // a1 ins 187 FMLA v26.4s, v12.4s, v4.s[2] 188 LDR x4, [x5, 72] // b 189 FMLA v21.4s, v13.4s, v3.s[0] 190 FMLA v23.4s, v13.4s, v3.s[2] 191 192 # BLOCK 2 193 LDR d1, [x10], 8 // a2 194 INS v16.d[1], x4 // b 195 FMLA v25.4s, v13.4s, v4.s[0] 196 LDR x4, [x11], 8 // a3 197 FMLA v27.4s, v13.4s, v4.s[2] 198 FMLA v20.4s, v14.4s, v3.s[1] 199 200 # BLOCK 3 201 LDR d17, [x5, 80] 202 INS v1.d[1], x4 // a3 ins 203 FMLA v22.4s, v14.4s, v3.s[3] 204 LDR x4, [x5, 88] 205 FMLA v24.4s, v14.4s, v4.s[1] 206 FMLA v26.4s, v14.4s, v4.s[3] 207 208 # BLOCK 4 209 LDR d18, [x5, 96] 210 INS v17.d[1], x4 // b 211 FMLA v21.4s, v15.4s, v3.s[1] 212 LDR x4, [x5, 104] 213 FMLA v23.4s, v15.4s, v3.s[3] 214 FMLA v25.4s, v15.4s, v4.s[1] 215 216 # BLOCK 5 217 # NOTE that block needs to be 4 cycles for LDR not to stall 218 LDR d19, [x5, 112] 219 INS v18.d[1], x4 220 FMLA v27.4s, v15.4s, v4.s[3] 221 LDR x4, [x5, 120] 222 SUBS x0, x0, 16 223 ADD x5, x5, 128 224 B.HS 1b 225 226 # Epilogue - 4 floats of A (16 bytes) 227 # 32 FMA + 8 LD64 A + 8 LDR B 2282: 229 # First group of 16 FMA, Second group loads 230 # BLOCK 0 231 LDR d3, [x3], 8 // a0 232 INS v19.d[1], x4 // b from second group 233 FMLA v20.4s, v16.4s, v0.s[0] 234 LDR x4, [x9], 8 // a1 235 FMLA v22.4s, v16.4s, v0.s[2] 236 FMLA v24.4s, v16.4s, v1.s[0] 237 238 # BLOCK 1 239 LDR d12, [x5] 240 INS v3.d[1], x4 // a1 ins 241 FMLA v26.4s, v16.4s, v1.s[2] 242 LDR x4, [x5, 8] // b 243 FMLA v21.4s, v17.4s, v0.s[0] 244 FMLA v23.4s, v17.4s, v0.s[2] 245 246 # BLOCK 2 247 LDR d4, [x10], 8 // a2 248 INS v12.d[1], x4 // b ins 249 FMLA v25.4s, v17.4s, v1.s[0] 250 LDR x4, [x11], 8 // a3 251 FMLA v27.4s, v17.4s, v1.s[2] 252 FMLA v20.4s, v18.4s, v0.s[1] 253 254 # BLOCK 3 255 LDR d13, [x5, 16] 256 INS v4.d[1], x4 // a3 ins 257 FMLA v22.4s, v18.4s, v0.s[3] 258 LDR x4, [x5, 24] 259 FMLA v24.4s, v18.4s, v1.s[1] 260 FMLA v26.4s, v18.4s, v1.s[3] 261 262 # BLOCK 4 263 LDR d14, [x5, 32] 264 INS v13.d[1], x4 // b 265 FMLA v21.4s, v19.4s, v0.s[1] 266 LDR x4, [x5, 40] 267 FMLA v23.4s, v19.4s, v0.s[3] 268 FMLA v25.4s, v19.4s, v1.s[1] 269 270 # BLOCK 5 271 # NOPs to ensure 4 cycle LDR lands on next LDR 272 LDR d15, [x5, 48] 273 INS v14.d[1], x4 274 FMLA v27.4s, v19.4s, v1.s[3] 275 LDR x4, [x5, 56] 276 NOP // fma 277 NOP 278 NOP // fma 279 NOP 280 281 # Second group of 16 FMA, no loads 282 # BLOCK 0 283 INS v15.d[1], x4 // b from previous 284 FMLA v20.4s, v12.4s, v3.s[0] 285 FMLA v22.4s, v12.4s, v3.s[2] 286 FMLA v24.4s, v12.4s, v4.s[0] 287 288 # BLOCK 1 289 FMLA v26.4s, v12.4s, v4.s[2] 290 FMLA v21.4s, v13.4s, v3.s[0] 291 FMLA v23.4s, v13.4s, v3.s[2] 292 293 # BLOCK 2 294 FMLA v25.4s, v13.4s, v4.s[0] 295 FMLA v27.4s, v13.4s, v4.s[2] 296 FMLA v20.4s, v14.4s, v3.s[1] 297 298 # BLOCK 3 299 FMLA v22.4s, v14.4s, v3.s[3] 300 FMLA v24.4s, v14.4s, v4.s[1] 301 FMLA v26.4s, v14.4s, v4.s[3] 302 TST x0, 15 303 304 # BLOCK 4 305 FMLA v21.4s, v15.4s, v3.s[1] 306 FMLA v23.4s, v15.4s, v3.s[3] 307 FMLA v25.4s, v15.4s, v4.s[1] 308 ADD x5, x5, 64 309 310 # BLOCK 5 311 FMLA v27.4s, v15.4s, v4.s[3] 312 313 # Is there a remainder?- 2 floats of A (8 bytes) or less 314 B.NE 4f 315 3163: 317 # Clamp 318 FMAX v20.4s, v20.4s, v6.4s 319 # Load cn_stride 320 LDR x0, [sp, 32] 321 FMAX v21.4s, v21.4s, v6.4s 322 FMAX v22.4s, v22.4s, v6.4s 323 FMAX v23.4s, v23.4s, v6.4s 324 FMAX v24.4s, v24.4s, v6.4s 325 FMAX v25.4s, v25.4s, v6.4s 326 FMAX v26.4s, v26.4s, v6.4s 327 FMAX v27.4s, v27.4s, v6.4s 328 SUBS x1, x1, 8 329 FMIN v20.4s, v20.4s, v7.4s 330 FMIN v21.4s, v21.4s, v7.4s 331 FMIN v22.4s, v22.4s, v7.4s 332 FMIN v23.4s, v23.4s, v7.4s 333 FMIN v24.4s, v24.4s, v7.4s 334 FMIN v25.4s, v25.4s, v7.4s 335 FMIN v26.4s, v26.4s, v7.4s 336 FMIN v27.4s, v27.4s, v7.4s 337 338 # Store full 4 x 8 339 B.LO 6f 340 341 ST1 {v26.16b, v27.16b}, [x14], x0 342 SUB x3, x3, x2 // a0 -= kc 343 ST1 {v24.16b, v25.16b}, [x17], x0 344 SUB x9, x9, x2 // a1 -= kc 345 ST1 {v22.16b, v23.16b}, [x16], x0 346 SUB x10, x10, x2 // a2 -= kc 347 ST1 {v20.16b, v21.16b}, [x6], x0 348 SUB x11, x11, x2 // a3 -= kc 349 350 B.HI 0b 351 352 # Restore d12-d15 from stack 353 LDP d14, d15, [sp, 16] 354 LDP d12, d13, [sp], 32 355 RET 356 3574: 358 # Is there a remainder?- 2 floats of A (8 bytes) 359 TBZ x0, 3, 5f 360 361 # Remainder- 2 floats of A (8 bytes) 362 LDR d0, [x3], 8 363 LDR q16, [x5], 16 364 LD1 {v0.d}[1], [x9], 8 365 LDR d1, [x10], 8 366 LD1 {v1.d}[1], [x11], 8 367 LDR q17, [x5], 16 368 LDR q18, [x5], 16 369 LDR q19, [x5], 16 370 FMLA v20.4s, v16.4s, v0.s[0] 371 FMLA v22.4s, v16.4s, v0.s[2] 372 FMLA v24.4s, v16.4s, v1.s[0] 373 FMLA v26.4s, v16.4s, v1.s[2] 374 FMLA v21.4s, v17.4s, v0.s[0] 375 FMLA v23.4s, v17.4s, v0.s[2] 376 FMLA v25.4s, v17.4s, v1.s[0] 377 FMLA v27.4s, v17.4s, v1.s[2] 378 379 FMLA v20.4s, v18.4s, v0.s[1] 380 FMLA v22.4s, v18.4s, v0.s[3] 381 FMLA v24.4s, v18.4s, v1.s[1] 382 FMLA v26.4s, v18.4s, v1.s[3] 383 FMLA v21.4s, v19.4s, v0.s[1] 384 FMLA v23.4s, v19.4s, v0.s[3] 385 FMLA v25.4s, v19.4s, v1.s[1] 386 FMLA v27.4s, v19.4s, v1.s[3] 387 388 # Is there a remainder?- 1 float of A (4 bytes) 389 TBZ x0, 2, 3b 390 3915: 392 # Remainder- 1 float of A (4 bytes) 393 LDR s0, [x3], 4 394 LDR q16, [x5], 16 395 LD1 {v0.s}[2], [x9], 4 396 LDR s1, [x10], 4 397 LD1 {v1.s}[2], [x11], 4 398 LDR q17, [x5], 16 399 400 FMLA v20.4s, v16.4s, v0.s[0] 401 FMLA v22.4s, v16.4s, v0.s[2] 402 FMLA v24.4s, v16.4s, v1.s[0] 403 FMLA v26.4s, v16.4s, v1.s[2] 404 FMLA v21.4s, v17.4s, v0.s[0] 405 FMLA v23.4s, v17.4s, v0.s[2] 406 FMLA v25.4s, v17.4s, v1.s[0] 407 FMLA v27.4s, v17.4s, v1.s[2] 408 B 3b 409 410 # Store odd width 4116: 412 TBZ x1, 2, 7f 413 STR q26, [x14], 16 414 MOV v26.16b, v27.16b 415 STR q24, [x17], 16 416 MOV v24.16b, v25.16b 417 STR q22, [x16], 16 418 MOV v22.16b, v23.16b 419 STR q20, [x6], 16 420 MOV v20.16b, v21.16b 421 4227: 423 TBZ x1, 1, 8f 424 STR d26, [x14], 8 425 STR d24, [x17], 8 426 DUP d26, v26.d[1] 427 DUP d24, v24.d[1] 428 STR d22, [x16], 8 429 STR d20, [x6], 8 430 DUP d22, v22.d[1] 431 DUP d20, v20.d[1] 432 4338: 434 TBZ x1, 0, 9f 435 STR s26, [x14] 436 STR s24, [x17] 437 STR s22, [x16] 438 STR s20, [x6] 4399: 440 # Restore d12-d15 from stack 441 LDP d14, d15, [sp, 16] 442 LDP d12, d13, [sp], 32 443 RET 444 445END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 446 447#ifdef __ELF__ 448.section ".note.GNU-stack","",%progbits 449#endif 450