1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 v3 28# A1 x9 v0[1] v3[1] 29# A2 x10 v1 v4 30# A3 x11 v1[1] v4[1] 31# A4 x12 v2 v5 32# A5 x4 v2[1] v5[1] 33 34# B x5 v12 v13 v14 v15 second set of B 35# B v16 v17 v18 v19 first set 36 37# C0 x6 v20 v21 38# C1 x16 v22 v23 39# C2 x17 v24 v25 40# C3 x14 v26 v27 41# C4 x13 v28 v29 42# C5 x7 v30 v31 43 44# Clamp v6 v7 45# unused A v8 v9 v10 v11 46# x8 temporary vector shadow register 47 48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 49 50 # Load params pointer 51 LDR x8, [sp, 8] 52 53 # Clamp A and C pointers 54 CMP x0, 2 // if mr < 2 55 ADD x9, x3, x4 // a1 = a0 + a_stride 56 ADD x16, x6, x7 // c1 = c0 + cm_stride 57 CSEL x9, x3, x9, LO // a1 = a0 58 CSEL x16, x6, x16, LO // c1 = c0 59 60 ADD x10, x9, x4 // a2 = a1 + a_stride 61 ADD x17, x16, x7 // c2 = c1 + cm_stride 62 // if mr <= 2 63 CSEL x10, x9, x10, LS // a2 = a1 64 CSEL x17, x16, x17, LS // c2 = c1 65 66 CMP x0, 4 // if mr < 4 67 ADD x11, x10, x4 // a3 = a2 + a_stride 68 ADD x14, x17, x7 // c3 = c2 + cm_stride 69 CSEL x11, x10, x11, LO // a3 = a2 70 CSEL x14, x17, x14, LO // c3 = c2 71 72 ADD x12, x11, x4 // a4 = a3 + a_stride 73 ADD x13, x14, x7 // c4 = c3 + cm_stride 74 // if mr <= 4 75 CSEL x12, x11, x12, LS // a4 = a3 76 CSEL x13, x14, x13, LS // c4 = c3 77 78 CMP x0, 6 // if mr < 6 79 ADD x4, x12, x4 // a5 = a4 + a_stride 80 ADD x7, x13, x7 // c5 = c4 + cm_stride 81 CSEL x4, x12, x4, LO // a5 = a4 82 CSEL x7, x13, x7, LO // c5 = c4 83 84 # Load min/max values 85 LD2R {v6.4s, v7.4s}, [x8] 86 87 # Save d12-d15 on stack 88 STP d12, d13, [sp, -32]! 89 STP d14, d15, [sp, 16] 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 SUBS x0, x2, 16 // k = kc - 16 95 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 96 PRFM PLDL1KEEP, [x3, 64] 97 MOV v22.16b, v20.16b 98 PRFM PLDL1KEEP, [x9, 0] 99 PRFM PLDL1KEEP, [x9, 64] 100 MOV v23.16b, v21.16b 101 PRFM PLDL1KEEP, [x10, 0] 102 PRFM PLDL1KEEP, [x10, 64] 103 MOV v24.16b, v20.16b 104 PRFM PLDL1KEEP, [x11, 0] 105 PRFM PLDL1KEEP, [x11, 64] 106 MOV v25.16b, v21.16b 107 PRFM PLDL1KEEP, [x12, 0] 108 PRFM PLDL1KEEP, [x12, 64] 109 MOV v26.16b, v20.16b 110 PRFM PLDL1KEEP, [x4, 0] 111 PRFM PLDL1KEEP, [x4, 64] 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 MOV v27.16b, v21.16b 114 PRFM PLDL1KEEP, [x5, 64] 115 MOV v28.16b, v20.16b 116 PRFM PLDL1KEEP, [x5, 128] 117 MOV v29.16b, v21.16b 118 PRFM PLDL1KEEP, [x5, 192] 119 MOV v30.16b, v20.16b 120 PRFM PLDL1KEEP, [x5, 256] 121 MOV v31.16b, v21.16b 122 PRFM PLDL1KEEP, [x5, 320] 123 124 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 125 B.LO 4f 126 127 # Prologue - First group loads, no FMA 128 LDR d0, [x3], 8 // a0 129 LDP q16, q17, [x5], 32 // b 130 LDR d1, [x10], 8 // a2 131 LDR d2, [x12], 8 // a4 132 LD1 {v0.d}[1], [x9], 8 // a1 133 LD1 {v1.d}[1], [x11], 8 // a3 134 LD1 {v2.d}[1], [x4], 8 // a5 135 SUBS x0, x0, 16 136 LDR q18, [x5], 16 137 LDR d19, [x5], 8 138 LDR x8, [x5], 8 // ins is in BLOCK 0 139 140 # Is there at least 4 floats (16 bytes) for main loop? 141 B.LO 2f 142 143 # Main loop - 4 floats of A (16 bytes) 144 # 48 FMA + 12 LD64 A + 8 LDR B 1451: 146 # First group of 24 FMA, Second group loads 147 # BLOCK 0 148 FMLA v20.4s, v16.4s, v0.s[0] 149 LDR d3, [x3], 8 // a0 150 FMLA v22.4s, v16.4s, v0.s[2] 151 INS v19.d[1], x8 // b from second group 152 FMLA v24.4s, v16.4s, v1.s[0] 153 LDR x8, [x9], 8 // a1 154 155 # BLOCK 1 156 FMLA v26.4s, v16.4s, v1.s[2] 157 LDR d12, [x5] 158 FMLA v28.4s, v16.4s, v2.s[0] 159 INS v3.d[1], x8 // a1 ins 160 FMLA v30.4s, v16.4s, v2.s[2] 161 LDR x8, [x5, 8] // b 162 163 # BLOCK 2 164 FMLA v21.4s, v17.4s, v0.s[0] 165 LDR d4, [x10], 8 // a2 166 FMLA v23.4s, v17.4s, v0.s[2] 167 INS v12.d[1], x8 // b ins 168 FMLA v25.4s, v17.4s, v1.s[0] 169 LDR x8, [x11], 8 // a3 170 171 # BLOCK 3 172 FMLA v27.4s, v17.4s, v1.s[2] 173 LDR d5, [x12], 8 // a4 174 FMLA v29.4s, v17.4s, v2.s[0] 175 INS v4.d[1], x8 // a3 ins 176 FMLA v31.4s, v17.4s, v2.s[2] 177 LDR x8, [x4], 8 // a5 178 179 # BLOCK 4 180 FMLA v20.4s, v18.4s, v0.s[1] 181 LDR d13, [x5, 16] 182 FMLA v22.4s, v18.4s, v0.s[3] 183 INS v5.d[1], x8 // a5 ins 184 FMLA v24.4s, v18.4s, v1.s[1] 185 LDR x8, [x5, 24] 186 187 # BLOCK 5 188 FMLA v26.4s, v18.4s, v1.s[3] 189 LDR d14, [x5, 32] 190 FMLA v28.4s, v18.4s, v2.s[1] 191 INS v13.d[1], x8 // b 192 FMLA v30.4s, v18.4s, v2.s[3] 193 LDR x8, [x5, 40] 194 195 # BLOCK 6 196 FMLA v21.4s, v19.4s, v0.s[1] 197 LDR d15, [x5, 48] 198 FMLA v23.4s, v19.4s, v0.s[3] 199 INS v14.d[1], x8 // b 200 FMLA v25.4s, v19.4s, v1.s[1] 201 LDR x8, [x5, 56] 202 203 # BLOCK 7 204 FMLA v27.4s, v19.4s, v1.s[3] 205 FMLA v29.4s, v19.4s, v2.s[1] 206 INS v15.d[1], x8 207 FMLA v31.4s, v19.4s, v2.s[3] 208 209 # Second group of 24 FMA, First group of loads 210 # BLOCK 0 211 FMLA v20.4s, v12.4s, v3.s[0] 212 LDR d0, [x3], 8 // a0 213 FMLA v22.4s, v12.4s, v3.s[2] 214 FMLA v24.4s, v12.4s, v4.s[0] 215 LDR x8, [x9], 8 // a1 216 217 # BLOCK 1 218 FMLA v26.4s, v12.4s, v4.s[2] 219 LDR d16, [x5, 64] 220 FMLA v28.4s, v12.4s, v5.s[0] 221 INS v0.d[1], x8 // a1 ins 222 FMLA v30.4s, v12.4s, v5.s[2] 223 LDR x8, [x5, 72] // b 224 225 # BLOCK 2 226 FMLA v21.4s, v13.4s, v3.s[0] 227 LDR d1, [x10], 8 // a2 228 FMLA v23.4s, v13.4s, v3.s[2] 229 INS v16.d[1], x8 // b 230 FMLA v25.4s, v13.4s, v4.s[0] 231 LDR x8, [x11], 8 // a3 232 233 # BLOCK 3 234 FMLA v27.4s, v13.4s, v4.s[2] 235 LDR d2, [x12], 8 // a4 236 FMLA v29.4s, v13.4s, v5.s[0] 237 INS v1.d[1], x8 // a3 ins 238 FMLA v31.4s, v13.4s, v5.s[2] 239 LDR x8, [x4], 8 // a5 240 241 # BLOCK 4 242 FMLA v20.4s, v14.4s, v3.s[1] 243 LDR d17, [x5, 80] 244 FMLA v22.4s, v14.4s, v3.s[3] 245 INS v2.d[1], x8 // a5 ins 246 FMLA v24.4s, v14.4s, v4.s[1] 247 LDR x8, [x5, 88] 248 249 # BLOCK 5 250 FMLA v26.4s, v14.4s, v4.s[3] 251 LDR d18, [x5, 96] 252 FMLA v28.4s, v14.4s, v5.s[1] 253 INS v17.d[1], x8 // b 254 FMLA v30.4s, v14.4s, v5.s[3] 255 LDR x8, [x5, 104] 256 257 # BLOCK 6 258 FMLA v21.4s, v15.4s, v3.s[1] 259 LDR d19, [x5, 112] 260 FMLA v23.4s, v15.4s, v3.s[3] 261 INS v18.d[1], x8 // b 262 FMLA v25.4s, v15.4s, v4.s[1] 263 LDR x8, [x5, 120] 264 265 # BLOCK 7 266 FMLA v27.4s, v15.4s, v4.s[3] 267 SUBS x0, x0, 16 268 FMLA v29.4s, v15.4s, v5.s[1] 269 ADD x5, x5, 128 270 FMLA v31.4s, v15.4s, v5.s[3] 271 B.HS 1b 272 273 # Epilogue - 4 floats of A (16 bytes) 274 # 48 FMA + 12 LD64 A + 8 LDR B 2752: 276 # First group of 24 FMA, Second group loads 277 # BLOCK 0 278 FMLA v20.4s, v16.4s, v0.s[0] 279 LDR d3, [x3], 8 // a0 280 FMLA v22.4s, v16.4s, v0.s[2] 281 INS v19.d[1], x8 // b from second group 282 FMLA v24.4s, v16.4s, v1.s[0] 283 LDR x8, [x9], 8 // a1 284 285 # BLOCK 1 286 FMLA v26.4s, v16.4s, v1.s[2] 287 LDR d12, [x5] 288 FMLA v28.4s, v16.4s, v2.s[0] 289 INS v3.d[1], x8 // a1 ins 290 FMLA v30.4s, v16.4s, v2.s[2] 291 LDR x8, [x5, 8] // b 292 293 # BLOCK 2 294 FMLA v21.4s, v17.4s, v0.s[0] 295 LDR d4, [x10], 8 // a2 296 FMLA v23.4s, v17.4s, v0.s[2] 297 INS v12.d[1], x8 // b ins 298 FMLA v25.4s, v17.4s, v1.s[0] 299 LDR x8, [x11], 8 // a3 300 301 # BLOCK 3 302 FMLA v27.4s, v17.4s, v1.s[2] 303 LDR d5, [x12], 8 // a4 304 FMLA v29.4s, v17.4s, v2.s[0] 305 INS v4.d[1], x8 // a3 ins 306 FMLA v31.4s, v17.4s, v2.s[2] 307 LDR x8, [x4], 8 // a5 308 309 # BLOCK 4 310 FMLA v20.4s, v18.4s, v0.s[1] 311 LDR d13, [x5, 16] 312 FMLA v22.4s, v18.4s, v0.s[3] 313 INS v5.d[1], x8 // a5 ins 314 FMLA v24.4s, v18.4s, v1.s[1] 315 LDR x8, [x5, 24] 316 317 # BLOCK 5 318 FMLA v26.4s, v18.4s, v1.s[3] 319 LDR d14, [x5, 32] 320 FMLA v28.4s, v18.4s, v2.s[1] 321 INS v13.d[1], x8 // b 322 FMLA v30.4s, v18.4s, v2.s[3] 323 LDR x8, [x5, 40] 324 325 # BLOCK 6 326 FMLA v21.4s, v19.4s, v0.s[1] 327 LDR d15, [x5, 48] 328 FMLA v23.4s, v19.4s, v0.s[3] 329 INS v14.d[1], x8 // b 330 FMLA v25.4s, v19.4s, v1.s[1] 331 LDR x8, [x5, 56] 332 333 # BLOCK 7 334 FMLA v27.4s, v19.4s, v1.s[3] 335 FMLA v29.4s, v19.4s, v2.s[1] 336 INS v15.d[1], x8 // b 337 FMLA v31.4s, v19.4s, v2.s[3] 338 339 # Second group of 24 FMA, First group of loads 340 # BLOCK 0 341 FMLA v20.4s, v12.4s, v3.s[0] 342 PRFM PSTL1KEEP, [x6] // Prefetch C0 343 FMLA v22.4s, v12.4s, v3.s[2] 344 PRFM PSTL1KEEP, [x16] // Prefetch C1 345 FMLA v24.4s, v12.4s, v4.s[0] 346 PRFM PSTL1KEEP, [x17] // Prefetch C2 347 348 # BLOCK 1 349 FMLA v26.4s, v12.4s, v4.s[2] 350 PRFM PSTL1KEEP, [x14] // Prefetch C3 351 FMLA v28.4s, v12.4s, v5.s[0] 352 PRFM PSTL1KEEP, [x13] // Prefetch C4 353 FMLA v30.4s, v12.4s, v5.s[2] 354 PRFM PSTL1KEEP, [x7] // Prefetch C5 355 356 # BLOCK 2 357 FMLA v21.4s, v13.4s, v3.s[0] 358 FMLA v23.4s, v13.4s, v3.s[2] 359 FMLA v25.4s, v13.4s, v4.s[0] 360 361 # BLOCK 3 362 FMLA v27.4s, v13.4s, v4.s[2] 363 FMLA v29.4s, v13.4s, v5.s[0] 364 FMLA v31.4s, v13.4s, v5.s[2] 365 366 # BLOCK 4 367 FMLA v20.4s, v14.4s, v3.s[1] 368 FMLA v22.4s, v14.4s, v3.s[3] 369 FMLA v24.4s, v14.4s, v4.s[1] 370 371 # BLOCK 5 372 FMLA v26.4s, v14.4s, v4.s[3] 373 FMLA v28.4s, v14.4s, v5.s[1] 374 FMLA v30.4s, v14.4s, v5.s[3] 375 TST x0, 15 376 377 # BLOCK 6 378 FMLA v21.4s, v15.4s, v3.s[1] 379 FMLA v23.4s, v15.4s, v3.s[3] 380 FMLA v25.4s, v15.4s, v4.s[1] 381 ADD x5, x5, 64 382 383 # BLOCK 7 384 FMLA v27.4s, v15.4s, v4.s[3] 385 FMLA v29.4s, v15.4s, v5.s[1] 386 FMLA v31.4s, v15.4s, v5.s[3] 387 388 # Is there a remainder?- 2 floats of A (8 bytes) or less 389 B.NE 4f 3903: 391 # Clamp 392 FMAX v20.4s, v20.4s, v6.4s 393 # Load cn_stride 394 LDR x0, [sp, 32] 395 FMAX v21.4s, v21.4s, v6.4s 396 FMAX v22.4s, v22.4s, v6.4s 397 FMAX v23.4s, v23.4s, v6.4s 398 FMAX v24.4s, v24.4s, v6.4s 399 FMAX v25.4s, v25.4s, v6.4s 400 FMAX v26.4s, v26.4s, v6.4s 401 FMAX v27.4s, v27.4s, v6.4s 402 FMAX v28.4s, v28.4s, v6.4s 403 FMAX v29.4s, v29.4s, v6.4s 404 FMAX v30.4s, v30.4s, v6.4s 405 FMAX v31.4s, v31.4s, v6.4s 406 SUBS x1, x1, 8 407 FMIN v20.4s, v20.4s, v7.4s 408 FMIN v21.4s, v21.4s, v7.4s 409 FMIN v22.4s, v22.4s, v7.4s 410 FMIN v23.4s, v23.4s, v7.4s 411 FMIN v24.4s, v24.4s, v7.4s 412 FMIN v25.4s, v25.4s, v7.4s 413 FMIN v26.4s, v26.4s, v7.4s 414 FMIN v27.4s, v27.4s, v7.4s 415 FMIN v28.4s, v28.4s, v7.4s 416 FMIN v29.4s, v29.4s, v7.4s 417 FMIN v30.4s, v30.4s, v7.4s 418 FMIN v31.4s, v31.4s, v7.4s 419 420 # Store full 6 x 8 421 B.LO 6f 422 423 ST1 {v20.16b, v21.16b}, [x6], x0 424 SUB x3, x3, x2 // a0 -= kc 425 ST1 {v22.16b, v23.16b}, [x16], x0 426 SUB x9, x9, x2 // a1 -= kc 427 ST1 {v24.16b, v25.16b}, [x17], x0 428 SUB x10, x10, x2 // a2 -= kc 429 ST1 {v26.16b, v27.16b}, [x14], x0 430 SUB x11, x11, x2 // a3 -= kc 431 ST1 {v28.16b, v29.16b}, [x13], x0 432 SUB x12, x12, x2 // a4 -= kc 433 ST1 {v30.16b, v31.16b}, [x7], x0 434 SUB x4, x4, x2 // a5 -= kc 435 436 B.HI 0b 437 438 # Restore d12-d15 from stack 439 LDP d14, d15, [sp, 16] 440 LDP d12, d13, [sp], 32 441 RET 442 4434: 444 # Is there a remainder?- 2 floats of A (8 bytes) 445 TBZ x0, 3, 5f 446 447 # Remainder- 2 floats of A (8 bytes) 448 LDR d0, [x3], 8 449 LDR q16, [x5], 16 450 LD1 {v0.d}[1], [x9], 8 451 LDR d1, [x10], 8 452 LD1 {v1.d}[1], [x11], 8 453 LDR d2, [x12], 8 454 LD1 {v2.d}[1], [x4], 8 455 LDR q17, [x5], 16 456 LDR q18, [x5], 16 457 LDR q19, [x5], 16 458 459 FMLA v20.4s, v16.4s, v0.s[0] 460 FMLA v22.4s, v16.4s, v0.s[2] 461 FMLA v24.4s, v16.4s, v1.s[0] 462 FMLA v26.4s, v16.4s, v1.s[2] 463 FMLA v28.4s, v16.4s, v2.s[0] 464 FMLA v30.4s, v16.4s, v2.s[2] 465 FMLA v21.4s, v17.4s, v0.s[0] 466 FMLA v23.4s, v17.4s, v0.s[2] 467 FMLA v25.4s, v17.4s, v1.s[0] 468 FMLA v27.4s, v17.4s, v1.s[2] 469 FMLA v29.4s, v17.4s, v2.s[0] 470 FMLA v31.4s, v17.4s, v2.s[2] 471 472 FMLA v20.4s, v18.4s, v0.s[1] 473 FMLA v22.4s, v18.4s, v0.s[3] 474 FMLA v24.4s, v18.4s, v1.s[1] 475 FMLA v26.4s, v18.4s, v1.s[3] 476 FMLA v28.4s, v18.4s, v2.s[1] 477 FMLA v30.4s, v18.4s, v2.s[3] 478 FMLA v21.4s, v19.4s, v0.s[1] 479 FMLA v23.4s, v19.4s, v0.s[3] 480 FMLA v25.4s, v19.4s, v1.s[1] 481 FMLA v27.4s, v19.4s, v1.s[3] 482 FMLA v29.4s, v19.4s, v2.s[1] 483 FMLA v31.4s, v19.4s, v2.s[3] 484 485 # Is there a remainder?- 1 float of A (4 bytes) 486 TBZ x0, 2, 3b 4875: 488 # Remainder- 1 float of A (4 bytes) 489 LDR s0, [x3], 4 490 LDR q16, [x5], 16 491 LD1 {v0.s}[2], [x9], 4 492 LDR s1, [x10], 4 493 LD1 {v1.s}[2], [x11], 4 494 LDR s2, [x12], 4 495 LD1 {v2.s}[2], [x4], 4 496 LDR q17, [x5], 16 497 498 FMLA v20.4s, v16.4s, v0.s[0] 499 FMLA v22.4s, v16.4s, v0.s[2] 500 FMLA v24.4s, v16.4s, v1.s[0] 501 FMLA v26.4s, v16.4s, v1.s[2] 502 FMLA v28.4s, v16.4s, v2.s[0] 503 FMLA v30.4s, v16.4s, v2.s[2] 504 FMLA v21.4s, v17.4s, v0.s[0] 505 FMLA v23.4s, v17.4s, v0.s[2] 506 FMLA v25.4s, v17.4s, v1.s[0] 507 FMLA v27.4s, v17.4s, v1.s[2] 508 FMLA v29.4s, v17.4s, v2.s[0] 509 FMLA v31.4s, v17.4s, v2.s[2] 510 B 3b 511 512 # Store odd width 5136: 514 TBZ x1, 2, 7f 515 STR q20, [x6], 16 516 MOV v20.16b, v21.16b 517 STR q22, [x16], 16 518 MOV v22.16b, v23.16b 519 STR q24, [x17], 16 520 MOV v24.16b, v25.16b 521 STR q26, [x14], 16 522 MOV v26.16b, v27.16b 523 STR q28, [x13], 16 524 MOV v28.16b, v29.16b 525 STR q30, [x7], 16 526 MOV v30.16b, v31.16b 527 5287: 529 TBZ x1, 1, 8f 530 STR d20, [x6], 8 531 STR d22, [x16], 8 532 DUP d20, v20.d[1] 533 DUP d22, v22.d[1] 534 STR d24, [x17], 8 535 STR d26, [x14], 8 536 DUP d24, v24.d[1] 537 DUP d26, v26.d[1] 538 STR d28, [x13], 8 539 STR d30, [x7], 8 540 DUP d28, v28.d[1] 541 DUP d30, v30.d[1] 542 5438: 544 TBZ x1, 0, 9f 545 STR s20, [x6] 546 STR s22, [x16] 547 STR s24, [x17] 548 STR s26, [x14] 549 STR s28, [x13] 550 STR s30, [x7] 5519: 552 # Restore d12-d15 from stack 553 LDP d14, d15, [sp, 16] 554 LDP d12, d13, [sp], 32 555 RET 556 557END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 558 559#ifdef __ELF__ 560.section ".note.GNU-stack","",%progbits 561#endif 562