1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 v3 28# A1 x9 v0[1] v3[1] 29# A2 x10 v1 v4 30# A3 x11 v1[1] v4[1] 31# A4 x12 v2 v5 32# A5 x4 v2[1] v5[1] 33 34# B x5 v12 v13 v14 v15 second set of B 35# B v16 v17 v18 v19 first set 36 37# C x6 v20 v21 38# C x16 v22 v23 39# C x17 v24 v25 40# C x14 v26 v27 41# C x13 v28 v29 42# C x7 v30 v31 43 44# Clamp v6 v7 45# unused A v8 v9 v10 v11 46# x8 temporary vector shadow register 47 48BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 49 50 # Load params pointer 51 LDR x8, [sp, 8] 52 53 # Clamp A and C pointers 54 CMP x0, 2 // if mr < 2 55 ADD x9, x3, x4 // A1 = a0 + a_stride 56 ADD x16, x6, x7 // c1 = c0 + cm_stride 57 CSEL x9, x3, x9, LO // a1 = a0 58 CSEL x16, x6, x16, LO // c1 = c0 59 60 ADD x10, x9, x4 // A2 = a1 + a_stride 61 ADD x17, x16, x7 // c2 = c1 + cm_stride 62 // if mr <= 2 63 CSEL x10, x9, x10, LS // a2 = a1 64 CSEL x17, x16, x17, LS // c2 = c1 65 66 CMP x0, 4 // if mr < 4 67 ADD x11, x10, x4 // A3 = a2 + a_stride 68 ADD x14, x17, x7 // c3 = c2 + cm_stride 69 CSEL x11, x10, x11, LO // a3 = a2 70 CSEL x14, x17, x14, LO // c3 = c2 71 72 ADD x12, x11, x4 // A4 = a3 + a_stride 73 ADD x13, x14, x7 // c4 = c3 + cm_stride 74 // if mr <= 4 75 CSEL x12, x11, x12, LS // a4 = a3 76 CSEL x13, x14, x13, LS // c4 = c3 77 78 CMP x0, 6 // if mr < 6 79 ADD x4, x12, x4 // A5 = a4 + a_stride 80 ADD x7, x13, x7 // c5 = c4 + cm_stride 81 CSEL x4, x12, x4, LO // a5 = a4 82 CSEL x7, x13, x7, LO // c5 = c4 83 84 # Load min/max values 85 LD2R {v6.4s, v7.4s}, [x8] 86 87 # Save d12-d15 on stack 88 STP d12, d13, [sp, -32]! 89 STP d14, d15, [sp, 16] 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 MOV v22.16b, v20.16b 95 MOV v23.16b, v21.16b 96 MOV v24.16b, v20.16b 97 MOV v25.16b, v21.16b 98 MOV v26.16b, v20.16b 99 MOV v27.16b, v21.16b 100 MOV v28.16b, v20.16b 101 MOV v29.16b, v21.16b 102 MOV v30.16b, v20.16b 103 MOV v31.16b, v21.16b 104 105 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 106 SUBS x0, x2, 16 // k = kc - 16 107 B.LO 4f 108 109 # Prologue - First group loads, no FMA 110 LDR d0, [x3], 8 // A0 111 LDP q16, q17, [x5], 32 // B 112 LDR d1, [x10], 8 // A2 113 LDR d2, [x12], 8 // A4 114 LD1 {v0.d}[1], [x9], 8 // A1 115 LD1 {v1.d}[1], [x11], 8 // A3 116 LD1 {v2.d}[1], [x4], 8 // A5 117 SUBS x0, x0, 16 118 LDR q18, [x5], 16 119 LDR d19, [x5], 8 120 LDR x8, [x5], 8 // ins is in BLOCK 0 121 122 # Is there at least 4 floats (16 bytes) for main loop? 123 B.LO 2f 124 125 # Main loop - 4 floats of A (16 bytes) 126 # 48 FMA + 12 LD64 A + 8 LDR B 1271: 128 # First group of 24 FMA, Second group loads 129 # BLOCK 0 130 LDR d3, [x3], 8 // A0 131 INS v19.d[1], x8 // B from second group 132 FMLA v20.4s, v16.4s, v0.s[0] 133 LDR x8, [x9], 8 // A1 134 FMLA v22.4s, v16.4s, v0.s[2] 135 FMLA v24.4s, v16.4s, v1.s[0] 136 137 # BLOCK 1 138 LDR d12, [x5] 139 INS v3.d[1], x8 // A1 ins 140 FMLA v26.4s, v16.4s, v1.s[2] 141 LDR x8, [x5, 8] // B 142 FMLA v28.4s, v16.4s, v2.s[0] 143 FMLA v30.4s, v16.4s, v2.s[2] 144 145 # BLOCK 2 146 LDR d4, [x10], 8 // A2 147 INS v12.d[1], x8 // B ins 148 FMLA v21.4s, v17.4s, v0.s[0] 149 LDR x8, [x11], 8 // A3 150 FMLA v23.4s, v17.4s, v0.s[2] 151 FMLA v25.4s, v17.4s, v1.s[0] 152 153 # BLOCK 3 154 LDR d5, [x12], 8 // A4 155 INS v4.d[1], x8 // A3 ins 156 FMLA v27.4s, v17.4s, v1.s[2] 157 LDR x8, [x4], 8 // A5 158 FMLA v29.4s, v17.4s, v2.s[0] 159 FMLA v31.4s, v17.4s, v2.s[2] 160 161 # BLOCK 4 162 LDR d13, [x5, 16] 163 INS v5.d[1], x8 // A5 ins 164 FMLA v20.4s, v18.4s, v0.s[1] 165 LDR x8, [x5, 24] 166 FMLA v22.4s, v18.4s, v0.s[3] 167 FMLA v24.4s, v18.4s, v1.s[1] 168 169 # BLOCK 5 170 LDR d14, [x5, 32] 171 INS v13.d[1], x8 // B 172 FMLA v26.4s, v18.4s, v1.s[3] 173 LDR x8, [x5, 40] 174 FMLA v28.4s, v18.4s, v2.s[1] 175 FMLA v30.4s, v18.4s, v2.s[3] 176 177 # BLOCK 6 178 LDR d15, [x5, 48] 179 INS v14.d[1], x8 // B 180 FMLA v21.4s, v19.4s, v0.s[1] 181 LDR x8, [x5, 56] 182 FMLA v23.4s, v19.4s, v0.s[3] 183 FMLA v25.4s, v19.4s, v1.s[1] 184 185 # BLOCK 7 186 INS v15.d[1], x8 187 FMLA v27.4s, v19.4s, v1.s[3] 188 FMLA v29.4s, v19.4s, v2.s[1] 189 FMLA v31.4s, v19.4s, v2.s[3] 190 191 # Second group of 24 FMA, First group of loads 192 # BLOCK 0 193 LDR d0, [x3], 8 // A0 194 FMLA v20.4s, v12.4s, v3.s[0] 195 LDR x8, [x9], 8 // A1 196 FMLA v22.4s, v12.4s, v3.s[2] 197 FMLA v24.4s, v12.4s, v4.s[0] 198 199 # BLOCK 1 200 LDR d16, [x5, 64] 201 INS v0.d[1], x8 // A1 ins 202 FMLA v26.4s, v12.4s, v4.s[2] 203 LDR x8, [x5, 72] // B 204 FMLA v28.4s, v12.4s, v5.s[0] 205 FMLA v30.4s, v12.4s, v5.s[2] 206 207 # BLOCK 2 208 LDR d1, [x10], 8 // A2 209 INS v16.d[1], x8 // B 210 FMLA v21.4s, v13.4s, v3.s[0] 211 LDR x8, [x11], 8 // A3 212 FMLA v23.4s, v13.4s, v3.s[2] 213 FMLA v25.4s, v13.4s, v4.s[0] 214 215 # BLOCK 3 216 LDR d2, [x12], 8 // A4 217 INS v1.d[1], x8 // A3 ins 218 FMLA v27.4s, v13.4s, v4.s[2] 219 LDR x8, [x4], 8 // A5 220 FMLA v29.4s, v13.4s, v5.s[0] 221 FMLA v31.4s, v13.4s, v5.s[2] 222 223 # BLOCK 4 224 LDR d17, [x5, 80] 225 INS v2.d[1], x8 // A5 ins 226 FMLA v20.4s, v14.4s, v3.s[1] 227 LDR x8, [x5, 88] 228 FMLA v22.4s, v14.4s, v3.s[3] 229 FMLA v24.4s, v14.4s, v4.s[1] 230 231 # BLOCK 5 232 LDR d18, [x5, 96] 233 INS v17.d[1], x8 // B 234 FMLA v26.4s, v14.4s, v4.s[3] 235 LDR x8, [x5, 104] 236 FMLA v28.4s, v14.4s, v5.s[1] 237 FMLA v30.4s, v14.4s, v5.s[3] 238 239 # BLOCK 6 240 LDR d19, [x5, 112] 241 INS v18.d[1], x8 // B 242 FMLA v21.4s, v15.4s, v3.s[1] 243 LDR x8, [x5, 120] 244 FMLA v23.4s, v15.4s, v3.s[3] 245 FMLA v25.4s, v15.4s, v4.s[1] 246 247 # BLOCK 7 248 SUBS x0, x0, 16 // LDR lands here 249 FMLA v27.4s, v15.4s, v4.s[3] 250 FMLA v29.4s, v15.4s, v5.s[1] 251 ADD x5, x5, 128 252 FMLA v31.4s, v15.4s, v5.s[3] 253 B.HS 1b 254 255 # Epilogue - 4 floats of A (16 bytes) 256 # 48 FMA + 12 LD64 A + 8 LDR B 2572: 258 # First group of 24 FMA, Second group loads 259 # BLOCK 0 260 LDR d3, [x3], 8 // A0 261 INS v19.d[1], x8 // B from second group 262 FMLA v20.4s, v16.4s, v0.s[0] 263 LDR x8, [x9], 8 // A1 264 FMLA v22.4s, v16.4s, v0.s[2] 265 FMLA v24.4s, v16.4s, v1.s[0] 266 267 # BLOCK 1 268 LDR d12, [x5] 269 INS v3.d[1], x8 // A1 ins 270 FMLA v26.4s, v16.4s, v1.s[2] 271 LDR x8, [x5, 8] // B 272 FMLA v28.4s, v16.4s, v2.s[0] 273 FMLA v30.4s, v16.4s, v2.s[2] 274 275 # BLOCK 2 276 LDR d4, [x10], 8 // A2 277 INS v12.d[1], x8 // B ins 278 FMLA v21.4s, v17.4s, v0.s[0] 279 LDR x8, [x11], 8 // A3 280 FMLA v23.4s, v17.4s, v0.s[2] 281 FMLA v25.4s, v17.4s, v1.s[0] 282 283 # BLOCK 3 284 LDR d5, [x12], 8 // A4 285 INS v4.d[1], x8 // A3 ins 286 FMLA v27.4s, v17.4s, v1.s[2] 287 LDR x8, [x4], 8 // A5 288 FMLA v29.4s, v17.4s, v2.s[0] 289 FMLA v31.4s, v17.4s, v2.s[2] 290 291 # BLOCK 4 292 LDR d13, [x5, 16] 293 INS v5.d[1], x8 // A5 ins 294 FMLA v20.4s, v18.4s, v0.s[1] 295 LDR x8, [x5, 24] 296 FMLA v22.4s, v18.4s, v0.s[3] 297 FMLA v24.4s, v18.4s, v1.s[1] 298 299 # BLOCK 5 300 LDR d14, [x5, 32] 301 INS v13.d[1], x8 // B 302 FMLA v26.4s, v18.4s, v1.s[3] 303 LDR x8, [x5, 40] 304 FMLA v28.4s, v18.4s, v2.s[1] 305 FMLA v30.4s, v18.4s, v2.s[3] 306 307 # BLOCK 6 308 LDR d15, [x5, 48] 309 INS v14.d[1], x8 // B 310 FMLA v21.4s, v19.4s, v0.s[1] 311 LDR x8, [x5, 56] 312 FMLA v23.4s, v19.4s, v0.s[3] 313 FMLA v25.4s, v19.4s, v1.s[1] 314 315 # BLOCK 7 316 INS v15.d[1], x8 // B 317 FMLA v27.4s, v19.4s, v1.s[3] 318 FMLA v29.4s, v19.4s, v2.s[1] 319 FMLA v31.4s, v19.4s, v2.s[3] 320 321 # Second group of 24 FMA, First group of loads 322 # BLOCK 0 323 FMLA v20.4s, v12.4s, v3.s[0] 324 FMLA v22.4s, v12.4s, v3.s[2] 325 FMLA v24.4s, v12.4s, v4.s[0] 326 327 # BLOCK 1 328 FMLA v26.4s, v12.4s, v4.s[2] 329 FMLA v28.4s, v12.4s, v5.s[0] 330 FMLA v30.4s, v12.4s, v5.s[2] 331 332 # BLOCK 2 333 FMLA v21.4s, v13.4s, v3.s[0] 334 FMLA v23.4s, v13.4s, v3.s[2] 335 FMLA v25.4s, v13.4s, v4.s[0] 336 337 # BLOCK 3 338 FMLA v27.4s, v13.4s, v4.s[2] 339 FMLA v29.4s, v13.4s, v5.s[0] 340 FMLA v31.4s, v13.4s, v5.s[2] 341 342 # BLOCK 4 343 FMLA v20.4s, v14.4s, v3.s[1] 344 FMLA v22.4s, v14.4s, v3.s[3] 345 FMLA v24.4s, v14.4s, v4.s[1] 346 347 # BLOCK 5 348 FMLA v26.4s, v14.4s, v4.s[3] 349 FMLA v28.4s, v14.4s, v5.s[1] 350 FMLA v30.4s, v14.4s, v5.s[3] 351 TST x0, 15 352 353 # BLOCK 6 354 FMLA v21.4s, v15.4s, v3.s[1] 355 FMLA v23.4s, v15.4s, v3.s[3] 356 FMLA v25.4s, v15.4s, v4.s[1] 357 ADD x5, x5, 64 358 359 # BLOCK 7 360 FMLA v27.4s, v15.4s, v4.s[3] 361 FMLA v29.4s, v15.4s, v5.s[1] 362 FMLA v31.4s, v15.4s, v5.s[3] 363 364 # Is there a remainder?- 2 floats of A (8 bytes) or less 365 B.NE 4f 3663: 367 # Clamp 368 FMAX v20.4s, v20.4s, v6.4s 369 # Load cn_stride 370 LDR x0, [sp, 32] 371 FMAX v21.4s, v21.4s, v6.4s 372 FMAX v22.4s, v22.4s, v6.4s 373 FMAX v23.4s, v23.4s, v6.4s 374 FMAX v24.4s, v24.4s, v6.4s 375 FMAX v25.4s, v25.4s, v6.4s 376 FMAX v26.4s, v26.4s, v6.4s 377 FMAX v27.4s, v27.4s, v6.4s 378 FMAX v28.4s, v28.4s, v6.4s 379 FMAX v29.4s, v29.4s, v6.4s 380 FMAX v30.4s, v30.4s, v6.4s 381 FMAX v31.4s, v31.4s, v6.4s 382 SUBS x1, x1, 8 383 FMIN v20.4s, v20.4s, v7.4s 384 FMIN v21.4s, v21.4s, v7.4s 385 FMIN v22.4s, v22.4s, v7.4s 386 FMIN v23.4s, v23.4s, v7.4s 387 FMIN v24.4s, v24.4s, v7.4s 388 FMIN v25.4s, v25.4s, v7.4s 389 FMIN v26.4s, v26.4s, v7.4s 390 FMIN v27.4s, v27.4s, v7.4s 391 FMIN v28.4s, v28.4s, v7.4s 392 FMIN v29.4s, v29.4s, v7.4s 393 FMIN v30.4s, v30.4s, v7.4s 394 FMIN v31.4s, v31.4s, v7.4s 395 396 # Store full 6 x 8 397 B.LO 6f 398 399 ST1 {v20.16b, v21.16b}, [x6], x0 400 SUB x3, x3, x2 // A0 -= kc 401 ST1 {v22.16b, v23.16b}, [x16], x0 402 SUB x9, x9, x2 // A1 -= kc 403 ST1 {v24.16b, v25.16b}, [x17], x0 404 SUB x10, x10, x2 // A2 -= kc 405 ST1 {v26.16b, v27.16b}, [x14], x0 406 SUB x11, x11, x2 // A3 -= kc 407 ST1 {v28.16b, v29.16b}, [x13], x0 408 SUB x12, x12, x2 // A4 -= kc 409 ST1 {v30.16b, v31.16b}, [x7], x0 410 SUB x4, x4, x2 // A5 -= kc 411 412 B.HI 0b 413 414 # Restore d12-d15 from stack 415 LDP d14, d15, [sp, 16] 416 LDP d12, d13, [sp], 32 417 RET 418 4194: 420 # Is there a remainder?- 2 floats of A (8 bytes) 421 TBZ x0, 3, 5f 422 423 # Remainder- 2 floats of A (8 bytes) 424 LDR d0, [x3], 8 425 LDR q16, [x5], 16 426 LD1 {v0.d}[1], [x9], 8 427 LDR d1, [x10], 8 428 LD1 {v1.d}[1], [x11], 8 429 LDR d2, [x12], 8 430 LD1 {v2.d}[1], [x4], 8 431 LDR q17, [x5], 16 432 LDR q18, [x5], 16 433 LDR q19, [x5], 16 434 435 FMLA v20.4s, v16.4s, v0.s[0] 436 FMLA v22.4s, v16.4s, v0.s[2] 437 FMLA v24.4s, v16.4s, v1.s[0] 438 FMLA v26.4s, v16.4s, v1.s[2] 439 FMLA v28.4s, v16.4s, v2.s[0] 440 FMLA v30.4s, v16.4s, v2.s[2] 441 FMLA v21.4s, v17.4s, v0.s[0] 442 FMLA v23.4s, v17.4s, v0.s[2] 443 FMLA v25.4s, v17.4s, v1.s[0] 444 FMLA v27.4s, v17.4s, v1.s[2] 445 FMLA v29.4s, v17.4s, v2.s[0] 446 FMLA v31.4s, v17.4s, v2.s[2] 447 448 FMLA v20.4s, v18.4s, v0.s[1] 449 FMLA v22.4s, v18.4s, v0.s[3] 450 FMLA v24.4s, v18.4s, v1.s[1] 451 FMLA v26.4s, v18.4s, v1.s[3] 452 FMLA v28.4s, v18.4s, v2.s[1] 453 FMLA v30.4s, v18.4s, v2.s[3] 454 FMLA v21.4s, v19.4s, v0.s[1] 455 FMLA v23.4s, v19.4s, v0.s[3] 456 FMLA v25.4s, v19.4s, v1.s[1] 457 FMLA v27.4s, v19.4s, v1.s[3] 458 FMLA v29.4s, v19.4s, v2.s[1] 459 FMLA v31.4s, v19.4s, v2.s[3] 460 461 # Is there a remainder?- 1 float of A (4 bytes) 462 TBZ x0, 2, 3b 4635: 464 # Remainder- 1 float of A (4 bytes) 465 LDR s0, [x3], 4 466 LDR q16, [x5], 16 467 LD1 {v0.s}[2], [x9], 4 468 LDR s1, [x10], 4 469 LD1 {v1.s}[2], [x11], 4 470 LDR s2, [x12], 4 471 LD1 {v2.s}[2], [x4], 4 472 LDR q17, [x5], 16 473 474 FMLA v20.4s, v16.4s, v0.s[0] 475 FMLA v22.4s, v16.4s, v0.s[2] 476 FMLA v24.4s, v16.4s, v1.s[0] 477 FMLA v26.4s, v16.4s, v1.s[2] 478 FMLA v28.4s, v16.4s, v2.s[0] 479 FMLA v30.4s, v16.4s, v2.s[2] 480 FMLA v21.4s, v17.4s, v0.s[0] 481 FMLA v23.4s, v17.4s, v0.s[2] 482 FMLA v25.4s, v17.4s, v1.s[0] 483 FMLA v27.4s, v17.4s, v1.s[2] 484 FMLA v29.4s, v17.4s, v2.s[0] 485 FMLA v31.4s, v17.4s, v2.s[2] 486 B 3b 487 488 # Store odd width 4896: 490 TBZ x1, 2, 7f 491 STR q20, [x6], 16 492 MOV v20.16b, v21.16b 493 STR q22, [x16], 16 494 MOV v22.16b, v23.16b 495 STR q24, [x17], 16 496 MOV v24.16b, v25.16b 497 STR q26, [x14], 16 498 MOV v26.16b, v27.16b 499 STR q28, [x13], 16 500 MOV v28.16b, v29.16b 501 STR q30, [x7], 16 502 MOV v30.16b, v31.16b 503 5047: 505 TBZ x1, 1, 8f 506 STR d20, [x6], 8 507 STR d22, [x16], 8 508 DUP d20, v20.d[1] 509 DUP d22, v22.d[1] 510 STR d24, [x17], 8 511 STR d26, [x14], 8 512 DUP d24, v24.d[1] 513 DUP d26, v26.d[1] 514 STR d28, [x13], 8 515 STR d30, [x7], 8 516 DUP d28, v28.d[1] 517 DUP d30, v30.d[1] 518 5198: 520 TBZ x1, 0, 9f 521 STR s20, [x6] 522 STR s22, [x16] 523 STR s24, [x17] 524 STR s26, [x14] 525 STR s28, [x13] 526 STR s30, [x7] 5279: 528 # Restore d12-d15 from stack 529 LDP d14, d15, [sp, 16] 530 LDP d12, d13, [sp], 32 531 RET 532 533END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 534 535#ifdef __ELF__ 536.section ".note.GNU-stack","",%progbits 537#endif 538