1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55r0.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 28# A1 x9 v1 29# A2 x10 v2 30# A3 x11 v3 31# A4 x12 v4 32# A5 x4 v5 33 34# B x5 v16 v17 v18 v19 35 36# C0 x6 v20 v21 37# C1 x16 v22 v23 38# C2 x17 v24 v25 39# C3 x14 v26 v27 40# C4 x13 v28 v29 41# C5 x7 v30 v31 42 43# Clamp v6, (v4), (v5) 44# unused v7 45# unused A v8 v9 v10 v11 46# unused B v12 v13 v14 v15 47 48# x8 temporary vector shadow register 49 50BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0 51 52 # Load params pointer 53 LDR x8, [sp, 8] 54 55 # Clamp A and C pointers 56 CMP x0, 2 // if mr < 2 57 ADD x9, x3, x4 // a1 = a0 + a_stride 58 ADD x16, x6, x7 // c1 = c0 + cm_stride 59 CSEL x9, x3, x9, LO // a1 = a0 60 CSEL x16, x6, x16, LO // c1 = c0 61 62 # Load params 63 LDR s6, [x8] 64 65 ADD x10, x9, x4 // a2 = a1 + a_stride 66 ADD x17, x16, x7 // c2 = c1 + cm_stride 67 // if mr <= 2 68 CSEL x10, x9, x10, LS // a2 = a1 69 CSEL x17, x16, x17, LS // c2 = c1 70 71 CMP x0, 4 // if mr < 4 72 ADD x11, x10, x4 // a3 = a2 + a_stride 73 ADD x14, x17, x7 // c3 = c2 + cm_stride 74 CSEL x11, x10, x11, LO // a3 = a2 75 CSEL x14, x17, x14, LO // c3 = c2 76 77 ADD x12, x11, x4 // a4 = a3 + a_stride 78 ADD x13, x14, x7 // c4 = c3 + cm_stride 79 // if mr <= 4 80 CSEL x12, x11, x12, LS // a4 = a3 81 CSEL x13, x14, x13, LS // c4 = c3 82 83 CMP x0, 6 // if mr < 6 84 ADD x4, x12, x4 // a5 = a4 + a_stride 85 ADD x7, x13, x7 // c5 = c4 + cm_stride 86 CSEL x4, x12, x4, LO // a5 = a4 87 CSEL x7, x13, x7, LO // c5 = c4 88 89 # Save d12-d15 on stack 90 STP d12, d13, [sp, -32]! 91 STP d14, d15, [sp, 16] 920: 93 # Load initial bias from w into accumulators 94 LDP q20, q21, [x5], 32 95 MOV v22.16b, v20.16b 96 MOV v23.16b, v21.16b 97 MOV v24.16b, v20.16b 98 MOV v25.16b, v21.16b 99 MOV v26.16b, v20.16b 100 MOV v27.16b, v21.16b 101 MOV v28.16b, v20.16b 102 MOV v29.16b, v21.16b 103 MOV v30.16b, v20.16b 104 MOV v31.16b, v21.16b 105 106 107 # Is there at least 4 halffloats (8 bytes) for prologue + epilogue? 108 SUBS x0, x2, 8 // k = kc - 8 109 B.LO 4f 110 111 # Prologue - First group loads, no FMA 112 LDR s0, [x3], 4 // A0 113 LDP q16, q17, [x5], 32 // B 114 LDR s1, [x10], 4 // A2 115 LDR s2, [x12], 4 // A4 116 LD1 {v0.s}[2], [x9], 4 // A1 117 LD1 {v1.s}[2], [x11], 4 // A3 118 LD1 {v2.s}[2], [x4], 4 // A5 119 LDR q18, [x5], 16 120 LDR d19, [x5], 8 121 LDR x8, [x5], 8 // ins is in BLOCK 0 122 SUBS x0, x0, 8 123 124 # Is there at least 4 halffloats (8 bytes) for main loop? 125 B.LO 2f 126 127 # Main loop - 4 halffloats of A (8 bytes) 128 # 48 FMA + 12 LD32 A + 8 LDR B 1291: 130 # First group of 24 FMA, Second group loads 131 # BLOCK 0 132 LDR s3, [x3], 4 // A0 133 INS v19.d[1], x8 // B from second group 134 FMLA v20.8h, v16.8h, v0.h[0] 135 LDR w8, [x9], 4 // A1 136 FMLA v22.8h, v16.8h, v0.h[4] 137 FMLA v24.8h, v16.8h, v1.h[0] 138 139 # BLOCK 1 140 LDR d12, [x5] 141 INS v3.d[1], x8 // A1 ins 142 FMLA v26.8h, v16.8h, v1.h[4] 143 LDR x8, [x5, 8] // B 144 FMLA v28.8h, v16.8h, v2.h[0] 145 FMLA v30.8h, v16.8h, v2.h[4] 146 147 # BLOCK 2 148 LDR s4, [x10], 4 // A2 149 INS v12.d[1], x8 // B ins 150 FMLA v21.8h, v17.8h, v0.h[0] 151 LDR w8, [x11], 4 // A3 152 FMLA v23.8h, v17.8h, v0.h[4] 153 FMLA v25.8h, v17.8h, v1.h[0] 154 155 # BLOCK 3 156 LDR s5, [x12], 4 // A4 157 INS v4.d[1], x8 // A3 ins 158 FMLA v27.8h, v17.8h, v1.h[4] 159 LDR w8, [x4], 4 // A5 160 FMLA v29.8h, v17.8h, v2.h[0] 161 FMLA v31.8h, v17.8h, v2.h[4] 162 163 # BLOCK 4 164 LDR d13, [x5, 16] 165 INS v5.d[1], x8 // A5 ins 166 FMLA v20.8h, v18.8h, v0.h[1] 167 LDR x8, [x5, 24] 168 FMLA v22.8h, v18.8h, v0.h[5] 169 FMLA v24.8h, v18.8h, v1.h[1] 170 171 # BLOCK 5 172 LDR d14, [x5, 32] 173 INS v13.d[1], x8 // B 174 FMLA v26.8h, v18.8h, v1.h[5] 175 LDR x8, [x5, 40] 176 FMLA v28.8h, v18.8h, v2.h[1] 177 FMLA v30.8h, v18.8h, v2.h[5] 178 179 # BLOCK 6 180 LDR d15, [x5, 48] 181 INS v14.d[1], x8 // B 182 FMLA v21.8h, v19.8h, v0.h[1] 183 LDR x8, [x5, 56] 184 FMLA v23.8h, v19.8h, v0.h[5] 185 FMLA v25.8h, v19.8h, v1.h[1] 186 187 # BLOCK 7 188 INS v15.d[1], x8 189 FMLA v27.8h, v19.8h, v1.h[5] 190 FMLA v29.8h, v19.8h, v2.h[1] 191 FMLA v31.8h, v19.8h, v2.h[5] 192 193 # Second group of 24 FMA, First group of loads 194 # BLOCK 0 195 LDR s0, [x3], 4 // A0 196 FMLA v20.8h, v12.8h, v3.h[0] 197 LDR w8, [x9], 4 // A1 198 FMLA v22.8h, v12.8h, v3.h[4] 199 FMLA v24.8h, v12.8h, v4.h[0] 200 201 # BLOCK 1 202 LDR d16, [x5, 64] 203 INS v0.d[1], x8 // A1 ins 204 FMLA v26.8h, v12.8h, v4.h[4] 205 LDR x8, [x5, 72] // B 206 FMLA v28.8h, v12.8h, v5.h[0] 207 FMLA v30.8h, v12.8h, v5.h[4] 208 209 # BLOCK 2 210 LDR s1, [x10], 4 // A2 211 INS v16.d[1], x8 // B 212 FMLA v21.8h, v13.8h, v3.h[0] 213 LDR w8, [x11], 4 // A3 214 FMLA v23.8h, v13.8h, v3.h[4] 215 FMLA v25.8h, v13.8h, v4.h[0] 216 217 # BLOCK 3 218 LDR s2, [x12], 4 // A4 219 INS v1.d[1], x8 // A3 ins 220 FMLA v27.8h, v13.8h, v4.h[4] 221 LDR w8, [x4], 4 // A5 222 FMLA v29.8h, v13.8h, v5.h[0] 223 FMLA v31.8h, v13.8h, v5.h[4] 224 225 # BLOCK 4 226 LDR d17, [x5, 80] 227 INS v2.d[1], x8 // A5 ins 228 FMLA v20.8h, v14.8h, v3.h[1] 229 LDR x8, [x5, 88] 230 FMLA v22.8h, v14.8h, v3.h[5] 231 FMLA v24.8h, v14.8h, v4.h[1] 232 233 # BLOCK 5 234 LDR d18, [x5, 96] 235 INS v17.d[1], x8 // B 236 FMLA v26.8h, v14.8h, v4.h[5] 237 LDR x8, [x5, 104] 238 FMLA v28.8h, v14.8h, v5.h[1] 239 FMLA v30.8h, v14.8h, v5.h[5] 240 241 # BLOCK 6 242 LDR d19, [x5, 112] 243 INS v18.d[1], x8 // B 244 FMLA v21.8h, v15.8h, v3.h[1] 245 LDR x8, [x5, 120] 246 FMLA v23.8h, v15.8h, v3.h[5] 247 FMLA v25.8h, v15.8h, v4.h[1] 248 249 # BLOCK 7 250 SUBS x0, x0, 8 // LDR lands here 251 FMLA v27.8h, v15.8h, v4.h[5] 252 FMLA v29.8h, v15.8h, v5.h[1] 253 ADD x5, x5, 128 254 FMLA v31.8h, v15.8h, v5.h[5] 255 B.HS 1b 256 257 # Epilogue - 4 halffloats of A (8 bytes) 258 # 48 FMA + 12 LD32 A + 8 LDR B 2592: 260 # First group of 24 FMA, Second group loads 261 # BLOCK 0 262 LDR s3, [x3], 4 // A0 263 INS v19.d[1], x8 // B from second group 264 FMLA v20.8h, v16.8h, v0.h[0] 265 LDR w8, [x9], 4 // A1 266 FMLA v22.8h, v16.8h, v0.h[4] 267 FMLA v24.8h, v16.8h, v1.h[0] 268 269 # BLOCK 1 270 LDR d12, [x5] 271 INS v3.d[1], x8 // A1 ins 272 FMLA v26.8h, v16.8h, v1.h[4] 273 LDR x8, [x5, 8] // B 274 FMLA v28.8h, v16.8h, v2.h[0] 275 FMLA v30.8h, v16.8h, v2.h[4] 276 277 # BLOCK 2 278 LDR s4, [x10], 4 // A2 279 INS v12.d[1], x8 // B ins 280 FMLA v21.8h, v17.8h, v0.h[0] 281 LDR w8, [x11], 4 // A3 282 FMLA v23.8h, v17.8h, v0.h[4] 283 FMLA v25.8h, v17.8h, v1.h[0] 284 285 # BLOCK 3 286 LDR s5, [x12], 4 // A4 287 INS v4.d[1], x8 // A3 ins 288 FMLA v27.8h, v17.8h, v1.h[4] 289 LDR w8, [x4], 4 // A5 290 FMLA v29.8h, v17.8h, v2.h[0] 291 FMLA v31.8h, v17.8h, v2.h[4] 292 293 # BLOCK 4 294 LDR d13, [x5, 16] 295 INS v5.d[1], x8 // A5 ins 296 FMLA v20.8h, v18.8h, v0.h[1] 297 LDR x8, [x5, 24] 298 FMLA v22.8h, v18.8h, v0.h[5] 299 FMLA v24.8h, v18.8h, v1.h[1] 300 301 # BLOCK 5 302 LDR d14, [x5, 32] 303 INS v13.d[1], x8 // B 304 FMLA v26.8h, v18.8h, v1.h[5] 305 LDR x8, [x5, 40] 306 FMLA v28.8h, v18.8h, v2.h[1] 307 FMLA v30.8h, v18.8h, v2.h[5] 308 309 # BLOCK 6 310 LDR d15, [x5, 48] 311 INS v14.d[1], x8 // B 312 FMLA v21.8h, v19.8h, v0.h[1] 313 LDR x8, [x5, 56] 314 FMLA v23.8h, v19.8h, v0.h[5] 315 FMLA v25.8h, v19.8h, v1.h[1] 316 317 # BLOCK 7 318 INS v15.d[1], x8 // B 319 FMLA v27.8h, v19.8h, v1.h[5] 320 FMLA v29.8h, v19.8h, v2.h[1] 321 FMLA v31.8h, v19.8h, v2.h[5] 322 323 # Second group of 24 FMA, First group of loads 324 # BLOCK 0 325 FMLA v20.8h, v12.8h, v3.h[0] 326 FMLA v22.8h, v12.8h, v3.h[4] 327 FMLA v24.8h, v12.8h, v4.h[0] 328 329 # BLOCK 1 330 FMLA v26.8h, v12.8h, v4.h[4] 331 FMLA v28.8h, v12.8h, v5.h[0] 332 FMLA v30.8h, v12.8h, v5.h[4] 333 334 # BLOCK 2 335 FMLA v21.8h, v13.8h, v3.h[0] 336 FMLA v23.8h, v13.8h, v3.h[4] 337 FMLA v25.8h, v13.8h, v4.h[0] 338 339 # BLOCK 3 340 FMLA v27.8h, v13.8h, v4.h[4] 341 FMLA v29.8h, v13.8h, v5.h[0] 342 FMLA v31.8h, v13.8h, v5.h[4] 343 344 # BLOCK 4 345 FMLA v20.8h, v14.8h, v3.h[1] 346 FMLA v22.8h, v14.8h, v3.h[5] 347 FMLA v24.8h, v14.8h, v4.h[1] 348 349 # BLOCK 5 350 FMLA v26.8h, v14.8h, v4.h[5] 351 FMLA v28.8h, v14.8h, v5.h[1] 352 FMLA v30.8h, v14.8h, v5.h[5] 353 TST x0, 7 354 355 # BLOCK 6 356 FMLA v21.8h, v15.8h, v3.h[1] 357 FMLA v23.8h, v15.8h, v3.h[5] 358 FMLA v25.8h, v15.8h, v4.h[1] 359 ADD x5, x5, 64 360 361 # BLOCK 7 362 FMLA v27.8h, v15.8h, v4.h[5] 363 FMLA v29.8h, v15.8h, v5.h[1] 364 FMLA v31.8h, v15.8h, v5.h[5] 365 366 # Is there a remainder?- 2 halffloats of A (4 bytes) or less 367 B.NE 4f 368 3693: 370 # Clamp 371 DUP v4.8h, v6.h[0] 372 DUP v5.8h, v6.h[1] 373 FMAX v20.8h, v20.8h, v4.8h 374 LDR x0, [sp, 32] // cn_stride 375 FMAX v21.8h, v21.8h, v4.8h 376 FMAX v22.8h, v22.8h, v4.8h 377 FMAX v23.8h, v23.8h, v4.8h 378 FMAX v24.8h, v24.8h, v4.8h 379 FMAX v25.8h, v25.8h, v4.8h 380 FMAX v26.8h, v26.8h, v4.8h 381 FMAX v27.8h, v27.8h, v4.8h 382 FMAX v28.8h, v28.8h, v4.8h 383 FMAX v29.8h, v29.8h, v4.8h 384 FMAX v30.8h, v30.8h, v4.8h 385 FMAX v31.8h, v31.8h, v4.8h 386 SUBS x1, x1, 16 387 FMIN v20.8h, v20.8h, v5.8h 388 FMIN v21.8h, v21.8h, v5.8h 389 FMIN v22.8h, v22.8h, v5.8h 390 FMIN v23.8h, v23.8h, v5.8h 391 FMIN v24.8h, v24.8h, v5.8h 392 FMIN v25.8h, v25.8h, v5.8h 393 FMIN v26.8h, v26.8h, v5.8h 394 FMIN v27.8h, v27.8h, v5.8h 395 FMIN v28.8h, v28.8h, v5.8h 396 FMIN v29.8h, v29.8h, v5.8h 397 FMIN v30.8h, v30.8h, v5.8h 398 FMIN v31.8h, v31.8h, v5.8h 399 400 # Store full 6 x 16 401 B.LO 6f 402 403 ST1 {v20.16b, v21.16b}, [x6], x0 404 SUB x3, x3, x2 // a0 -= kc 405 ST1 {v22.16b, v23.16b}, [x16], x0 406 SUB x9, x9, x2 // a1 -= kc 407 ST1 {v24.16b, v25.16b}, [x17], x0 408 SUB x10, x10, x2 // a2 -= kc 409 ST1 {v26.16b, v27.16b}, [x14], x0 410 SUB x11, x11, x2 // a3 -= kc 411 ST1 {v28.16b, v29.16b}, [x13], x0 412 SUB x12, x12, x2 // a4 -= kc 413 ST1 {v30.16b, v31.16b}, [x7], x0 414 SUB x4, x4, x2 // a5 -= kc 415 416 B.HI 0b 417 418 # Restore d12-d15 from stack 419 LDP d14, d15, [sp, 16] 420 LDP d12, d13, [sp], 32 421 RET 422 4234: 424 # Is there a remainder?- 2 halffloats of A (4 bytes) 425 TBZ x0, 2, 5f 426 427 # Remainder- 2 halffloats of A (4 bytes) 428 LDR s0, [x3], 4 // A0 429 LDP q16, q17, [x5], 32 // B 430 LDR s1, [x10], 4 // A2 431 LDR s2, [x12], 4 // A4 432 LD1 {v0.s}[2], [x9], 4 // A1 433 LD1 {v1.s}[2], [x11], 4 // A3 434 LD1 {v2.s}[2], [x4], 4 // A5 435 LDR q18, [x5], 16 436 LDR q19, [x5], 16 437 FMLA v20.8h, v16.8h, v0.h[0] 438 FMLA v22.8h, v16.8h, v0.h[4] 439 FMLA v24.8h, v16.8h, v1.h[0] 440 FMLA v26.8h, v16.8h, v1.h[4] 441 FMLA v28.8h, v16.8h, v2.h[0] 442 FMLA v30.8h, v16.8h, v2.h[4] 443 FMLA v21.8h, v17.8h, v0.h[0] 444 FMLA v23.8h, v17.8h, v0.h[4] 445 FMLA v25.8h, v17.8h, v1.h[0] 446 FMLA v27.8h, v17.8h, v1.h[4] 447 FMLA v29.8h, v17.8h, v2.h[0] 448 FMLA v31.8h, v17.8h, v2.h[4] 449 FMLA v20.8h, v18.8h, v0.h[1] 450 FMLA v22.8h, v18.8h, v0.h[5] 451 FMLA v24.8h, v18.8h, v1.h[1] 452 FMLA v26.8h, v18.8h, v1.h[5] 453 FMLA v28.8h, v18.8h, v2.h[1] 454 FMLA v30.8h, v18.8h, v2.h[5] 455 FMLA v21.8h, v19.8h, v0.h[1] 456 FMLA v23.8h, v19.8h, v0.h[5] 457 FMLA v25.8h, v19.8h, v1.h[1] 458 FMLA v27.8h, v19.8h, v1.h[5] 459 FMLA v29.8h, v19.8h, v2.h[1] 460 FMLA v31.8h, v19.8h, v2.h[5] 461 462 # Is there a remainder?- 1 halffloat of A (2 bytes) 463 TBZ x0, 1, 3b 4645: 465 466 # Remainder- 1 halffloat of A (2 bytes) 467 LDR h0, [x3], 2 // A0 468 LDP q16, q17, [x5], 32 // B 469 LDR h1, [x10], 2 // A2 470 LDR h2, [x12], 2 // A4 471 LD1 {v0.h}[4], [x9], 2 // A1 472 LD1 {v1.h}[4], [x11], 2 // A3 473 LD1 {v2.h}[4], [x4], 2 // A5 474 FMLA v20.8h, v16.8h, v0.h[0] 475 FMLA v22.8h, v16.8h, v0.h[4] 476 FMLA v24.8h, v16.8h, v1.h[0] 477 FMLA v26.8h, v16.8h, v1.h[4] 478 FMLA v28.8h, v16.8h, v2.h[0] 479 FMLA v30.8h, v16.8h, v2.h[4] 480 FMLA v21.8h, v17.8h, v0.h[0] 481 FMLA v23.8h, v17.8h, v0.h[4] 482 FMLA v25.8h, v17.8h, v1.h[0] 483 FMLA v27.8h, v17.8h, v1.h[4] 484 FMLA v29.8h, v17.8h, v2.h[0] 485 FMLA v31.8h, v17.8h, v2.h[4] 486 B 3b 487 488 # Store odd width 4896: 490 TBZ x1, 3, 7f 491 STR q20, [x6], 16 492 MOV v20.16b, v21.16b 493 STR q22, [x16], 16 494 MOV v22.16b, v23.16b 495 STR q24, [x17], 16 496 MOV v24.16b, v25.16b 497 STR q26, [x14], 16 498 MOV v26.16b, v27.16b 499 STR q28, [x13], 16 500 MOV v28.16b, v29.16b 501 STR q30, [x7], 16 502 MOV v30.16b, v31.16b 503 5047: 505 TBZ x1, 2, 8f 506 STR d20, [x6], 8 507 STR d22, [x16], 8 508 DUP d20, v20.d[1] 509 DUP d22, v22.d[1] 510 STR d24, [x17], 8 511 STR d26, [x14], 8 512 DUP d24, v24.d[1] 513 DUP d26, v26.d[1] 514 STR d28, [x13], 8 515 STR d30, [x7], 8 516 DUP d28, v28.d[1] 517 DUP d30, v30.d[1] 518 5198: 520 TBZ x1, 1, 9f 521 STR s20, [x6], 4 522 STR s22, [x16], 4 523 DUP s20, v20.s[1] 524 DUP s22, v22.s[1] 525 STR s24, [x17], 4 526 STR s26, [x14], 4 527 DUP s24, v24.s[1] 528 DUP s26, v26.s[1] 529 STR s28, [x13], 4 530 STR s30, [x7], 4 531 DUP s28, v28.s[1] 532 DUP s30, v30.s[1] 533 5349: 535 TBZ x1, 0, 10f 536 STR h20, [x6] 537 STR h22, [x16] 538 STR h24, [x17] 539 STR h26, [x14] 540 STR h28, [x13] 541 STR h30, [x7] 54210: 543 # Restore d12-d15 from stack 544 LDP d14, d15, [sp, 16] 545 LDP d12, d13, [sp], 32 546 RET 547 548END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0 549 550#ifdef __ELF__ 551.section ".note.GNU-stack","",%progbits 552#endif 553