1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const void**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const void* zero, [sp + 16] -> x12 20# const xnn_f16_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 v3 26# A1 x15 v0[1] v3[1] 27# A2 x20 v1 v4 28# A3 x21 v1[1] v4[1] 29# A4 x22 v2 v5 30# A5 x23 v2[1] v5[1] 31 32# B x5 v12 v13 v14 v15 second set of B 33# B v16 v17 v18 v19 first set 34 35# C0 x6 v20 v21 36# C1 x16 v22 v23 37# C2 x17 v24 v25 38# C3 x10 v26 v27 39# C4 x13 v28 v29 40# C5 x7 v30 v31 41 42# Clamp v6, (v4), (v5) 43# unused v7 v8 v9 v10 v11 44 45# x8 temporary vector shadow register 46 47BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0 48 49 # Load zero, params pointer 50 LDP x12, x8, [sp, 16] 51 52 # Clamp C pointers 53 CMP x0, 2 // if mr < 2 54 ADD x16, x6, x7 // c1 = c0 + cm_stride 55 CSEL x16, x6, x16, LO // c1 = c0 56 ADD x17, x16, x7 // c2 = c1 + cm_stride 57 // if mr <= 2 58 CSEL x17, x16, x17, LS // c2 = c1 59 60 # Load params 61 LDR s6, [x8] 62 63 CMP x0, 4 // if mr < 4 64 ADD x10, x17, x7 // c3 = c2 + cm_stride 65 CSEL x10, x17, x10, LO // c3 = c2 66 ADD x13, x10, x7 // c4 = c3 + cm_stride 67 // if mr <= 4 68 CSEL x13, x10, x13, LS // c4 = c3 69 CMP x0, 6 // if mr < 6 70 ADD x7, x13, x7 // c5 = c4 + cm_stride 71 CSEL x7, x13, x7, LO // c5 = c4 72 73 # Load a_offset 74 LDR x11, [sp, 8] 75 76 # Save x20-x23, d12-d15 on stack 77 STP d12, d13, [sp, -64]! 78 STP d14, d15, [sp, 16] 79 STP x20, x21, [sp, 32] 80 STP x22, x23, [sp, 48] 810: 82 # Load initial bias from w into accumulators 83 LDP q20, q21, [x5], 32 84 MOV x9, x3 // p = ks 85 MOV v22.16b, v20.16b 86 MOV v23.16b, v21.16b 87 MOV v24.16b, v20.16b 88 MOV v25.16b, v21.16b 89 MOV v26.16b, v20.16b 90 MOV v27.16b, v21.16b 91 MOV v28.16b, v20.16b 92 MOV v29.16b, v21.16b 93 MOV v30.16b, v20.16b 94 MOV v31.16b, v21.16b 95 961: 97 # Load next 6 A pointers 98 LDP x14, x15, [x4], 16 99 LDP x20, x21, [x4], 16 100 LDP x22, x23, [x4], 16 101 102 CMP x14, x12 // if a0 == zero 103 ADD x14, x14, x11 // a0 += a_offset 104 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 105 CMP x15, x12 // if a1 == zero 106 ADD x15, x15, x11 // a1 += a_offset 107 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 108 CMP x20, x12 // if a2 == zero 109 ADD x20, x20, x11 // a2 += a_offset 110 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 111 CMP x21, x12 // if a3 == zero 112 ADD x21, x21, x11 // a3 += a_offset 113 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 114 CMP x22, x12 // if a4 == zero 115 ADD x22, x22, x11 // a4 += a_offset 116 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 117 CMP x23, x12 // if a5 == zero 118 ADD x23, x23, x11 // a5 += a_offset 119 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 120 121 # Is there at least 4 halffloats (8 bytes) for prologue + epilogue? 122 SUBS x0, x2, 8 // k = kc - 8 123 B.LO 5f 124 125 # Prologue - First group loads, no FMA 126 LDR s0, [x14], 4 // A0 127 LDP q16, q17, [x5], 32 // B 128 LDR s1, [x20], 4 // A2 129 LDR s2, [x22], 4 // A4 130 LD1 {v0.s}[2], [x15], 4 // A1 131 LD1 {v1.s}[2], [x21], 4 // A3 132 LD1 {v2.s}[2], [x23], 4 // A5 133 LDR q18, [x5], 16 134 LDR d19, [x5], 8 135 LDR x8, [x5], 8 // ins is in BLOCK 0 136 SUBS x0, x0, 8 137 138 # Is there at least 4 halffloats (8 bytes) for main loop? 139 B.LO 3f 140 141 .p2align 3 142 # Main loop - 4 halffloats of A (8 bytes) 143 # 48 FMA + 12 LD32 A + 8 LDR B 1442: 145 # First group of 24 FMA, Second group loads 146 # BLOCK 0 147 LDR s3, [x14], 4 // A0 148 INS v19.d[1], x8 // B from second group 149 FMLA v20.8h, v16.8h, v0.h[0] 150 LDR w8, [x15], 4 // A1 151 FMLA v22.8h, v16.8h, v0.h[4] 152 FMLA v24.8h, v16.8h, v1.h[0] 153 154 # BLOCK 1 155 LDR d12, [x5] 156 INS v3.d[1], x8 // A1 ins 157 FMLA v26.8h, v16.8h, v1.h[4] 158 LDR x8, [x5, 8] // B 159 FMLA v28.8h, v16.8h, v2.h[0] 160 FMLA v30.8h, v16.8h, v2.h[4] 161 162 # BLOCK 2 163 LDR s4, [x20], 4 // A2 164 INS v12.d[1], x8 // B ins 165 FMLA v21.8h, v17.8h, v0.h[0] 166 LDR w8, [x21], 4 // A3 167 FMLA v23.8h, v17.8h, v0.h[4] 168 FMLA v25.8h, v17.8h, v1.h[0] 169 170 # BLOCK 3 171 LDR s5, [x22], 4 // A4 172 INS v4.d[1], x8 // A3 ins 173 FMLA v27.8h, v17.8h, v1.h[4] 174 LDR w8, [x23], 4 // A5 175 FMLA v29.8h, v17.8h, v2.h[0] 176 FMLA v31.8h, v17.8h, v2.h[4] 177 178 # BLOCK 4 179 LDR d13, [x5, 16] 180 INS v5.d[1], x8 // A5 ins 181 FMLA v20.8h, v18.8h, v0.h[1] 182 LDR x8, [x5, 24] 183 FMLA v22.8h, v18.8h, v0.h[5] 184 FMLA v24.8h, v18.8h, v1.h[1] 185 186 # BLOCK 5 187 LDR d14, [x5, 32] 188 INS v13.d[1], x8 // B 189 FMLA v26.8h, v18.8h, v1.h[5] 190 LDR x8, [x5, 40] 191 FMLA v28.8h, v18.8h, v2.h[1] 192 FMLA v30.8h, v18.8h, v2.h[5] 193 194 # BLOCK 6 195 LDR d15, [x5, 48] 196 INS v14.d[1], x8 // B 197 FMLA v21.8h, v19.8h, v0.h[1] 198 LDR x8, [x5, 56] 199 FMLA v23.8h, v19.8h, v0.h[5] 200 FMLA v25.8h, v19.8h, v1.h[1] 201 202 # BLOCK 7 203 INS v15.d[1], x8 204 FMLA v27.8h, v19.8h, v1.h[5] 205 FMLA v29.8h, v19.8h, v2.h[1] 206 FMLA v31.8h, v19.8h, v2.h[5] 207 208 # Second group of 24 FMA, First group of loads 209 # BLOCK 0 210 LDR s0, [x14], 4 // A0 211 FMLA v20.8h, v12.8h, v3.h[0] 212 LDR w8, [x15], 4 // A1 213 FMLA v22.8h, v12.8h, v3.h[4] 214 FMLA v24.8h, v12.8h, v4.h[0] 215 216 # BLOCK 1 217 LDR d16, [x5, 64] 218 INS v0.d[1], x8 // A1 ins 219 FMLA v26.8h, v12.8h, v4.h[4] 220 LDR x8, [x5, 72] // B 221 FMLA v28.8h, v12.8h, v5.h[0] 222 FMLA v30.8h, v12.8h, v5.h[4] 223 224 # BLOCK 2 225 LDR s1, [x20], 4 // A2 226 INS v16.d[1], x8 // B 227 FMLA v21.8h, v13.8h, v3.h[0] 228 LDR w8, [x21], 4 // A3 229 FMLA v23.8h, v13.8h, v3.h[4] 230 FMLA v25.8h, v13.8h, v4.h[0] 231 232 # BLOCK 3 233 LDR s2, [x22], 4 // A4 234 INS v1.d[1], x8 // A3 ins 235 FMLA v27.8h, v13.8h, v4.h[4] 236 LDR w8, [x23], 4 // A5 237 FMLA v29.8h, v13.8h, v5.h[0] 238 FMLA v31.8h, v13.8h, v5.h[4] 239 240 # BLOCK 4 241 LDR d17, [x5, 80] 242 INS v2.d[1], x8 // A5 ins 243 FMLA v20.8h, v14.8h, v3.h[1] 244 LDR x8, [x5, 88] 245 FMLA v22.8h, v14.8h, v3.h[5] 246 FMLA v24.8h, v14.8h, v4.h[1] 247 248 # BLOCK 5 249 LDR d18, [x5, 96] 250 INS v17.d[1], x8 // B 251 FMLA v26.8h, v14.8h, v4.h[5] 252 LDR x8, [x5, 104] 253 FMLA v28.8h, v14.8h, v5.h[1] 254 FMLA v30.8h, v14.8h, v5.h[5] 255 256 # BLOCK 6 257 LDR d19, [x5, 112] 258 INS v18.d[1], x8 // B 259 FMLA v21.8h, v15.8h, v3.h[1] 260 LDR x8, [x5, 120] 261 FMLA v23.8h, v15.8h, v3.h[5] 262 FMLA v25.8h, v15.8h, v4.h[1] 263 264 # BLOCK 7 265 SUBS x0, x0, 8 // LDR lands here 266 FMLA v27.8h, v15.8h, v4.h[5] 267 FMLA v29.8h, v15.8h, v5.h[1] 268 ADD x5, x5, 128 269 FMLA v31.8h, v15.8h, v5.h[5] 270 B.HS 2b 271 272 # Epilogue - 4 halffloats of A (8 bytes) 273 # 48 FMA + 12 LD32 A + 8 LDR B 2743: 275 # First group of 24 FMA, Second group loads 276 # BLOCK 0 277 LDR s3, [x14], 4 // A0 278 INS v19.d[1], x8 // B from second group 279 FMLA v20.8h, v16.8h, v0.h[0] 280 LDR w8, [x15], 4 // A1 281 FMLA v22.8h, v16.8h, v0.h[4] 282 FMLA v24.8h, v16.8h, v1.h[0] 283 284 # BLOCK 1 285 LDR d12, [x5] 286 INS v3.d[1], x8 // A1 ins 287 FMLA v26.8h, v16.8h, v1.h[4] 288 LDR x8, [x5, 8] // B 289 FMLA v28.8h, v16.8h, v2.h[0] 290 FMLA v30.8h, v16.8h, v2.h[4] 291 292 # BLOCK 2 293 LDR s4, [x20], 4 // A2 294 INS v12.d[1], x8 // B ins 295 FMLA v21.8h, v17.8h, v0.h[0] 296 LDR w8, [x21], 4 // A3 297 FMLA v23.8h, v17.8h, v0.h[4] 298 FMLA v25.8h, v17.8h, v1.h[0] 299 300 # BLOCK 3 301 LDR s5, [x22], 4 // A4 302 INS v4.d[1], x8 // A3 ins 303 FMLA v27.8h, v17.8h, v1.h[4] 304 LDR w8, [x23], 4 // A5 305 FMLA v29.8h, v17.8h, v2.h[0] 306 FMLA v31.8h, v17.8h, v2.h[4] 307 308 # BLOCK 4 309 LDR d13, [x5, 16] 310 INS v5.d[1], x8 // A5 ins 311 FMLA v20.8h, v18.8h, v0.h[1] 312 LDR x8, [x5, 24] 313 FMLA v22.8h, v18.8h, v0.h[5] 314 FMLA v24.8h, v18.8h, v1.h[1] 315 316 # BLOCK 5 317 LDR d14, [x5, 32] 318 INS v13.d[1], x8 // B 319 FMLA v26.8h, v18.8h, v1.h[5] 320 LDR x8, [x5, 40] 321 FMLA v28.8h, v18.8h, v2.h[1] 322 FMLA v30.8h, v18.8h, v2.h[5] 323 324 # BLOCK 6 325 LDR d15, [x5, 48] 326 INS v14.d[1], x8 // B 327 FMLA v21.8h, v19.8h, v0.h[1] 328 LDR x8, [x5, 56] 329 FMLA v23.8h, v19.8h, v0.h[5] 330 FMLA v25.8h, v19.8h, v1.h[1] 331 332 # BLOCK 7 333 INS v15.d[1], x8 // B 334 FMLA v27.8h, v19.8h, v1.h[5] 335 FMLA v29.8h, v19.8h, v2.h[1] 336 FMLA v31.8h, v19.8h, v2.h[5] 337 338 # Second group of 24 FMA, First group of loads 339 # BLOCK 0 340 FMLA v20.8h, v12.8h, v3.h[0] 341 FMLA v22.8h, v12.8h, v3.h[4] 342 FMLA v24.8h, v12.8h, v4.h[0] 343 344 # BLOCK 1 345 FMLA v26.8h, v12.8h, v4.h[4] 346 FMLA v28.8h, v12.8h, v5.h[0] 347 FMLA v30.8h, v12.8h, v5.h[4] 348 349 # BLOCK 2 350 FMLA v21.8h, v13.8h, v3.h[0] 351 FMLA v23.8h, v13.8h, v3.h[4] 352 FMLA v25.8h, v13.8h, v4.h[0] 353 354 # BLOCK 3 355 FMLA v27.8h, v13.8h, v4.h[4] 356 FMLA v29.8h, v13.8h, v5.h[0] 357 FMLA v31.8h, v13.8h, v5.h[4] 358 359 # BLOCK 4 360 FMLA v20.8h, v14.8h, v3.h[1] 361 FMLA v22.8h, v14.8h, v3.h[5] 362 FMLA v24.8h, v14.8h, v4.h[1] 363 364 # BLOCK 5 365 FMLA v26.8h, v14.8h, v4.h[5] 366 FMLA v28.8h, v14.8h, v5.h[1] 367 FMLA v30.8h, v14.8h, v5.h[5] 368 TST x0, 7 369 370 # BLOCK 6 371 FMLA v21.8h, v15.8h, v3.h[1] 372 FMLA v23.8h, v15.8h, v3.h[5] 373 FMLA v25.8h, v15.8h, v4.h[1] 374 ADD x5, x5, 64 375 376 # BLOCK 7 377 FMLA v27.8h, v15.8h, v4.h[5] 378 FMLA v29.8h, v15.8h, v5.h[1] 379 FMLA v31.8h, v15.8h, v5.h[5] 380 381 # Is there a remainder?- 2 halffloats of A (4 bytes) or less 382 B.NE 5f 383 3844: 385 # ks loop 386 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 387 B.HI 1b 388 389 # Clamp 390 DUP v4.8h, v6.h[0] 391 DUP v5.8h, v6.h[1] 392 LDR x0, [sp, 64] // cn_stride 393 FMAX v20.8h, v20.8h, v4.8h 394 FMAX v21.8h, v21.8h, v4.8h 395 FMAX v22.8h, v22.8h, v4.8h 396 FMAX v23.8h, v23.8h, v4.8h 397 FMAX v24.8h, v24.8h, v4.8h 398 FMAX v25.8h, v25.8h, v4.8h 399 FMAX v26.8h, v26.8h, v4.8h 400 FMAX v27.8h, v27.8h, v4.8h 401 FMAX v28.8h, v28.8h, v4.8h 402 FMAX v29.8h, v29.8h, v4.8h 403 FMAX v30.8h, v30.8h, v4.8h 404 FMAX v31.8h, v31.8h, v4.8h 405 SUBS x1, x1, 16 406 FMIN v20.8h, v20.8h, v5.8h 407 FMIN v21.8h, v21.8h, v5.8h 408 FMIN v22.8h, v22.8h, v5.8h 409 FMIN v23.8h, v23.8h, v5.8h 410 FMIN v24.8h, v24.8h, v5.8h 411 FMIN v25.8h, v25.8h, v5.8h 412 FMIN v26.8h, v26.8h, v5.8h 413 FMIN v27.8h, v27.8h, v5.8h 414 FMIN v28.8h, v28.8h, v5.8h 415 FMIN v29.8h, v29.8h, v5.8h 416 FMIN v30.8h, v30.8h, v5.8h 417 FMIN v31.8h, v31.8h, v5.8h 418 419 # Store full 6 x 16 420 B.LO 7f 421 422 ST1 {v30.16b, v31.16b}, [x7], x0 423 ST1 {v28.16b, v29.16b}, [x13], x0 424 ST1 {v26.16b, v27.16b}, [x10], x0 425 ST1 {v24.16b, v25.16b}, [x17], x0 426 ST1 {v22.16b, v23.16b}, [x16], x0 427 ST1 {v20.16b, v21.16b}, [x6], x0 428 429 SUB x4, x4, x3 // a -= ks 430 431 # nc loop 432 B.HI 0b 433 434 # Restore x20-x23, d12-d15 from stack 435 LDP x22, x23, [sp, 48] 436 LDP x20, x21, [sp, 32] 437 LDP d14, d15, [sp, 16] 438 LDP d12, d13, [sp], 64 439 RET 440 4415: 442 # Is there a remainder?- 2 halffloats of A (4 bytes) 443 TBZ x0, 2, 6f 444 445 # Remainder- 2 halffloats of A (4 bytes) 446 LDR s0, [x14], 4 // A0 447 LDP q16, q17, [x5], 32 // B 448 LDR s1, [x20], 4 // A2 449 LDR s2, [x22], 4 // A4 450 LD1 {v0.s}[2], [x15], 4 // A1 451 LD1 {v1.s}[2], [x21], 4 // A3 452 LD1 {v2.s}[2], [x23], 4 // A5 453 LDR q18, [x5], 16 454 LDR q19, [x5], 16 455 FMLA v20.8h, v16.8h, v0.h[0] 456 FMLA v22.8h, v16.8h, v0.h[4] 457 FMLA v24.8h, v16.8h, v1.h[0] 458 FMLA v26.8h, v16.8h, v1.h[4] 459 FMLA v28.8h, v16.8h, v2.h[0] 460 FMLA v30.8h, v16.8h, v2.h[4] 461 FMLA v21.8h, v17.8h, v0.h[0] 462 FMLA v23.8h, v17.8h, v0.h[4] 463 FMLA v25.8h, v17.8h, v1.h[0] 464 FMLA v27.8h, v17.8h, v1.h[4] 465 FMLA v29.8h, v17.8h, v2.h[0] 466 FMLA v31.8h, v17.8h, v2.h[4] 467 FMLA v20.8h, v18.8h, v0.h[1] 468 FMLA v22.8h, v18.8h, v0.h[5] 469 FMLA v24.8h, v18.8h, v1.h[1] 470 FMLA v26.8h, v18.8h, v1.h[5] 471 FMLA v28.8h, v18.8h, v2.h[1] 472 FMLA v30.8h, v18.8h, v2.h[5] 473 FMLA v21.8h, v19.8h, v0.h[1] 474 FMLA v23.8h, v19.8h, v0.h[5] 475 FMLA v25.8h, v19.8h, v1.h[1] 476 FMLA v27.8h, v19.8h, v1.h[5] 477 FMLA v29.8h, v19.8h, v2.h[1] 478 FMLA v31.8h, v19.8h, v2.h[5] 479 480 # Is there a remainder?- 1 halffloat of A (2 bytes) 481 TBZ x0, 1, 4b 4826: 483 # Remainder- 1 halffloat of A (2 bytes) 484 LDR h0, [x14], 2 // A0 485 LDP q16, q17, [x5], 32 // B 486 LDR h1, [x20], 2 // A2 487 LDR h2, [x22], 2 // A4 488 LD1 {v0.h}[4], [x15], 2 // A1 489 LD1 {v1.h}[4], [x21], 2 // A3 490 LD1 {v2.h}[4], [x23], 2 // A5 491 FMLA v20.8h, v16.8h, v0.h[0] 492 FMLA v22.8h, v16.8h, v0.h[4] 493 FMLA v24.8h, v16.8h, v1.h[0] 494 FMLA v26.8h, v16.8h, v1.h[4] 495 FMLA v28.8h, v16.8h, v2.h[0] 496 FMLA v30.8h, v16.8h, v2.h[4] 497 FMLA v21.8h, v17.8h, v0.h[0] 498 FMLA v23.8h, v17.8h, v0.h[4] 499 FMLA v25.8h, v17.8h, v1.h[0] 500 FMLA v27.8h, v17.8h, v1.h[4] 501 FMLA v29.8h, v17.8h, v2.h[0] 502 FMLA v31.8h, v17.8h, v2.h[4] 503 B 4b 504 505 # Store odd width 5067: 507 TBZ x1, 3, 8f 508 STR q30, [x7], 16 509 MOV v30.16b, v31.16b 510 STR q28, [x13], 16 511 MOV v28.16b, v29.16b 512 STR q26, [x10], 16 513 MOV v26.16b, v27.16b 514 STR q24, [x17], 16 515 MOV v24.16b, v25.16b 516 STR q22, [x16], 16 517 MOV v22.16b, v23.16b 518 STR q20, [x6], 16 519 MOV v20.16b, v21.16b 5208: 521 TBZ x1, 2, 9f 522 STR d30, [x7], 8 523 STR d28, [x13], 8 524 DUP d30, v30.d[1] 525 DUP d28, v28.d[1] 526 STR d26, [x10], 8 527 STR d24, [x17], 8 528 DUP d26, v26.d[1] 529 DUP d24, v24.d[1] 530 STR d22, [x16], 8 531 STR d20, [x6], 8 532 DUP d22, v22.d[1] 533 DUP d20, v20.d[1] 534 5359: 536 TBZ x1, 1, 10f 537 STR s30, [x7], 4 538 STR s28, [x13], 4 539 DUP s30, v30.s[1] 540 DUP s28, v28.s[1] 541 STR s26, [x10], 4 542 STR s24, [x17], 4 543 DUP s26, v26.s[1] 544 DUP s24, v24.s[1] 545 STR s22, [x16], 4 546 STR s20, [x6], 4 547 DUP s22, v22.s[1] 548 DUP s20, v20.s[1] 549 55010: 551 TBZ x1, 0, 11f 552 STR h30, [x7] 553 STR h28, [x13] 554 STR h26, [x10] 555 STR h24, [x17] 556 STR h22, [x16] 557 STR h20, [x6] 55811: 559 # Restore x20-x23, d12-d15 from stack 560 LDP x22, x23, [sp, 48] 561 LDP x20, x21, [sp, 32] 562 LDP d14, d15, [sp, 16] 563 LDP d12, d13, [sp], 64 564 RET 565 566END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0 567 568#ifdef __ELF__ 569.section ".note.GNU-stack","",%progbits 570#endif 571