1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const void**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x8 18# size_t a_offset, [sp + 8] -> x11 19# const void* zero, [sp + 16] -> x12 20# const xnn_f16_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 26# A1 x15 v1 27# A2 x20 v2 28# A3 x21 v3 29# A4 x22 v4 30# A5 x23 v5 31 32# B x5 v16 v17 v18 v19 33 34# C0 x6 v20 v21 35# C1 x16 v22 v23 36# C2 x17 v24 v25 37# C3 x10 v26 v27 38# C4 x13 v28 v29 39# C5 x7 v30 v31 40 41# Clamp v6, (v4), (v5) 42# unused v7 43# unused A v8 v9 v10 v11 44# unused B v12 v13 v14 v15 45 46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 47 48 # Load zero, params pointer 49 LDP x12, x8, [sp, 16] 50 51 # Clamp C pointers 52 CMP x0, 2 // if mr < 2 53 ADD x16, x6, x7 // c1 = c0 + cm_stride 54 CSEL x16, x6, x16, LO // c1 = c0 55 ADD x17, x16, x7 // c2 = c1 + cm_stride 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 59 # Load params 60 LDR s6, [x8] 61 62 CMP x0, 4 // if mr < 4 63 ADD x10, x17, x7 // c3 = c2 + cm_stride 64 CSEL x10, x17, x10, LO // c3 = c2 65 ADD x13, x10, x7 // c4 = c3 + cm_stride 66 // if mr <= 4 67 CSEL x13, x10, x13, LS // c4 = c3 68 CMP x0, 6 // if mr < 6 69 ADD x7, x13, x7 // c5 = c4 + cm_stride 70 CSEL x7, x13, x7, LO // c5 = c4 71 72 LDP x8, x11, [sp] // load cn_stride, a_offset 73 74 # Save x20-x23 on stack 75 STP x20, x21, [sp, -32]! 76 STP x22, x23, [sp, 16] 77 780: 79 # Load initial bias from w into accumulators 80 LDP q20, q21, [x5], 32 81 MOV x9, x3 // p = ks 82 MOV v22.16b, v20.16b 83 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 84 MOV v23.16b, v21.16b 85 PRFM PLDL1KEEP, [x5, 64] 86 MOV v24.16b, v20.16b 87 PRFM PLDL1KEEP, [x5, 128] 88 MOV v25.16b, v21.16b 89 PRFM PLDL1KEEP, [x5, 192] 90 MOV v26.16b, v20.16b 91 PRFM PLDL1KEEP, [x5, 256] 92 MOV v27.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 320] 94 MOV v28.16b, v20.16b 95 MOV v29.16b, v21.16b 96 MOV v30.16b, v20.16b 97 MOV v31.16b, v21.16b 98 991: 100 # Load next 6 A pointers 101 LDP x14, x15, [x4], 16 102 LDP x20, x21, [x4], 16 103 LDP x22, x23, [x4], 16 104 105 CMP x14, x12 // if a0 == zero 106 ADD x14, x14, x11 // a0 += a_offset 107 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 108 CMP x15, x12 // if a1 == zero 109 ADD x15, x15, x11 // a1 += a_offset 110 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 111 CMP x20, x12 // if a2 == zero 112 ADD x20, x20, x11 // a2 += a_offset 113 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 114 CMP x21, x12 // if a3 == zero 115 ADD x21, x21, x11 // a3 += a_offset 116 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 117 CMP x22, x12 // if a4 == zero 118 ADD x22, x22, x11 // a4 += a_offset 119 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 120 CMP x23, x12 // if a5 == zero 121 ADD x23, x23, x11 // a5 += a_offset 122 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 123 124 # Is there at least 4 halffloats (8 bytes)? 125 SUBS x0, x2, 8 // k = kc - 8 126 B.LO 5f 127 128 # Prologue - load 4 A and 2 B 129 130 LDR d0, [x14], 8 // A0 131 LDR q16, [x5], 16 // B0 132 LDR q17, [x5], 16 // B1 133 LDR d1, [x15], 8 // A1 134 LDR d2, [x20], 8 // A2 135 LDR d3, [x21], 8 // A3 136 137 # Is there at least 4 halffloats for main loop? 138 SUBS x0, x0, 8 139 B.LO 3f 140 141 .p2align 3 142 # Main loop - 4 halffloats of A (8 bytes) 143 # 48 FMA + 6 ld32 A + 8 LDR B 1442: 145 FMLA v20.8h, v16.8h, v0.h[0] 146 FMLA v21.8h, v17.8h, v0.h[0] 147 LDR d4, [x22], 8 // A4 148 FMLA v22.8h, v16.8h, v1.h[0] 149 FMLA v23.8h, v17.8h, v1.h[0] 150 LDR d5, [x23], 8 // A5 151 FMLA v24.8h, v16.8h, v2.h[0] 152 FMLA v25.8h, v17.8h, v2.h[0] 153 LDR q18, [x5], 16 // B2 154 FMLA v26.8h, v16.8h, v3.h[0] 155 FMLA v27.8h, v17.8h, v3.h[0] 156 LDR q19, [x5], 16 // B3 157 FMLA v28.8h, v16.8h, v4.h[0] 158 FMLA v29.8h, v17.8h, v4.h[0] 159 FMLA v30.8h, v16.8h, v5.h[0] 160 FMLA v31.8h, v17.8h, v5.h[0] 161 SUBS x0, x0, 8 162 163 FMLA v20.8h, v18.8h, v0.h[1] 164 FMLA v21.8h, v19.8h, v0.h[1] 165 LDR q16, [x5], 16 // B4 166 FMLA v22.8h, v18.8h, v1.h[1] 167 FMLA v23.8h, v19.8h, v1.h[1] 168 LDR q17, [x5], 16 // B5 169 FMLA v24.8h, v18.8h, v2.h[1] 170 FMLA v25.8h, v19.8h, v2.h[1] 171 FMLA v26.8h, v18.8h, v3.h[1] 172 FMLA v27.8h, v19.8h, v3.h[1] 173 FMLA v28.8h, v18.8h, v4.h[1] 174 FMLA v29.8h, v19.8h, v4.h[1] 175 FMLA v30.8h, v18.8h, v5.h[1] 176 FMLA v31.8h, v19.8h, v5.h[1] 177 178 FMLA v20.8h, v16.8h, v0.h[2] 179 FMLA v21.8h, v17.8h, v0.h[2] 180 LDR q18, [x5], 16 // B6 181 FMLA v22.8h, v16.8h, v1.h[2] 182 FMLA v23.8h, v17.8h, v1.h[2] 183 LDR q19, [x5], 16 // B7 184 FMLA v24.8h, v16.8h, v2.h[2] 185 FMLA v25.8h, v17.8h, v2.h[2] 186 FMLA v26.8h, v16.8h, v3.h[2] 187 FMLA v27.8h, v17.8h, v3.h[2] 188 FMLA v28.8h, v16.8h, v4.h[2] 189 FMLA v29.8h, v17.8h, v4.h[2] 190 FMLA v30.8h, v16.8h, v5.h[2] 191 FMLA v31.8h, v17.8h, v5.h[2] 192 193 LDR q16, [x5], 16 // B0 194 FMLA v20.8h, v18.8h, v0.h[3] 195 FMLA v21.8h, v19.8h, v0.h[3] 196 LDR q17, [x5], 16 // B1 197 FMLA v22.8h, v18.8h, v1.h[3] 198 FMLA v23.8h, v19.8h, v1.h[3] 199 LDR d0, [x14], 8 // A0 200 FMLA v24.8h, v18.8h, v2.h[3] 201 FMLA v25.8h, v19.8h, v2.h[3] 202 LDR d1, [x15], 8 // A1 203 FMLA v26.8h, v18.8h, v3.h[3] 204 FMLA v27.8h, v19.8h, v3.h[3] 205 LDR d2, [x20], 8 // A2 206 FMLA v28.8h, v18.8h, v4.h[3] 207 FMLA v29.8h, v19.8h, v4.h[3] 208 LDR d3, [x21], 8 // A3 209 FMLA v30.8h, v18.8h, v5.h[3] 210 FMLA v31.8h, v19.8h, v5.h[3] 211 B.HS 2b 212 213 # Epilogue - same as main loop but no loads for next loop 2143: 215 FMLA v20.8h, v16.8h, v0.h[0] 216 FMLA v21.8h, v17.8h, v0.h[0] 217 LDR d4, [x22], 8 // A4 218 FMLA v22.8h, v16.8h, v1.h[0] 219 FMLA v23.8h, v17.8h, v1.h[0] 220 LDR d5, [x23], 8 // A5 221 FMLA v24.8h, v16.8h, v2.h[0] 222 FMLA v25.8h, v17.8h, v2.h[0] 223 LDR q18, [x5], 16 // B2 224 FMLA v26.8h, v16.8h, v3.h[0] 225 FMLA v27.8h, v17.8h, v3.h[0] 226 LDR q19, [x5], 16 // B3 227 FMLA v28.8h, v16.8h, v4.h[0] 228 FMLA v29.8h, v17.8h, v4.h[0] 229 FMLA v30.8h, v16.8h, v5.h[0] 230 FMLA v31.8h, v17.8h, v5.h[0] 231 ADDS x0, x0, 8 232 233 FMLA v20.8h, v18.8h, v0.h[1] 234 FMLA v21.8h, v19.8h, v0.h[1] 235 LDR q16, [x5], 16 // B4 236 FMLA v22.8h, v18.8h, v1.h[1] 237 FMLA v23.8h, v19.8h, v1.h[1] 238 LDR q17, [x5], 16 // B5 239 FMLA v24.8h, v18.8h, v2.h[1] 240 FMLA v25.8h, v19.8h, v2.h[1] 241 FMLA v26.8h, v18.8h, v3.h[1] 242 FMLA v27.8h, v19.8h, v3.h[1] 243 FMLA v28.8h, v18.8h, v4.h[1] 244 FMLA v29.8h, v19.8h, v4.h[1] 245 FMLA v30.8h, v18.8h, v5.h[1] 246 FMLA v31.8h, v19.8h, v5.h[1] 247 248 FMLA v20.8h, v16.8h, v0.h[2] 249 FMLA v21.8h, v17.8h, v0.h[2] 250 LDR q18, [x5], 16 // B6 251 FMLA v22.8h, v16.8h, v1.h[2] 252 FMLA v23.8h, v17.8h, v1.h[2] 253 LDR q19, [x5], 16 // B7 254 FMLA v24.8h, v16.8h, v2.h[2] 255 FMLA v25.8h, v17.8h, v2.h[2] 256 FMLA v26.8h, v16.8h, v3.h[2] 257 FMLA v27.8h, v17.8h, v3.h[2] 258 FMLA v28.8h, v16.8h, v4.h[2] 259 FMLA v29.8h, v17.8h, v4.h[2] 260 FMLA v30.8h, v16.8h, v5.h[2] 261 FMLA v31.8h, v17.8h, v5.h[2] 262 263 FMLA v20.8h, v18.8h, v0.h[3] 264 FMLA v21.8h, v19.8h, v0.h[3] 265 FMLA v22.8h, v18.8h, v1.h[3] 266 FMLA v23.8h, v19.8h, v1.h[3] 267 FMLA v24.8h, v18.8h, v2.h[3] 268 FMLA v25.8h, v19.8h, v2.h[3] 269 FMLA v26.8h, v18.8h, v3.h[3] 270 FMLA v27.8h, v19.8h, v3.h[3] 271 FMLA v28.8h, v18.8h, v4.h[3] 272 FMLA v29.8h, v19.8h, v4.h[3] 273 FMLA v30.8h, v18.8h, v5.h[3] 274 FMLA v31.8h, v19.8h, v5.h[3] 275 276 # Is there a remainder?- 1-3 halffloats of A (2-6 bytes) 277 B.NE 5f 278 2794: 280 # ks loop 281 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 282 B.HI 1b 283 284 # Clamp 285 DUP v4.8h, v6.h[0] 286 DUP v5.8h, v6.h[1] 287 FMAX v20.8h, v20.8h, v4.8h 288 FMAX v21.8h, v21.8h, v4.8h 289 FMAX v22.8h, v22.8h, v4.8h 290 FMAX v23.8h, v23.8h, v4.8h 291 FMAX v24.8h, v24.8h, v4.8h 292 FMAX v25.8h, v25.8h, v4.8h 293 FMAX v26.8h, v26.8h, v4.8h 294 FMAX v27.8h, v27.8h, v4.8h 295 FMAX v28.8h, v28.8h, v4.8h 296 FMAX v29.8h, v29.8h, v4.8h 297 FMAX v30.8h, v30.8h, v4.8h 298 FMAX v31.8h, v31.8h, v4.8h 299 SUBS x1, x1, 16 300 FMIN v20.8h, v20.8h, v5.8h 301 FMIN v21.8h, v21.8h, v5.8h 302 FMIN v22.8h, v22.8h, v5.8h 303 FMIN v23.8h, v23.8h, v5.8h 304 FMIN v24.8h, v24.8h, v5.8h 305 FMIN v25.8h, v25.8h, v5.8h 306 FMIN v26.8h, v26.8h, v5.8h 307 FMIN v27.8h, v27.8h, v5.8h 308 FMIN v28.8h, v28.8h, v5.8h 309 FMIN v29.8h, v29.8h, v5.8h 310 FMIN v30.8h, v30.8h, v5.8h 311 FMIN v31.8h, v31.8h, v5.8h 312 313 # Store full 6 x 16 314 B.LO 7f 315 316 ST1 {v30.16b, v31.16b}, [x7], x8 317 ST1 {v28.16b, v29.16b}, [x13], x8 318 ST1 {v26.16b, v27.16b}, [x10], x8 319 ST1 {v24.16b, v25.16b}, [x17], x8 320 ST1 {v22.16b, v23.16b}, [x16], x8 321 ST1 {v20.16b, v21.16b}, [x6], x8 322 323 SUB x4, x4, x3 // a -= ks 324 325 # nc loop 326 B.HI 0b 327 328 # Restore x20-x23 from stack 329 LDP x22, x23, [sp, 16] 330 LDP x20, x21, [sp], 32 331 RET 332 333 # Remainder- 1-3 halffloats of A (2-6 bytes) 3345: 335 TBZ x0, 2, 6f 336 LDR s0, [x14], 4 337 LDR q16, [x5], 16 338 LDR q17, [x5], 16 339 LDR s1, [x15], 4 340 LDR s2, [x20], 4 341 LDR s3, [x21], 4 342 LDR s4, [x22], 4 343 LDR s5, [x23], 4 344 LDR q18, [x5], 16 345 LDR q19, [x5], 16 346 FMLA v20.8h, v16.8h, v0.h[0] 347 FMLA v22.8h, v16.8h, v1.h[0] 348 FMLA v24.8h, v16.8h, v2.h[0] 349 FMLA v26.8h, v16.8h, v3.h[0] 350 FMLA v28.8h, v16.8h, v4.h[0] 351 FMLA v30.8h, v16.8h, v5.h[0] 352 FMLA v21.8h, v17.8h, v0.h[0] 353 FMLA v23.8h, v17.8h, v1.h[0] 354 FMLA v25.8h, v17.8h, v2.h[0] 355 FMLA v27.8h, v17.8h, v3.h[0] 356 FMLA v29.8h, v17.8h, v4.h[0] 357 FMLA v31.8h, v17.8h, v5.h[0] 358 359 FMLA v20.8h, v18.8h, v0.h[1] 360 FMLA v22.8h, v18.8h, v1.h[1] 361 FMLA v24.8h, v18.8h, v2.h[1] 362 FMLA v26.8h, v18.8h, v3.h[1] 363 FMLA v28.8h, v18.8h, v4.h[1] 364 FMLA v30.8h, v18.8h, v5.h[1] 365 FMLA v21.8h, v19.8h, v0.h[1] 366 FMLA v23.8h, v19.8h, v1.h[1] 367 FMLA v25.8h, v19.8h, v2.h[1] 368 FMLA v27.8h, v19.8h, v3.h[1] 369 FMLA v29.8h, v19.8h, v4.h[1] 370 FMLA v31.8h, v19.8h, v5.h[1] 371 TBZ x0, 1, 4b 372 3736: 374 LDR h0, [x14], 2 375 LDR q16, [x5], 16 376 LDR q17, [x5], 16 377 LDR h1, [x15], 2 378 LDR h2, [x20], 2 379 LDR h3, [x21], 2 380 LDR h4, [x22], 2 381 LDR h5, [x23], 2 382 FMLA v20.8h, v16.8h, v0.h[0] 383 FMLA v22.8h, v16.8h, v1.h[0] 384 FMLA v24.8h, v16.8h, v2.h[0] 385 FMLA v26.8h, v16.8h, v3.h[0] 386 FMLA v28.8h, v16.8h, v4.h[0] 387 FMLA v30.8h, v16.8h, v5.h[0] 388 FMLA v21.8h, v17.8h, v0.h[0] 389 FMLA v23.8h, v17.8h, v1.h[0] 390 FMLA v25.8h, v17.8h, v2.h[0] 391 FMLA v27.8h, v17.8h, v3.h[0] 392 FMLA v29.8h, v17.8h, v4.h[0] 393 FMLA v31.8h, v17.8h, v5.h[0] 394 B 4b 395 396 # Store odd width 3977: 398 TBZ x1, 3, 8f 399 STR q30, [x7], 16 400 MOV v30.16b, v31.16b 401 STR q28, [x13], 16 402 MOV v28.16b, v29.16b 403 STR q26, [x10], 16 404 MOV v26.16b, v27.16b 405 STR q24, [x17], 16 406 MOV v24.16b, v25.16b 407 STR q22, [x16], 16 408 MOV v22.16b, v23.16b 409 STR q20, [x6], 16 410 MOV v20.16b, v21.16b 4118: 412 TBZ x1, 2, 9f 413 STR d30, [x7], 8 414 STR d28, [x13], 8 415 DUP d30, v30.d[1] 416 DUP d28, v28.d[1] 417 STR d26, [x10], 8 418 STR d24, [x17], 8 419 DUP d26, v26.d[1] 420 DUP d24, v24.d[1] 421 STR d22, [x16], 8 422 STR d20, [x6], 8 423 DUP d22, v22.d[1] 424 DUP d20, v20.d[1] 425 4269: 427 TBZ x1, 1, 10f 428 STR s30, [x7], 4 429 STR s28, [x13], 4 430 DUP s30, v30.s[1] 431 DUP s28, v28.s[1] 432 STR s26, [x10], 4 433 STR s24, [x17], 4 434 DUP s26, v26.s[1] 435 DUP s24, v24.s[1] 436 STR s22, [x16], 4 437 STR s20, [x6], 4 438 DUP s22, v22.s[1] 439 DUP s20, v20.s[1] 440 44110: 442 TBZ x1, 0, 11f 443 STR h30, [x7] 444 STR h28, [x13] 445 STR h26, [x10] 446 STR h24, [x17] 447 STR h22, [x16] 448 STR h20, [x6] 44911: 450 # Restore x20-x23 from stack 451 LDP x22, x23, [sp, 16] 452 LDP x20, x21, [sp], 32 453 RET 454 455END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 456 457#ifdef __ELF__ 458.section ".note.GNU-stack","",%progbits 459#endif 460