1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const void*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# void*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x8 18 19$if INC: 20 # const float*restrict acc, [sp + 8] -> x15 21 # const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 22$else: 23 # const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x9 v1 30# A2 x10 v2 31# A3 x11 v3 32# A4 x12 v4 33# A5 x4 v5 34 35# B x5 v16 v17 v18 v19 36 37# C0 x6 v20 v21 38# C1 x16 v22 v23 39# C2 x17 v24 v25 40# C3 x14 v26 v27 41# C4 x13 v28 v29 42# C5 x7 v30 v31 43 44# Clamp v6, (v4), (v5) 45# unused v7 46# unused A v8 v9 v10 v11 47# unused B v12 v13 v14 v15 48 49BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 50 51 $if INC: 52 # Load acc, params pointer 53 LDP x15, x8, [sp, 8] 54 $else: 55 # Load params pointer 56 LDR x8, [sp, 8] 57 58 # Clamp A and C pointers 59 CMP x0, 2 // if mr < 2 60 ADD x9, x3, x4 // a1 = a0 + a_stride 61 ADD x16, x6, x7 // c1 = c0 + cm_stride 62 CSEL x9, x3, x9, LO // a1 = a0 63 CSEL x16, x6, x16, LO // c1 = c0 64 65 # Load params 66 LDR s6, [x8] 67 68 ADD x10, x9, x4 // a2 = a1 + a_stride 69 ADD x17, x16, x7 // c2 = c1 + cm_stride 70 // if mr <= 2 71 CSEL x10, x9, x10, LS // a2 = a1 72 CSEL x17, x16, x17, LS // c2 = c1 73 74 CMP x0, 4 // if mr < 4 75 ADD x11, x10, x4 // a3 = a2 + a_stride 76 ADD x14, x17, x7 // c3 = c2 + cm_stride 77 CSEL x11, x10, x11, LO // a3 = a2 78 CSEL x14, x17, x14, LO // c3 = c2 79 80 ADD x12, x11, x4 // a4 = a3 + a_stride 81 ADD x13, x14, x7 // c4 = c3 + cm_stride 82 // if mr <= 4 83 CSEL x12, x11, x12, LS // a4 = a3 84 CSEL x13, x14, x13, LS // c4 = c3 85 86 CMP x0, 6 // if mr < 6 87 ADD x4, x12, x4 // a5 = a4 + a_stride 88 ADD x7, x13, x7 // c5 = c4 + cm_stride 89 CSEL x4, x12, x4, LO // a5 = a4 90 CSEL x7, x13, x7, LO // c5 = c4 91 92 LDR x8, [sp] // load cn_stride 93 940: 95 $if INC: 96 # Load initial accumulators 97 LDP q20, q21, [x15], 32 98 LDP q22, q23, [x15], 32 99 LDP q24, q25, [x15], 32 100 LDP q26, q27, [x15], 32 101 LDP q28, q29, [x15], 32 102 LDP q30, q31, [x15], 32 103 $else: 104 # Load initial bias from w into accumulators 105 LDP q20, q21, [x5], 32 106 MOV v22.16b, v20.16b 107 MOV v23.16b, v21.16b 108 MOV v24.16b, v20.16b 109 MOV v25.16b, v21.16b 110 MOV v26.16b, v20.16b 111 MOV v27.16b, v21.16b 112 MOV v28.16b, v20.16b 113 MOV v29.16b, v21.16b 114 MOV v30.16b, v20.16b 115 MOV v31.16b, v21.16b 116 117 # Is there at least 4 halffloats (8 bytes)? 118 SUBS x0, x2, 8 // k = kc - 8 119 B.LO 4f 120 121 # Prologue - load 4 A and 2 B 122 123 LDR d0, [x3], 8 // A0 124 LDR q16, [x5], 16 // B0 125 LDR q17, [x5], 16 // B1 126 LDR d1, [x9], 8 // A1 127 LDR d2, [x10], 8 // A2 128 LDR d3, [x11], 8 // A3 129 130 # Is there at least 4 halffloats for main loop? 131 SUBS x0, x0, 8 132 B.LO 2f 133 134 .p2align 3 135 # Main loop - 4 halffloats of A (8 bytes) 136 # 48 FMA + 6 ld32 A + 8 LDR B 1371: 138 FMLA v20.8h, v16.8h, v0.h[0] 139 FMLA v21.8h, v17.8h, v0.h[0] 140 LDR d4, [x12], 8 // A4 141 FMLA v22.8h, v16.8h, v1.h[0] 142 FMLA v23.8h, v17.8h, v1.h[0] 143 LDR d5, [x4], 8 // A5 144 FMLA v24.8h, v16.8h, v2.h[0] 145 FMLA v25.8h, v17.8h, v2.h[0] 146 LDR q18, [x5], 16 // B2 147 FMLA v26.8h, v16.8h, v3.h[0] 148 FMLA v27.8h, v17.8h, v3.h[0] 149 LDR q19, [x5], 16 // B3 150 FMLA v28.8h, v16.8h, v4.h[0] 151 FMLA v29.8h, v17.8h, v4.h[0] 152 FMLA v30.8h, v16.8h, v5.h[0] 153 FMLA v31.8h, v17.8h, v5.h[0] 154 SUBS x0, x0, 8 155 156 FMLA v20.8h, v18.8h, v0.h[1] 157 FMLA v21.8h, v19.8h, v0.h[1] 158 LDR q16, [x5], 16 // B4 159 FMLA v22.8h, v18.8h, v1.h[1] 160 FMLA v23.8h, v19.8h, v1.h[1] 161 LDR q17, [x5], 16 // B5 162 FMLA v24.8h, v18.8h, v2.h[1] 163 FMLA v25.8h, v19.8h, v2.h[1] 164 FMLA v26.8h, v18.8h, v3.h[1] 165 FMLA v27.8h, v19.8h, v3.h[1] 166 FMLA v28.8h, v18.8h, v4.h[1] 167 FMLA v29.8h, v19.8h, v4.h[1] 168 FMLA v30.8h, v18.8h, v5.h[1] 169 FMLA v31.8h, v19.8h, v5.h[1] 170 171 FMLA v20.8h, v16.8h, v0.h[2] 172 FMLA v21.8h, v17.8h, v0.h[2] 173 LDR q18, [x5], 16 // B6 174 FMLA v22.8h, v16.8h, v1.h[2] 175 FMLA v23.8h, v17.8h, v1.h[2] 176 LDR q19, [x5], 16 // B7 177 FMLA v24.8h, v16.8h, v2.h[2] 178 FMLA v25.8h, v17.8h, v2.h[2] 179 FMLA v26.8h, v16.8h, v3.h[2] 180 FMLA v27.8h, v17.8h, v3.h[2] 181 FMLA v28.8h, v16.8h, v4.h[2] 182 FMLA v29.8h, v17.8h, v4.h[2] 183 FMLA v30.8h, v16.8h, v5.h[2] 184 FMLA v31.8h, v17.8h, v5.h[2] 185 186 LDR q16, [x5], 16 // B0 187 FMLA v20.8h, v18.8h, v0.h[3] 188 FMLA v21.8h, v19.8h, v0.h[3] 189 LDR q17, [x5], 16 // B1 190 FMLA v22.8h, v18.8h, v1.h[3] 191 FMLA v23.8h, v19.8h, v1.h[3] 192 LDR d0, [x3], 8 // A0 193 FMLA v24.8h, v18.8h, v2.h[3] 194 FMLA v25.8h, v19.8h, v2.h[3] 195 LDR d1, [x9], 8 // A1 196 FMLA v26.8h, v18.8h, v3.h[3] 197 FMLA v27.8h, v19.8h, v3.h[3] 198 LDR d2, [x10], 8 // A2 199 FMLA v28.8h, v18.8h, v4.h[3] 200 FMLA v29.8h, v19.8h, v4.h[3] 201 LDR d3, [x11], 8 // A3 202 FMLA v30.8h, v18.8h, v5.h[3] 203 FMLA v31.8h, v19.8h, v5.h[3] 204 B.HS 1b 205 206 # Epilogue - same as main loop but no loads for next loop 2072: 208 FMLA v20.8h, v16.8h, v0.h[0] 209 FMLA v21.8h, v17.8h, v0.h[0] 210 LDR d4, [x12], 8 // A4 211 FMLA v22.8h, v16.8h, v1.h[0] 212 FMLA v23.8h, v17.8h, v1.h[0] 213 LDR d5, [x4], 8 // A5 214 FMLA v24.8h, v16.8h, v2.h[0] 215 FMLA v25.8h, v17.8h, v2.h[0] 216 LDR q18, [x5], 16 // B2 217 FMLA v26.8h, v16.8h, v3.h[0] 218 FMLA v27.8h, v17.8h, v3.h[0] 219 LDR q19, [x5], 16 // B3 220 FMLA v28.8h, v16.8h, v4.h[0] 221 FMLA v29.8h, v17.8h, v4.h[0] 222 FMLA v30.8h, v16.8h, v5.h[0] 223 FMLA v31.8h, v17.8h, v5.h[0] 224 ADDS x0, x0, 8 225 226 FMLA v20.8h, v18.8h, v0.h[1] 227 FMLA v21.8h, v19.8h, v0.h[1] 228 LDR q16, [x5], 16 // B4 229 FMLA v22.8h, v18.8h, v1.h[1] 230 FMLA v23.8h, v19.8h, v1.h[1] 231 LDR q17, [x5], 16 // B5 232 FMLA v24.8h, v18.8h, v2.h[1] 233 FMLA v25.8h, v19.8h, v2.h[1] 234 FMLA v26.8h, v18.8h, v3.h[1] 235 FMLA v27.8h, v19.8h, v3.h[1] 236 FMLA v28.8h, v18.8h, v4.h[1] 237 FMLA v29.8h, v19.8h, v4.h[1] 238 FMLA v30.8h, v18.8h, v5.h[1] 239 FMLA v31.8h, v19.8h, v5.h[1] 240 241 FMLA v20.8h, v16.8h, v0.h[2] 242 FMLA v21.8h, v17.8h, v0.h[2] 243 LDR q18, [x5], 16 // B6 244 FMLA v22.8h, v16.8h, v1.h[2] 245 FMLA v23.8h, v17.8h, v1.h[2] 246 LDR q19, [x5], 16 // B7 247 FMLA v24.8h, v16.8h, v2.h[2] 248 FMLA v25.8h, v17.8h, v2.h[2] 249 FMLA v26.8h, v16.8h, v3.h[2] 250 FMLA v27.8h, v17.8h, v3.h[2] 251 FMLA v28.8h, v16.8h, v4.h[2] 252 FMLA v29.8h, v17.8h, v4.h[2] 253 FMLA v30.8h, v16.8h, v5.h[2] 254 FMLA v31.8h, v17.8h, v5.h[2] 255 256 FMLA v20.8h, v18.8h, v0.h[3] 257 FMLA v21.8h, v19.8h, v0.h[3] 258 FMLA v22.8h, v18.8h, v1.h[3] 259 FMLA v23.8h, v19.8h, v1.h[3] 260 FMLA v24.8h, v18.8h, v2.h[3] 261 FMLA v25.8h, v19.8h, v2.h[3] 262 FMLA v26.8h, v18.8h, v3.h[3] 263 FMLA v27.8h, v19.8h, v3.h[3] 264 FMLA v28.8h, v18.8h, v4.h[3] 265 FMLA v29.8h, v19.8h, v4.h[3] 266 FMLA v30.8h, v18.8h, v5.h[3] 267 FMLA v31.8h, v19.8h, v5.h[3] 268 269 # Is there a remainder?- 1-3 halffloats of A (2-6 bytes) 270 B.NE 4f 271 2723: 273 # Clamp 274 DUP v4.8h, v6.h[0] 275 DUP v5.8h, v6.h[1] 276 FMAX v20.8h, v20.8h, v4.8h 277 FMAX v21.8h, v21.8h, v4.8h 278 FMAX v22.8h, v22.8h, v4.8h 279 FMAX v23.8h, v23.8h, v4.8h 280 FMAX v24.8h, v24.8h, v4.8h 281 FMAX v25.8h, v25.8h, v4.8h 282 FMAX v26.8h, v26.8h, v4.8h 283 FMAX v27.8h, v27.8h, v4.8h 284 FMAX v28.8h, v28.8h, v4.8h 285 FMAX v29.8h, v29.8h, v4.8h 286 FMAX v30.8h, v30.8h, v4.8h 287 FMAX v31.8h, v31.8h, v4.8h 288 SUBS x1, x1, 16 289 FMIN v20.8h, v20.8h, v5.8h 290 FMIN v21.8h, v21.8h, v5.8h 291 FMIN v22.8h, v22.8h, v5.8h 292 FMIN v23.8h, v23.8h, v5.8h 293 FMIN v24.8h, v24.8h, v5.8h 294 FMIN v25.8h, v25.8h, v5.8h 295 FMIN v26.8h, v26.8h, v5.8h 296 FMIN v27.8h, v27.8h, v5.8h 297 FMIN v28.8h, v28.8h, v5.8h 298 FMIN v29.8h, v29.8h, v5.8h 299 FMIN v30.8h, v30.8h, v5.8h 300 FMIN v31.8h, v31.8h, v5.8h 301 302 # Store full 6 x 16 303 B.LO 6f 304 305 $if INC: 306 ST1 {v30.16b, v31.16b}, [x7], x8 307 SUB x3, x3, x2 // a0 -= kc 308 ST1 {v28.16b, v29.16b}, [x13], x8 309 SUB x9, x9, x2 // a1 -= kc 310 ST1 {v26.16b, v27.16b}, [x14], x8 311 SUB x10, x10, x2 // a2 -= kc 312 ST1 {v24.16b, v25.16b}, [x17], x8 313 SUB x11, x11, x2 // a3 -= kc 314 ST1 {v22.16b, v23.16b}, [x16], x8 315 SUB x12, x12, x2 // a4 -= kc 316 ST1 {v20.16b, v21.16b}, [x6], x8 317 SUB x4, x4, x2 // a5 -= kc 318 $else: 319 ST1 {v20.16b, v21.16b}, [x6], x8 320 SUB x3, x3, x2 // a0 -= kc 321 ST1 {v22.16b, v23.16b}, [x16], x8 322 SUB x9, x9, x2 // a1 -= kc 323 ST1 {v24.16b, v25.16b}, [x17], x8 324 SUB x10, x10, x2 // a2 -= kc 325 ST1 {v26.16b, v27.16b}, [x14], x8 326 SUB x11, x11, x2 // a3 -= kc 327 ST1 {v28.16b, v29.16b}, [x13], x8 328 SUB x12, x12, x2 // a4 -= kc 329 ST1 {v30.16b, v31.16b}, [x7], x8 330 SUB x4, x4, x2 // a5 -= kc 331 332 B.HI 0b 333 RET 334 335 # Remainder- 1-3 halffloats of A (2-6 bytes) 3364: 337 TBZ x0, 2, 5f 338 LDR s0, [x3], 4 339 LDR q16, [x5], 16 340 LDR q17, [x5], 16 341 LDR s1, [x9], 4 342 LDR s2, [x10], 4 343 LDR s3, [x11], 4 344 LDR s4, [x12], 4 345 LDR s5, [x4], 4 346 LDR q18, [x5], 16 347 LDR q19, [x5], 16 348 FMLA v20.8h, v16.8h, v0.h[0] 349 FMLA v22.8h, v16.8h, v1.h[0] 350 FMLA v24.8h, v16.8h, v2.h[0] 351 FMLA v26.8h, v16.8h, v3.h[0] 352 FMLA v28.8h, v16.8h, v4.h[0] 353 FMLA v30.8h, v16.8h, v5.h[0] 354 FMLA v21.8h, v17.8h, v0.h[0] 355 FMLA v23.8h, v17.8h, v1.h[0] 356 FMLA v25.8h, v17.8h, v2.h[0] 357 FMLA v27.8h, v17.8h, v3.h[0] 358 FMLA v29.8h, v17.8h, v4.h[0] 359 FMLA v31.8h, v17.8h, v5.h[0] 360 361 FMLA v20.8h, v18.8h, v0.h[1] 362 FMLA v22.8h, v18.8h, v1.h[1] 363 FMLA v24.8h, v18.8h, v2.h[1] 364 FMLA v26.8h, v18.8h, v3.h[1] 365 FMLA v28.8h, v18.8h, v4.h[1] 366 FMLA v30.8h, v18.8h, v5.h[1] 367 FMLA v21.8h, v19.8h, v0.h[1] 368 FMLA v23.8h, v19.8h, v1.h[1] 369 FMLA v25.8h, v19.8h, v2.h[1] 370 FMLA v27.8h, v19.8h, v3.h[1] 371 FMLA v29.8h, v19.8h, v4.h[1] 372 FMLA v31.8h, v19.8h, v5.h[1] 373 TBZ x0, 1, 3b 374 3755: 376 LDR h0, [x3], 2 377 LDR q16, [x5], 16 378 LDR q17, [x5], 16 379 LDR h1, [x9], 2 380 LDR h2, [x10], 2 381 LDR h3, [x11], 2 382 LDR h4, [x12], 2 383 LDR h5, [x4], 2 384 FMLA v20.8h, v16.8h, v0.h[0] 385 FMLA v22.8h, v16.8h, v1.h[0] 386 FMLA v24.8h, v16.8h, v2.h[0] 387 FMLA v26.8h, v16.8h, v3.h[0] 388 FMLA v28.8h, v16.8h, v4.h[0] 389 FMLA v30.8h, v16.8h, v5.h[0] 390 FMLA v21.8h, v17.8h, v0.h[0] 391 FMLA v23.8h, v17.8h, v1.h[0] 392 FMLA v25.8h, v17.8h, v2.h[0] 393 FMLA v27.8h, v17.8h, v3.h[0] 394 FMLA v29.8h, v17.8h, v4.h[0] 395 FMLA v31.8h, v17.8h, v5.h[0] 396 B 3b 397 398 # Store odd width 3996: 400 TBZ x1, 3, 7f 401 $if INC: 402 STR q30, [x7], 16 403 MOV v30.16b, v31.16b 404 STR q28, [x13], 16 405 MOV v28.16b, v29.16b 406 STR q26, [x14], 16 407 MOV v26.16b, v27.16b 408 STR q24, [x17], 16 409 MOV v24.16b, v25.16b 410 STR q22, [x16], 16 411 MOV v22.16b, v23.16b 412 STR q20, [x6], 16 413 MOV v20.16b, v21.16b 414 $else: 415 STR q20, [x6], 16 416 MOV v20.16b, v21.16b 417 STR q22, [x16], 16 418 MOV v22.16b, v23.16b 419 STR q24, [x17], 16 420 MOV v24.16b, v25.16b 421 STR q26, [x14], 16 422 MOV v26.16b, v27.16b 423 STR q28, [x13], 16 424 MOV v28.16b, v29.16b 425 STR q30, [x7], 16 426 MOV v30.16b, v31.16b 427 4287: 429 TBZ x1, 2, 8f 430 $if INC: 431 STR d30, [x7], 8 432 STR d28, [x13], 8 433 DUP d30, v30.d[1] 434 DUP d28, v28.d[1] 435 STR d26, [x14], 8 436 STR d24, [x17], 8 437 DUP d26, v26.d[1] 438 DUP d24, v24.d[1] 439 STR d22, [x16], 8 440 STR d20, [x6], 8 441 DUP d22, v22.d[1] 442 DUP d20, v20.d[1] 443 $else: 444 STR d20, [x6], 8 445 STR d22, [x16], 8 446 DUP d20, v20.d[1] 447 DUP d22, v22.d[1] 448 STR d24, [x17], 8 449 STR d26, [x14], 8 450 DUP d24, v24.d[1] 451 DUP d26, v26.d[1] 452 STR d28, [x13], 8 453 STR d30, [x7], 8 454 DUP d28, v28.d[1] 455 DUP d30, v30.d[1] 456 4578: 458 TBZ x1, 1, 9f 459 $if INC: 460 STR s30, [x7], 4 461 STR s28, [x13], 4 462 DUP s30, v30.s[1] 463 DUP s28, v28.s[1] 464 STR s26, [x14], 4 465 STR s24, [x17], 4 466 DUP s26, v26.s[1] 467 DUP s24, v24.s[1] 468 STR s22, [x16], 4 469 STR s20, [x6], 4 470 DUP s22, v22.s[1] 471 DUP s20, v20.s[1] 472 $else: 473 STR s20, [x6], 4 474 STR s22, [x16], 4 475 DUP s20, v20.s[1] 476 DUP s22, v22.s[1] 477 STR s24, [x17], 4 478 STR s26, [x14], 4 479 DUP s24, v24.s[1] 480 DUP s26, v26.s[1] 481 STR s28, [x13], 4 482 STR s30, [x7], 4 483 DUP s28, v28.s[1] 484 DUP s30, v30.s[1] 485 4869: 487 TBZ x1, 0, 10f 488 $if INC: 489 STR h30, [x7] 490 STR h28, [x13] 491 STR h26, [x14] 492 STR h24, [x17] 493 STR h22, [x16] 494 STR h20, [x6] 495 $else: 496 STR h20, [x6] 497 STR h22, [x16] 498 STR h24, [x17] 499 STR h26, [x14] 500 STR h28, [x13] 501 STR h30, [x7] 50210: 503 RET 504 505END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 506 507#ifdef __ELF__ 508.section ".note.GNU-stack","",%progbits 509#endif 510