1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x8 22 23# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x9 v1 30# A2 x10 v2 31# A3 x11 v3 32# A4 x12 v4 33# A5 x4 v5 34 35# B x5 v16 v17 v18 v19 36 37# C0 x6 v20 v21 38# C1 x16 v22 v23 39# C2 x17 v24 v25 40# C3 x14 v26 v27 41# C4 x13 v28 v29 42# C5 x7 v30 v31 43 44# Clamp v6, (v4), (v5) 45# unused v7 46# unused A v8 v9 v10 v11 47# unused B v12 v13 v14 v15 48 49BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 50 51 # Load params pointer 52 LDR x8, [sp, 8] 53 54 # Clamp A and C pointers 55 CMP x0, 2 // if mr < 2 56 ADD x9, x3, x4 // a1 = a0 + a_stride 57 ADD x16, x6, x7 // c1 = c0 + cm_stride 58 CSEL x9, x3, x9, LO // a1 = a0 59 CSEL x16, x6, x16, LO // c1 = c0 60 61 # Load params 62 LDR s6, [x8] 63 64 ADD x10, x9, x4 // a2 = a1 + a_stride 65 ADD x17, x16, x7 // c2 = c1 + cm_stride 66 // if mr <= 2 67 CSEL x10, x9, x10, LS // a2 = a1 68 CSEL x17, x16, x17, LS // c2 = c1 69 70 CMP x0, 4 // if mr < 4 71 ADD x11, x10, x4 // a3 = a2 + a_stride 72 ADD x14, x17, x7 // c3 = c2 + cm_stride 73 CSEL x11, x10, x11, LO // a3 = a2 74 CSEL x14, x17, x14, LO // c3 = c2 75 76 ADD x12, x11, x4 // a4 = a3 + a_stride 77 ADD x13, x14, x7 // c4 = c3 + cm_stride 78 // if mr <= 4 79 CSEL x12, x11, x12, LS // a4 = a3 80 CSEL x13, x14, x13, LS // c4 = c3 81 82 CMP x0, 6 // if mr < 6 83 ADD x4, x12, x4 // a5 = a4 + a_stride 84 ADD x7, x13, x7 // c5 = c4 + cm_stride 85 CSEL x4, x12, x4, LO // a5 = a4 86 CSEL x7, x13, x7, LO // c5 = c4 87 88 LDR x8, [sp] // load cn_stride 89 900: 91 # Load initial bias from w into accumulators 92 LDP q20, q21, [x5], 32 93 MOV v22.16b, v20.16b 94 MOV v23.16b, v21.16b 95 MOV v24.16b, v20.16b 96 MOV v25.16b, v21.16b 97 MOV v26.16b, v20.16b 98 MOV v27.16b, v21.16b 99 MOV v28.16b, v20.16b 100 MOV v29.16b, v21.16b 101 MOV v30.16b, v20.16b 102 MOV v31.16b, v21.16b 103 104 # Is there at least 4 halffloats (8 bytes)? 105 SUBS x0, x2, 8 // k = kc - 8 106 B.LO 4f 107 108 # Prologue - load 4 A and 2 B 109 110 LDR d0, [x3], 8 // A0 111 LDR q16, [x5], 16 // B0 112 LDR q17, [x5], 16 // B1 113 LDR d1, [x9], 8 // A1 114 LDR d2, [x10], 8 // A2 115 LDR d3, [x11], 8 // A3 116 117 # Is there at least 4 halffloats for main loop? 118 SUBS x0, x0, 8 119 B.LO 2f 120 121 .p2align 3 122 # Main loop - 4 halffloats of A (8 bytes) 123 # 48 FMA + 6 ld32 A + 8 LDR B 1241: 125 FMLA v20.8h, v16.8h, v0.h[0] 126 FMLA v21.8h, v17.8h, v0.h[0] 127 LDR d4, [x12], 8 // A4 128 FMLA v22.8h, v16.8h, v1.h[0] 129 FMLA v23.8h, v17.8h, v1.h[0] 130 LDR d5, [x4], 8 // A5 131 FMLA v24.8h, v16.8h, v2.h[0] 132 FMLA v25.8h, v17.8h, v2.h[0] 133 LDR q18, [x5], 16 // B2 134 FMLA v26.8h, v16.8h, v3.h[0] 135 FMLA v27.8h, v17.8h, v3.h[0] 136 LDR q19, [x5], 16 // B3 137 FMLA v28.8h, v16.8h, v4.h[0] 138 FMLA v29.8h, v17.8h, v4.h[0] 139 FMLA v30.8h, v16.8h, v5.h[0] 140 FMLA v31.8h, v17.8h, v5.h[0] 141 SUBS x0, x0, 8 142 143 FMLA v20.8h, v18.8h, v0.h[1] 144 FMLA v21.8h, v19.8h, v0.h[1] 145 LDR q16, [x5], 16 // B4 146 FMLA v22.8h, v18.8h, v1.h[1] 147 FMLA v23.8h, v19.8h, v1.h[1] 148 LDR q17, [x5], 16 // B5 149 FMLA v24.8h, v18.8h, v2.h[1] 150 FMLA v25.8h, v19.8h, v2.h[1] 151 FMLA v26.8h, v18.8h, v3.h[1] 152 FMLA v27.8h, v19.8h, v3.h[1] 153 FMLA v28.8h, v18.8h, v4.h[1] 154 FMLA v29.8h, v19.8h, v4.h[1] 155 FMLA v30.8h, v18.8h, v5.h[1] 156 FMLA v31.8h, v19.8h, v5.h[1] 157 158 FMLA v20.8h, v16.8h, v0.h[2] 159 FMLA v21.8h, v17.8h, v0.h[2] 160 LDR q18, [x5], 16 // B6 161 FMLA v22.8h, v16.8h, v1.h[2] 162 FMLA v23.8h, v17.8h, v1.h[2] 163 LDR q19, [x5], 16 // B7 164 FMLA v24.8h, v16.8h, v2.h[2] 165 FMLA v25.8h, v17.8h, v2.h[2] 166 FMLA v26.8h, v16.8h, v3.h[2] 167 FMLA v27.8h, v17.8h, v3.h[2] 168 FMLA v28.8h, v16.8h, v4.h[2] 169 FMLA v29.8h, v17.8h, v4.h[2] 170 FMLA v30.8h, v16.8h, v5.h[2] 171 FMLA v31.8h, v17.8h, v5.h[2] 172 173 LDR q16, [x5], 16 // B0 174 FMLA v20.8h, v18.8h, v0.h[3] 175 FMLA v21.8h, v19.8h, v0.h[3] 176 LDR q17, [x5], 16 // B1 177 FMLA v22.8h, v18.8h, v1.h[3] 178 FMLA v23.8h, v19.8h, v1.h[3] 179 LDR d0, [x3], 8 // A0 180 FMLA v24.8h, v18.8h, v2.h[3] 181 FMLA v25.8h, v19.8h, v2.h[3] 182 LDR d1, [x9], 8 // A1 183 FMLA v26.8h, v18.8h, v3.h[3] 184 FMLA v27.8h, v19.8h, v3.h[3] 185 LDR d2, [x10], 8 // A2 186 FMLA v28.8h, v18.8h, v4.h[3] 187 FMLA v29.8h, v19.8h, v4.h[3] 188 LDR d3, [x11], 8 // A3 189 FMLA v30.8h, v18.8h, v5.h[3] 190 FMLA v31.8h, v19.8h, v5.h[3] 191 B.HS 1b 192 193 # Epilogue - same as main loop but no loads for next loop 1942: 195 FMLA v20.8h, v16.8h, v0.h[0] 196 FMLA v21.8h, v17.8h, v0.h[0] 197 LDR d4, [x12], 8 // A4 198 FMLA v22.8h, v16.8h, v1.h[0] 199 FMLA v23.8h, v17.8h, v1.h[0] 200 LDR d5, [x4], 8 // A5 201 FMLA v24.8h, v16.8h, v2.h[0] 202 FMLA v25.8h, v17.8h, v2.h[0] 203 LDR q18, [x5], 16 // B2 204 FMLA v26.8h, v16.8h, v3.h[0] 205 FMLA v27.8h, v17.8h, v3.h[0] 206 LDR q19, [x5], 16 // B3 207 FMLA v28.8h, v16.8h, v4.h[0] 208 FMLA v29.8h, v17.8h, v4.h[0] 209 FMLA v30.8h, v16.8h, v5.h[0] 210 FMLA v31.8h, v17.8h, v5.h[0] 211 ADDS x0, x0, 8 212 213 FMLA v20.8h, v18.8h, v0.h[1] 214 FMLA v21.8h, v19.8h, v0.h[1] 215 LDR q16, [x5], 16 // B4 216 FMLA v22.8h, v18.8h, v1.h[1] 217 FMLA v23.8h, v19.8h, v1.h[1] 218 LDR q17, [x5], 16 // B5 219 FMLA v24.8h, v18.8h, v2.h[1] 220 FMLA v25.8h, v19.8h, v2.h[1] 221 FMLA v26.8h, v18.8h, v3.h[1] 222 FMLA v27.8h, v19.8h, v3.h[1] 223 FMLA v28.8h, v18.8h, v4.h[1] 224 FMLA v29.8h, v19.8h, v4.h[1] 225 FMLA v30.8h, v18.8h, v5.h[1] 226 FMLA v31.8h, v19.8h, v5.h[1] 227 228 FMLA v20.8h, v16.8h, v0.h[2] 229 FMLA v21.8h, v17.8h, v0.h[2] 230 LDR q18, [x5], 16 // B6 231 FMLA v22.8h, v16.8h, v1.h[2] 232 FMLA v23.8h, v17.8h, v1.h[2] 233 LDR q19, [x5], 16 // B7 234 FMLA v24.8h, v16.8h, v2.h[2] 235 FMLA v25.8h, v17.8h, v2.h[2] 236 FMLA v26.8h, v16.8h, v3.h[2] 237 FMLA v27.8h, v17.8h, v3.h[2] 238 FMLA v28.8h, v16.8h, v4.h[2] 239 FMLA v29.8h, v17.8h, v4.h[2] 240 FMLA v30.8h, v16.8h, v5.h[2] 241 FMLA v31.8h, v17.8h, v5.h[2] 242 243 FMLA v20.8h, v18.8h, v0.h[3] 244 FMLA v21.8h, v19.8h, v0.h[3] 245 FMLA v22.8h, v18.8h, v1.h[3] 246 FMLA v23.8h, v19.8h, v1.h[3] 247 FMLA v24.8h, v18.8h, v2.h[3] 248 FMLA v25.8h, v19.8h, v2.h[3] 249 FMLA v26.8h, v18.8h, v3.h[3] 250 FMLA v27.8h, v19.8h, v3.h[3] 251 FMLA v28.8h, v18.8h, v4.h[3] 252 FMLA v29.8h, v19.8h, v4.h[3] 253 FMLA v30.8h, v18.8h, v5.h[3] 254 FMLA v31.8h, v19.8h, v5.h[3] 255 256 # Is there a remainder?- 1-3 halffloats of A (2-6 bytes) 257 B.NE 4f 258 2593: 260 # Clamp 261 DUP v4.8h, v6.h[0] 262 DUP v5.8h, v6.h[1] 263 FMAX v20.8h, v20.8h, v4.8h 264 FMAX v21.8h, v21.8h, v4.8h 265 FMAX v22.8h, v22.8h, v4.8h 266 FMAX v23.8h, v23.8h, v4.8h 267 FMAX v24.8h, v24.8h, v4.8h 268 FMAX v25.8h, v25.8h, v4.8h 269 FMAX v26.8h, v26.8h, v4.8h 270 FMAX v27.8h, v27.8h, v4.8h 271 FMAX v28.8h, v28.8h, v4.8h 272 FMAX v29.8h, v29.8h, v4.8h 273 FMAX v30.8h, v30.8h, v4.8h 274 FMAX v31.8h, v31.8h, v4.8h 275 SUBS x1, x1, 16 276 FMIN v20.8h, v20.8h, v5.8h 277 FMIN v21.8h, v21.8h, v5.8h 278 FMIN v22.8h, v22.8h, v5.8h 279 FMIN v23.8h, v23.8h, v5.8h 280 FMIN v24.8h, v24.8h, v5.8h 281 FMIN v25.8h, v25.8h, v5.8h 282 FMIN v26.8h, v26.8h, v5.8h 283 FMIN v27.8h, v27.8h, v5.8h 284 FMIN v28.8h, v28.8h, v5.8h 285 FMIN v29.8h, v29.8h, v5.8h 286 FMIN v30.8h, v30.8h, v5.8h 287 FMIN v31.8h, v31.8h, v5.8h 288 289 # Store full 6 x 16 290 B.LO 6f 291 292 ST1 {v20.16b, v21.16b}, [x6], x8 293 SUB x3, x3, x2 // a0 -= kc 294 ST1 {v22.16b, v23.16b}, [x16], x8 295 SUB x9, x9, x2 // a1 -= kc 296 ST1 {v24.16b, v25.16b}, [x17], x8 297 SUB x10, x10, x2 // a2 -= kc 298 ST1 {v26.16b, v27.16b}, [x14], x8 299 SUB x11, x11, x2 // a3 -= kc 300 ST1 {v28.16b, v29.16b}, [x13], x8 301 SUB x12, x12, x2 // a4 -= kc 302 ST1 {v30.16b, v31.16b}, [x7], x8 303 SUB x4, x4, x2 // a5 -= kc 304 305 B.HI 0b 306 RET 307 308 # Remainder- 1-3 halffloats of A (2-6 bytes) 3094: 310 TBZ x0, 2, 5f 311 LDR s0, [x3], 4 312 LDR q16, [x5], 16 313 LDR q17, [x5], 16 314 LDR s1, [x9], 4 315 LDR s2, [x10], 4 316 LDR s3, [x11], 4 317 LDR s4, [x12], 4 318 LDR s5, [x4], 4 319 LDR q18, [x5], 16 320 LDR q19, [x5], 16 321 FMLA v20.8h, v16.8h, v0.h[0] 322 FMLA v22.8h, v16.8h, v1.h[0] 323 FMLA v24.8h, v16.8h, v2.h[0] 324 FMLA v26.8h, v16.8h, v3.h[0] 325 FMLA v28.8h, v16.8h, v4.h[0] 326 FMLA v30.8h, v16.8h, v5.h[0] 327 FMLA v21.8h, v17.8h, v0.h[0] 328 FMLA v23.8h, v17.8h, v1.h[0] 329 FMLA v25.8h, v17.8h, v2.h[0] 330 FMLA v27.8h, v17.8h, v3.h[0] 331 FMLA v29.8h, v17.8h, v4.h[0] 332 FMLA v31.8h, v17.8h, v5.h[0] 333 334 FMLA v20.8h, v18.8h, v0.h[1] 335 FMLA v22.8h, v18.8h, v1.h[1] 336 FMLA v24.8h, v18.8h, v2.h[1] 337 FMLA v26.8h, v18.8h, v3.h[1] 338 FMLA v28.8h, v18.8h, v4.h[1] 339 FMLA v30.8h, v18.8h, v5.h[1] 340 FMLA v21.8h, v19.8h, v0.h[1] 341 FMLA v23.8h, v19.8h, v1.h[1] 342 FMLA v25.8h, v19.8h, v2.h[1] 343 FMLA v27.8h, v19.8h, v3.h[1] 344 FMLA v29.8h, v19.8h, v4.h[1] 345 FMLA v31.8h, v19.8h, v5.h[1] 346 TBZ x0, 1, 3b 347 3485: 349 LDR h0, [x3], 2 350 LDR q16, [x5], 16 351 LDR q17, [x5], 16 352 LDR h1, [x9], 2 353 LDR h2, [x10], 2 354 LDR h3, [x11], 2 355 LDR h4, [x12], 2 356 LDR h5, [x4], 2 357 FMLA v20.8h, v16.8h, v0.h[0] 358 FMLA v22.8h, v16.8h, v1.h[0] 359 FMLA v24.8h, v16.8h, v2.h[0] 360 FMLA v26.8h, v16.8h, v3.h[0] 361 FMLA v28.8h, v16.8h, v4.h[0] 362 FMLA v30.8h, v16.8h, v5.h[0] 363 FMLA v21.8h, v17.8h, v0.h[0] 364 FMLA v23.8h, v17.8h, v1.h[0] 365 FMLA v25.8h, v17.8h, v2.h[0] 366 FMLA v27.8h, v17.8h, v3.h[0] 367 FMLA v29.8h, v17.8h, v4.h[0] 368 FMLA v31.8h, v17.8h, v5.h[0] 369 B 3b 370 371 # Store odd width 3726: 373 TBZ x1, 3, 7f 374 STR q20, [x6], 16 375 MOV v20.16b, v21.16b 376 STR q22, [x16], 16 377 MOV v22.16b, v23.16b 378 STR q24, [x17], 16 379 MOV v24.16b, v25.16b 380 STR q26, [x14], 16 381 MOV v26.16b, v27.16b 382 STR q28, [x13], 16 383 MOV v28.16b, v29.16b 384 STR q30, [x7], 16 385 MOV v30.16b, v31.16b 386 3877: 388 TBZ x1, 2, 8f 389 STR d20, [x6], 8 390 STR d22, [x16], 8 391 DUP d20, v20.d[1] 392 DUP d22, v22.d[1] 393 STR d24, [x17], 8 394 STR d26, [x14], 8 395 DUP d24, v24.d[1] 396 DUP d26, v26.d[1] 397 STR d28, [x13], 8 398 STR d30, [x7], 8 399 DUP d28, v28.d[1] 400 DUP d30, v30.d[1] 401 4028: 403 TBZ x1, 1, 9f 404 STR s20, [x6], 4 405 STR s22, [x16], 4 406 DUP s20, v20.s[1] 407 DUP s22, v22.s[1] 408 STR s24, [x17], 4 409 STR s26, [x14], 4 410 DUP s24, v24.s[1] 411 DUP s26, v26.s[1] 412 STR s28, [x13], 4 413 STR s30, [x7], 4 414 DUP s28, v28.s[1] 415 DUP s30, v30.s[1] 416 4179: 418 TBZ x1, 0, 10f 419 STR h20, [x6] 420 STR h22, [x16] 421 STR h24, [x17] 422 STR h26, [x14] 423 STR h28, [x13] 424 STR h30, [x7] 42510: 426 RET 427 428END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 429 430#ifdef __ELF__ 431.section ".note.GNU-stack","",%progbits 432#endif 433