1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const void**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x8 18# size_t a_offset, [sp + 8] -> x11 19# const void* zero, [sp + 16] -> x12 20# const xnn_f16_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 26# A1 x15 v1 27# A2 x20 v2 28# A3 x21 v3 29# A4 x22 v4 30# A5 x23 v5 31 32# B x5 v16 v17 v18 v19 33 34# C0 x6 v20 v21 35# C1 x16 v22 v23 36# C2 x17 v24 v25 37# C3 x10 v26 v27 38# C4 x13 v28 v29 39# C5 x7 v30 v31 40 41# Clamp v6, (v4), (v5) 42# unused v7 43# unused A v8 v9 v10 v11 44# unused B v12 v13 v14 v15 45 46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64 47 48 # Load zero, params pointer 49 LDP x12, x8, [sp, 16] 50 51 # Clamp C pointers 52 CMP x0, 2 // if mr < 2 53 ADD x16, x6, x7 // c1 = c0 + cm_stride 54 CSEL x16, x6, x16, LO // c1 = c0 55 ADD x17, x16, x7 // c2 = c1 + cm_stride 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 59 # Load params 60 LDR s6, [x8] 61 62 CMP x0, 4 // if mr < 4 63 ADD x10, x17, x7 // c3 = c2 + cm_stride 64 CSEL x10, x17, x10, LO // c3 = c2 65 ADD x13, x10, x7 // c4 = c3 + cm_stride 66 // if mr <= 4 67 CSEL x13, x10, x13, LS // c4 = c3 68 CMP x0, 6 // if mr < 6 69 ADD x7, x13, x7 // c5 = c4 + cm_stride 70 CSEL x7, x13, x7, LO // c5 = c4 71 72 LDP x8, x11, [sp] // load cn_stride, a_offset 73 74 # Save x20-x23 on stack 75 STP x20, x21, [sp, -32]! 76 STP x22, x23, [sp, 16] 77 780: 79 # Load initial bias from w into accumulators 80 LDP q20, q21, [x5], 32 81 MOV x9, x3 // p = ks 82 MOV v22.16b, v20.16b 83 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 84 MOV v23.16b, v21.16b 85 PRFM PLDL1KEEP, [x5, 64] 86 MOV v24.16b, v20.16b 87 PRFM PLDL1KEEP, [x5, 128] 88 MOV v25.16b, v21.16b 89 PRFM PLDL1KEEP, [x5, 192] 90 MOV v26.16b, v20.16b 91 PRFM PLDL1KEEP, [x5, 256] 92 MOV v27.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 320] 94 MOV v28.16b, v20.16b 95 MOV v29.16b, v21.16b 96 MOV v30.16b, v20.16b 97 MOV v31.16b, v21.16b 98 991: 100 # Load next 6 A pointers 101 LDP x14, x15, [x4], 16 102 LDP x20, x21, [x4], 16 103 LDP x22, x23, [x4], 16 104 105 CMP x14, x12 // if a0 == zero 106 ADD x14, x14, x11 // a0 += a_offset 107 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 108 CMP x15, x12 // if a1 == zero 109 ADD x15, x15, x11 // a1 += a_offset 110 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 111 CMP x20, x12 // if a2 == zero 112 ADD x20, x20, x11 // a2 += a_offset 113 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 114 CMP x21, x12 // if a3 == zero 115 ADD x21, x21, x11 // a3 += a_offset 116 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 117 CMP x22, x12 // if a4 == zero 118 ADD x22, x22, x11 // a4 += a_offset 119 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 120 CMP x23, x12 // if a5 == zero 121 ADD x23, x23, x11 // a5 += a_offset 122 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 123 124 # Is there at least 4 halffloats (8 bytes)? 125 SUBS x0, x2, 8 // k = kc - 8 126 B.LO 4f 127 128 .p2align 3 129 # Main loop - 2 halffloats of A (4 bytes) 130 # 48 FMA + 6 ld64 A + 8 LDR B 1312: 132 LDR d0, [x14], 8 // A0 133 LDR q16, [x5], 16 // B 134 LDR q17, [x5], 16 // B 135 LDR d1, [x15], 8 // A1 136 LDR d2, [x20], 8 // A2 137 LDR d3, [x21], 8 // A3 138 LDR d4, [x22], 8 // A4 139 LDR d5, [x23], 8 // A5 140 LDR q18, [x5], 16 // B 141 LDR q19, [x5], 16 // B 142 FMLA v20.8h, v16.8h, v0.h[0] 143 FMLA v22.8h, v16.8h, v1.h[0] 144 FMLA v24.8h, v16.8h, v2.h[0] 145 FMLA v26.8h, v16.8h, v3.h[0] 146 FMLA v28.8h, v16.8h, v4.h[0] 147 FMLA v30.8h, v16.8h, v5.h[0] 148 FMLA v21.8h, v17.8h, v0.h[0] 149 FMLA v23.8h, v17.8h, v1.h[0] 150 FMLA v25.8h, v17.8h, v2.h[0] 151 FMLA v27.8h, v17.8h, v3.h[0] 152 FMLA v29.8h, v17.8h, v4.h[0] 153 FMLA v31.8h, v17.8h, v5.h[0] 154 155 FMLA v20.8h, v18.8h, v0.h[1] 156 FMLA v22.8h, v18.8h, v1.h[1] 157 FMLA v24.8h, v18.8h, v2.h[1] 158 FMLA v26.8h, v18.8h, v3.h[1] 159 FMLA v28.8h, v18.8h, v4.h[1] 160 FMLA v30.8h, v18.8h, v5.h[1] 161 FMLA v21.8h, v19.8h, v0.h[1] 162 FMLA v23.8h, v19.8h, v1.h[1] 163 FMLA v25.8h, v19.8h, v2.h[1] 164 FMLA v27.8h, v19.8h, v3.h[1] 165 FMLA v29.8h, v19.8h, v4.h[1] 166 FMLA v31.8h, v19.8h, v5.h[1] 167 LDR q16, [x5], 16 168 LDR q17, [x5], 16 169 LDR q18, [x5], 16 170 LDR q19, [x5], 16 171 SUBS x0, x0, 8 172 173 FMLA v20.8h, v16.8h, v0.h[2] 174 FMLA v22.8h, v16.8h, v1.h[2] 175 FMLA v24.8h, v16.8h, v2.h[2] 176 FMLA v26.8h, v16.8h, v3.h[2] 177 FMLA v28.8h, v16.8h, v4.h[2] 178 FMLA v30.8h, v16.8h, v5.h[2] 179 FMLA v21.8h, v17.8h, v0.h[2] 180 FMLA v23.8h, v17.8h, v1.h[2] 181 FMLA v25.8h, v17.8h, v2.h[2] 182 FMLA v27.8h, v17.8h, v3.h[2] 183 FMLA v29.8h, v17.8h, v4.h[2] 184 FMLA v31.8h, v17.8h, v5.h[2] 185 186 FMLA v20.8h, v18.8h, v0.h[3] 187 FMLA v22.8h, v18.8h, v1.h[3] 188 FMLA v24.8h, v18.8h, v2.h[3] 189 FMLA v26.8h, v18.8h, v3.h[3] 190 FMLA v28.8h, v18.8h, v4.h[3] 191 FMLA v30.8h, v18.8h, v5.h[3] 192 FMLA v21.8h, v19.8h, v0.h[3] 193 FMLA v23.8h, v19.8h, v1.h[3] 194 FMLA v25.8h, v19.8h, v2.h[3] 195 FMLA v27.8h, v19.8h, v3.h[3] 196 FMLA v29.8h, v19.8h, v4.h[3] 197 FMLA v31.8h, v19.8h, v5.h[3] 198 B.HS 2b 199 200 # Is there a remainder?- 1-3 halffloat of A (2-6 bytes) 201 ADDS x0, x0, 8 202 B.NE 4f 203 2043: 205 # ks loop 206 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 207 B.HI 1b 208 209 # Clamp 210 DUP v4.8h, v6.h[0] 211 DUP v5.8h, v6.h[1] 212 FMAX v20.8h, v20.8h, v4.8h 213 FMAX v21.8h, v21.8h, v4.8h 214 FMAX v22.8h, v22.8h, v4.8h 215 FMAX v23.8h, v23.8h, v4.8h 216 FMAX v24.8h, v24.8h, v4.8h 217 FMAX v25.8h, v25.8h, v4.8h 218 FMAX v26.8h, v26.8h, v4.8h 219 FMAX v27.8h, v27.8h, v4.8h 220 FMAX v28.8h, v28.8h, v4.8h 221 FMAX v29.8h, v29.8h, v4.8h 222 FMAX v30.8h, v30.8h, v4.8h 223 FMAX v31.8h, v31.8h, v4.8h 224 SUBS x1, x1, 16 225 FMIN v20.8h, v20.8h, v5.8h 226 FMIN v21.8h, v21.8h, v5.8h 227 FMIN v22.8h, v22.8h, v5.8h 228 FMIN v23.8h, v23.8h, v5.8h 229 FMIN v24.8h, v24.8h, v5.8h 230 FMIN v25.8h, v25.8h, v5.8h 231 FMIN v26.8h, v26.8h, v5.8h 232 FMIN v27.8h, v27.8h, v5.8h 233 FMIN v28.8h, v28.8h, v5.8h 234 FMIN v29.8h, v29.8h, v5.8h 235 FMIN v30.8h, v30.8h, v5.8h 236 FMIN v31.8h, v31.8h, v5.8h 237 238 # Store full 6 x 16 239 B.LO 6f 240 241 ST1 {v30.16b, v31.16b}, [x7], x8 242 ST1 {v28.16b, v29.16b}, [x13], x8 243 ST1 {v26.16b, v27.16b}, [x10], x8 244 ST1 {v24.16b, v25.16b}, [x17], x8 245 ST1 {v22.16b, v23.16b}, [x16], x8 246 ST1 {v20.16b, v21.16b}, [x6], x8 247 248 SUB x4, x4, x3 // a -= ks 249 250 # nc loop 251 B.HI 0b 252 253 # Restore x20-x23 from stack 254 LDP x22, x23, [sp, 16] 255 LDP x20, x21, [sp], 32 256 RET 257 258 # Remainder- 1-3 halffloats of A (2-6 bytes) 2594: 260 TBZ x0, 2, 5f 261 LDR s0, [x14], 4 // A0 262 LDR q16, [x5], 16 // B 263 LDR q17, [x5], 16 // B 264 LDR s1, [x15], 4 // A1 265 LDR s2, [x20], 4 // A2 266 LDR s3, [x21], 4 // A3 267 LDR s4, [x22], 4 // A4 268 LDR s5, [x23], 4 // A5 269 LDR q18, [x5], 16 // B 270 LDR q19, [x5], 16 // B 271 SUBS x0, x0, 4 272 FMLA v20.8h, v16.8h, v0.h[0] 273 FMLA v21.8h, v17.8h, v0.h[0] 274 FMLA v22.8h, v16.8h, v1.h[0] 275 FMLA v23.8h, v17.8h, v1.h[0] 276 FMLA v24.8h, v16.8h, v2.h[0] 277 FMLA v25.8h, v17.8h, v2.h[0] 278 FMLA v26.8h, v16.8h, v3.h[0] 279 FMLA v27.8h, v17.8h, v3.h[0] 280 FMLA v28.8h, v16.8h, v4.h[0] 281 FMLA v29.8h, v17.8h, v4.h[0] 282 FMLA v30.8h, v16.8h, v5.h[0] 283 FMLA v31.8h, v17.8h, v5.h[0] 284 285 FMLA v20.8h, v18.8h, v0.h[1] 286 FMLA v21.8h, v19.8h, v0.h[1] 287 FMLA v22.8h, v18.8h, v1.h[1] 288 FMLA v23.8h, v19.8h, v1.h[1] 289 FMLA v24.8h, v18.8h, v2.h[1] 290 FMLA v25.8h, v19.8h, v2.h[1] 291 FMLA v26.8h, v18.8h, v3.h[1] 292 FMLA v27.8h, v19.8h, v3.h[1] 293 FMLA v28.8h, v18.8h, v4.h[1] 294 FMLA v29.8h, v19.8h, v4.h[1] 295 FMLA v30.8h, v18.8h, v5.h[1] 296 FMLA v31.8h, v19.8h, v5.h[1] 297 2985: 299 TBZ x0, 1, 3b 300 LDR h0, [x14], 2 // A0 301 LDR q16, [x5], 16 // B 302 LDR q17, [x5], 16 // B 303 LDR h1, [x15], 2 // A1 304 LDR h2, [x20], 2 // A2 305 LDR h3, [x21], 2 // A3 306 LDR h4, [x22], 2 // A4 307 LDR h5, [x23], 2 // A5 308 FMLA v20.8h, v16.8h, v0.h[0] 309 FMLA v21.8h, v17.8h, v0.h[0] 310 FMLA v22.8h, v16.8h, v1.h[0] 311 FMLA v23.8h, v17.8h, v1.h[0] 312 FMLA v24.8h, v16.8h, v2.h[0] 313 FMLA v25.8h, v17.8h, v2.h[0] 314 FMLA v26.8h, v16.8h, v3.h[0] 315 FMLA v27.8h, v17.8h, v3.h[0] 316 FMLA v28.8h, v16.8h, v4.h[0] 317 FMLA v29.8h, v17.8h, v4.h[0] 318 FMLA v30.8h, v16.8h, v5.h[0] 319 FMLA v31.8h, v17.8h, v5.h[0] 320 B 3b 321 322 # Store odd width 3236: 324 TBZ x1, 3, 7f 325 STR q30, [x7], 16 326 MOV v30.16b, v31.16b 327 STR q28, [x13], 16 328 MOV v28.16b, v29.16b 329 STR q26, [x10], 16 330 MOV v26.16b, v27.16b 331 STR q24, [x17], 16 332 MOV v24.16b, v25.16b 333 STR q22, [x16], 16 334 MOV v22.16b, v23.16b 335 STR q20, [x6], 16 336 MOV v20.16b, v21.16b 3377: 338 TBZ x1, 2, 8f 339 STR d30, [x7], 8 340 STR d28, [x13], 8 341 DUP d30, v30.d[1] 342 DUP d28, v28.d[1] 343 STR d26, [x10], 8 344 STR d24, [x17], 8 345 DUP d26, v26.d[1] 346 DUP d24, v24.d[1] 347 STR d22, [x16], 8 348 STR d20, [x6], 8 349 DUP d22, v22.d[1] 350 DUP d20, v20.d[1] 351 3528: 353 TBZ x1, 1, 9f 354 STR s30, [x7], 4 355 STR s28, [x13], 4 356 DUP s30, v30.s[1] 357 DUP s28, v28.s[1] 358 STR s26, [x10], 4 359 STR s24, [x17], 4 360 DUP s26, v26.s[1] 361 DUP s24, v24.s[1] 362 STR s22, [x16], 4 363 STR s20, [x6], 4 364 DUP s22, v22.s[1] 365 DUP s20, v20.s[1] 366 3679: 368 TBZ x1, 0, 10f 369 STR h30, [x7] 370 STR h28, [x13] 371 STR h26, [x10] 372 STR h24, [x17] 373 STR h22, [x16] 374 STR h20, [x6] 37510: 376 # Restore x20-x23 from stack 377 LDP x22, x23, [sp, 16] 378 LDP x20, x21, [sp], 32 379 RET 380 381END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64 382 383#ifdef __ELF__ 384.section ".note.GNU-stack","",%progbits 385#endif 386