1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const void**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x8 18# size_t a_offset, [sp + 8] -> x11 19# const void* zero, [sp + 16] -> x12 20# const xnn_f16_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 26# A1 x15 v1 27# A2 x20 v2 28# A3 x21 v3 29# A4 x22 v4 30# A5 x23 v5 31 32# B x5 v16 v17 v18 v19 33 34# C0 x6 v20 v21 35# C1 x16 v22 v23 36# C2 x17 v24 v25 37# C3 x10 v26 v27 38# C4 x13 v28 v29 39# C5 x7 v30 v31 40 41# Clamp v6, (v4), (v5) 42# unused v7 43# unused A v8 v9 v10 v11 44# unused B v12 v13 v14 v15 45 46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 47 48 # Load zero, params pointer 49 LDP x12, x8, [sp, 16] 50 51 # Clamp C pointers 52 CMP x0, 2 // if mr < 2 53 ADD x16, x6, x7 // c1 = c0 + cm_stride 54 CSEL x16, x6, x16, LO // c1 = c0 55 ADD x17, x16, x7 // c2 = c1 + cm_stride 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 59 # Load params 60 LDR s6, [x8] 61 62 CMP x0, 4 // if mr < 4 63 ADD x10, x17, x7 // c3 = c2 + cm_stride 64 CSEL x10, x17, x10, LO // c3 = c2 65 ADD x13, x10, x7 // c4 = c3 + cm_stride 66 // if mr <= 4 67 CSEL x13, x10, x13, LS // c4 = c3 68 CMP x0, 6 // if mr < 6 69 ADD x7, x13, x7 // c5 = c4 + cm_stride 70 CSEL x7, x13, x7, LO // c5 = c4 71 72 LDP x8, x11, [sp] // load cn_stride, a_offset 73 74 # Save x20-x23 on stack 75 STP x20, x21, [sp, -32]! 76 STP x22, x23, [sp, 16] 77 780: 79 # Load initial bias from w into accumulators 80 LDP q20, q21, [x5], 32 81 MOV x9, x3 // p = ks 82 MOV v22.16b, v20.16b 83 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 84 MOV v23.16b, v21.16b 85 PRFM PLDL1KEEP, [x5, 64] 86 MOV v24.16b, v20.16b 87 PRFM PLDL1KEEP, [x5, 128] 88 MOV v25.16b, v21.16b 89 PRFM PLDL1KEEP, [x5, 192] 90 MOV v26.16b, v20.16b 91 PRFM PLDL1KEEP, [x5, 256] 92 MOV v27.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 320] 94 MOV v28.16b, v20.16b 95 MOV v29.16b, v21.16b 96 MOV v30.16b, v20.16b 97 MOV v31.16b, v21.16b 98 991: 100 # Load next 6 A pointers 101 LDP x14, x15, [x4], 16 102 LDP x20, x21, [x4], 16 103 LDP x22, x23, [x4], 16 104 105 CMP x14, x12 // if a0 == zero 106 ADD x14, x14, x11 // a0 += a_offset 107 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 108 CMP x15, x12 // if a1 == zero 109 ADD x15, x15, x11 // a1 += a_offset 110 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 111 CMP x20, x12 // if a2 == zero 112 ADD x20, x20, x11 // a2 += a_offset 113 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 114 CMP x21, x12 // if a3 == zero 115 ADD x21, x21, x11 // a3 += a_offset 116 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 117 CMP x22, x12 // if a4 == zero 118 ADD x22, x22, x11 // a4 += a_offset 119 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 120 CMP x23, x12 // if a5 == zero 121 ADD x23, x23, x11 // a5 += a_offset 122 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 123 124 # Is there at least 2 halffloats (4 bytes)? 125 SUBS x0, x2, 4 // k = kc - 4 126 B.LO 5f 127 128 # Prologue - load 4 A and 2 B 129 LDR s0, [x14], 4 // A0 130 LDR q16, [x5], 16 // B 131 LDR q17, [x5], 16 // B 132 LDR s1, [x15], 4 // A1 133 LDR s2, [x20], 4 // A2 134 LDR s3, [x21], 4 // A3 135 136 # Is there at least 2 halffloats for main loop? 137 SUBS x0, x0, 4 138 B.LO 3f 139 140 .p2align 3 141 # Main loop - 2 halffloats of A (4 bytes) 142 # 24 FMA + 6 ld32 A + 4 LDR B 1432: 144 FMLA v20.8h, v16.8h, v0.h[0] 145 LDR s4, [x22], 4 // A4 146 FMLA v21.8h, v17.8h, v0.h[0] 147 LDR s5, [x23], 4 // A5 148 FMLA v22.8h, v16.8h, v1.h[0] 149 LDR d18, [x5], 8 // B0 150 FMLA v23.8h, v17.8h, v1.h[0] 151 LD1 {v18.d}[1], [x5], 8 // B1 152 FMLA v24.8h, v16.8h, v2.h[0] 153 LDR d19, [x5], 8 // B2 154 FMLA v25.8h, v17.8h, v2.h[0] 155 LD1 {v19.d}[1], [x5], 8 // B3 156 FMLA v26.8h, v16.8h, v3.h[0] 157 FMLA v27.8h, v17.8h, v3.h[0] 158 FMLA v28.8h, v16.8h, v4.h[0] 159 FMLA v29.8h, v17.8h, v4.h[0] 160 FMLA v30.8h, v16.8h, v5.h[0] 161 FMLA v31.8h, v17.8h, v5.h[0] 162 SUBS x0, x0, 4 163 164 FMLA v20.8h, v18.8h, v0.h[1] 165 LDR d16, [x5], 8 // B0 166 FMLA v21.8h, v19.8h, v0.h[1] 167 LD1 {v16.d}[1], [x5], 8 // B1 168 FMLA v22.8h, v18.8h, v1.h[1] 169 LDR d17, [x5], 8 // B2 170 FMLA v23.8h, v19.8h, v1.h[1] 171 LD1 {v17.d}[1], [x5], 8 // B3 172 FMLA v24.8h, v18.8h, v2.h[1] 173 FMLA v25.8h, v19.8h, v2.h[1] 174 FMLA v26.8h, v18.8h, v3.h[1] 175 FMLA v27.8h, v19.8h, v3.h[1] 176 LDR s0, [x14], 4 // A0 177 FMLA v28.8h, v18.8h, v4.h[1] 178 LDR s1, [x15], 4 // A1 179 FMLA v29.8h, v19.8h, v4.h[1] 180 LDR s2, [x20], 4 // A2 181 FMLA v30.8h, v18.8h, v5.h[1] 182 LDR s3, [x21], 4 // A3 183 FMLA v31.8h, v19.8h, v5.h[1] 184 B.HS 2b 185 186 # Epilogue - same as main loop but no loads for next loop 1873: 188 FMLA v20.8h, v16.8h, v0.h[0] 189 LDR s4, [x22], 4 // A4 190 FMLA v21.8h, v17.8h, v0.h[0] 191 LDR s5, [x23], 4 // A5 192 FMLA v22.8h, v16.8h, v1.h[0] 193 LDR d18, [x5], 8 // B0 194 FMLA v23.8h, v17.8h, v1.h[0] 195 LD1 {v18.d}[1], [x5], 8 // B1 196 FMLA v24.8h, v16.8h, v2.h[0] 197 LDR d19, [x5], 8 // B2 198 FMLA v25.8h, v17.8h, v2.h[0] 199 LD1 {v19.d}[1], [x5], 8 // B3 200 FMLA v26.8h, v16.8h, v3.h[0] 201 FMLA v27.8h, v17.8h, v3.h[0] 202 FMLA v28.8h, v16.8h, v4.h[0] 203 FMLA v29.8h, v17.8h, v4.h[0] 204 FMLA v30.8h, v16.8h, v5.h[0] 205 FMLA v31.8h, v17.8h, v5.h[0] 206 207 FMLA v20.8h, v18.8h, v0.h[1] 208 FMLA v21.8h, v19.8h, v0.h[1] 209 FMLA v22.8h, v18.8h, v1.h[1] 210 FMLA v23.8h, v19.8h, v1.h[1] 211 FMLA v24.8h, v18.8h, v2.h[1] 212 FMLA v25.8h, v19.8h, v2.h[1] 213 FMLA v26.8h, v18.8h, v3.h[1] 214 FMLA v27.8h, v19.8h, v3.h[1] 215 FMLA v28.8h, v18.8h, v4.h[1] 216 FMLA v29.8h, v19.8h, v4.h[1] 217 FMLA v30.8h, v18.8h, v5.h[1] 218 FMLA v31.8h, v19.8h, v5.h[1] 219 220 # Is there a remainder?- 1 halffloat of A (2 bytes) 221 TBNZ x0, 1, 5f 222 2234: 224 # ks loop 225 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 226 B.HI 1b 227 228 # Clamp 229 DUP v4.8h, v6.h[0] 230 DUP v5.8h, v6.h[1] 231 FMAX v20.8h, v20.8h, v4.8h 232 FMAX v21.8h, v21.8h, v4.8h 233 FMAX v22.8h, v22.8h, v4.8h 234 FMAX v23.8h, v23.8h, v4.8h 235 FMAX v24.8h, v24.8h, v4.8h 236 FMAX v25.8h, v25.8h, v4.8h 237 FMAX v26.8h, v26.8h, v4.8h 238 FMAX v27.8h, v27.8h, v4.8h 239 FMAX v28.8h, v28.8h, v4.8h 240 FMAX v29.8h, v29.8h, v4.8h 241 FMAX v30.8h, v30.8h, v4.8h 242 FMAX v31.8h, v31.8h, v4.8h 243 SUBS x1, x1, 16 244 FMIN v20.8h, v20.8h, v5.8h 245 FMIN v21.8h, v21.8h, v5.8h 246 FMIN v22.8h, v22.8h, v5.8h 247 FMIN v23.8h, v23.8h, v5.8h 248 FMIN v24.8h, v24.8h, v5.8h 249 FMIN v25.8h, v25.8h, v5.8h 250 FMIN v26.8h, v26.8h, v5.8h 251 FMIN v27.8h, v27.8h, v5.8h 252 FMIN v28.8h, v28.8h, v5.8h 253 FMIN v29.8h, v29.8h, v5.8h 254 FMIN v30.8h, v30.8h, v5.8h 255 FMIN v31.8h, v31.8h, v5.8h 256 257 # Store full 6 x 16 258 B.LO 6f 259 260 ST1 {v30.16b, v31.16b}, [x7], x8 261 ST1 {v28.16b, v29.16b}, [x13], x8 262 ST1 {v26.16b, v27.16b}, [x10], x8 263 ST1 {v24.16b, v25.16b}, [x17], x8 264 ST1 {v22.16b, v23.16b}, [x16], x8 265 ST1 {v20.16b, v21.16b}, [x6], x8 266 267 SUB x4, x4, x3 // a -= ks 268 269 # nc loop 270 B.HI 0b 271 272 # Restore x20-x23 from stack 273 LDP x22, x23, [sp, 16] 274 LDP x20, x21, [sp], 32 275 RET 276 2775: 278 # Remainder- 1 halffloat of A (2 bytes) 279 LDR h0, [x14], 2 // A0 280 LDR q16, [x5], 16 // B 281 LDR q17, [x5], 16 // B 282 FMLA v20.8h, v16.8h, v0.h[0] 283 LDR h1, [x15], 2 // A1 284 FMLA v21.8h, v17.8h, v0.h[0] 285 LDR h2, [x20], 2 // A2 286 FMLA v22.8h, v16.8h, v1.h[0] 287 LDR h3, [x21], 2 // A3 288 FMLA v23.8h, v17.8h, v1.h[0] 289 LDR h4, [x22], 2 // A4 290 FMLA v24.8h, v16.8h, v2.h[0] 291 LDR h5, [x23], 2 // A5 292 FMLA v25.8h, v17.8h, v2.h[0] 293 FMLA v26.8h, v16.8h, v3.h[0] 294 FMLA v27.8h, v17.8h, v3.h[0] 295 FMLA v28.8h, v16.8h, v4.h[0] 296 FMLA v29.8h, v17.8h, v4.h[0] 297 FMLA v30.8h, v16.8h, v5.h[0] 298 FMLA v31.8h, v17.8h, v5.h[0] 299 B 4b 300 301 # Store odd width 3026: 303 TBZ x1, 3, 7f 304 STR q30, [x7], 16 305 MOV v30.16b, v31.16b 306 STR q28, [x13], 16 307 MOV v28.16b, v29.16b 308 STR q26, [x10], 16 309 MOV v26.16b, v27.16b 310 STR q24, [x17], 16 311 MOV v24.16b, v25.16b 312 STR q22, [x16], 16 313 MOV v22.16b, v23.16b 314 STR q20, [x6], 16 315 MOV v20.16b, v21.16b 3167: 317 TBZ x1, 2, 8f 318 STR d30, [x7], 8 319 STR d28, [x13], 8 320 DUP d30, v30.d[1] 321 DUP d28, v28.d[1] 322 STR d26, [x10], 8 323 STR d24, [x17], 8 324 DUP d26, v26.d[1] 325 DUP d24, v24.d[1] 326 STR d22, [x16], 8 327 STR d20, [x6], 8 328 DUP d22, v22.d[1] 329 DUP d20, v20.d[1] 330 3318: 332 TBZ x1, 1, 9f 333 STR s30, [x7], 4 334 STR s28, [x13], 4 335 DUP s30, v30.s[1] 336 DUP s28, v28.s[1] 337 STR s26, [x10], 4 338 STR s24, [x17], 4 339 DUP s26, v26.s[1] 340 DUP s24, v24.s[1] 341 STR s22, [x16], 4 342 STR s20, [x6], 4 343 DUP s22, v22.s[1] 344 DUP s20, v20.s[1] 345 3469: 347 TBZ x1, 0, 10f 348 STR h30, [x7] 349 STR h28, [x13] 350 STR h26, [x10] 351 STR h24, [x17] 352 STR h22, [x16] 353 STR h20, [x6] 35410: 355 # Restore x20-x23 from stack 356 LDP x22, x23, [sp, 16] 357 LDP x20, x21, [sp], 32 358 RET 359 360END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 361 362#ifdef __ELF__ 363.section ".note.GNU-stack","",%progbits 364#endif 365