1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/8x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x19 a5 34# x20 a6 35# x4 a7 36 37# C pointers 38# x6 c0 39# x16 c1 40# x17 c2 41# x14 c3 42# x13 c4 43# x21 c5 44# x22 c6 45# x7 c7 46 47# Vector register usage 48# A0 v0 49# A1 v1 50# A2 v2 51# A3 v3 52# A4 v4 53# A5 v5 54# A6 v6 55# A7 v7 56# B v16 v17 v18 v19 57# C v24 58# C v25 59# C v26 60# C v27 61# C v28 62# C v29 63# C v30 64# C v31 65 66# Clamp v20 v21 67# unused A v8 v9 v10 v11 68# unused B v12 v13 v14 v15 69 70BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64 71 72 # Load params pointer 73 LDR x8, [sp, 8] 74 75 # Save x19,x20,x21,x22 on stack 76 STP x19, x20, [sp, -32]! 77 STP x21, x22, [sp, 16] 78 79 # Clamp A and C pointers 80 CMP x0, 2 // if mr < 2 81 ADD x9, x3, x4 // a1 = a0 + a_stride 82 ADD x16, x6, x7 // c1 = c0 + cm_stride 83 CSEL x9, x3, x9, LO // a1 = a0 84 CSEL x16, x6, x16, LO // c1 = c0 85 86 # Load params 87 LD2R {v20.8h, v21.8h}, [x8] 88 89 ADD x10, x9, x4 // a2 = a1 + a_stride 90 ADD x17, x16, x7 // c2 = c1 + cm_stride 91 // if mr <= 2 92 CSEL x10, x9, x10, LS // a2 = a1 93 CSEL x17, x16, x17, LS // c2 = c1 94 95 CMP x0, 4 // if mr < 4 96 ADD x11, x10, x4 // a3 = a2 + a_stride 97 ADD x14, x17, x7 // c3 = c2 + cm_stride 98 CSEL x11, x10, x11, LO // a3 = a2 99 CSEL x14, x17, x14, LO // c3 = c2 100 101 ADD x12, x11, x4 // a4 = a3 + a_stride 102 ADD x13, x14, x7 // c4 = c3 + cm_stride 103 // if mr <= 4 104 CSEL x12, x11, x12, LS // a4 = a3 105 CSEL x13, x14, x13, LS // c4 = c3 106 107 CMP x0, 6 // if mr < 6 108 ADD x19, x12, x4 // a5 = a4 + a_stride 109 ADD x21, x13, x7 // c5 = c4 + cm_stride 110 CSEL x19, x12, x19, LO // a5 = a4 111 CSEL x21, x13, x21, LO // c5 = c4 112 113 ADD x20, x19, x4 // a6 = a5 + a_stride 114 ADD x22, x21, x7 // c6 = c5 + cm_stride 115 // if mr <= 6 116 CSEL x20, x19, x20, LS // a6 = a5 117 CSEL x22, x21, x22, LS // c6 = c5 118 119 CMP x0, 8 // if mr < 8 120 ADD x4, x20, x4 // a7 = a5 + a_stride 121 ADD x7, x22, x7 // c7 = c5 + cm_stride 122 CSEL x4, x20, x4, LO // a7 = a5 123 CSEL x7, x22, x7, LO // c7 = c5 124 125 LDR x8, [sp, 32] // load cn_stride 126 1270: 128 # Load initial bias from w into accumulators 129 LDR q24, [x5], 16 130 MOV v25.16b, v24.16b 131 MOV v26.16b, v24.16b 132 MOV v27.16b, v24.16b 133 MOV v28.16b, v24.16b 134 MOV v29.16b, v24.16b 135 MOV v30.16b, v24.16b 136 MOV v31.16b, v24.16b 137 138 # Is there at least 4 halffloats (8 bytes)? 139 SUBS x0, x2, 8 // k = kc - 8 140 B.LO 3f 141 142 # Main loop - 4 halffloats of A (8 bytes) 143 # 32 FMA + 8 ld64 A + 4 LDR B 1441: 145 LDR d0, [x3], 8 146 LDR q16, [x5], 16 147 LDR q17, [x5], 16 148 LDR d1, [x9], 8 149 LDR d2, [x10], 8 150 LDR d3, [x11], 8 151 LDR d4, [x12], 8 152 LDR d5, [x19], 8 153 LDR d6, [x20], 8 154 LDR d7, [x4], 8 155 LDR q18, [x5], 16 156 LDR q19, [x5], 16 157 SUBS x0, x0, 8 158 FMLA v24.8h, v16.8h, v0.h[0] 159 FMLA v25.8h, v16.8h, v1.h[0] 160 FMLA v26.8h, v16.8h, v2.h[0] 161 FMLA v27.8h, v16.8h, v3.h[0] 162 FMLA v28.8h, v16.8h, v4.h[0] 163 FMLA v29.8h, v16.8h, v5.h[0] 164 FMLA v30.8h, v16.8h, v6.h[0] 165 FMLA v31.8h, v16.8h, v7.h[0] 166 167 FMLA v24.8h, v17.8h, v0.h[1] 168 FMLA v25.8h, v17.8h, v1.h[1] 169 FMLA v26.8h, v17.8h, v2.h[1] 170 FMLA v27.8h, v17.8h, v3.h[1] 171 FMLA v28.8h, v17.8h, v4.h[1] 172 FMLA v29.8h, v17.8h, v5.h[1] 173 FMLA v30.8h, v17.8h, v6.h[1] 174 FMLA v31.8h, v17.8h, v7.h[1] 175 176 FMLA v24.8h, v18.8h, v0.h[2] 177 FMLA v25.8h, v18.8h, v1.h[2] 178 FMLA v26.8h, v18.8h, v2.h[2] 179 FMLA v27.8h, v18.8h, v3.h[2] 180 FMLA v28.8h, v18.8h, v4.h[2] 181 FMLA v29.8h, v18.8h, v5.h[2] 182 FMLA v30.8h, v18.8h, v6.h[2] 183 FMLA v31.8h, v18.8h, v7.h[2] 184 185 FMLA v24.8h, v19.8h, v0.h[3] 186 FMLA v25.8h, v19.8h, v1.h[3] 187 FMLA v26.8h, v19.8h, v2.h[3] 188 FMLA v27.8h, v19.8h, v3.h[3] 189 FMLA v28.8h, v19.8h, v4.h[3] 190 FMLA v29.8h, v19.8h, v5.h[3] 191 FMLA v30.8h, v19.8h, v6.h[3] 192 FMLA v31.8h, v19.8h, v7.h[3] 193 B.HS 1b 194 195 # Is there a remainder?- 2 halffloats of A (4 bytes) 196 TBNZ x0, 2, 4f 197 # Is there a remainder?- 1 halffloat of A (2 bytes) 198 TBNZ x0, 1, 5f 1992: 200 # Clamp 201 FMAX v24.8h, v24.8h, v20.8h 202 FMAX v25.8h, v25.8h, v20.8h 203 FMAX v26.8h, v26.8h, v20.8h 204 FMAX v27.8h, v27.8h, v20.8h 205 FMAX v28.8h, v28.8h, v20.8h 206 FMAX v29.8h, v29.8h, v20.8h 207 FMAX v30.8h, v30.8h, v20.8h 208 FMAX v31.8h, v31.8h, v20.8h 209 SUBS x1, x1, 8 210 FMIN v24.8h, v24.8h, v21.8h 211 FMIN v25.8h, v25.8h, v21.8h 212 FMIN v26.8h, v26.8h, v21.8h 213 FMIN v27.8h, v27.8h, v21.8h 214 FMIN v28.8h, v28.8h, v21.8h 215 FMIN v29.8h, v29.8h, v21.8h 216 FMIN v30.8h, v30.8h, v21.8h 217 FMIN v31.8h, v31.8h, v21.8h 218 219 # Store full 8 x 8 220 B.LO 6f 221 222 ST1 {v24.16b}, [x6], x8 223 SUB x3, x3, x2 // a0 -= kc 224 ST1 {v25.16b}, [x16], x8 225 SUB x9, x9, x2 // a1 -= kc 226 ST1 {v26.16b}, [x17], x8 227 SUB x10, x10, x2 // a2 -= kc 228 ST1 {v27.16b}, [x14], x8 229 SUB x11, x11, x2 // a3 -= kc 230 ST1 {v28.16b}, [x13], x8 231 SUB x12, x12, x2 // a4 -= kc 232 ST1 {v29.16b}, [x21], x8 233 SUB x19, x19, x2 // a6 -= kc 234 ST1 {v30.16b}, [x22], x8 235 SUB x20, x20, x2 // a6 -= kc 236 ST1 {v31.16b}, [x7], x8 237 SUB x4, x4, x2 // a7 -= kc 238 239 B.HI 0b 240 241 # Restore x19,x20,x21,x22 from stack 242 LDP x21, x22, [sp, 16] 243 LDP x19, x20, [sp], 32 244 RET 245 2463: 247 TBZ x0, 2, 5f 2484: 249 # Remainder- 2 halffloats of A (4 bytes) 250 LDR s0, [x3], 4 251 LDR q16, [x5], 16 252 LDR q17, [x5], 16 253 LDR s1, [x9], 4 254 LDR s2, [x10], 4 255 LDR s3, [x11], 4 256 LDR s4, [x12], 4 257 LDR s5, [x19], 4 258 LDR s6, [x20], 4 259 LDR s7, [x4], 4 260 261 FMLA v24.8h, v16.8h, v0.h[0] 262 FMLA v25.8h, v16.8h, v1.h[0] 263 FMLA v26.8h, v16.8h, v2.h[0] 264 FMLA v27.8h, v16.8h, v3.h[0] 265 FMLA v28.8h, v16.8h, v4.h[0] 266 FMLA v29.8h, v16.8h, v5.h[0] 267 FMLA v30.8h, v16.8h, v6.h[0] 268 FMLA v31.8h, v16.8h, v7.h[0] 269 270 FMLA v24.8h, v17.8h, v0.h[1] 271 FMLA v25.8h, v17.8h, v1.h[1] 272 FMLA v26.8h, v17.8h, v2.h[1] 273 FMLA v27.8h, v17.8h, v3.h[1] 274 FMLA v28.8h, v17.8h, v4.h[1] 275 FMLA v29.8h, v17.8h, v5.h[1] 276 FMLA v30.8h, v17.8h, v6.h[1] 277 FMLA v31.8h, v17.8h, v7.h[1] 278 TBZ x0, 1, 2b 279 2805: 281 # Remainder- 1 halffloat of A (2 bytes) 282 LDR h0, [x3], 2 283 LDR q16, [x5], 16 284 LDR h1, [x9], 2 285 LDR h2, [x10], 2 286 LDR h3, [x11], 2 287 LDR h4, [x12], 2 288 LDR h5, [x19], 2 289 LDR h6, [x20], 2 290 LDR h7, [x4], 2 291 292 FMLA v24.8h, v16.8h, v0.h[0] 293 FMLA v25.8h, v16.8h, v1.h[0] 294 FMLA v26.8h, v16.8h, v2.h[0] 295 FMLA v27.8h, v16.8h, v3.h[0] 296 FMLA v28.8h, v16.8h, v4.h[0] 297 FMLA v29.8h, v16.8h, v5.h[0] 298 FMLA v30.8h, v16.8h, v6.h[0] 299 FMLA v31.8h, v16.8h, v7.h[0] 300 B 2b 301 302 # Store odd width 3036: 304 TBZ x1, 2, 7f 305 STR d24, [x6], 8 306 STR d25, [x16], 8 307 DUP d24, v24.d[1] 308 DUP d25, v25.d[1] 309 STR d26, [x17], 8 310 STR d27, [x14], 8 311 DUP d26, v26.d[1] 312 DUP d27, v27.d[1] 313 STR d28, [x13], 8 314 STR d29, [x21], 8 315 DUP d28, v28.d[1] 316 DUP d29, v29.d[1] 317 STR d30, [x22], 8 318 STR d31, [x7], 8 319 DUP d30, v30.d[1] 320 DUP d31, v31.d[1] 3217: 322 TBZ x1, 1, 8f 323 STR s24, [x6], 4 324 STR s25, [x16], 4 325 DUP s24, v24.s[1] 326 DUP s25, v25.s[1] 327 STR s26, [x17], 4 328 STR s27, [x14], 4 329 DUP s26, v26.s[1] 330 DUP s27, v27.s[1] 331 STR s28, [x13], 4 332 STR s29, [x21], 4 333 DUP s28, v28.s[1] 334 DUP s29, v29.s[1] 335 STR s30, [x22], 4 336 STR s31, [x7], 4 337 DUP s30, v30.s[1] 338 DUP s31, v31.s[1] 339 3408: 341 TBZ x1, 0, 9f 342 STR h24, [x6] 343 STR h25, [x16] 344 STR h26, [x17] 345 STR h27, [x14] 346 STR h28, [x13] 347 STR h29, [x21] 348 STR h30, [x22] 349 STR h31, [x7] 3509: 351 # Restore x19,x20,x21,x22 from stack 352 LDP x21, x22, [sp, 16] 353 LDP x19, x20, [sp], 32 354 RET 355 356END_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64 357 358#ifdef __ELF__ 359.section ".note.GNU-stack","",%progbits 360#endif 361