1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128 63 64 # Load zero, params pointer 65 LDP x12, x8, [sp, 16] 66 67 # Clamp C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 # Load min/max values 73 LD2R {v6.4s, v7.4s}, [x8] 74 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 # Save x20,x21,x22,x23 on stack 80 STP x20, x21, [sp, -32]! 81 82 CMP x0, 4 // if mr < 4 83 ADD x10, x17, x7 // c3 = c2 + cm_stride 84 CSEL x10, x17, x10, LO // c3 = c2 85 86 STP x22, x23, [sp, 16] 87 88 ADD x13, x10, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x13, x10, x13, LS // c4 = c3 91 92 # Load a_offset 93 LDR x11, [sp, 40] 94 95 CMP x0, 6 // if mr < 6 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x7, x13, x7, LO // c5 = c4 98 990: 100 # Load initial bias from w into accumulators 101 LDP q20, q21, [x5], 32 102 MOV v22.16b, v20.16b 103 MOV v23.16b, v21.16b 104 MOV v24.16b, v20.16b 105 MOV v25.16b, v21.16b 106 MOV v26.16b, v20.16b 107 MOV v27.16b, v21.16b 108 MOV v28.16b, v20.16b 109 MOV v29.16b, v21.16b 110 MOV v30.16b, v20.16b 111 MOV v31.16b, v21.16b 112 113 MOV x9, x3 // p = ks 114 1151: 116 # Load next 6 A pointers 117 LDP x14, x15, [x4], 16 118 LDP x20, x21, [x4], 16 119 LDP x22, x23, [x4], 16 120 121 CMP x14, x12 // if a0 == zero 122 ADD x14, x14, x11 // a0 += a_offset 123 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 124 CMP x15, x12 // if a1 == zero 125 ADD x15, x15, x11 // a1 += a_offset 126 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 127 CMP x20, x12 // if a2 == zero 128 ADD x20, x20, x11 // a2 += a_offset 129 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 130 CMP x21, x12 // if a3 == zero 131 ADD x21, x21, x11 // a3 += a_offset 132 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 133 CMP x22, x12 // if a4 == zero 134 ADD x22, x22, x11 // a4 += a_offset 135 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 136 CMP x23, x12 // if a5 == zero 137 ADD x23, x23, x11 // a5 += a_offset 138 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 139 140 # Is there at least 4 floats (16 bytes)? 141 SUBS x0, x2, 16 // k = kc - 16 142 B.LO 4f 143 144 # Main loop - 4 floats of A (16 bytes) 145 # 48 FMA + 6 ld128 A + 4 LDP B 1462: 147 LDP q16, q17, [x5], 32 148 LDR q0, [x14], 16 149 LDR q1, [x15], 16 150 LDR q2, [x20], 16 151 LDR q3, [x21], 16 152 LDR q4, [x22], 16 153 LDR q5, [x23], 16 154 FMLA v20.4s, v16.4s, v0.s[0] 155 FMLA v22.4s, v16.4s, v1.s[0] 156 FMLA v24.4s, v16.4s, v2.s[0] 157 FMLA v26.4s, v16.4s, v3.s[0] 158 LDP q18, q19, [x5], 32 159 FMLA v28.4s, v16.4s, v4.s[0] 160 FMLA v30.4s, v16.4s, v5.s[0] 161 FMLA v21.4s, v17.4s, v0.s[0] 162 FMLA v23.4s, v17.4s, v1.s[0] 163 FMLA v25.4s, v17.4s, v2.s[0] 164 FMLA v27.4s, v17.4s, v3.s[0] 165 FMLA v29.4s, v17.4s, v4.s[0] 166 FMLA v31.4s, v17.4s, v5.s[0] 167 168 FMLA v20.4s, v18.4s, v0.s[1] 169 LDP q16, q17, [x5], 32 170 FMLA v22.4s, v18.4s, v1.s[1] 171 FMLA v24.4s, v18.4s, v2.s[1] 172 FMLA v26.4s, v18.4s, v3.s[1] 173 FMLA v28.4s, v18.4s, v4.s[1] 174 FMLA v30.4s, v18.4s, v5.s[1] 175 FMLA v21.4s, v19.4s, v0.s[1] 176 FMLA v23.4s, v19.4s, v1.s[1] 177 FMLA v25.4s, v19.4s, v2.s[1] 178 FMLA v27.4s, v19.4s, v3.s[1] 179 FMLA v29.4s, v19.4s, v4.s[1] 180 FMLA v31.4s, v19.4s, v5.s[1] 181 182 FMLA v20.4s, v16.4s, v0.s[2] 183 LDP q18, q19, [x5], 32 184 FMLA v22.4s, v16.4s, v1.s[2] 185 FMLA v24.4s, v16.4s, v2.s[2] 186 FMLA v26.4s, v16.4s, v3.s[2] 187 FMLA v28.4s, v16.4s, v4.s[2] 188 FMLA v30.4s, v16.4s, v5.s[2] 189 FMLA v21.4s, v17.4s, v0.s[2] 190 FMLA v23.4s, v17.4s, v1.s[2] 191 FMLA v25.4s, v17.4s, v2.s[2] 192 FMLA v27.4s, v17.4s, v3.s[2] 193 FMLA v29.4s, v17.4s, v4.s[2] 194 FMLA v31.4s, v17.4s, v5.s[2] 195 196 FMLA v20.4s, v18.4s, v0.s[3] 197 FMLA v22.4s, v18.4s, v1.s[3] 198 FMLA v24.4s, v18.4s, v2.s[3] 199 FMLA v26.4s, v18.4s, v3.s[3] 200 FMLA v28.4s, v18.4s, v4.s[3] 201 FMLA v30.4s, v18.4s, v5.s[3] 202 FMLA v21.4s, v19.4s, v0.s[3] 203 FMLA v23.4s, v19.4s, v1.s[3] 204 FMLA v25.4s, v19.4s, v2.s[3] 205 FMLA v27.4s, v19.4s, v3.s[3] 206 SUBS x0, x0, 16 207 FMLA v29.4s, v19.4s, v4.s[3] 208 FMLA v31.4s, v19.4s, v5.s[3] 209 B.HS 2b 210 211 # Is there a remainder?- 2 floats of A (8 bytes) or less 212 TST x0, 15 213 B.NE 4f 214 2153: 216 # ks loop 217 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 218 B.HI 1b 219 220 # Clamp 221 FMAX v20.4s, v20.4s, v6.4s 222 # Load cn_stride 223 LDR x0, [sp, 32] 224 FMAX v21.4s, v21.4s, v6.4s 225 FMAX v22.4s, v22.4s, v6.4s 226 FMAX v23.4s, v23.4s, v6.4s 227 FMAX v24.4s, v24.4s, v6.4s 228 FMAX v25.4s, v25.4s, v6.4s 229 FMAX v26.4s, v26.4s, v6.4s 230 FMAX v27.4s, v27.4s, v6.4s 231 FMAX v28.4s, v28.4s, v6.4s 232 FMAX v29.4s, v29.4s, v6.4s 233 FMAX v30.4s, v30.4s, v6.4s 234 FMAX v31.4s, v31.4s, v6.4s 235 SUBS x1, x1, 8 236 FMIN v20.4s, v20.4s, v7.4s 237 FMIN v21.4s, v21.4s, v7.4s 238 FMIN v22.4s, v22.4s, v7.4s 239 FMIN v23.4s, v23.4s, v7.4s 240 FMIN v24.4s, v24.4s, v7.4s 241 FMIN v25.4s, v25.4s, v7.4s 242 FMIN v26.4s, v26.4s, v7.4s 243 FMIN v27.4s, v27.4s, v7.4s 244 FMIN v28.4s, v28.4s, v7.4s 245 FMIN v29.4s, v29.4s, v7.4s 246 FMIN v30.4s, v30.4s, v7.4s 247 FMIN v31.4s, v31.4s, v7.4s 248 249 # Store full 6 x 8 250 B.LO 6f 251 252 STP q30, q31, [x7] 253 ADD x7, x7, x0 254 STP q28, q29, [x13] 255 ADD x13, x13, x0 256 STP q26, q27, [x10] 257 ADD x10, x10, x0 258 STP q24, q25, [x17] 259 ADD x17, x17, x0 260 STP q22, q23, [x16] 261 ADD x16, x16, x0 262 STP q20, q21, [x6] 263 ADD x6, x6, x0 264 265 SUB x4, x4, x3 // a -= ks 266 267 # nc loop 268 B.HI 0b 269 270 # Restore x20,x21,x22,x23 from stack 271 LDP x22, x23, [sp, 16] 272 LDP x20, x21, [sp], 32 273 RET 274 2754: 276 # Is there a remainder?- 2 floats of A (8 bytes) 277 TBZ x0, 3, 5f 278 279 # Remainder- 2 floats of A (8 bytes) 280 LDR d0, [x14], 8 281 LDP q16, q17, [x5], 32 282 LDR d1, [x15], 8 283 LDR d2, [x20], 8 284 LDR d3, [x21], 8 285 LDR d4, [x22], 8 286 LDR d5, [x23], 8 287 FMLA v20.4s, v16.4s, v0.s[0] 288 FMLA v22.4s, v16.4s, v1.s[0] 289 FMLA v24.4s, v16.4s, v2.s[0] 290 FMLA v26.4s, v16.4s, v3.s[0] 291 LDP q18, q19, [x5], 32 292 FMLA v28.4s, v16.4s, v4.s[0] 293 FMLA v30.4s, v16.4s, v5.s[0] 294 FMLA v21.4s, v17.4s, v0.s[0] 295 FMLA v23.4s, v17.4s, v1.s[0] 296 FMLA v25.4s, v17.4s, v2.s[0] 297 FMLA v27.4s, v17.4s, v3.s[0] 298 FMLA v29.4s, v17.4s, v4.s[0] 299 FMLA v31.4s, v17.4s, v5.s[0] 300 301 FMLA v20.4s, v18.4s, v0.s[1] 302 FMLA v22.4s, v18.4s, v1.s[1] 303 FMLA v24.4s, v18.4s, v2.s[1] 304 FMLA v26.4s, v18.4s, v3.s[1] 305 FMLA v28.4s, v18.4s, v4.s[1] 306 FMLA v30.4s, v18.4s, v5.s[1] 307 FMLA v21.4s, v19.4s, v0.s[1] 308 FMLA v23.4s, v19.4s, v1.s[1] 309 FMLA v25.4s, v19.4s, v2.s[1] 310 FMLA v27.4s, v19.4s, v3.s[1] 311 FMLA v29.4s, v19.4s, v4.s[1] 312 FMLA v31.4s, v19.4s, v5.s[1] 313 314 # Is there a remainder?- 1 float of A (4 bytes) 315 TBZ x0, 2, 3b 316 317 # Remainder- 1 float of A (4 bytes) 3185: 319 LDR s0, [x14], 4 320 LDP q16, q17, [x5], 32 321 LDR s1, [x15], 4 322 LDR s2, [x20], 4 323 LDR s3, [x21], 4 324 LDR s4, [x22], 4 325 LDR s5, [x23], 4 326 FMLA v20.4s, v16.4s, v0.s[0] 327 FMLA v22.4s, v16.4s, v1.s[0] 328 FMLA v24.4s, v16.4s, v2.s[0] 329 FMLA v26.4s, v16.4s, v3.s[0] 330 FMLA v28.4s, v16.4s, v4.s[0] 331 FMLA v30.4s, v16.4s, v5.s[0] 332 FMLA v21.4s, v17.4s, v0.s[0] 333 FMLA v23.4s, v17.4s, v1.s[0] 334 FMLA v25.4s, v17.4s, v2.s[0] 335 FMLA v27.4s, v17.4s, v3.s[0] 336 FMLA v29.4s, v17.4s, v4.s[0] 337 FMLA v31.4s, v17.4s, v5.s[0] 338 B 3b 339 340 # Store odd width 3416: 342 TBZ x1, 2, 7f 343 STR q30, [x7], 16 344 MOV v30.16b, v31.16b 345 STR q28, [x13], 16 346 MOV v28.16b, v29.16b 347 STR q26, [x10], 16 348 MOV v26.16b, v27.16b 349 STR q24, [x17], 16 350 MOV v24.16b, v25.16b 351 STR q22, [x16], 16 352 MOV v22.16b, v23.16b 353 STR q20, [x6], 16 354 MOV v20.16b, v21.16b 3557: 356 TBZ x1, 1, 8f 357 STR d30, [x7], 8 358 STR d28, [x13], 8 359 DUP d30, v30.d[1] 360 DUP d28, v28.d[1] 361 STR d26, [x10], 8 362 STR d24, [x17], 8 363 DUP d26, v26.d[1] 364 DUP d24, v24.d[1] 365 STR d22, [x16], 8 366 STR d20, [x6], 8 367 DUP d22, v22.d[1] 368 DUP d20, v20.d[1] 369 3708: 371 TBZ x1, 0, 9f 372 STR s30, [x7] 373 STR s28, [x13] 374 STR s26, [x10] 375 STR s24, [x17] 376 STR s22, [x16] 377 STR s20, [x6] 3789: 379 # Restore x20,x21,x22,x23 from stack 380 LDP x22, x23, [sp, 16] 381 LDP x20, x21, [sp], 32 382 RET 383 384END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128 385 386#ifdef __ELF__ 387.section ".note.GNU-stack","",%progbits 388#endif 389