1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float** a, x4 18# const void* w, x5 19# uint8_t* c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 62 63 # Clamp C pointers / Save d8-d15 on stack 64 CMP x0, 2 // if mr < 2 65 STP d8, d9, [sp, -96]! 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 STP d10, d11, [sp, 16] 68 CSEL x16, x6, x16, LO // c1 = c0 69 STP d12, d13, [sp, 32] 70 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 STP d14, d15, [sp, 48] 73 // if mr <= 2 74 CSEL x17, x16, x17, LS // c2 = c1 75 STP x20, x21, [sp, 64] 76 77 CMP x0, 4 // if mr < 4 78 STP x22, x23, [sp, 80] 79 ADD x10, x17, x7 // c3 = c2 + cm_stride 80 CSEL x10, x17, x10, LO // c3 = c2 81 82 ADD x13, x10, x7 // c4 = c3 + cm_stride 83 // if mr <= 4 84 CSEL x13, x10, x13, LS // c4 = c3 85 86 # Load zero, params pointer 87 LDP x12, x8, [sp, 112] 88 89 CMP x0, 6 // if mr < 6 90 ADD x7, x13, x7 // c5 = c4 + cm_stride 91 LDR x11, [sp, 104] // Load a_offset 92 CSEL x7, x13, x7, LO // c5 = c4 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 MOV v23.16b, v21.16b 99 MOV v24.16b, v20.16b 100 MOV v25.16b, v21.16b 101 MOV v26.16b, v20.16b 102 MOV v27.16b, v21.16b 103 MOV v28.16b, v20.16b 104 MOV v29.16b, v21.16b 105 MOV v30.16b, v20.16b 106 MOV v31.16b, v21.16b 107 108 MOV x9, x3 // p = ks 109 1101: 111 # Load next 6 A pointers 112 LDP x14, x15, [x4], 16 113 LDP x20, x21, [x4], 16 114 LDP x22, x23, [x4], 16 115 116 CMP x14, x12 // if a0 == zero 117 ADD x14, x14, x11 // a0 += a_offset 118 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 119 CMP x15, x12 // if a1 == zero 120 ADD x15, x15, x11 // a1 += a_offset 121 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 122 CMP x20, x12 // if a2 == zero 123 ADD x20, x20, x11 // a2 += a_offset 124 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 125 CMP x21, x12 // if a3 == zero 126 ADD x21, x21, x11 // a3 += a_offset 127 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 128 CMP x22, x12 // if a4 == zero 129 ADD x22, x22, x11 // a4 += a_offset 130 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 131 CMP x23, x12 // if a5 == zero 132 ADD x23, x23, x11 // a5 += a_offset 133 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 134 135 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 136 SUBS x0, x2, 32 // k = kc - 32 137 B.LO 5f 138 139 # Prologue - loads for main loop of 96 FMA 140 LDR q0, [x14], 16 141 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 142 LDR q1, [x15], 16 143 LDR q2, [x20], 16 144 LDR q3, [x21], 16 145 LDR q4, [x22], 16 146 LDR q5, [x23], 16 147 LDP q14, q15, [x5], 32 148 LDP q16, q17, [x5], 32 149 150 # Is there at least 8 floats (32 bytes) for main loop? 151 SUBS x0, x0, 32 152 B.LO 3f 153 154 # Main loop - 8 floats of A (32 bytes) 155 # 96 FMA + 6 LDP A + 8 LDP B 156 # 64 float weights = 256 bytes. 4 cache lines. 1572: 158 # First group of 4 A. 48 FMA. 159 FMLA v20.4s, v12.4s, v0.s[0] 160 LDP q18, q19, [x5], 32 // Load last B 161 FMLA v22.4s, v12.4s, v1.s[0] 162 FMLA v24.4s, v12.4s, v2.s[0] 163 FMLA v26.4s, v12.4s, v3.s[0] 164 FMLA v28.4s, v12.4s, v4.s[0] 165 FMLA v30.4s, v12.4s, v5.s[0] 166 FMLA v21.4s, v13.4s, v0.s[0] 167 FMLA v23.4s, v13.4s, v1.s[0] 168 FMLA v25.4s, v13.4s, v2.s[0] 169 FMLA v27.4s, v13.4s, v3.s[0] 170 FMLA v29.4s, v13.4s, v4.s[0] 171 FMLA v31.4s, v13.4s, v5.s[0] 172 FMLA v20.4s, v14.4s, v0.s[1] 173 FMLA v22.4s, v14.4s, v1.s[1] 174 FMLA v24.4s, v14.4s, v2.s[1] 175 FMLA v26.4s, v14.4s, v3.s[1] 176 FMLA v28.4s, v14.4s, v4.s[1] 177 FMLA v30.4s, v14.4s, v5.s[1] 178 FMLA v21.4s, v15.4s, v0.s[1] 179 FMLA v23.4s, v15.4s, v1.s[1] 180 FMLA v25.4s, v15.4s, v2.s[1] 181 LDR q6, [x14], 16 // Load next 6 A 182 FMLA v27.4s, v15.4s, v3.s[1] 183 FMLA v29.4s, v15.4s, v4.s[1] 184 FMLA v31.4s, v15.4s, v5.s[1] 185 LDR q7, [x15], 16 186 187 FMLA v20.4s, v16.4s, v0.s[2] 188 FMLA v22.4s, v16.4s, v1.s[2] 189 FMLA v24.4s, v16.4s, v2.s[2] 190 LDR q8, [x20], 16 191 FMLA v26.4s, v16.4s, v3.s[2] 192 FMLA v28.4s, v16.4s, v4.s[2] 193 FMLA v30.4s, v16.4s, v5.s[2] 194 LDR q9, [x21], 16 195 FMLA v21.4s, v17.4s, v0.s[2] 196 FMLA v23.4s, v17.4s, v1.s[2] 197 FMLA v25.4s, v17.4s, v2.s[2] 198 LDR q10, [x22], 16 199 FMLA v27.4s, v17.4s, v3.s[2] 200 FMLA v29.4s, v17.4s, v4.s[2] 201 FMLA v31.4s, v17.4s, v5.s[2] 202 LDR q11, [x23], 16 203 204 FMLA v20.4s, v18.4s, v0.s[3] 205 FMLA v22.4s, v18.4s, v1.s[3] 206 FMLA v24.4s, v18.4s, v2.s[3] 207 LDP q12, q13, [x5], 32 // Load 4 B 208 FMLA v26.4s, v18.4s, v3.s[3] 209 FMLA v28.4s, v18.4s, v4.s[3] 210 FMLA v30.4s, v18.4s, v5.s[3] 211 LDP q14, q15, [x5], 32 212 FMLA v21.4s, v19.4s, v0.s[3] 213 FMLA v23.4s, v19.4s, v1.s[3] 214 FMLA v25.4s, v19.4s, v2.s[3] 215 LDP q16, q17, [x5], 32 216 FMLA v27.4s, v19.4s, v3.s[3] 217 FMLA v29.4s, v19.4s, v4.s[3] 218 FMLA v31.4s, v19.4s, v5.s[3] 219 LDP q18, q19, [x5], 32 220 221 # Second group of 4 A. 48 FMA. 222 FMLA v20.4s, v12.4s, v6.s[0] 223 FMLA v22.4s, v12.4s, v7.s[0] 224 FMLA v24.4s, v12.4s, v8.s[0] 225 LDR q0, [x14], 16 // Load next 6 A 226 FMLA v26.4s, v12.4s, v9.s[0] 227 FMLA v28.4s, v12.4s, v10.s[0] 228 FMLA v30.4s, v12.4s, v11.s[0] 229 LDR q1, [x15], 16 230 FMLA v21.4s, v13.4s, v6.s[0] 231 FMLA v23.4s, v13.4s, v7.s[0] 232 FMLA v25.4s, v13.4s, v8.s[0] 233 LDR q2, [x20], 16 234 FMLA v27.4s, v13.4s, v9.s[0] 235 FMLA v29.4s, v13.4s, v10.s[0] 236 FMLA v31.4s, v13.4s, v11.s[0] 237 LDR q3, [x21], 16 238 239 FMLA v20.4s, v14.4s, v6.s[1] 240 FMLA v22.4s, v14.4s, v7.s[1] 241 FMLA v24.4s, v14.4s, v8.s[1] 242 LDR q4, [x22], 16 243 FMLA v26.4s, v14.4s, v9.s[1] 244 FMLA v28.4s, v14.4s, v10.s[1] 245 FMLA v30.4s, v14.4s, v11.s[1] 246 LDR q5, [x23], 16 247 FMLA v21.4s, v15.4s, v6.s[1] 248 FMLA v23.4s, v15.4s, v7.s[1] 249 FMLA v25.4s, v15.4s, v8.s[1] 250 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 251 FMLA v27.4s, v15.4s, v9.s[1] 252 FMLA v29.4s, v15.4s, v10.s[1] 253 FMLA v31.4s, v15.4s, v11.s[1] 254 LDP q14, q15, [x5], 32 255 256 FMLA v20.4s, v16.4s, v6.s[2] 257 FMLA v22.4s, v16.4s, v7.s[2] 258 FMLA v24.4s, v16.4s, v8.s[2] 259 FMLA v26.4s, v16.4s, v9.s[2] 260 FMLA v28.4s, v16.4s, v10.s[2] 261 FMLA v30.4s, v16.4s, v11.s[2] 262 FMLA v21.4s, v17.4s, v6.s[2] 263 FMLA v23.4s, v17.4s, v7.s[2] 264 FMLA v25.4s, v17.4s, v8.s[2] 265 FMLA v27.4s, v17.4s, v9.s[2] 266 FMLA v29.4s, v17.4s, v10.s[2] 267 FMLA v31.4s, v17.4s, v11.s[2] 268 269 FMLA v20.4s, v18.4s, v6.s[3] 270 FMLA v22.4s, v18.4s, v7.s[3] 271 LDP q16, q17, [x5], 32 272 FMLA v24.4s, v18.4s, v8.s[3] 273 FMLA v26.4s, v18.4s, v9.s[3] 274 FMLA v28.4s, v18.4s, v10.s[3] 275 FMLA v30.4s, v18.4s, v11.s[3] 276 SUBS x0, x0, 32 277 FMLA v21.4s, v19.4s, v6.s[3] 278 FMLA v23.4s, v19.4s, v7.s[3] 279 FMLA v25.4s, v19.4s, v8.s[3] 280 FMLA v27.4s, v19.4s, v9.s[3] 281 FMLA v29.4s, v19.4s, v10.s[3] 282 FMLA v31.4s, v19.4s, v11.s[3] 283 B.HS 2b 284 285 # Epilogue - 8 floats of A (32 bytes) 286 # 96 FMA + 6 LDP A + 8 LDP B 287 # First block same as main loop. Second block has no preloads. 2883: 289 # First group of 4 A. 48 FMA. 290 FMLA v20.4s, v12.4s, v0.s[0] 291 LDP q18, q19, [x5], 32 // Load last B 292 FMLA v22.4s, v12.4s, v1.s[0] 293 FMLA v24.4s, v12.4s, v2.s[0] 294 FMLA v26.4s, v12.4s, v3.s[0] 295 FMLA v28.4s, v12.4s, v4.s[0] 296 FMLA v30.4s, v12.4s, v5.s[0] 297 FMLA v21.4s, v13.4s, v0.s[0] 298 FMLA v23.4s, v13.4s, v1.s[0] 299 FMLA v25.4s, v13.4s, v2.s[0] 300 FMLA v27.4s, v13.4s, v3.s[0] 301 FMLA v29.4s, v13.4s, v4.s[0] 302 FMLA v31.4s, v13.4s, v5.s[0] 303 FMLA v20.4s, v14.4s, v0.s[1] 304 FMLA v22.4s, v14.4s, v1.s[1] 305 FMLA v24.4s, v14.4s, v2.s[1] 306 FMLA v26.4s, v14.4s, v3.s[1] 307 FMLA v28.4s, v14.4s, v4.s[1] 308 FMLA v30.4s, v14.4s, v5.s[1] 309 FMLA v21.4s, v15.4s, v0.s[1] 310 FMLA v23.4s, v15.4s, v1.s[1] 311 FMLA v25.4s, v15.4s, v2.s[1] 312 LDR q6, [x14], 16 // Load next 6 A 313 FMLA v27.4s, v15.4s, v3.s[1] 314 FMLA v29.4s, v15.4s, v4.s[1] 315 FMLA v31.4s, v15.4s, v5.s[1] 316 LDR q7, [x15], 16 317 318 FMLA v20.4s, v16.4s, v0.s[2] 319 FMLA v22.4s, v16.4s, v1.s[2] 320 FMLA v24.4s, v16.4s, v2.s[2] 321 LDR q8, [x20], 16 322 FMLA v26.4s, v16.4s, v3.s[2] 323 FMLA v28.4s, v16.4s, v4.s[2] 324 FMLA v30.4s, v16.4s, v5.s[2] 325 LDR q9, [x21], 16 326 FMLA v21.4s, v17.4s, v0.s[2] 327 FMLA v23.4s, v17.4s, v1.s[2] 328 FMLA v25.4s, v17.4s, v2.s[2] 329 LDR q10, [x22], 16 330 FMLA v27.4s, v17.4s, v3.s[2] 331 FMLA v29.4s, v17.4s, v4.s[2] 332 FMLA v31.4s, v17.4s, v5.s[2] 333 LDR q11, [x23], 16 334 335 FMLA v20.4s, v18.4s, v0.s[3] 336 FMLA v22.4s, v18.4s, v1.s[3] 337 FMLA v24.4s, v18.4s, v2.s[3] 338 LDP q12, q13, [x5], 32 // Load 4 B 339 FMLA v26.4s, v18.4s, v3.s[3] 340 FMLA v28.4s, v18.4s, v4.s[3] 341 FMLA v30.4s, v18.4s, v5.s[3] 342 LDP q14, q15, [x5], 32 343 FMLA v21.4s, v19.4s, v0.s[3] 344 FMLA v23.4s, v19.4s, v1.s[3] 345 FMLA v25.4s, v19.4s, v2.s[3] 346 LDP q16, q17, [x5], 32 347 FMLA v27.4s, v19.4s, v3.s[3] 348 FMLA v29.4s, v19.4s, v4.s[3] 349 FMLA v31.4s, v19.4s, v5.s[3] 350 LDP q18, q19, [x5], 32 351 352 # Second group of 4 A. 48 FMA. 353 FMLA v20.4s, v12.4s, v6.s[0] 354 FMLA v22.4s, v12.4s, v7.s[0] 355 FMLA v24.4s, v12.4s, v8.s[0] 356 FMLA v26.4s, v12.4s, v9.s[0] 357 FMLA v28.4s, v12.4s, v10.s[0] 358 FMLA v30.4s, v12.4s, v11.s[0] 359 FMLA v21.4s, v13.4s, v6.s[0] 360 FMLA v23.4s, v13.4s, v7.s[0] 361 FMLA v25.4s, v13.4s, v8.s[0] 362 FMLA v27.4s, v13.4s, v9.s[0] 363 FMLA v29.4s, v13.4s, v10.s[0] 364 FMLA v31.4s, v13.4s, v11.s[0] 365 366 FMLA v20.4s, v14.4s, v6.s[1] 367 FMLA v22.4s, v14.4s, v7.s[1] 368 FMLA v24.4s, v14.4s, v8.s[1] 369 FMLA v26.4s, v14.4s, v9.s[1] 370 FMLA v28.4s, v14.4s, v10.s[1] 371 FMLA v30.4s, v14.4s, v11.s[1] 372 FMLA v21.4s, v15.4s, v6.s[1] 373 FMLA v23.4s, v15.4s, v7.s[1] 374 FMLA v25.4s, v15.4s, v8.s[1] 375 FMLA v27.4s, v15.4s, v9.s[1] 376 FMLA v29.4s, v15.4s, v10.s[1] 377 FMLA v31.4s, v15.4s, v11.s[1] 378 379 FMLA v20.4s, v16.4s, v6.s[2] 380 FMLA v22.4s, v16.4s, v7.s[2] 381 FMLA v24.4s, v16.4s, v8.s[2] 382 FMLA v26.4s, v16.4s, v9.s[2] 383 FMLA v28.4s, v16.4s, v10.s[2] 384 FMLA v30.4s, v16.4s, v11.s[2] 385 FMLA v21.4s, v17.4s, v6.s[2] 386 FMLA v23.4s, v17.4s, v7.s[2] 387 FMLA v25.4s, v17.4s, v8.s[2] 388 FMLA v27.4s, v17.4s, v9.s[2] 389 FMLA v29.4s, v17.4s, v10.s[2] 390 FMLA v31.4s, v17.4s, v11.s[2] 391 392 393 FMLA v20.4s, v18.4s, v6.s[3] 394 FMLA v22.4s, v18.4s, v7.s[3] 395 FMLA v24.4s, v18.4s, v8.s[3] 396 FMLA v26.4s, v18.4s, v9.s[3] 397 FMLA v28.4s, v18.4s, v10.s[3] 398 FMLA v30.4s, v18.4s, v11.s[3] 399 400 # Is there a remainder?- 4 floats of A (16 bytes) or less 401 TST x0, 31 402 403 FMLA v21.4s, v19.4s, v6.s[3] 404 FMLA v23.4s, v19.4s, v7.s[3] 405 FMLA v25.4s, v19.4s, v8.s[3] 406 LD2R {v6.4s, v7.4s}, [x8] // Load min/max values 407 FMLA v27.4s, v19.4s, v9.s[3] 408 FMLA v29.4s, v19.4s, v10.s[3] 409 FMLA v31.4s, v19.4s, v11.s[3] 410 B.NE 5f 411 4124: 413 # ks loop 414 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 415 B.HI 1b 416 417 # Clamp 418 FMAX v20.4s, v20.4s, v6.4s 419 FMAX v21.4s, v21.4s, v6.4s 420 FMAX v22.4s, v22.4s, v6.4s 421 FMAX v23.4s, v23.4s, v6.4s 422 LDR x0, [sp, 96] // Load cn_stride 423 FMAX v24.4s, v24.4s, v6.4s 424 FMAX v25.4s, v25.4s, v6.4s 425 FMAX v26.4s, v26.4s, v6.4s 426 FMAX v27.4s, v27.4s, v6.4s 427 FMAX v28.4s, v28.4s, v6.4s 428 FMAX v29.4s, v29.4s, v6.4s 429 FMAX v30.4s, v30.4s, v6.4s 430 FMAX v31.4s, v31.4s, v6.4s 431 SUBS x1, x1, 8 432 FMIN v20.4s, v20.4s, v7.4s 433 FMIN v21.4s, v21.4s, v7.4s 434 FMIN v22.4s, v22.4s, v7.4s 435 FMIN v23.4s, v23.4s, v7.4s 436 FMIN v24.4s, v24.4s, v7.4s 437 FMIN v25.4s, v25.4s, v7.4s 438 FMIN v26.4s, v26.4s, v7.4s 439 FMIN v27.4s, v27.4s, v7.4s 440 FMIN v28.4s, v28.4s, v7.4s 441 FMIN v29.4s, v29.4s, v7.4s 442 FMIN v30.4s, v30.4s, v7.4s 443 FMIN v31.4s, v31.4s, v7.4s 444 445 # Store full 6 x 8 446 B.LO 8f 447 448 STP q30, q31, [x7] 449 ADD x7, x7, x0 450 STP q28, q29, [x13] 451 ADD x13, x13, x0 452 STP q26, q27, [x10] 453 ADD x10, x10, x0 454 STP q24, q25, [x17] 455 ADD x17, x17, x0 456 STP q22, q23, [x16] 457 ADD x16, x16, x0 458 STP q20, q21, [x6] 459 ADD x6, x6, x0 460 461 SUB x4, x4, x3 // a -= ks 462 463 # nc loop 464 B.HI 0b 465 466 # Restore x20,x21,x22,x23 from stack 467 LDP x22, x23, [sp, 80] 468 LDP x20, x21, [sp, 64] 469 470 # Restore d8-d15 from stack 471 LDP d14, d15, [sp, 48] 472 LDP d12, d13, [sp, 32] 473 LDP d10, d11, [sp, 16] 474 LDP d8, d9, [sp], 96 475 RET 476 4775: 478 # Load min/max values 479 LD2R {v6.4s, v7.4s}, [x8] 480 481 # Is there a remainder?- 4 floats of A (16 bytes) 482 TBZ x0, 4, 6f 483 484 # Remainder- 4 floats of A (16 bytes) 485 # Load A 486 LDR q0, [x14], 16 487 LDR q1, [x15], 16 488 LDR q2, [x20], 16 489 LDR q3, [x21], 16 490 LDR q4, [x22], 16 491 LDR q5, [x23], 16 492 # Load B 493 LDP q12, q13, [x5], 32 494 LDP q14, q15, [x5], 32 495 LDP q16, q17, [x5], 32 496 LDP q18, q19, [x5], 32 497 498 FMLA v20.4s, v12.4s, v0.s[0] 499 FMLA v22.4s, v12.4s, v1.s[0] 500 FMLA v24.4s, v12.4s, v2.s[0] 501 FMLA v26.4s, v12.4s, v3.s[0] 502 FMLA v28.4s, v12.4s, v4.s[0] 503 FMLA v30.4s, v12.4s, v5.s[0] 504 FMLA v21.4s, v13.4s, v0.s[0] 505 FMLA v23.4s, v13.4s, v1.s[0] 506 FMLA v25.4s, v13.4s, v2.s[0] 507 FMLA v27.4s, v13.4s, v3.s[0] 508 FMLA v29.4s, v13.4s, v4.s[0] 509 FMLA v31.4s, v13.4s, v5.s[0] 510 511 FMLA v20.4s, v14.4s, v0.s[1] 512 FMLA v22.4s, v14.4s, v1.s[1] 513 FMLA v24.4s, v14.4s, v2.s[1] 514 FMLA v26.4s, v14.4s, v3.s[1] 515 FMLA v28.4s, v14.4s, v4.s[1] 516 FMLA v30.4s, v14.4s, v5.s[1] 517 FMLA v21.4s, v15.4s, v0.s[1] 518 FMLA v23.4s, v15.4s, v1.s[1] 519 FMLA v25.4s, v15.4s, v2.s[1] 520 FMLA v27.4s, v15.4s, v3.s[1] 521 FMLA v29.4s, v15.4s, v4.s[1] 522 FMLA v31.4s, v15.4s, v5.s[1] 523 524 FMLA v20.4s, v16.4s, v0.s[2] 525 FMLA v22.4s, v16.4s, v1.s[2] 526 FMLA v24.4s, v16.4s, v2.s[2] 527 FMLA v26.4s, v16.4s, v3.s[2] 528 FMLA v28.4s, v16.4s, v4.s[2] 529 FMLA v30.4s, v16.4s, v5.s[2] 530 FMLA v21.4s, v17.4s, v0.s[2] 531 FMLA v23.4s, v17.4s, v1.s[2] 532 FMLA v25.4s, v17.4s, v2.s[2] 533 FMLA v27.4s, v17.4s, v3.s[2] 534 FMLA v29.4s, v17.4s, v4.s[2] 535 FMLA v31.4s, v17.4s, v5.s[2] 536 537 FMLA v20.4s, v18.4s, v0.s[3] 538 FMLA v22.4s, v18.4s, v1.s[3] 539 FMLA v24.4s, v18.4s, v2.s[3] 540 FMLA v26.4s, v18.4s, v3.s[3] 541 FMLA v28.4s, v18.4s, v4.s[3] 542 FMLA v30.4s, v18.4s, v5.s[3] 543 FMLA v21.4s, v19.4s, v0.s[3] 544 FMLA v23.4s, v19.4s, v1.s[3] 545 FMLA v25.4s, v19.4s, v2.s[3] 546 FMLA v27.4s, v19.4s, v3.s[3] 547 FMLA v29.4s, v19.4s, v4.s[3] 548 FMLA v31.4s, v19.4s, v5.s[3] 549 550 # Is there a remainder?- 2 floats of A (8 bytes) 5516: 552 TBZ x0, 3, 7f 553 554 # Remainder- 2 floats of A (8 bytes) 555 # Load A 556 LDR d0, [x14], 8 557 LDR d1, [x15], 8 558 LDR d2, [x20], 8 559 LDR d3, [x21], 8 560 LDR d4, [x22], 8 561 LDR d5, [x23], 8 562 # Load B 563 LDP q12, q13, [x5], 32 564 LDP q14, q15, [x5], 32 565 566 FMLA v20.4s, v12.4s, v0.s[0] 567 FMLA v22.4s, v12.4s, v1.s[0] 568 FMLA v24.4s, v12.4s, v2.s[0] 569 FMLA v26.4s, v12.4s, v3.s[0] 570 FMLA v28.4s, v12.4s, v4.s[0] 571 FMLA v30.4s, v12.4s, v5.s[0] 572 FMLA v21.4s, v13.4s, v0.s[0] 573 FMLA v23.4s, v13.4s, v1.s[0] 574 FMLA v25.4s, v13.4s, v2.s[0] 575 FMLA v27.4s, v13.4s, v3.s[0] 576 FMLA v29.4s, v13.4s, v4.s[0] 577 FMLA v31.4s, v13.4s, v5.s[0] 578 579 FMLA v20.4s, v14.4s, v0.s[1] 580 FMLA v22.4s, v14.4s, v1.s[1] 581 FMLA v24.4s, v14.4s, v2.s[1] 582 FMLA v26.4s, v14.4s, v3.s[1] 583 FMLA v28.4s, v14.4s, v4.s[1] 584 FMLA v30.4s, v14.4s, v5.s[1] 585 FMLA v21.4s, v15.4s, v0.s[1] 586 FMLA v23.4s, v15.4s, v1.s[1] 587 FMLA v25.4s, v15.4s, v2.s[1] 588 FMLA v27.4s, v15.4s, v3.s[1] 589 FMLA v29.4s, v15.4s, v4.s[1] 590 FMLA v31.4s, v15.4s, v5.s[1] 591 592 # Is there a remainder?- 1 float of A (4 bytes) 5937: 594 TBZ x0, 2, 4b 595 596 # Remainder- 1 float of A (4 bytes) 597 # Load A 598 LDR s0, [x14], 4 599 LDR s1, [x15], 4 600 LDR s2, [x20], 4 601 LDR s3, [x21], 4 602 LDR s4, [x22], 4 603 LDR s5, [x23], 4 604 # Load B 605 LDP q12, q13, [x5], 32 606 607 FMLA v20.4s, v12.4s, v0.s[0] 608 FMLA v22.4s, v12.4s, v1.s[0] 609 FMLA v24.4s, v12.4s, v2.s[0] 610 FMLA v26.4s, v12.4s, v3.s[0] 611 FMLA v28.4s, v12.4s, v4.s[0] 612 FMLA v30.4s, v12.4s, v5.s[0] 613 FMLA v21.4s, v13.4s, v0.s[0] 614 FMLA v23.4s, v13.4s, v1.s[0] 615 FMLA v25.4s, v13.4s, v2.s[0] 616 FMLA v27.4s, v13.4s, v3.s[0] 617 FMLA v29.4s, v13.4s, v4.s[0] 618 FMLA v31.4s, v13.4s, v5.s[0] 619 B 4b 620 621 # Store odd width 6228: 623 TBZ x1, 2, 9f 624 STR q30, [x7], 16 625 MOV v30.16b, v31.16b 626 STR q28, [x13], 16 627 MOV v28.16b, v29.16b 628 STR q26, [x10], 16 629 MOV v26.16b, v27.16b 630 STR q24, [x17], 16 631 MOV v24.16b, v25.16b 632 STR q22, [x16], 16 633 MOV v22.16b, v23.16b 634 STR q20, [x6], 16 635 MOV v20.16b, v21.16b 6369: 637 TBZ x1, 1, 10f 638 STR d30, [x7], 8 639 STR d28, [x13], 8 640 DUP d30, v30.d[1] 641 DUP d28, v28.d[1] 642 STR d26, [x10], 8 643 STR d24, [x17], 8 644 DUP d26, v26.d[1] 645 DUP d24, v24.d[1] 646 STR d22, [x16], 8 647 STR d20, [x6], 8 648 DUP d22, v22.d[1] 649 DUP d20, v20.d[1] 650 65110: 652 TBZ x1, 0, 11f 653 STR s30, [x7] 654 STR s28, [x13] 655 STR s26, [x10] 656 STR s24, [x17] 657 STR s22, [x16] 658 STR s20, [x6] 65911: 660 # Restore x20,x21,x22,x23 from stack 661 LDP x22, x23, [sp, 80] 662 LDP x20, x21, [sp, 64] 663 664 # Restore d8-d15 from stack 665 LDP d14, d15, [sp, 48] 666 LDP d12, d13, [sp, 32] 667 LDP d10, d11, [sp, 16] 668 LDP d8, d9, [sp], 96 669 RET 670 671END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 672 673#ifdef __ELF__ 674.section ".note.GNU-stack","",%progbits 675#endif 676