1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# 5x8 strips the following out of 5x8 27# x23 a5 28# x7 c5 x13 unused 29# A5 v10 v11 30# C v30 v31 31 32# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 33 34# A pointers 35# x14 a0 36# x15 a1 37# x20 a2 38# x21 a3 39# x8 a4 40 41# C pointers 42# x6 c0 43# x16 c1 44# x17 c2 45# x13 c3 46# x7 c4 47 48# Vector register usage 49# A0 v0 v1 50# A1 v2 v3 51# A2 v4 v5 52# A3 v6 v7 53# A4 v8 v9 54# B v12 v13 v14 v15 55# B v16 v17 v18 v19 56# C v20 v21 57# C v22 v23 58# C v24 v25 59# C v26 v27 60# C v28 v29 61# Clamp v30 v31 62 63BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75 64 65 # Clamp C pointers / Save d8-d15 on stack 66 STP d8, d9, [sp, -64]! 67 CMP x0, 2 // if mr < 2 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 STP d12, d13, [sp, 16] 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 STP d14, d15, [sp, 32] 77 CMP x0, 4 // if mr < 4 78 ADD x13, x17, x7 // c3 = c2 + cm_stride 79 CSEL x13, x17, x13, LO // c3 = c2 80 81 # Load zero, params pointer 82 LDP x12, x8, [sp, 80] 83 ADD x7, x13, x7 // c4 = c3 + cm_stride 84 // if mr <= 4 85 CSEL x7, x13, x7, LS // c4 = c3 86 87 # Save x20,x21 on stack 88 STP x20, x21, [sp, 48] 89 90 # Load clamp values 91 LD2R {v30.4s, v31.4s}, [x8] 92 93 # Load cn_stride, a_offset 94 LDP x10, x11, [sp, 64] 95 960: 97 # Load initial bias from w into accumulators 98 LDP q20, q21, [x5], 32 99 MOV v22.16b, v20.16b 100 MOV v23.16b, v21.16b 101 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 102 MOV v24.16b, v20.16b 103 MOV v25.16b, v21.16b 104 PRFM PLDL1KEEP, [x5, 64] 105 MOV v26.16b, v20.16b 106 MOV v27.16b, v21.16b 107 PRFM PLDL1KEEP, [x5, 128] 108 MOV v28.16b, v20.16b 109 MOV v29.16b, v21.16b 110 PRFM PLDL1KEEP, [x5, 192] 111 112 MOV x9, x3 // p = ks 113 1141: 115 # Load next 5 A pointers 116 LDP x14, x15, [x4], 16 117 LDP x20, x21, [x4], 16 118 LDR x8, [x4], 8 119 120 CMP x14, x12 // if a0 == zero 121 ADD x14, x14, x11 // a0 += a_offset 122 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 123 CMP x15, x12 // if a1 == zero 124 ADD x15, x15, x11 // a1 += a_offset 125 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 126 CMP x20, x12 // if a2 == zero 127 ADD x20, x20, x11 // a2 += a_offset 128 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 129 CMP x21, x12 // if a3 == zero 130 ADD x21, x21, x11 // a3 += a_offset 131 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 132 CMP x8, x12 // if a4 == zero 133 ADD x8, x8, x11 // a4 += a_offset 134 CSEL x8, x12, x8, EQ // a4 = zero, else += a4 + a_offset 135 136 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 137 SUBS x0, x2, 32 // k = kc - 32 138 B.LO 5f 139 140 # Prologue - loads for main loop of 96 FMA 141 LDR q0, [x14], 16 142 LDR q2, [x15], 16 143 LDR q4, [x20], 16 144 LDR q6, [x21], 16 145 LDR q8, [x8], 16 146 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 147 LDP q14, q15, [x5], 32 148 LDP q16, q17, [x5], 32 149 150 # Is there at least 8 floats (32 bytes) for main loop? 151 SUBS x0, x0, 32 152 B.LO 3f 153 154 # Main loop - 8 floats of A (32 bytes) 155 # 80 FMA + 5 LDP A + 8 LDP B 1562: 157 # First group of 4 A. 40 FMA. 158 FMLA v20.4s, v12.4s, v0.s[0] 159 LDP q18, q19, [x5], 32 // Load last B 160 FMLA v22.4s, v12.4s, v2.s[0] 161 FMLA v24.4s, v12.4s, v4.s[0] 162 FMLA v26.4s, v12.4s, v6.s[0] 163 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 164 FMLA v28.4s, v12.4s, v8.s[0] 165 FMLA v21.4s, v13.4s, v0.s[0] 166 FMLA v23.4s, v13.4s, v2.s[0] 167 PRFM PLDL1KEEP, [x5, 256] 168 FMLA v25.4s, v13.4s, v4.s[0] 169 FMLA v27.4s, v13.4s, v6.s[0] 170 FMLA v29.4s, v13.4s, v8.s[0] 171 LDR q1, [x14], 16 // Load next 5 A 172 173 FMLA v20.4s, v14.4s, v0.s[1] 174 FMLA v22.4s, v14.4s, v2.s[1] 175 FMLA v24.4s, v14.4s, v4.s[1] 176 LDR q3, [x15], 16 177 FMLA v26.4s, v14.4s, v6.s[1] 178 FMLA v28.4s, v14.4s, v8.s[1] 179 FMLA v21.4s, v15.4s, v0.s[1] 180 LDR q5, [x20], 16 181 FMLA v23.4s, v15.4s, v2.s[1] 182 FMLA v25.4s, v15.4s, v4.s[1] 183 FMLA v27.4s, v15.4s, v6.s[1] 184 LDR q7, [x21], 16 185 FMLA v29.4s, v15.4s, v8.s[1] 186 187 FMLA v20.4s, v16.4s, v0.s[2] 188 FMLA v22.4s, v16.4s, v2.s[2] 189 LDR q9, [x8], 16 190 FMLA v24.4s, v16.4s, v4.s[2] 191 FMLA v26.4s, v16.4s, v6.s[2] 192 FMLA v28.4s, v16.4s, v8.s[2] 193 LDP q12, q13, [x5], 32 // Load 4 B 194 FMLA v21.4s, v17.4s, v0.s[2] 195 FMLA v23.4s, v17.4s, v2.s[2] 196 FMLA v25.4s, v17.4s, v4.s[2] 197 FMLA v27.4s, v17.4s, v6.s[2] 198 FMLA v29.4s, v17.4s, v8.s[2] 199 200 FMLA v20.4s, v18.4s, v0.s[3] 201 FMLA v22.4s, v18.4s, v2.s[3] 202 FMLA v24.4s, v18.4s, v4.s[3] 203 FMLA v26.4s, v18.4s, v6.s[3] 204 LDP q14, q15, [x5], 32 205 FMLA v28.4s, v18.4s, v8.s[3] 206 FMLA v21.4s, v19.4s, v0.s[3] 207 FMLA v23.4s, v19.4s, v2.s[3] 208 LDP q16, q17, [x5], 32 209 FMLA v25.4s, v19.4s, v4.s[3] 210 FMLA v27.4s, v19.4s, v6.s[3] 211 FMLA v29.4s, v19.4s, v8.s[3] 212 LDP q18, q19, [x5], 32 213 214 # Second group of 4 A. 40 FMA. 215 FMLA v20.4s, v12.4s, v1.s[0] 216 FMLA v22.4s, v12.4s, v3.s[0] 217 FMLA v24.4s, v12.4s, v5.s[0] 218 LDR q0, [x14], 16 // Load next 5 A 219 FMLA v26.4s, v12.4s, v7.s[0] 220 FMLA v28.4s, v12.4s, v9.s[0] 221 FMLA v21.4s, v13.4s, v1.s[0] 222 LDR q2, [x15], 16 223 FMLA v23.4s, v13.4s, v3.s[0] 224 FMLA v25.4s, v13.4s, v5.s[0] 225 FMLA v27.4s, v13.4s, v7.s[0] 226 LDR q4, [x20], 16 227 FMLA v29.4s, v13.4s, v9.s[0] 228 229 FMLA v20.4s, v14.4s, v1.s[1] 230 FMLA v22.4s, v14.4s, v3.s[1] 231 LDR q6, [x21], 16 232 FMLA v24.4s, v14.4s, v5.s[1] 233 FMLA v26.4s, v14.4s, v7.s[1] 234 FMLA v28.4s, v14.4s, v9.s[1] 235 LDR q8, [x8], 16 236 FMLA v21.4s, v15.4s, v1.s[1] 237 FMLA v23.4s, v15.4s, v3.s[1] 238 FMLA v25.4s, v15.4s, v5.s[1] 239 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 240 FMLA v27.4s, v15.4s, v7.s[1] 241 FMLA v29.4s, v15.4s, v9.s[1] 242 243 FMLA v20.4s, v16.4s, v1.s[2] 244 FMLA v22.4s, v16.4s, v3.s[2] 245 FMLA v24.4s, v16.4s, v5.s[2] 246 FMLA v26.4s, v16.4s, v7.s[2] 247 FMLA v28.4s, v16.4s, v9.s[2] 248 FMLA v21.4s, v17.4s, v1.s[2] 249 FMLA v23.4s, v17.4s, v3.s[2] 250 LDP q14, q15, [x5], 32 251 FMLA v25.4s, v17.4s, v5.s[2] 252 FMLA v27.4s, v17.4s, v7.s[2] 253 FMLA v29.4s, v17.4s, v9.s[2] 254 LDP q16, q17, [x5], 32 255 256 FMLA v20.4s, v18.4s, v1.s[3] 257 FMLA v22.4s, v18.4s, v3.s[3] 258 SUBS x0, x0, 32 259 FMLA v24.4s, v18.4s, v5.s[3] 260 FMLA v26.4s, v18.4s, v7.s[3] 261 FMLA v28.4s, v18.4s, v9.s[3] 262 FMLA v21.4s, v19.4s, v1.s[3] 263 FMLA v23.4s, v19.4s, v3.s[3] 264 FMLA v25.4s, v19.4s, v5.s[3] 265 FMLA v27.4s, v19.4s, v7.s[3] 266 FMLA v29.4s, v19.4s, v9.s[3] 267 B.HS 2b 268 269 # Epilogue - 8 floats of A (32 bytes) 270 # 80 FMA + 5 LDP A + 8 LDP B 271 # First block same as main loop. Second block has no preloads. 2723: 273 # First group of 4 A. 40 FMA. 274 FMLA v20.4s, v12.4s, v0.s[0] 275 LDP q18, q19, [x5], 32 // Load last B 276 FMLA v22.4s, v12.4s, v2.s[0] 277 FMLA v24.4s, v12.4s, v4.s[0] 278 FMLA v26.4s, v12.4s, v6.s[0] 279 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 280 FMLA v28.4s, v12.4s, v8.s[0] 281 FMLA v21.4s, v13.4s, v0.s[0] 282 FMLA v23.4s, v13.4s, v2.s[0] 283 PRFM PLDL1KEEP, [x5, 256] 284 FMLA v25.4s, v13.4s, v4.s[0] 285 FMLA v27.4s, v13.4s, v6.s[0] 286 FMLA v29.4s, v13.4s, v8.s[0] 287 LDR q1, [x14], 16 // Load next 5 A 288 289 FMLA v20.4s, v14.4s, v0.s[1] 290 FMLA v22.4s, v14.4s, v2.s[1] 291 FMLA v24.4s, v14.4s, v4.s[1] 292 LDR q3, [x15], 16 293 FMLA v26.4s, v14.4s, v6.s[1] 294 FMLA v28.4s, v14.4s, v8.s[1] 295 FMLA v21.4s, v15.4s, v0.s[1] 296 LDR q5, [x20], 16 297 FMLA v23.4s, v15.4s, v2.s[1] 298 FMLA v25.4s, v15.4s, v4.s[1] 299 FMLA v27.4s, v15.4s, v6.s[1] 300 LDR q7, [x21], 16 301 FMLA v29.4s, v15.4s, v8.s[1] 302 303 FMLA v20.4s, v16.4s, v0.s[2] 304 FMLA v22.4s, v16.4s, v2.s[2] 305 LDR q9, [x8], 16 306 FMLA v24.4s, v16.4s, v4.s[2] 307 FMLA v26.4s, v16.4s, v6.s[2] 308 FMLA v28.4s, v16.4s, v8.s[2] 309 LDP q12, q13, [x5], 32 // Load 4 B 310 FMLA v21.4s, v17.4s, v0.s[2] 311 FMLA v23.4s, v17.4s, v2.s[2] 312 FMLA v25.4s, v17.4s, v4.s[2] 313 FMLA v27.4s, v17.4s, v6.s[2] 314 FMLA v29.4s, v17.4s, v8.s[2] 315 316 FMLA v20.4s, v18.4s, v0.s[3] 317 FMLA v22.4s, v18.4s, v2.s[3] 318 FMLA v24.4s, v18.4s, v4.s[3] 319 FMLA v26.4s, v18.4s, v6.s[3] 320 LDP q14, q15, [x5], 32 321 FMLA v28.4s, v18.4s, v8.s[3] 322 FMLA v21.4s, v19.4s, v0.s[3] 323 FMLA v23.4s, v19.4s, v2.s[3] 324 LDP q16, q17, [x5], 32 325 FMLA v25.4s, v19.4s, v4.s[3] 326 FMLA v27.4s, v19.4s, v6.s[3] 327 FMLA v29.4s, v19.4s, v8.s[3] 328 LDP q18, q19, [x5], 32 329 330 # Second group of 4 A. 40 FMA. 331 FMLA v20.4s, v12.4s, v1.s[0] 332 FMLA v22.4s, v12.4s, v3.s[0] 333 FMLA v24.4s, v12.4s, v5.s[0] 334 FMLA v26.4s, v12.4s, v7.s[0] 335 FMLA v28.4s, v12.4s, v9.s[0] 336 FMLA v21.4s, v13.4s, v1.s[0] 337 FMLA v23.4s, v13.4s, v3.s[0] 338 FMLA v25.4s, v13.4s, v5.s[0] 339 FMLA v27.4s, v13.4s, v7.s[0] 340 FMLA v29.4s, v13.4s, v9.s[0] 341 342 FMLA v20.4s, v14.4s, v1.s[1] 343 FMLA v22.4s, v14.4s, v3.s[1] 344 FMLA v24.4s, v14.4s, v5.s[1] 345 FMLA v26.4s, v14.4s, v7.s[1] 346 FMLA v28.4s, v14.4s, v9.s[1] 347 FMLA v21.4s, v15.4s, v1.s[1] 348 FMLA v23.4s, v15.4s, v3.s[1] 349 FMLA v25.4s, v15.4s, v5.s[1] 350 FMLA v27.4s, v15.4s, v7.s[1] 351 FMLA v29.4s, v15.4s, v9.s[1] 352 353 FMLA v20.4s, v16.4s, v1.s[2] 354 FMLA v22.4s, v16.4s, v3.s[2] 355 FMLA v24.4s, v16.4s, v5.s[2] 356 FMLA v26.4s, v16.4s, v7.s[2] 357 FMLA v28.4s, v16.4s, v9.s[2] 358 FMLA v21.4s, v17.4s, v1.s[2] 359 FMLA v23.4s, v17.4s, v3.s[2] 360 FMLA v25.4s, v17.4s, v5.s[2] 361 FMLA v27.4s, v17.4s, v7.s[2] 362 FMLA v29.4s, v17.4s, v9.s[2] 363 364 FMLA v20.4s, v18.4s, v1.s[3] 365 FMLA v22.4s, v18.4s, v3.s[3] 366 FMLA v24.4s, v18.4s, v5.s[3] 367 FMLA v26.4s, v18.4s, v7.s[3] 368 FMLA v28.4s, v18.4s, v9.s[3] 369 FMLA v21.4s, v19.4s, v1.s[3] 370 FMLA v23.4s, v19.4s, v3.s[3] 371 FMLA v25.4s, v19.4s, v5.s[3] 372 FMLA v27.4s, v19.4s, v7.s[3] 373 FMLA v29.4s, v19.4s, v9.s[3] 374 # Is there a remainder?- 4 floats of A (16 bytes) or less 375 TST x0, 31 376 B.NE 5f 377 3784: 379 # ks loop 380 SUBS x9, x9, 40 // ks -= MR * sizeof(void*) 381 B.HI 1b 382 383 # Clamp 384 FMAX v20.4s, v20.4s, v30.4s 385 FMAX v21.4s, v21.4s, v30.4s 386 FMAX v22.4s, v22.4s, v30.4s 387 FMAX v23.4s, v23.4s, v30.4s 388 FMAX v24.4s, v24.4s, v30.4s 389 FMAX v25.4s, v25.4s, v30.4s 390 FMAX v26.4s, v26.4s, v30.4s 391 FMAX v27.4s, v27.4s, v30.4s 392 FMAX v28.4s, v28.4s, v30.4s 393 FMAX v29.4s, v29.4s, v30.4s 394 FMIN v20.4s, v20.4s, v31.4s 395 FMIN v21.4s, v21.4s, v31.4s 396 FMIN v22.4s, v22.4s, v31.4s 397 FMIN v23.4s, v23.4s, v31.4s 398 FMIN v24.4s, v24.4s, v31.4s 399 FMIN v25.4s, v25.4s, v31.4s 400 FMIN v26.4s, v26.4s, v31.4s 401 FMIN v27.4s, v27.4s, v31.4s 402 FMIN v28.4s, v28.4s, v31.4s 403 FMIN v29.4s, v29.4s, v31.4s 404 405 # Store full 5 x 8 406 SUBS x1, x1, 8 407 B.LO 8f 408 409 STP q28, q29, [x7] 410 ADD x7, x7, x10 411 STP q26, q27, [x13] 412 ADD x13, x13, x10 413 STP q24, q25, [x17] 414 ADD x17, x17, x10 415 STP q22, q23, [x16] 416 ADD x16, x16, x10 417 STP q20, q21, [x6] 418 ADD x6, x6, x10 419 420 SUB x4, x4, x3 // a -= ks 421 422 # nc loop 423 B.HI 0b 424 425 # Restore x20,x21 from stack 426 LDP x20, x21, [sp, 48] 427 428 # Restore d8-d15 from stack 429 LDP d14, d15, [sp, 32] 430 LDP d12, d13, [sp, 16] 431 LDP d8, d9, [sp], 64 432 RET 433 4345: 435 # Is there a remainder?- 4 floats of A (16 bytes) 436 TBZ x0, 4, 6f 437 438 # Remainder- 4 floats of A (16 bytes) 439 # Load A 440 LDR q0, [x14], 16 441 LDR q2, [x15], 16 442 LDR q4, [x20], 16 443 LDR q6, [x21], 16 444 LDR q8, [x8], 16 445 # Load B 446 LDP q12, q13, [x5], 32 447 LDP q14, q15, [x5], 32 448 LDP q16, q17, [x5], 32 449 LDP q18, q19, [x5], 32 450 451 FMLA v20.4s, v12.4s, v0.s[0] 452 FMLA v22.4s, v12.4s, v2.s[0] 453 FMLA v24.4s, v12.4s, v4.s[0] 454 FMLA v26.4s, v12.4s, v6.s[0] 455 FMLA v28.4s, v12.4s, v8.s[0] 456 FMLA v21.4s, v13.4s, v0.s[0] 457 FMLA v23.4s, v13.4s, v2.s[0] 458 FMLA v25.4s, v13.4s, v4.s[0] 459 FMLA v27.4s, v13.4s, v6.s[0] 460 FMLA v29.4s, v13.4s, v8.s[0] 461 462 FMLA v20.4s, v14.4s, v0.s[1] 463 FMLA v22.4s, v14.4s, v2.s[1] 464 FMLA v24.4s, v14.4s, v4.s[1] 465 FMLA v26.4s, v14.4s, v6.s[1] 466 FMLA v28.4s, v14.4s, v8.s[1] 467 FMLA v21.4s, v15.4s, v0.s[1] 468 FMLA v23.4s, v15.4s, v2.s[1] 469 FMLA v25.4s, v15.4s, v4.s[1] 470 FMLA v27.4s, v15.4s, v6.s[1] 471 FMLA v29.4s, v15.4s, v8.s[1] 472 473 FMLA v20.4s, v16.4s, v0.s[2] 474 FMLA v22.4s, v16.4s, v2.s[2] 475 FMLA v24.4s, v16.4s, v4.s[2] 476 FMLA v26.4s, v16.4s, v6.s[2] 477 FMLA v28.4s, v16.4s, v8.s[2] 478 FMLA v21.4s, v17.4s, v0.s[2] 479 FMLA v23.4s, v17.4s, v2.s[2] 480 FMLA v25.4s, v17.4s, v4.s[2] 481 FMLA v27.4s, v17.4s, v6.s[2] 482 FMLA v29.4s, v17.4s, v8.s[2] 483 484 FMLA v20.4s, v18.4s, v0.s[3] 485 FMLA v22.4s, v18.4s, v2.s[3] 486 FMLA v24.4s, v18.4s, v4.s[3] 487 FMLA v26.4s, v18.4s, v6.s[3] 488 FMLA v28.4s, v18.4s, v8.s[3] 489 FMLA v21.4s, v19.4s, v0.s[3] 490 FMLA v23.4s, v19.4s, v2.s[3] 491 FMLA v25.4s, v19.4s, v4.s[3] 492 FMLA v27.4s, v19.4s, v6.s[3] 493 FMLA v29.4s, v19.4s, v8.s[3] 494 495 # Is there a remainder?- 2 floats of A (8 bytes) 4966: 497 TBZ x0, 3, 7f 498 499 # Remainder- 2 floats of A (8 bytes) 500 # Load A 501 LDR d0, [x14], 8 502 LDR d2, [x15], 8 503 LDR d4, [x20], 8 504 LDR d6, [x21], 8 505 LDR d8, [x8], 8 506 # Load B 507 LDP q12, q13, [x5], 32 508 LDP q14, q15, [x5], 32 509 510 FMLA v20.4s, v12.4s, v0.s[0] 511 FMLA v22.4s, v12.4s, v2.s[0] 512 FMLA v24.4s, v12.4s, v4.s[0] 513 FMLA v26.4s, v12.4s, v6.s[0] 514 FMLA v28.4s, v12.4s, v8.s[0] 515 FMLA v21.4s, v13.4s, v0.s[0] 516 FMLA v23.4s, v13.4s, v2.s[0] 517 FMLA v25.4s, v13.4s, v4.s[0] 518 FMLA v27.4s, v13.4s, v6.s[0] 519 FMLA v29.4s, v13.4s, v8.s[0] 520 521 FMLA v20.4s, v14.4s, v0.s[1] 522 FMLA v22.4s, v14.4s, v2.s[1] 523 FMLA v24.4s, v14.4s, v4.s[1] 524 FMLA v26.4s, v14.4s, v6.s[1] 525 FMLA v28.4s, v14.4s, v8.s[1] 526 FMLA v21.4s, v15.4s, v0.s[1] 527 FMLA v23.4s, v15.4s, v2.s[1] 528 FMLA v25.4s, v15.4s, v4.s[1] 529 FMLA v27.4s, v15.4s, v6.s[1] 530 FMLA v29.4s, v15.4s, v8.s[1] 531 532 # Is there a remainder?- 1 float of A (4 bytes) 5337: 534 TBZ x0, 2, 4b 535 536 # Remainder- 1 float of A (4 bytes) 537 # Load A 538 LDR s0, [x14], 4 539 LDR s2, [x15], 4 540 LDR s4, [x20], 4 541 LDR s6, [x21], 4 542 LDR s8, [x8], 4 543 # Load B 544 LDP q12, q13, [x5], 32 545 546 FMLA v20.4s, v12.4s, v0.s[0] 547 FMLA v22.4s, v12.4s, v2.s[0] 548 FMLA v24.4s, v12.4s, v4.s[0] 549 FMLA v26.4s, v12.4s, v6.s[0] 550 FMLA v28.4s, v12.4s, v8.s[0] 551 FMLA v21.4s, v13.4s, v0.s[0] 552 FMLA v23.4s, v13.4s, v2.s[0] 553 FMLA v25.4s, v13.4s, v4.s[0] 554 FMLA v27.4s, v13.4s, v6.s[0] 555 FMLA v29.4s, v13.4s, v8.s[0] 556 B 4b 557 558 # Store odd width 5598: 560 TBZ x1, 2, 9f 561 STR q28, [x7], 16 562 MOV v28.16b, v29.16b 563 STR q26, [x13], 16 564 MOV v26.16b, v27.16b 565 STR q24, [x17], 16 566 MOV v24.16b, v25.16b 567 STR q22, [x16], 16 568 MOV v22.16b, v23.16b 569 STR q20, [x6], 16 570 MOV v20.16b, v21.16b 5719: 572 TBZ x1, 1, 10f 573 STR d28, [x7], 8 574 STR d26, [x13], 8 575 DUP d28, v28.d[1] 576 DUP d26, v26.d[1] 577 STR d24, [x17], 8 578 STR d22, [x16], 8 579 DUP d24, v24.d[1] 580 DUP d22, v22.d[1] 581 STR d20, [x6], 8 582 DUP d20, v20.d[1] 583 58410: 585 TBZ x1, 0, 11f 586 STR s28, [x7] 587 STR s26, [x13] 588 STR s24, [x17] 589 STR s22, [x16] 590 STR s20, [x6] 59111: 592 # Restore x20,x21 from stack 593 LDP x20, x21, [sp, 48] 594 595 # Restore d8-d15 from stack 596 LDP d14, d15, [sp, 32] 597 LDP d12, d13, [sp, 16] 598 LDP d8, d9, [sp], 64 599 RET 600 601END_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75 602 603#ifdef __ELF__ 604.section ".note.GNU-stack","",%progbits 605#endif 606