1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# unused compared to 5x8 25# x4 a5 26# x7 c5 27# A5 v10 v11 28# C v30 v31 29 30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 31 32# A pointers 33# x3 a0 34# x9 a1 35# x10 a2 36# x11 a3 37# x12 a4 38 39# C pointers 40# x6 c0 41# x16 c1 42# x17 c2 43# x13 c3 44# x7 c4 45 46# Vector register usage 47# A0 v0 v1 48# A1 v2 v3 49# A2 v4 v5 50# A3 v6 v7 51# A4 v8 v9 52# B v12 v13 v14 v15 53# B v16 v17 v18 v19 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# Clamp v30 v31 60 61BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 62 63 # Load cn_stride, params pointer 64 LDP x14, x8, [sp] 65 66 # Clamp A and C pointers / Save d8-d15 on stack 67 STP d8, d9, [sp, -48]! 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 STP d12, d13, [sp, 16] 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 STP d14, d15, [sp, 32] 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x13, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x13, x17, x13, LO // c3 = c2 87 88 ADD x12, x11, x4 // a4 = a3 + a_stride 89 ADD x7, x13, x7 // c4 = c3 + cm_stride 90 // if mr <= 4 91 CSEL x12, x11, x12, LS // a4 = a3 92 CSEL x7, x13, x7, LS // c4 = c3 93 94 # Load clamp values 95 LD2R {v30.4s, v31.4s}, [x8] 96 970: 98 # Load initial bias from w into accumulators 99 LDP q20, q21, [x5], 32 100 MOV v22.16b, v20.16b 101 MOV v23.16b, v21.16b 102 MOV v24.16b, v20.16b 103 MOV v25.16b, v21.16b 104 MOV v26.16b, v20.16b 105 MOV v27.16b, v21.16b 106 MOV v28.16b, v20.16b 107 MOV v29.16b, v21.16b 108 109 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 110 SUBS x0, x2, 32 // k = kc - 32 111 B.LO 4f 112 113 # Prologue - loads for main loop of 80 FMA 114 LDR q0, [x3], 16 115 LDR q2, [x9], 16 116 LDR q4, [x10], 16 117 LDR q6, [x11], 16 118 LDR q8, [x12], 16 119 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 120 LDP q14, q15, [x5], 32 121 LDP q16, q17, [x5], 32 122 123 # Is there at least 8 floats (32 bytes) for main loop? 124 SUBS x0, x0, 32 125 B.LO 2f 126 127 # Main loop - 8 floats of A (32 bytes) 128 # 80 FMA + 5 LDP A + 8 LDP B 1291: 130 # First group of 4 A. 40 FMA. 131 FMLA v20.4s, v12.4s, v0.s[0] 132 LDP q18, q19, [x5], 32 // Load last B 133 FMLA v22.4s, v12.4s, v2.s[0] 134 FMLA v24.4s, v12.4s, v4.s[0] 135 FMLA v26.4s, v12.4s, v6.s[0] 136 FMLA v28.4s, v12.4s, v8.s[0] 137 FMLA v21.4s, v13.4s, v0.s[0] 138 FMLA v23.4s, v13.4s, v2.s[0] 139 FMLA v25.4s, v13.4s, v4.s[0] 140 FMLA v27.4s, v13.4s, v6.s[0] 141 FMLA v29.4s, v13.4s, v8.s[0] 142 LDR q1, [x3], 16 // Load next 5 A 143 144 FMLA v20.4s, v14.4s, v0.s[1] 145 FMLA v22.4s, v14.4s, v2.s[1] 146 FMLA v24.4s, v14.4s, v4.s[1] 147 LDR q3, [x9], 16 148 FMLA v26.4s, v14.4s, v6.s[1] 149 FMLA v28.4s, v14.4s, v8.s[1] 150 FMLA v21.4s, v15.4s, v0.s[1] 151 LDR q5, [x10], 16 152 FMLA v23.4s, v15.4s, v2.s[1] 153 FMLA v25.4s, v15.4s, v4.s[1] 154 FMLA v27.4s, v15.4s, v6.s[1] 155 LDR q7, [x11], 16 156 FMLA v29.4s, v15.4s, v8.s[1] 157 158 FMLA v20.4s, v16.4s, v0.s[2] 159 FMLA v22.4s, v16.4s, v2.s[2] 160 LDR q9, [x12], 16 161 FMLA v24.4s, v16.4s, v4.s[2] 162 FMLA v26.4s, v16.4s, v6.s[2] 163 FMLA v28.4s, v16.4s, v8.s[2] 164 LDP q12, q13, [x5], 32 // Load 4 B 165 FMLA v21.4s, v17.4s, v0.s[2] 166 FMLA v23.4s, v17.4s, v2.s[2] 167 FMLA v25.4s, v17.4s, v4.s[2] 168 LDP q14, q15, [x5], 32 169 FMLA v27.4s, v17.4s, v6.s[2] 170 FMLA v29.4s, v17.4s, v8.s[2] 171 172 FMLA v20.4s, v18.4s, v0.s[3] 173 LDP q16, q17, [x5], 32 174 FMLA v22.4s, v18.4s, v2.s[3] 175 FMLA v24.4s, v18.4s, v4.s[3] 176 FMLA v26.4s, v18.4s, v6.s[3] 177 FMLA v28.4s, v18.4s, v8.s[3] 178 FMLA v21.4s, v19.4s, v0.s[3] 179 FMLA v23.4s, v19.4s, v2.s[3] 180 FMLA v25.4s, v19.4s, v4.s[3] 181 FMLA v27.4s, v19.4s, v6.s[3] 182 FMLA v29.4s, v19.4s, v8.s[3] 183 LDP q18, q19, [x5], 32 184 185 # Second group of 4 A. 40 FMA. 186 FMLA v20.4s, v12.4s, v1.s[0] 187 FMLA v22.4s, v12.4s, v3.s[0] 188 FMLA v24.4s, v12.4s, v5.s[0] 189 LDR q0, [x3], 16 // Load next 5 A 190 FMLA v26.4s, v12.4s, v7.s[0] 191 FMLA v28.4s, v12.4s, v9.s[0] 192 FMLA v21.4s, v13.4s, v1.s[0] 193 LDR q2, [x9], 16 194 FMLA v23.4s, v13.4s, v3.s[0] 195 FMLA v25.4s, v13.4s, v5.s[0] 196 FMLA v27.4s, v13.4s, v7.s[0] 197 LDR q4, [x10], 16 198 FMLA v29.4s, v13.4s, v9.s[0] 199 200 FMLA v20.4s, v14.4s, v1.s[1] 201 FMLA v22.4s, v14.4s, v3.s[1] 202 LDR q6, [x11], 16 203 FMLA v24.4s, v14.4s, v5.s[1] 204 FMLA v26.4s, v14.4s, v7.s[1] 205 FMLA v28.4s, v14.4s, v9.s[1] 206 LDR q8, [x12], 16 207 FMLA v21.4s, v15.4s, v1.s[1] 208 FMLA v23.4s, v15.4s, v3.s[1] 209 FMLA v25.4s, v15.4s, v5.s[1] 210 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 211 FMLA v27.4s, v15.4s, v7.s[1] 212 FMLA v29.4s, v15.4s, v9.s[1] 213 214 FMLA v20.4s, v16.4s, v1.s[2] 215 LDP q14, q15, [x5], 32 216 FMLA v22.4s, v16.4s, v3.s[2] 217 FMLA v24.4s, v16.4s, v5.s[2] 218 FMLA v26.4s, v16.4s, v7.s[2] 219 FMLA v28.4s, v16.4s, v9.s[2] 220 FMLA v21.4s, v17.4s, v1.s[2] 221 FMLA v23.4s, v17.4s, v3.s[2] 222 FMLA v25.4s, v17.4s, v5.s[2] 223 FMLA v27.4s, v17.4s, v7.s[2] 224 FMLA v29.4s, v17.4s, v9.s[2] 225 LDP q16, q17, [x5], 32 226 227 FMLA v20.4s, v18.4s, v1.s[3] 228 FMLA v22.4s, v18.4s, v3.s[3] 229 SUBS x0, x0, 32 230 FMLA v24.4s, v18.4s, v5.s[3] 231 FMLA v26.4s, v18.4s, v7.s[3] 232 FMLA v28.4s, v18.4s, v9.s[3] 233 FMLA v21.4s, v19.4s, v1.s[3] 234 FMLA v23.4s, v19.4s, v3.s[3] 235 FMLA v25.4s, v19.4s, v5.s[3] 236 FMLA v27.4s, v19.4s, v7.s[3] 237 FMLA v29.4s, v19.4s, v9.s[3] 238 B.HS 1b 239 240 # Epilogue - 8 floats of A (32 bytes) 241 # 80 FMA + 5 LDP A + 8 LDP B 242 # First block same as main loop. Second block has no preloads. 2432: 244 # First group of 4 A. 40 FMA. 245 FMLA v20.4s, v12.4s, v0.s[0] 246 LDP q18, q19, [x5], 32 // Load last B 247 FMLA v22.4s, v12.4s, v2.s[0] 248 FMLA v24.4s, v12.4s, v4.s[0] 249 FMLA v26.4s, v12.4s, v6.s[0] 250 FMLA v28.4s, v12.4s, v8.s[0] 251 FMLA v21.4s, v13.4s, v0.s[0] 252 FMLA v23.4s, v13.4s, v2.s[0] 253 FMLA v25.4s, v13.4s, v4.s[0] 254 FMLA v27.4s, v13.4s, v6.s[0] 255 FMLA v29.4s, v13.4s, v8.s[0] 256 LDR q1, [x3], 16 // Load next 5 A 257 258 FMLA v20.4s, v14.4s, v0.s[1] 259 FMLA v22.4s, v14.4s, v2.s[1] 260 FMLA v24.4s, v14.4s, v4.s[1] 261 LDR q3, [x9], 16 262 FMLA v26.4s, v14.4s, v6.s[1] 263 FMLA v28.4s, v14.4s, v8.s[1] 264 FMLA v21.4s, v15.4s, v0.s[1] 265 LDR q5, [x10], 16 266 FMLA v23.4s, v15.4s, v2.s[1] 267 FMLA v25.4s, v15.4s, v4.s[1] 268 FMLA v27.4s, v15.4s, v6.s[1] 269 LDR q7, [x11], 16 270 FMLA v29.4s, v15.4s, v8.s[1] 271 272 FMLA v20.4s, v16.4s, v0.s[2] 273 FMLA v22.4s, v16.4s, v2.s[2] 274 LDR q9, [x12], 16 275 FMLA v24.4s, v16.4s, v4.s[2] 276 FMLA v26.4s, v16.4s, v6.s[2] 277 FMLA v28.4s, v16.4s, v8.s[2] 278 LDP q12, q13, [x5], 32 // Load 4 B 279 FMLA v21.4s, v17.4s, v0.s[2] 280 FMLA v23.4s, v17.4s, v2.s[2] 281 FMLA v25.4s, v17.4s, v4.s[2] 282 LDP q14, q15, [x5], 32 283 FMLA v27.4s, v17.4s, v6.s[2] 284 FMLA v29.4s, v17.4s, v8.s[2] 285 286 FMLA v20.4s, v18.4s, v0.s[3] 287 LDP q16, q17, [x5], 32 288 FMLA v22.4s, v18.4s, v2.s[3] 289 FMLA v24.4s, v18.4s, v4.s[3] 290 FMLA v26.4s, v18.4s, v6.s[3] 291 FMLA v28.4s, v18.4s, v8.s[3] 292 FMLA v21.4s, v19.4s, v0.s[3] 293 FMLA v23.4s, v19.4s, v2.s[3] 294 FMLA v25.4s, v19.4s, v4.s[3] 295 FMLA v27.4s, v19.4s, v6.s[3] 296 FMLA v29.4s, v19.4s, v8.s[3] 297 LDP q18, q19, [x5], 32 298 299 # Second group of 4 A. 40 FMA. 300 FMLA v20.4s, v12.4s, v1.s[0] 301 FMLA v22.4s, v12.4s, v3.s[0] 302 FMLA v24.4s, v12.4s, v5.s[0] 303 FMLA v26.4s, v12.4s, v7.s[0] 304 FMLA v28.4s, v12.4s, v9.s[0] 305 FMLA v21.4s, v13.4s, v1.s[0] 306 FMLA v23.4s, v13.4s, v3.s[0] 307 FMLA v25.4s, v13.4s, v5.s[0] 308 FMLA v27.4s, v13.4s, v7.s[0] 309 FMLA v29.4s, v13.4s, v9.s[0] 310 311 FMLA v20.4s, v14.4s, v1.s[1] 312 FMLA v22.4s, v14.4s, v3.s[1] 313 FMLA v24.4s, v14.4s, v5.s[1] 314 FMLA v26.4s, v14.4s, v7.s[1] 315 FMLA v28.4s, v14.4s, v9.s[1] 316 FMLA v21.4s, v15.4s, v1.s[1] 317 FMLA v23.4s, v15.4s, v3.s[1] 318 FMLA v25.4s, v15.4s, v5.s[1] 319 FMLA v27.4s, v15.4s, v7.s[1] 320 FMLA v29.4s, v15.4s, v9.s[1] 321 322 FMLA v20.4s, v16.4s, v1.s[2] 323 FMLA v22.4s, v16.4s, v3.s[2] 324 FMLA v24.4s, v16.4s, v5.s[2] 325 FMLA v26.4s, v16.4s, v7.s[2] 326 FMLA v28.4s, v16.4s, v9.s[2] 327 FMLA v21.4s, v17.4s, v1.s[2] 328 FMLA v23.4s, v17.4s, v3.s[2] 329 FMLA v25.4s, v17.4s, v5.s[2] 330 FMLA v27.4s, v17.4s, v7.s[2] 331 FMLA v29.4s, v17.4s, v9.s[2] 332 TST x0, 31 333 334 FMLA v20.4s, v18.4s, v1.s[3] 335 FMLA v22.4s, v18.4s, v3.s[3] 336 FMLA v24.4s, v18.4s, v5.s[3] 337 FMLA v26.4s, v18.4s, v7.s[3] 338 FMLA v28.4s, v18.4s, v9.s[3] 339 FMLA v21.4s, v19.4s, v1.s[3] 340 FMLA v23.4s, v19.4s, v3.s[3] 341 FMLA v25.4s, v19.4s, v5.s[3] 342 FMLA v27.4s, v19.4s, v7.s[3] 343 FMLA v29.4s, v19.4s, v9.s[3] 344 B.NE 4f 345 346 # Clamp 3473: 348 FMAX v20.4s, v20.4s, v30.4s 349 SUBS x1, x1, 8 350 FMAX v21.4s, v21.4s, v30.4s 351 FMAX v22.4s, v22.4s, v30.4s 352 FMAX v23.4s, v23.4s, v30.4s 353 FMAX v24.4s, v24.4s, v30.4s 354 FMAX v25.4s, v25.4s, v30.4s 355 FMAX v26.4s, v26.4s, v30.4s 356 FMAX v27.4s, v27.4s, v30.4s 357 FMAX v28.4s, v28.4s, v30.4s 358 FMAX v29.4s, v29.4s, v30.4s 359 FMIN v20.4s, v20.4s, v31.4s 360 FMIN v21.4s, v21.4s, v31.4s 361 FMIN v22.4s, v22.4s, v31.4s 362 FMIN v23.4s, v23.4s, v31.4s 363 FMIN v24.4s, v24.4s, v31.4s 364 FMIN v25.4s, v25.4s, v31.4s 365 FMIN v26.4s, v26.4s, v31.4s 366 FMIN v27.4s, v27.4s, v31.4s 367 FMIN v28.4s, v28.4s, v31.4s 368 FMIN v29.4s, v29.4s, v31.4s 369 370 # Store full 5 x 8 371 B.LO 7f 372 373 STP q20, q21, [x6] 374 ADD x6, x6, x14 375 SUB x3, x3, x2 // a0 -= kc 376 STP q22, q23, [x16] 377 ADD x16, x16, x14 378 SUB x9, x9, x2 // a1 -= kc 379 STP q24, q25, [x17] 380 ADD x17, x17, x14 381 SUB x10, x10, x2 // a2 -= kc 382 STP q26, q27, [x13] 383 ADD x13, x13, x14 384 SUB x11, x11, x2 // a3 -= kc 385 STP q28, q29, [x7] 386 ADD x7, x7, x14 387 SUB x12, x12, x2 // a4 -= kc 388 389 B.HI 0b 390 391 # Restore d8-d15 from stack 392 LDP d14, d15, [sp, 32] 393 LDP d12, d13, [sp, 16] 394 LDP d8, d9, [sp], 48 395 RET 396 397 # Load clamp values 3984: 399 # Is there a remainder?- 4 floats of A (16 bytes) 400 TBZ x0, 4, 5f 401 402 # Remainder- 4 floats of A (16 bytes) 403 # Load A 404 LDR q0, [x3], 16 405 LDR q2, [x9], 16 406 LDR q4, [x10], 16 407 LDR q6, [x11], 16 408 LDR q8, [x12], 16 409 # Load B 410 LDP q12, q13, [x5], 32 411 LDP q14, q15, [x5], 32 412 LDP q16, q17, [x5], 32 413 LDP q18, q19, [x5], 32 414 415 FMLA v20.4s, v12.4s, v0.s[0] 416 FMLA v22.4s, v12.4s, v2.s[0] 417 FMLA v24.4s, v12.4s, v4.s[0] 418 FMLA v26.4s, v12.4s, v6.s[0] 419 FMLA v28.4s, v12.4s, v8.s[0] 420 FMLA v21.4s, v13.4s, v0.s[0] 421 FMLA v23.4s, v13.4s, v2.s[0] 422 FMLA v25.4s, v13.4s, v4.s[0] 423 FMLA v27.4s, v13.4s, v6.s[0] 424 FMLA v29.4s, v13.4s, v8.s[0] 425 426 FMLA v20.4s, v14.4s, v0.s[1] 427 FMLA v22.4s, v14.4s, v2.s[1] 428 FMLA v24.4s, v14.4s, v4.s[1] 429 FMLA v26.4s, v14.4s, v6.s[1] 430 FMLA v28.4s, v14.4s, v8.s[1] 431 FMLA v21.4s, v15.4s, v0.s[1] 432 FMLA v23.4s, v15.4s, v2.s[1] 433 FMLA v25.4s, v15.4s, v4.s[1] 434 FMLA v27.4s, v15.4s, v6.s[1] 435 FMLA v29.4s, v15.4s, v8.s[1] 436 437 FMLA v20.4s, v16.4s, v0.s[2] 438 FMLA v22.4s, v16.4s, v2.s[2] 439 FMLA v24.4s, v16.4s, v4.s[2] 440 FMLA v26.4s, v16.4s, v6.s[2] 441 FMLA v28.4s, v16.4s, v8.s[2] 442 FMLA v21.4s, v17.4s, v0.s[2] 443 FMLA v23.4s, v17.4s, v2.s[2] 444 FMLA v25.4s, v17.4s, v4.s[2] 445 FMLA v27.4s, v17.4s, v6.s[2] 446 FMLA v29.4s, v17.4s, v8.s[2] 447 448 FMLA v20.4s, v18.4s, v0.s[3] 449 FMLA v22.4s, v18.4s, v2.s[3] 450 FMLA v24.4s, v18.4s, v4.s[3] 451 FMLA v26.4s, v18.4s, v6.s[3] 452 FMLA v28.4s, v18.4s, v8.s[3] 453 FMLA v21.4s, v19.4s, v0.s[3] 454 FMLA v23.4s, v19.4s, v2.s[3] 455 FMLA v25.4s, v19.4s, v4.s[3] 456 FMLA v27.4s, v19.4s, v6.s[3] 457 FMLA v29.4s, v19.4s, v8.s[3] 458 459 # Is there a remainder?- 2 floats of A (8 bytes) 4605: 461 TBZ x0, 3, 6f 462 463 # Remainder- 2 floats of A (8 bytes) 464 # Load A 465 LDR d0, [x3], 8 466 LDR d2, [x9], 8 467 LDR d4, [x10], 8 468 LDR d6, [x11], 8 469 LDR d8, [x12], 8 470 # Load B 471 LDP q12, q13, [x5], 32 472 LDP q14, q15, [x5], 32 473 474 FMLA v20.4s, v12.4s, v0.s[0] 475 FMLA v22.4s, v12.4s, v2.s[0] 476 FMLA v24.4s, v12.4s, v4.s[0] 477 FMLA v26.4s, v12.4s, v6.s[0] 478 FMLA v28.4s, v12.4s, v8.s[0] 479 FMLA v21.4s, v13.4s, v0.s[0] 480 FMLA v23.4s, v13.4s, v2.s[0] 481 FMLA v25.4s, v13.4s, v4.s[0] 482 FMLA v27.4s, v13.4s, v6.s[0] 483 FMLA v29.4s, v13.4s, v8.s[0] 484 485 FMLA v20.4s, v14.4s, v0.s[1] 486 FMLA v22.4s, v14.4s, v2.s[1] 487 FMLA v24.4s, v14.4s, v4.s[1] 488 FMLA v26.4s, v14.4s, v6.s[1] 489 FMLA v28.4s, v14.4s, v8.s[1] 490 FMLA v21.4s, v15.4s, v0.s[1] 491 FMLA v23.4s, v15.4s, v2.s[1] 492 FMLA v25.4s, v15.4s, v4.s[1] 493 FMLA v27.4s, v15.4s, v6.s[1] 494 FMLA v29.4s, v15.4s, v8.s[1] 495 496 # Is there a remainder?- 1 float of A (4 bytes) 4976: 498 TBZ x0, 2, 3b 499 500 # Remainder- 1 float of A (4 bytes) 501 # Load A 502 LDR s0, [x3], 4 503 LDR s2, [x9], 4 504 LDR s4, [x10], 4 505 LDR s6, [x11], 4 506 LDR s8, [x12], 4 507 # Load B 508 LDP q12, q13, [x5], 32 509 510 FMLA v20.4s, v12.4s, v0.s[0] 511 FMLA v22.4s, v12.4s, v2.s[0] 512 FMLA v24.4s, v12.4s, v4.s[0] 513 FMLA v26.4s, v12.4s, v6.s[0] 514 FMLA v28.4s, v12.4s, v8.s[0] 515 FMLA v21.4s, v13.4s, v0.s[0] 516 FMLA v23.4s, v13.4s, v2.s[0] 517 FMLA v25.4s, v13.4s, v4.s[0] 518 FMLA v27.4s, v13.4s, v6.s[0] 519 FMLA v29.4s, v13.4s, v8.s[0] 520 B 3b 521 522 # Store odd width 5237: 524 TBZ x1, 2, 8f 525 STR q20, [x6], 16 526 MOV v20.16b, v21.16b 527 STR q22, [x16], 16 528 MOV v22.16b, v23.16b 529 STR q24, [x17], 16 530 MOV v24.16b, v25.16b 531 STR q26, [x13], 16 532 MOV v26.16b, v27.16b 533 STR q28, [x7], 16 534 MOV v28.16b, v29.16b 5358: 536 TBZ x1, 1, 9f 537 STR d20, [x6], 8 538 STR d22, [x16], 8 539 DUP d20, v20.d[1] 540 DUP d22, v22.d[1] 541 STR d24, [x17], 8 542 STR d26, [x13], 8 543 DUP d24, v24.d[1] 544 DUP d26, v26.d[1] 545 STR d28, [x7], 8 546 DUP d28, v28.d[1] 547 5489: 549 TBZ x1, 0, 10f 550 STR s20, [x6] 551 STR s22, [x16] 552 STR s24, [x17] 553 STR s26, [x13] 554 STR s28, [x7] 55510: 556 # Restore d8-d15 from stack 557 LDP d14, d15, [sp, 32] 558 LDP d12, d13, [sp, 16] 559 LDP d8, d9, [sp], 48 560 RET 561 562END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 563 564#ifdef __ELF__ 565.section ".note.GNU-stack","",%progbits 566#endif 567