1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float** a, x4 18# const void* w, x5 19# uint8_t* c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75 62 63 # Clamp C pointers / Save d8-d15 on stack 64 CMP x0, 2 // if mr < 2 65 STP d8, d9, [sp, -96]! 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 STP d10, d11, [sp, 16] 68 CSEL x16, x6, x16, LO // c1 = c0 69 STP d12, d13, [sp, 32] 70 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 STP d14, d15, [sp, 48] 73 // if mr <= 2 74 CSEL x17, x16, x17, LS // c2 = c1 75 STP x20, x21, [sp, 64] 76 77 CMP x0, 4 // if mr < 4 78 STP x22, x23, [sp, 80] 79 ADD x10, x17, x7 // c3 = c2 + cm_stride 80 CSEL x10, x17, x10, LO // c3 = c2 81 82 ADD x13, x10, x7 // c4 = c3 + cm_stride 83 // if mr <= 4 84 CSEL x13, x10, x13, LS // c4 = c3 85 86 # Load zero, params pointer 87 LDP x12, x8, [sp, 112] 88 89 CMP x0, 6 // if mr < 6 90 ADD x7, x13, x7 // c5 = c4 + cm_stride 91 LDR x11, [sp, 104] // Load a_offset 92 CSEL x7, x13, x7, LO // c5 = c4 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 MOV v23.16b, v21.16b 99 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 100 MOV v24.16b, v20.16b 101 PRFM PLDL1KEEP, [x5, 64] 102 MOV v25.16b, v21.16b 103 PRFM PLDL1KEEP, [x5, 128] 104 MOV v26.16b, v20.16b 105 PRFM PLDL1KEEP, [x5, 192] 106 MOV v27.16b, v21.16b 107 PRFM PLDL1KEEP, [x5, 256] 108 MOV v28.16b, v20.16b 109 PRFM PLDL1KEEP, [x5, 320] 110 MOV v29.16b, v21.16b 111 MOV v30.16b, v20.16b 112 MOV v31.16b, v21.16b 113 114 MOV x9, x3 // p = ks 115 1161: 117 # Load next 6 A pointers 118 LDP x14, x15, [x4], 16 119 LDP x20, x21, [x4], 16 120 LDP x22, x23, [x4], 16 121 122 CMP x14, x12 // if a0 == zero 123 ADD x14, x14, x11 // a0 += a_offset 124 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 125 CMP x15, x12 // if a1 == zero 126 ADD x15, x15, x11 // a1 += a_offset 127 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 128 CMP x20, x12 // if a2 == zero 129 ADD x20, x20, x11 // a2 += a_offset 130 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 131 CMP x21, x12 // if a3 == zero 132 ADD x21, x21, x11 // a3 += a_offset 133 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 134 CMP x22, x12 // if a4 == zero 135 ADD x22, x22, x11 // a4 += a_offset 136 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 137 CMP x23, x12 // if a5 == zero 138 ADD x23, x23, x11 // a5 += a_offset 139 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 140 141 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 142 SUBS x0, x2, 32 // k = kc - 32 143 B.LO 5f 144 145 # Prologue - loads for main loop of 96 FMA 146 LDR q0, [x14], 16 147 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 148 LDR q1, [x15], 16 149 LDR q2, [x20], 16 150 LDR q3, [x21], 16 151 LDR q4, [x22], 16 152 LDR q5, [x23], 16 153 LDP q14, q15, [x5], 32 154 LDP q16, q17, [x5], 32 155 156 # Is there at least 8 floats (32 bytes) for main loop? 157 SUBS x0, x0, 32 158 B.LO 3f 159 160 # Main loop - 8 floats of A (32 bytes) 161 # 96 FMA + 6 LDP A + 8 LDP B 162 # 64 float weights = 256 bytes. 4 cache lines. 1632: 164 # First group of 4 A. 48 FMA. 165 FMLA v20.4s, v12.4s, v0.s[0] 166 LDP q18, q19, [x5], 32 // Load last B 167 FMLA v22.4s, v12.4s, v1.s[0] 168 FMLA v24.4s, v12.4s, v2.s[0] 169 FMLA v26.4s, v12.4s, v3.s[0] 170 FMLA v28.4s, v12.4s, v4.s[0] 171 FMLA v30.4s, v12.4s, v5.s[0] 172 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 173 FMLA v21.4s, v13.4s, v0.s[0] 174 FMLA v23.4s, v13.4s, v1.s[0] 175 FMLA v25.4s, v13.4s, v2.s[0] 176 PRFM PLDL1KEEP, [x5, 320] 177 FMLA v27.4s, v13.4s, v3.s[0] 178 FMLA v29.4s, v13.4s, v4.s[0] 179 FMLA v31.4s, v13.4s, v5.s[0] 180 PRFM PLDL1KEEP, [x5, 384] 181 FMLA v20.4s, v14.4s, v0.s[1] 182 FMLA v22.4s, v14.4s, v1.s[1] 183 FMLA v24.4s, v14.4s, v2.s[1] 184 PRFM PLDL1KEEP, [x5, 448] 185 FMLA v26.4s, v14.4s, v3.s[1] 186 FMLA v28.4s, v14.4s, v4.s[1] 187 FMLA v30.4s, v14.4s, v5.s[1] 188 FMLA v21.4s, v15.4s, v0.s[1] 189 FMLA v23.4s, v15.4s, v1.s[1] 190 FMLA v25.4s, v15.4s, v2.s[1] 191 LDR q6, [x14], 16 // Load next 6 A 192 FMLA v27.4s, v15.4s, v3.s[1] 193 FMLA v29.4s, v15.4s, v4.s[1] 194 FMLA v31.4s, v15.4s, v5.s[1] 195 LDR q7, [x15], 16 196 197 FMLA v20.4s, v16.4s, v0.s[2] 198 FMLA v22.4s, v16.4s, v1.s[2] 199 FMLA v24.4s, v16.4s, v2.s[2] 200 LDR q8, [x20], 16 201 FMLA v26.4s, v16.4s, v3.s[2] 202 FMLA v28.4s, v16.4s, v4.s[2] 203 FMLA v30.4s, v16.4s, v5.s[2] 204 LDR q9, [x21], 16 205 FMLA v21.4s, v17.4s, v0.s[2] 206 FMLA v23.4s, v17.4s, v1.s[2] 207 FMLA v25.4s, v17.4s, v2.s[2] 208 LDR q10, [x22], 16 209 FMLA v27.4s, v17.4s, v3.s[2] 210 FMLA v29.4s, v17.4s, v4.s[2] 211 FMLA v31.4s, v17.4s, v5.s[2] 212 LDR q11, [x23], 16 213 214 FMLA v20.4s, v18.4s, v0.s[3] 215 FMLA v22.4s, v18.4s, v1.s[3] 216 FMLA v24.4s, v18.4s, v2.s[3] 217 LDP q12, q13, [x5], 32 // Load 4 B 218 FMLA v26.4s, v18.4s, v3.s[3] 219 FMLA v28.4s, v18.4s, v4.s[3] 220 FMLA v30.4s, v18.4s, v5.s[3] 221 LDP q14, q15, [x5], 32 222 FMLA v21.4s, v19.4s, v0.s[3] 223 FMLA v23.4s, v19.4s, v1.s[3] 224 FMLA v25.4s, v19.4s, v2.s[3] 225 LDP q16, q17, [x5], 32 226 FMLA v27.4s, v19.4s, v3.s[3] 227 FMLA v29.4s, v19.4s, v4.s[3] 228 FMLA v31.4s, v19.4s, v5.s[3] 229 LDP q18, q19, [x5], 32 230 231 # Second group of 4 A. 48 FMA. 232 FMLA v20.4s, v12.4s, v6.s[0] 233 FMLA v22.4s, v12.4s, v7.s[0] 234 FMLA v24.4s, v12.4s, v8.s[0] 235 LDR q0, [x14], 16 // Load next 6 A 236 FMLA v26.4s, v12.4s, v9.s[0] 237 FMLA v28.4s, v12.4s, v10.s[0] 238 FMLA v30.4s, v12.4s, v11.s[0] 239 LDR q1, [x15], 16 240 FMLA v21.4s, v13.4s, v6.s[0] 241 FMLA v23.4s, v13.4s, v7.s[0] 242 FMLA v25.4s, v13.4s, v8.s[0] 243 LDR q2, [x20], 16 244 FMLA v27.4s, v13.4s, v9.s[0] 245 FMLA v29.4s, v13.4s, v10.s[0] 246 FMLA v31.4s, v13.4s, v11.s[0] 247 LDR q3, [x21], 16 248 249 FMLA v20.4s, v14.4s, v6.s[1] 250 FMLA v22.4s, v14.4s, v7.s[1] 251 FMLA v24.4s, v14.4s, v8.s[1] 252 LDR q4, [x22], 16 253 FMLA v26.4s, v14.4s, v9.s[1] 254 FMLA v28.4s, v14.4s, v10.s[1] 255 FMLA v30.4s, v14.4s, v11.s[1] 256 LDR q5, [x23], 16 257 FMLA v21.4s, v15.4s, v6.s[1] 258 FMLA v23.4s, v15.4s, v7.s[1] 259 FMLA v25.4s, v15.4s, v8.s[1] 260 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 261 FMLA v27.4s, v15.4s, v9.s[1] 262 FMLA v29.4s, v15.4s, v10.s[1] 263 FMLA v31.4s, v15.4s, v11.s[1] 264 LDP q14, q15, [x5], 32 265 266 FMLA v20.4s, v16.4s, v6.s[2] 267 FMLA v22.4s, v16.4s, v7.s[2] 268 FMLA v24.4s, v16.4s, v8.s[2] 269 FMLA v26.4s, v16.4s, v9.s[2] 270 FMLA v28.4s, v16.4s, v10.s[2] 271 FMLA v30.4s, v16.4s, v11.s[2] 272 FMLA v21.4s, v17.4s, v6.s[2] 273 FMLA v23.4s, v17.4s, v7.s[2] 274 FMLA v25.4s, v17.4s, v8.s[2] 275 FMLA v27.4s, v17.4s, v9.s[2] 276 FMLA v29.4s, v17.4s, v10.s[2] 277 FMLA v31.4s, v17.4s, v11.s[2] 278 279 FMLA v20.4s, v18.4s, v6.s[3] 280 FMLA v22.4s, v18.4s, v7.s[3] 281 LDP q16, q17, [x5], 32 282 FMLA v24.4s, v18.4s, v8.s[3] 283 FMLA v26.4s, v18.4s, v9.s[3] 284 FMLA v28.4s, v18.4s, v10.s[3] 285 FMLA v30.4s, v18.4s, v11.s[3] 286 SUBS x0, x0, 32 287 FMLA v21.4s, v19.4s, v6.s[3] 288 FMLA v23.4s, v19.4s, v7.s[3] 289 FMLA v25.4s, v19.4s, v8.s[3] 290 FMLA v27.4s, v19.4s, v9.s[3] 291 FMLA v29.4s, v19.4s, v10.s[3] 292 FMLA v31.4s, v19.4s, v11.s[3] 293 B.HS 2b 294 295 # Epilogue - 8 floats of A (32 bytes) 296 # 96 FMA + 6 LDP A + 8 LDP B 297 # First block same as main loop. Second block has no preloads. 2983: 299 # First group of 4 A. 48 FMA. 300 FMLA v20.4s, v12.4s, v0.s[0] 301 LDP q18, q19, [x5], 32 // Load last B 302 FMLA v22.4s, v12.4s, v1.s[0] 303 FMLA v24.4s, v12.4s, v2.s[0] 304 FMLA v26.4s, v12.4s, v3.s[0] 305 FMLA v28.4s, v12.4s, v4.s[0] 306 FMLA v30.4s, v12.4s, v5.s[0] 307 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 308 FMLA v21.4s, v13.4s, v0.s[0] 309 FMLA v23.4s, v13.4s, v1.s[0] 310 FMLA v25.4s, v13.4s, v2.s[0] 311 PRFM PLDL1KEEP, [x5, 320] 312 FMLA v27.4s, v13.4s, v3.s[0] 313 FMLA v29.4s, v13.4s, v4.s[0] 314 FMLA v31.4s, v13.4s, v5.s[0] 315 PRFM PLDL1KEEP, [x5, 384] 316 FMLA v20.4s, v14.4s, v0.s[1] 317 FMLA v22.4s, v14.4s, v1.s[1] 318 FMLA v24.4s, v14.4s, v2.s[1] 319 PRFM PLDL1KEEP, [x5, 448] 320 FMLA v26.4s, v14.4s, v3.s[1] 321 FMLA v28.4s, v14.4s, v4.s[1] 322 FMLA v30.4s, v14.4s, v5.s[1] 323 FMLA v21.4s, v15.4s, v0.s[1] 324 FMLA v23.4s, v15.4s, v1.s[1] 325 FMLA v25.4s, v15.4s, v2.s[1] 326 LDR q6, [x14], 16 // Load next 6 A 327 FMLA v27.4s, v15.4s, v3.s[1] 328 FMLA v29.4s, v15.4s, v4.s[1] 329 FMLA v31.4s, v15.4s, v5.s[1] 330 LDR q7, [x15], 16 331 332 FMLA v20.4s, v16.4s, v0.s[2] 333 FMLA v22.4s, v16.4s, v1.s[2] 334 FMLA v24.4s, v16.4s, v2.s[2] 335 LDR q8, [x20], 16 336 FMLA v26.4s, v16.4s, v3.s[2] 337 FMLA v28.4s, v16.4s, v4.s[2] 338 FMLA v30.4s, v16.4s, v5.s[2] 339 LDR q9, [x21], 16 340 FMLA v21.4s, v17.4s, v0.s[2] 341 FMLA v23.4s, v17.4s, v1.s[2] 342 FMLA v25.4s, v17.4s, v2.s[2] 343 LDR q10, [x22], 16 344 FMLA v27.4s, v17.4s, v3.s[2] 345 FMLA v29.4s, v17.4s, v4.s[2] 346 FMLA v31.4s, v17.4s, v5.s[2] 347 LDR q11, [x23], 16 348 349 FMLA v20.4s, v18.4s, v0.s[3] 350 FMLA v22.4s, v18.4s, v1.s[3] 351 FMLA v24.4s, v18.4s, v2.s[3] 352 LDP q12, q13, [x5], 32 // Load 4 B 353 FMLA v26.4s, v18.4s, v3.s[3] 354 FMLA v28.4s, v18.4s, v4.s[3] 355 FMLA v30.4s, v18.4s, v5.s[3] 356 LDP q14, q15, [x5], 32 357 FMLA v21.4s, v19.4s, v0.s[3] 358 FMLA v23.4s, v19.4s, v1.s[3] 359 FMLA v25.4s, v19.4s, v2.s[3] 360 LDP q16, q17, [x5], 32 361 FMLA v27.4s, v19.4s, v3.s[3] 362 FMLA v29.4s, v19.4s, v4.s[3] 363 FMLA v31.4s, v19.4s, v5.s[3] 364 LDP q18, q19, [x5], 32 365 366 # Second group of 4 A. 48 FMA. 367 FMLA v20.4s, v12.4s, v6.s[0] 368 FMLA v22.4s, v12.4s, v7.s[0] 369 FMLA v24.4s, v12.4s, v8.s[0] 370 FMLA v26.4s, v12.4s, v9.s[0] 371 FMLA v28.4s, v12.4s, v10.s[0] 372 FMLA v30.4s, v12.4s, v11.s[0] 373 FMLA v21.4s, v13.4s, v6.s[0] 374 FMLA v23.4s, v13.4s, v7.s[0] 375 FMLA v25.4s, v13.4s, v8.s[0] 376 FMLA v27.4s, v13.4s, v9.s[0] 377 FMLA v29.4s, v13.4s, v10.s[0] 378 FMLA v31.4s, v13.4s, v11.s[0] 379 380 FMLA v20.4s, v14.4s, v6.s[1] 381 FMLA v22.4s, v14.4s, v7.s[1] 382 FMLA v24.4s, v14.4s, v8.s[1] 383 FMLA v26.4s, v14.4s, v9.s[1] 384 FMLA v28.4s, v14.4s, v10.s[1] 385 FMLA v30.4s, v14.4s, v11.s[1] 386 FMLA v21.4s, v15.4s, v6.s[1] 387 FMLA v23.4s, v15.4s, v7.s[1] 388 FMLA v25.4s, v15.4s, v8.s[1] 389 FMLA v27.4s, v15.4s, v9.s[1] 390 FMLA v29.4s, v15.4s, v10.s[1] 391 FMLA v31.4s, v15.4s, v11.s[1] 392 393 FMLA v20.4s, v16.4s, v6.s[2] 394 FMLA v22.4s, v16.4s, v7.s[2] 395 FMLA v24.4s, v16.4s, v8.s[2] 396 FMLA v26.4s, v16.4s, v9.s[2] 397 FMLA v28.4s, v16.4s, v10.s[2] 398 FMLA v30.4s, v16.4s, v11.s[2] 399 FMLA v21.4s, v17.4s, v6.s[2] 400 FMLA v23.4s, v17.4s, v7.s[2] 401 FMLA v25.4s, v17.4s, v8.s[2] 402 FMLA v27.4s, v17.4s, v9.s[2] 403 FMLA v29.4s, v17.4s, v10.s[2] 404 FMLA v31.4s, v17.4s, v11.s[2] 405 406 407 FMLA v20.4s, v18.4s, v6.s[3] 408 FMLA v22.4s, v18.4s, v7.s[3] 409 FMLA v24.4s, v18.4s, v8.s[3] 410 FMLA v26.4s, v18.4s, v9.s[3] 411 FMLA v28.4s, v18.4s, v10.s[3] 412 FMLA v30.4s, v18.4s, v11.s[3] 413 414 # Is there a remainder?- 4 floats of A (16 bytes) or less 415 TST x0, 31 416 417 FMLA v21.4s, v19.4s, v6.s[3] 418 FMLA v23.4s, v19.4s, v7.s[3] 419 FMLA v25.4s, v19.4s, v8.s[3] 420 LD2R {v6.4s, v7.4s}, [x8] // Load min/max values 421 FMLA v27.4s, v19.4s, v9.s[3] 422 FMLA v29.4s, v19.4s, v10.s[3] 423 FMLA v31.4s, v19.4s, v11.s[3] 424 B.NE 5f 425 4264: 427 # ks loop 428 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 429 B.HI 1b 430 431 # Clamp 432 FMAX v20.4s, v20.4s, v6.4s 433 FMAX v21.4s, v21.4s, v6.4s 434 FMAX v22.4s, v22.4s, v6.4s 435 FMAX v23.4s, v23.4s, v6.4s 436 LDR x0, [sp, 96] // Load cn_stride 437 FMAX v24.4s, v24.4s, v6.4s 438 FMAX v25.4s, v25.4s, v6.4s 439 FMAX v26.4s, v26.4s, v6.4s 440 FMAX v27.4s, v27.4s, v6.4s 441 FMAX v28.4s, v28.4s, v6.4s 442 FMAX v29.4s, v29.4s, v6.4s 443 FMAX v30.4s, v30.4s, v6.4s 444 FMAX v31.4s, v31.4s, v6.4s 445 SUBS x1, x1, 8 446 FMIN v20.4s, v20.4s, v7.4s 447 FMIN v21.4s, v21.4s, v7.4s 448 FMIN v22.4s, v22.4s, v7.4s 449 FMIN v23.4s, v23.4s, v7.4s 450 FMIN v24.4s, v24.4s, v7.4s 451 FMIN v25.4s, v25.4s, v7.4s 452 FMIN v26.4s, v26.4s, v7.4s 453 FMIN v27.4s, v27.4s, v7.4s 454 FMIN v28.4s, v28.4s, v7.4s 455 FMIN v29.4s, v29.4s, v7.4s 456 FMIN v30.4s, v30.4s, v7.4s 457 FMIN v31.4s, v31.4s, v7.4s 458 459 # Store full 6 x 8 460 B.LO 8f 461 462 STP q30, q31, [x7] 463 ADD x7, x7, x0 464 STP q28, q29, [x13] 465 ADD x13, x13, x0 466 STP q26, q27, [x10] 467 ADD x10, x10, x0 468 STP q24, q25, [x17] 469 ADD x17, x17, x0 470 STP q22, q23, [x16] 471 ADD x16, x16, x0 472 STP q20, q21, [x6] 473 ADD x6, x6, x0 474 475 SUB x4, x4, x3 // a -= ks 476 477 # nc loop 478 B.HI 0b 479 480 # Restore x20,x21,x22,x23 from stack 481 LDP x22, x23, [sp, 80] 482 LDP x20, x21, [sp, 64] 483 484 # Restore d8-d15 from stack 485 LDP d14, d15, [sp, 48] 486 LDP d12, d13, [sp, 32] 487 LDP d10, d11, [sp, 16] 488 LDP d8, d9, [sp], 96 489 RET 490 4915: 492 # Load min/max values 493 LD2R {v6.4s, v7.4s}, [x8] 494 495 # Is there a remainder?- 4 floats of A (16 bytes) 496 TBZ x0, 4, 6f 497 498 # Remainder- 4 floats of A (16 bytes) 499 # Load A 500 LDR q0, [x14], 16 501 LDR q1, [x15], 16 502 LDR q2, [x20], 16 503 LDR q3, [x21], 16 504 LDR q4, [x22], 16 505 LDR q5, [x23], 16 506 # Load B 507 LDP q12, q13, [x5], 32 508 LDP q14, q15, [x5], 32 509 LDP q16, q17, [x5], 32 510 LDP q18, q19, [x5], 32 511 512 FMLA v20.4s, v12.4s, v0.s[0] 513 FMLA v22.4s, v12.4s, v1.s[0] 514 FMLA v24.4s, v12.4s, v2.s[0] 515 FMLA v26.4s, v12.4s, v3.s[0] 516 FMLA v28.4s, v12.4s, v4.s[0] 517 FMLA v30.4s, v12.4s, v5.s[0] 518 FMLA v21.4s, v13.4s, v0.s[0] 519 FMLA v23.4s, v13.4s, v1.s[0] 520 FMLA v25.4s, v13.4s, v2.s[0] 521 FMLA v27.4s, v13.4s, v3.s[0] 522 FMLA v29.4s, v13.4s, v4.s[0] 523 FMLA v31.4s, v13.4s, v5.s[0] 524 525 FMLA v20.4s, v14.4s, v0.s[1] 526 FMLA v22.4s, v14.4s, v1.s[1] 527 FMLA v24.4s, v14.4s, v2.s[1] 528 FMLA v26.4s, v14.4s, v3.s[1] 529 FMLA v28.4s, v14.4s, v4.s[1] 530 FMLA v30.4s, v14.4s, v5.s[1] 531 FMLA v21.4s, v15.4s, v0.s[1] 532 FMLA v23.4s, v15.4s, v1.s[1] 533 FMLA v25.4s, v15.4s, v2.s[1] 534 FMLA v27.4s, v15.4s, v3.s[1] 535 FMLA v29.4s, v15.4s, v4.s[1] 536 FMLA v31.4s, v15.4s, v5.s[1] 537 538 FMLA v20.4s, v16.4s, v0.s[2] 539 FMLA v22.4s, v16.4s, v1.s[2] 540 FMLA v24.4s, v16.4s, v2.s[2] 541 FMLA v26.4s, v16.4s, v3.s[2] 542 FMLA v28.4s, v16.4s, v4.s[2] 543 FMLA v30.4s, v16.4s, v5.s[2] 544 FMLA v21.4s, v17.4s, v0.s[2] 545 FMLA v23.4s, v17.4s, v1.s[2] 546 FMLA v25.4s, v17.4s, v2.s[2] 547 FMLA v27.4s, v17.4s, v3.s[2] 548 FMLA v29.4s, v17.4s, v4.s[2] 549 FMLA v31.4s, v17.4s, v5.s[2] 550 551 FMLA v20.4s, v18.4s, v0.s[3] 552 FMLA v22.4s, v18.4s, v1.s[3] 553 FMLA v24.4s, v18.4s, v2.s[3] 554 FMLA v26.4s, v18.4s, v3.s[3] 555 FMLA v28.4s, v18.4s, v4.s[3] 556 FMLA v30.4s, v18.4s, v5.s[3] 557 FMLA v21.4s, v19.4s, v0.s[3] 558 FMLA v23.4s, v19.4s, v1.s[3] 559 FMLA v25.4s, v19.4s, v2.s[3] 560 FMLA v27.4s, v19.4s, v3.s[3] 561 FMLA v29.4s, v19.4s, v4.s[3] 562 FMLA v31.4s, v19.4s, v5.s[3] 563 564 # Is there a remainder?- 2 floats of A (8 bytes) 5656: 566 TBZ x0, 3, 7f 567 568 # Remainder- 2 floats of A (8 bytes) 569 # Load A 570 LDR d0, [x14], 8 571 LDR d1, [x15], 8 572 LDR d2, [x20], 8 573 LDR d3, [x21], 8 574 LDR d4, [x22], 8 575 LDR d5, [x23], 8 576 # Load B 577 LDP q12, q13, [x5], 32 578 LDP q14, q15, [x5], 32 579 580 FMLA v20.4s, v12.4s, v0.s[0] 581 FMLA v22.4s, v12.4s, v1.s[0] 582 FMLA v24.4s, v12.4s, v2.s[0] 583 FMLA v26.4s, v12.4s, v3.s[0] 584 FMLA v28.4s, v12.4s, v4.s[0] 585 FMLA v30.4s, v12.4s, v5.s[0] 586 FMLA v21.4s, v13.4s, v0.s[0] 587 FMLA v23.4s, v13.4s, v1.s[0] 588 FMLA v25.4s, v13.4s, v2.s[0] 589 FMLA v27.4s, v13.4s, v3.s[0] 590 FMLA v29.4s, v13.4s, v4.s[0] 591 FMLA v31.4s, v13.4s, v5.s[0] 592 593 FMLA v20.4s, v14.4s, v0.s[1] 594 FMLA v22.4s, v14.4s, v1.s[1] 595 FMLA v24.4s, v14.4s, v2.s[1] 596 FMLA v26.4s, v14.4s, v3.s[1] 597 FMLA v28.4s, v14.4s, v4.s[1] 598 FMLA v30.4s, v14.4s, v5.s[1] 599 FMLA v21.4s, v15.4s, v0.s[1] 600 FMLA v23.4s, v15.4s, v1.s[1] 601 FMLA v25.4s, v15.4s, v2.s[1] 602 FMLA v27.4s, v15.4s, v3.s[1] 603 FMLA v29.4s, v15.4s, v4.s[1] 604 FMLA v31.4s, v15.4s, v5.s[1] 605 606 # Is there a remainder?- 1 float of A (4 bytes) 6077: 608 TBZ x0, 2, 4b 609 610 # Remainder- 1 float of A (4 bytes) 611 # Load A 612 LDR s0, [x14], 4 613 LDR s1, [x15], 4 614 LDR s2, [x20], 4 615 LDR s3, [x21], 4 616 LDR s4, [x22], 4 617 LDR s5, [x23], 4 618 # Load B 619 LDP q12, q13, [x5], 32 620 621 FMLA v20.4s, v12.4s, v0.s[0] 622 FMLA v22.4s, v12.4s, v1.s[0] 623 FMLA v24.4s, v12.4s, v2.s[0] 624 FMLA v26.4s, v12.4s, v3.s[0] 625 FMLA v28.4s, v12.4s, v4.s[0] 626 FMLA v30.4s, v12.4s, v5.s[0] 627 FMLA v21.4s, v13.4s, v0.s[0] 628 FMLA v23.4s, v13.4s, v1.s[0] 629 FMLA v25.4s, v13.4s, v2.s[0] 630 FMLA v27.4s, v13.4s, v3.s[0] 631 FMLA v29.4s, v13.4s, v4.s[0] 632 FMLA v31.4s, v13.4s, v5.s[0] 633 B 4b 634 635 # Store odd width 6368: 637 TBZ x1, 2, 9f 638 STR q30, [x7], 16 639 MOV v30.16b, v31.16b 640 STR q28, [x13], 16 641 MOV v28.16b, v29.16b 642 STR q26, [x10], 16 643 MOV v26.16b, v27.16b 644 STR q24, [x17], 16 645 MOV v24.16b, v25.16b 646 STR q22, [x16], 16 647 MOV v22.16b, v23.16b 648 STR q20, [x6], 16 649 MOV v20.16b, v21.16b 6509: 651 TBZ x1, 1, 10f 652 STR d30, [x7], 8 653 STR d28, [x13], 8 654 DUP d30, v30.d[1] 655 DUP d28, v28.d[1] 656 STR d26, [x10], 8 657 STR d24, [x17], 8 658 DUP d26, v26.d[1] 659 DUP d24, v24.d[1] 660 STR d22, [x16], 8 661 STR d20, [x6], 8 662 DUP d22, v22.d[1] 663 DUP d20, v20.d[1] 664 66510: 666 TBZ x1, 0, 11f 667 STR s30, [x7] 668 STR s28, [x13] 669 STR s26, [x10] 670 STR s24, [x17] 671 STR s22, [x16] 672 STR s20, [x6] 67311: 674 # Restore x20,x21,x22,x23 from stack 675 LDP x22, x23, [sp, 80] 676 LDP x20, x21, [sp, 64] 677 678 # Restore d8-d15 from stack 679 LDP d14, d15, [sp, 48] 680 LDP d12, d13, [sp, 32] 681 LDP d10, d11, [sp, 16] 682 LDP d8, d9, [sp], 96 683 RET 684 685END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75 686 687#ifdef __ELF__ 688.section ".note.GNU-stack","",%progbits 689#endif 690