1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# LINT.IfChange 9# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75( 10# size_t mr, x0 11# size_t nc, x1 12# size_t kc, x2 / x0 13# size_t ks, x3 / x9 14# const float** a, x4 15# const void* w, x5 16# uint8_t* c, x6 17# size_t cm_stride, x7 18# size_t cn_stride, [sp] -> (x0) 19# size_t a_offset, [sp + 8] -> x11 20# const float* zero, [sp + 16] -> x12 21# const xnn_f32_minmax_params params [sp + 24] -> x8 22 23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 24 25# A pointers 26# x14 a0 27# x15 a1 28# x20 a2 29# x21 a3 30# x22 a4 31# x23 a5 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x10 c3 38# x13 c4 39# x7 c5 40 41# Vector register usage 42# A0 v0 v6 43# A1 v1 v7 44# A2 v2 v8 45# A3 v3 v9 46# A4 v4 v10 47# A5 v5 v11 48# B v12 v13 v14 v15 49# B v16 v17 v18 v19 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# C v28 v29 55# C v30 v31 56# Clamp v6 v7 57 58BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 59 60 # Clamp C pointers / Save d8-d15 on stack 61 CMP x0, 2 // if mr < 2 62 STP d8, d9, [sp, -96]! 63 ADD x16, x6, x7 // c1 = c0 + cm_stride 64 STP d10, d11, [sp, 16] 65 CSEL x16, x6, x16, LO // c1 = c0 66 STP d12, d13, [sp, 32] 67 68 ADD x17, x16, x7 // c2 = c1 + cm_stride 69 STP d14, d15, [sp, 48] 70 // if mr <= 2 71 CSEL x17, x16, x17, LS // c2 = c1 72 STP x20, x21, [sp, 64] 73 74 CMP x0, 4 // if mr < 4 75 STP x22, x23, [sp, 80] 76 ADD x10, x17, x7 // c3 = c2 + cm_stride 77 CSEL x10, x17, x10, LO // c3 = c2 78 79 ADD x13, x10, x7 // c4 = c3 + cm_stride 80 // if mr <= 4 81 CSEL x13, x10, x13, LS // c4 = c3 82 83 # Load zero, params pointer 84 LDP x12, x8, [sp, 112] 85 86 CMP x0, 6 // if mr < 6 87 ADD x7, x13, x7 // c5 = c4 + cm_stride 88 LDR x11, [sp, 104] // Load a_offset 89 CSEL x7, x13, x7, LO // c5 = c4 90 910: 92 # Load initial bias from w into accumulators 93 LDP q20, q21, [x5], 32 94 MOV v22.16b, v20.16b 95 MOV v23.16b, v21.16b 96 $if PREFETCH: 97 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 98 MOV v24.16b, v20.16b 99 $if PREFETCH: 100 PRFM PLDL1KEEP, [x5, 64] 101 MOV v25.16b, v21.16b 102 $if PREFETCH: 103 PRFM PLDL1KEEP, [x5, 128] 104 MOV v26.16b, v20.16b 105 $if PREFETCH: 106 PRFM PLDL1KEEP, [x5, 192] 107 MOV v27.16b, v21.16b 108 $if PREFETCH: 109 PRFM PLDL1KEEP, [x5, 256] 110 MOV v28.16b, v20.16b 111 $if PREFETCH: 112 PRFM PLDL1KEEP, [x5, 320] 113 MOV v29.16b, v21.16b 114 MOV v30.16b, v20.16b 115 MOV v31.16b, v21.16b 116 117 MOV x9, x3 // p = ks 118 1191: 120 # Load next 6 A pointers 121 LDP x14, x15, [x4], 16 122 LDP x20, x21, [x4], 16 123 LDP x22, x23, [x4], 16 124 125 CMP x14, x12 // if a0 == zero 126 ADD x14, x14, x11 // a0 += a_offset 127 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 128 CMP x15, x12 // if a1 == zero 129 ADD x15, x15, x11 // a1 += a_offset 130 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 131 CMP x20, x12 // if a2 == zero 132 ADD x20, x20, x11 // a2 += a_offset 133 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 134 CMP x21, x12 // if a3 == zero 135 ADD x21, x21, x11 // a3 += a_offset 136 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 137 CMP x22, x12 // if a4 == zero 138 ADD x22, x22, x11 // a4 += a_offset 139 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 140 CMP x23, x12 // if a5 == zero 141 ADD x23, x23, x11 // a5 += a_offset 142 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 143 144 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 145 SUBS x0, x2, 32 // k = kc - 32 146 B.LO 5f 147 148 # Prologue - loads for main loop of 96 FMA 149 LDR q0, [x14], 16 150 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 151 LDR q1, [x15], 16 152 LDR q2, [x20], 16 153 LDR q3, [x21], 16 154 LDR q4, [x22], 16 155 LDR q5, [x23], 16 156 LDP q14, q15, [x5], 32 157 LDP q16, q17, [x5], 32 158 159 # Is there at least 8 floats (32 bytes) for main loop? 160 SUBS x0, x0, 32 161 B.LO 3f 162 163 # Main loop - 8 floats of A (32 bytes) 164 # 96 FMA + 6 LDP A + 8 LDP B 165 # 64 float weights = 256 bytes. 4 cache lines. 1662: 167 # First group of 4 A. 48 FMA. 168 FMLA v20.4s, v12.4s, v0.s[0] 169 LDP q18, q19, [x5], 32 // Load last B 170 FMLA v22.4s, v12.4s, v1.s[0] 171 FMLA v24.4s, v12.4s, v2.s[0] 172 FMLA v26.4s, v12.4s, v3.s[0] 173 FMLA v28.4s, v12.4s, v4.s[0] 174 FMLA v30.4s, v12.4s, v5.s[0] 175 $if PREFETCH: 176 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 177 FMLA v21.4s, v13.4s, v0.s[0] 178 FMLA v23.4s, v13.4s, v1.s[0] 179 FMLA v25.4s, v13.4s, v2.s[0] 180 $if PREFETCH: 181 PRFM PLDL1KEEP, [x5, 320] 182 FMLA v27.4s, v13.4s, v3.s[0] 183 FMLA v29.4s, v13.4s, v4.s[0] 184 FMLA v31.4s, v13.4s, v5.s[0] 185 $if PREFETCH: 186 PRFM PLDL1KEEP, [x5, 384] 187 FMLA v20.4s, v14.4s, v0.s[1] 188 FMLA v22.4s, v14.4s, v1.s[1] 189 FMLA v24.4s, v14.4s, v2.s[1] 190 $if PREFETCH: 191 PRFM PLDL1KEEP, [x5, 448] 192 FMLA v26.4s, v14.4s, v3.s[1] 193 FMLA v28.4s, v14.4s, v4.s[1] 194 FMLA v30.4s, v14.4s, v5.s[1] 195 FMLA v21.4s, v15.4s, v0.s[1] 196 FMLA v23.4s, v15.4s, v1.s[1] 197 FMLA v25.4s, v15.4s, v2.s[1] 198 LDR q6, [x14], 16 // Load next 6 A 199 FMLA v27.4s, v15.4s, v3.s[1] 200 FMLA v29.4s, v15.4s, v4.s[1] 201 FMLA v31.4s, v15.4s, v5.s[1] 202 LDR q7, [x15], 16 203 204 FMLA v20.4s, v16.4s, v0.s[2] 205 FMLA v22.4s, v16.4s, v1.s[2] 206 FMLA v24.4s, v16.4s, v2.s[2] 207 LDR q8, [x20], 16 208 FMLA v26.4s, v16.4s, v3.s[2] 209 FMLA v28.4s, v16.4s, v4.s[2] 210 FMLA v30.4s, v16.4s, v5.s[2] 211 LDR q9, [x21], 16 212 FMLA v21.4s, v17.4s, v0.s[2] 213 FMLA v23.4s, v17.4s, v1.s[2] 214 FMLA v25.4s, v17.4s, v2.s[2] 215 LDR q10, [x22], 16 216 FMLA v27.4s, v17.4s, v3.s[2] 217 FMLA v29.4s, v17.4s, v4.s[2] 218 FMLA v31.4s, v17.4s, v5.s[2] 219 LDR q11, [x23], 16 220 221 FMLA v20.4s, v18.4s, v0.s[3] 222 FMLA v22.4s, v18.4s, v1.s[3] 223 FMLA v24.4s, v18.4s, v2.s[3] 224 LDP q12, q13, [x5], 32 // Load 4 B 225 FMLA v26.4s, v18.4s, v3.s[3] 226 FMLA v28.4s, v18.4s, v4.s[3] 227 FMLA v30.4s, v18.4s, v5.s[3] 228 LDP q14, q15, [x5], 32 229 FMLA v21.4s, v19.4s, v0.s[3] 230 FMLA v23.4s, v19.4s, v1.s[3] 231 FMLA v25.4s, v19.4s, v2.s[3] 232 LDP q16, q17, [x5], 32 233 FMLA v27.4s, v19.4s, v3.s[3] 234 FMLA v29.4s, v19.4s, v4.s[3] 235 FMLA v31.4s, v19.4s, v5.s[3] 236 LDP q18, q19, [x5], 32 237 238 # Second group of 4 A. 48 FMA. 239 FMLA v20.4s, v12.4s, v6.s[0] 240 FMLA v22.4s, v12.4s, v7.s[0] 241 FMLA v24.4s, v12.4s, v8.s[0] 242 LDR q0, [x14], 16 // Load next 6 A 243 FMLA v26.4s, v12.4s, v9.s[0] 244 FMLA v28.4s, v12.4s, v10.s[0] 245 FMLA v30.4s, v12.4s, v11.s[0] 246 LDR q1, [x15], 16 247 FMLA v21.4s, v13.4s, v6.s[0] 248 FMLA v23.4s, v13.4s, v7.s[0] 249 FMLA v25.4s, v13.4s, v8.s[0] 250 LDR q2, [x20], 16 251 FMLA v27.4s, v13.4s, v9.s[0] 252 FMLA v29.4s, v13.4s, v10.s[0] 253 FMLA v31.4s, v13.4s, v11.s[0] 254 LDR q3, [x21], 16 255 256 FMLA v20.4s, v14.4s, v6.s[1] 257 FMLA v22.4s, v14.4s, v7.s[1] 258 FMLA v24.4s, v14.4s, v8.s[1] 259 LDR q4, [x22], 16 260 FMLA v26.4s, v14.4s, v9.s[1] 261 FMLA v28.4s, v14.4s, v10.s[1] 262 FMLA v30.4s, v14.4s, v11.s[1] 263 LDR q5, [x23], 16 264 FMLA v21.4s, v15.4s, v6.s[1] 265 FMLA v23.4s, v15.4s, v7.s[1] 266 FMLA v25.4s, v15.4s, v8.s[1] 267 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 268 FMLA v27.4s, v15.4s, v9.s[1] 269 FMLA v29.4s, v15.4s, v10.s[1] 270 FMLA v31.4s, v15.4s, v11.s[1] 271 LDP q14, q15, [x5], 32 272 273 FMLA v20.4s, v16.4s, v6.s[2] 274 FMLA v22.4s, v16.4s, v7.s[2] 275 FMLA v24.4s, v16.4s, v8.s[2] 276 FMLA v26.4s, v16.4s, v9.s[2] 277 FMLA v28.4s, v16.4s, v10.s[2] 278 FMLA v30.4s, v16.4s, v11.s[2] 279 FMLA v21.4s, v17.4s, v6.s[2] 280 FMLA v23.4s, v17.4s, v7.s[2] 281 FMLA v25.4s, v17.4s, v8.s[2] 282 FMLA v27.4s, v17.4s, v9.s[2] 283 FMLA v29.4s, v17.4s, v10.s[2] 284 FMLA v31.4s, v17.4s, v11.s[2] 285 286 FMLA v20.4s, v18.4s, v6.s[3] 287 FMLA v22.4s, v18.4s, v7.s[3] 288 LDP q16, q17, [x5], 32 289 FMLA v24.4s, v18.4s, v8.s[3] 290 FMLA v26.4s, v18.4s, v9.s[3] 291 FMLA v28.4s, v18.4s, v10.s[3] 292 FMLA v30.4s, v18.4s, v11.s[3] 293 SUBS x0, x0, 32 294 FMLA v21.4s, v19.4s, v6.s[3] 295 FMLA v23.4s, v19.4s, v7.s[3] 296 FMLA v25.4s, v19.4s, v8.s[3] 297 FMLA v27.4s, v19.4s, v9.s[3] 298 FMLA v29.4s, v19.4s, v10.s[3] 299 FMLA v31.4s, v19.4s, v11.s[3] 300 B.HS 2b 301 302 # Epilogue - 8 floats of A (32 bytes) 303 # 96 FMA + 6 LDP A + 8 LDP B 304 # First block same as main loop. Second block has no preloads. 3053: 306 # First group of 4 A. 48 FMA. 307 FMLA v20.4s, v12.4s, v0.s[0] 308 LDP q18, q19, [x5], 32 // Load last B 309 FMLA v22.4s, v12.4s, v1.s[0] 310 FMLA v24.4s, v12.4s, v2.s[0] 311 FMLA v26.4s, v12.4s, v3.s[0] 312 FMLA v28.4s, v12.4s, v4.s[0] 313 FMLA v30.4s, v12.4s, v5.s[0] 314 $if PREFETCH: 315 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 316 FMLA v21.4s, v13.4s, v0.s[0] 317 FMLA v23.4s, v13.4s, v1.s[0] 318 FMLA v25.4s, v13.4s, v2.s[0] 319 $if PREFETCH: 320 PRFM PLDL1KEEP, [x5, 320] 321 FMLA v27.4s, v13.4s, v3.s[0] 322 FMLA v29.4s, v13.4s, v4.s[0] 323 FMLA v31.4s, v13.4s, v5.s[0] 324 $if PREFETCH: 325 PRFM PLDL1KEEP, [x5, 384] 326 FMLA v20.4s, v14.4s, v0.s[1] 327 FMLA v22.4s, v14.4s, v1.s[1] 328 FMLA v24.4s, v14.4s, v2.s[1] 329 $if PREFETCH: 330 PRFM PLDL1KEEP, [x5, 448] 331 FMLA v26.4s, v14.4s, v3.s[1] 332 FMLA v28.4s, v14.4s, v4.s[1] 333 FMLA v30.4s, v14.4s, v5.s[1] 334 FMLA v21.4s, v15.4s, v0.s[1] 335 FMLA v23.4s, v15.4s, v1.s[1] 336 FMLA v25.4s, v15.4s, v2.s[1] 337 LDR q6, [x14], 16 // Load next 6 A 338 FMLA v27.4s, v15.4s, v3.s[1] 339 FMLA v29.4s, v15.4s, v4.s[1] 340 FMLA v31.4s, v15.4s, v5.s[1] 341 LDR q7, [x15], 16 342 343 FMLA v20.4s, v16.4s, v0.s[2] 344 FMLA v22.4s, v16.4s, v1.s[2] 345 FMLA v24.4s, v16.4s, v2.s[2] 346 LDR q8, [x20], 16 347 FMLA v26.4s, v16.4s, v3.s[2] 348 FMLA v28.4s, v16.4s, v4.s[2] 349 FMLA v30.4s, v16.4s, v5.s[2] 350 LDR q9, [x21], 16 351 FMLA v21.4s, v17.4s, v0.s[2] 352 FMLA v23.4s, v17.4s, v1.s[2] 353 FMLA v25.4s, v17.4s, v2.s[2] 354 LDR q10, [x22], 16 355 FMLA v27.4s, v17.4s, v3.s[2] 356 FMLA v29.4s, v17.4s, v4.s[2] 357 FMLA v31.4s, v17.4s, v5.s[2] 358 LDR q11, [x23], 16 359 360 FMLA v20.4s, v18.4s, v0.s[3] 361 FMLA v22.4s, v18.4s, v1.s[3] 362 FMLA v24.4s, v18.4s, v2.s[3] 363 LDP q12, q13, [x5], 32 // Load 4 B 364 FMLA v26.4s, v18.4s, v3.s[3] 365 FMLA v28.4s, v18.4s, v4.s[3] 366 FMLA v30.4s, v18.4s, v5.s[3] 367 LDP q14, q15, [x5], 32 368 FMLA v21.4s, v19.4s, v0.s[3] 369 FMLA v23.4s, v19.4s, v1.s[3] 370 FMLA v25.4s, v19.4s, v2.s[3] 371 LDP q16, q17, [x5], 32 372 FMLA v27.4s, v19.4s, v3.s[3] 373 FMLA v29.4s, v19.4s, v4.s[3] 374 FMLA v31.4s, v19.4s, v5.s[3] 375 LDP q18, q19, [x5], 32 376 377 # Second group of 4 A. 48 FMA. 378 FMLA v20.4s, v12.4s, v6.s[0] 379 FMLA v22.4s, v12.4s, v7.s[0] 380 FMLA v24.4s, v12.4s, v8.s[0] 381 FMLA v26.4s, v12.4s, v9.s[0] 382 FMLA v28.4s, v12.4s, v10.s[0] 383 FMLA v30.4s, v12.4s, v11.s[0] 384 FMLA v21.4s, v13.4s, v6.s[0] 385 FMLA v23.4s, v13.4s, v7.s[0] 386 FMLA v25.4s, v13.4s, v8.s[0] 387 FMLA v27.4s, v13.4s, v9.s[0] 388 FMLA v29.4s, v13.4s, v10.s[0] 389 FMLA v31.4s, v13.4s, v11.s[0] 390 391 FMLA v20.4s, v14.4s, v6.s[1] 392 FMLA v22.4s, v14.4s, v7.s[1] 393 FMLA v24.4s, v14.4s, v8.s[1] 394 FMLA v26.4s, v14.4s, v9.s[1] 395 FMLA v28.4s, v14.4s, v10.s[1] 396 FMLA v30.4s, v14.4s, v11.s[1] 397 FMLA v21.4s, v15.4s, v6.s[1] 398 FMLA v23.4s, v15.4s, v7.s[1] 399 FMLA v25.4s, v15.4s, v8.s[1] 400 FMLA v27.4s, v15.4s, v9.s[1] 401 FMLA v29.4s, v15.4s, v10.s[1] 402 FMLA v31.4s, v15.4s, v11.s[1] 403 404 FMLA v20.4s, v16.4s, v6.s[2] 405 FMLA v22.4s, v16.4s, v7.s[2] 406 FMLA v24.4s, v16.4s, v8.s[2] 407 FMLA v26.4s, v16.4s, v9.s[2] 408 FMLA v28.4s, v16.4s, v10.s[2] 409 FMLA v30.4s, v16.4s, v11.s[2] 410 FMLA v21.4s, v17.4s, v6.s[2] 411 FMLA v23.4s, v17.4s, v7.s[2] 412 FMLA v25.4s, v17.4s, v8.s[2] 413 FMLA v27.4s, v17.4s, v9.s[2] 414 FMLA v29.4s, v17.4s, v10.s[2] 415 FMLA v31.4s, v17.4s, v11.s[2] 416 417 418 FMLA v20.4s, v18.4s, v6.s[3] 419 FMLA v22.4s, v18.4s, v7.s[3] 420 FMLA v24.4s, v18.4s, v8.s[3] 421 FMLA v26.4s, v18.4s, v9.s[3] 422 FMLA v28.4s, v18.4s, v10.s[3] 423 FMLA v30.4s, v18.4s, v11.s[3] 424 425 # Is there a remainder?- 4 floats of A (16 bytes) or less 426 TST x0, 31 427 428 FMLA v21.4s, v19.4s, v6.s[3] 429 FMLA v23.4s, v19.4s, v7.s[3] 430 FMLA v25.4s, v19.4s, v8.s[3] 431 LD2R {v6.4s, v7.4s}, [x8] // Load min/max values 432 FMLA v27.4s, v19.4s, v9.s[3] 433 FMLA v29.4s, v19.4s, v10.s[3] 434 FMLA v31.4s, v19.4s, v11.s[3] 435 B.NE 5f 436 4374: 438 # ks loop 439 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 440 B.HI 1b 441 442 # Clamp 443 FMAX v20.4s, v20.4s, v6.4s 444 FMAX v21.4s, v21.4s, v6.4s 445 FMAX v22.4s, v22.4s, v6.4s 446 FMAX v23.4s, v23.4s, v6.4s 447 LDR x0, [sp, 96] // Load cn_stride 448 FMAX v24.4s, v24.4s, v6.4s 449 FMAX v25.4s, v25.4s, v6.4s 450 FMAX v26.4s, v26.4s, v6.4s 451 FMAX v27.4s, v27.4s, v6.4s 452 FMAX v28.4s, v28.4s, v6.4s 453 FMAX v29.4s, v29.4s, v6.4s 454 FMAX v30.4s, v30.4s, v6.4s 455 FMAX v31.4s, v31.4s, v6.4s 456 SUBS x1, x1, 8 457 FMIN v20.4s, v20.4s, v7.4s 458 FMIN v21.4s, v21.4s, v7.4s 459 FMIN v22.4s, v22.4s, v7.4s 460 FMIN v23.4s, v23.4s, v7.4s 461 FMIN v24.4s, v24.4s, v7.4s 462 FMIN v25.4s, v25.4s, v7.4s 463 FMIN v26.4s, v26.4s, v7.4s 464 FMIN v27.4s, v27.4s, v7.4s 465 FMIN v28.4s, v28.4s, v7.4s 466 FMIN v29.4s, v29.4s, v7.4s 467 FMIN v30.4s, v30.4s, v7.4s 468 FMIN v31.4s, v31.4s, v7.4s 469 470 # Store full 6 x 8 471 B.LO 8f 472 473 STP q30, q31, [x7] 474 ADD x7, x7, x0 475 STP q28, q29, [x13] 476 ADD x13, x13, x0 477 STP q26, q27, [x10] 478 ADD x10, x10, x0 479 STP q24, q25, [x17] 480 ADD x17, x17, x0 481 STP q22, q23, [x16] 482 ADD x16, x16, x0 483 STP q20, q21, [x6] 484 ADD x6, x6, x0 485 486 SUB x4, x4, x3 // a -= ks 487 488 # nc loop 489 B.HI 0b 490 491 # Restore x20,x21,x22,x23 from stack 492 LDP x22, x23, [sp, 80] 493 LDP x20, x21, [sp, 64] 494 495 # Restore d8-d15 from stack 496 LDP d14, d15, [sp, 48] 497 LDP d12, d13, [sp, 32] 498 LDP d10, d11, [sp, 16] 499 LDP d8, d9, [sp], 96 500 RET 501 5025: 503 # Load min/max values 504 LD2R {v6.4s, v7.4s}, [x8] 505 506 # Is there a remainder?- 4 floats of A (16 bytes) 507 TBZ x0, 4, 6f 508 509 # Remainder- 4 floats of A (16 bytes) 510 # Load A 511 LDR q0, [x14], 16 512 LDR q1, [x15], 16 513 LDR q2, [x20], 16 514 LDR q3, [x21], 16 515 LDR q4, [x22], 16 516 LDR q5, [x23], 16 517 # Load B 518 LDP q12, q13, [x5], 32 519 LDP q14, q15, [x5], 32 520 LDP q16, q17, [x5], 32 521 LDP q18, q19, [x5], 32 522 523 FMLA v20.4s, v12.4s, v0.s[0] 524 FMLA v22.4s, v12.4s, v1.s[0] 525 FMLA v24.4s, v12.4s, v2.s[0] 526 FMLA v26.4s, v12.4s, v3.s[0] 527 FMLA v28.4s, v12.4s, v4.s[0] 528 FMLA v30.4s, v12.4s, v5.s[0] 529 FMLA v21.4s, v13.4s, v0.s[0] 530 FMLA v23.4s, v13.4s, v1.s[0] 531 FMLA v25.4s, v13.4s, v2.s[0] 532 FMLA v27.4s, v13.4s, v3.s[0] 533 FMLA v29.4s, v13.4s, v4.s[0] 534 FMLA v31.4s, v13.4s, v5.s[0] 535 536 FMLA v20.4s, v14.4s, v0.s[1] 537 FMLA v22.4s, v14.4s, v1.s[1] 538 FMLA v24.4s, v14.4s, v2.s[1] 539 FMLA v26.4s, v14.4s, v3.s[1] 540 FMLA v28.4s, v14.4s, v4.s[1] 541 FMLA v30.4s, v14.4s, v5.s[1] 542 FMLA v21.4s, v15.4s, v0.s[1] 543 FMLA v23.4s, v15.4s, v1.s[1] 544 FMLA v25.4s, v15.4s, v2.s[1] 545 FMLA v27.4s, v15.4s, v3.s[1] 546 FMLA v29.4s, v15.4s, v4.s[1] 547 FMLA v31.4s, v15.4s, v5.s[1] 548 549 FMLA v20.4s, v16.4s, v0.s[2] 550 FMLA v22.4s, v16.4s, v1.s[2] 551 FMLA v24.4s, v16.4s, v2.s[2] 552 FMLA v26.4s, v16.4s, v3.s[2] 553 FMLA v28.4s, v16.4s, v4.s[2] 554 FMLA v30.4s, v16.4s, v5.s[2] 555 FMLA v21.4s, v17.4s, v0.s[2] 556 FMLA v23.4s, v17.4s, v1.s[2] 557 FMLA v25.4s, v17.4s, v2.s[2] 558 FMLA v27.4s, v17.4s, v3.s[2] 559 FMLA v29.4s, v17.4s, v4.s[2] 560 FMLA v31.4s, v17.4s, v5.s[2] 561 562 FMLA v20.4s, v18.4s, v0.s[3] 563 FMLA v22.4s, v18.4s, v1.s[3] 564 FMLA v24.4s, v18.4s, v2.s[3] 565 FMLA v26.4s, v18.4s, v3.s[3] 566 FMLA v28.4s, v18.4s, v4.s[3] 567 FMLA v30.4s, v18.4s, v5.s[3] 568 FMLA v21.4s, v19.4s, v0.s[3] 569 FMLA v23.4s, v19.4s, v1.s[3] 570 FMLA v25.4s, v19.4s, v2.s[3] 571 FMLA v27.4s, v19.4s, v3.s[3] 572 FMLA v29.4s, v19.4s, v4.s[3] 573 FMLA v31.4s, v19.4s, v5.s[3] 574 575 # Is there a remainder?- 2 floats of A (8 bytes) 5766: 577 TBZ x0, 3, 7f 578 579 # Remainder- 2 floats of A (8 bytes) 580 # Load A 581 LDR d0, [x14], 8 582 LDR d1, [x15], 8 583 LDR d2, [x20], 8 584 LDR d3, [x21], 8 585 LDR d4, [x22], 8 586 LDR d5, [x23], 8 587 # Load B 588 LDP q12, q13, [x5], 32 589 LDP q14, q15, [x5], 32 590 591 FMLA v20.4s, v12.4s, v0.s[0] 592 FMLA v22.4s, v12.4s, v1.s[0] 593 FMLA v24.4s, v12.4s, v2.s[0] 594 FMLA v26.4s, v12.4s, v3.s[0] 595 FMLA v28.4s, v12.4s, v4.s[0] 596 FMLA v30.4s, v12.4s, v5.s[0] 597 FMLA v21.4s, v13.4s, v0.s[0] 598 FMLA v23.4s, v13.4s, v1.s[0] 599 FMLA v25.4s, v13.4s, v2.s[0] 600 FMLA v27.4s, v13.4s, v3.s[0] 601 FMLA v29.4s, v13.4s, v4.s[0] 602 FMLA v31.4s, v13.4s, v5.s[0] 603 604 FMLA v20.4s, v14.4s, v0.s[1] 605 FMLA v22.4s, v14.4s, v1.s[1] 606 FMLA v24.4s, v14.4s, v2.s[1] 607 FMLA v26.4s, v14.4s, v3.s[1] 608 FMLA v28.4s, v14.4s, v4.s[1] 609 FMLA v30.4s, v14.4s, v5.s[1] 610 FMLA v21.4s, v15.4s, v0.s[1] 611 FMLA v23.4s, v15.4s, v1.s[1] 612 FMLA v25.4s, v15.4s, v2.s[1] 613 FMLA v27.4s, v15.4s, v3.s[1] 614 FMLA v29.4s, v15.4s, v4.s[1] 615 FMLA v31.4s, v15.4s, v5.s[1] 616 617 # Is there a remainder?- 1 float of A (4 bytes) 6187: 619 TBZ x0, 2, 4b 620 621 # Remainder- 1 float of A (4 bytes) 622 # Load A 623 LDR s0, [x14], 4 624 LDR s1, [x15], 4 625 LDR s2, [x20], 4 626 LDR s3, [x21], 4 627 LDR s4, [x22], 4 628 LDR s5, [x23], 4 629 # Load B 630 LDP q12, q13, [x5], 32 631 632 FMLA v20.4s, v12.4s, v0.s[0] 633 FMLA v22.4s, v12.4s, v1.s[0] 634 FMLA v24.4s, v12.4s, v2.s[0] 635 FMLA v26.4s, v12.4s, v3.s[0] 636 FMLA v28.4s, v12.4s, v4.s[0] 637 FMLA v30.4s, v12.4s, v5.s[0] 638 FMLA v21.4s, v13.4s, v0.s[0] 639 FMLA v23.4s, v13.4s, v1.s[0] 640 FMLA v25.4s, v13.4s, v2.s[0] 641 FMLA v27.4s, v13.4s, v3.s[0] 642 FMLA v29.4s, v13.4s, v4.s[0] 643 FMLA v31.4s, v13.4s, v5.s[0] 644 B 4b 645 646 # Store odd width 6478: 648 TBZ x1, 2, 9f 649 STR q30, [x7], 16 650 MOV v30.16b, v31.16b 651 STR q28, [x13], 16 652 MOV v28.16b, v29.16b 653 STR q26, [x10], 16 654 MOV v26.16b, v27.16b 655 STR q24, [x17], 16 656 MOV v24.16b, v25.16b 657 STR q22, [x16], 16 658 MOV v22.16b, v23.16b 659 STR q20, [x6], 16 660 MOV v20.16b, v21.16b 6619: 662 TBZ x1, 1, 10f 663 STR d30, [x7], 8 664 STR d28, [x13], 8 665 DUP d30, v30.d[1] 666 DUP d28, v28.d[1] 667 STR d26, [x10], 8 668 STR d24, [x17], 8 669 DUP d26, v26.d[1] 670 DUP d24, v24.d[1] 671 STR d22, [x16], 8 672 STR d20, [x6], 8 673 DUP d22, v22.d[1] 674 DUP d20, v20.d[1] 675 67610: 677 TBZ x1, 0, 11f 678 STR s30, [x7] 679 STR s28, [x13] 680 STR s26, [x10] 681 STR s24, [x17] 682 STR s22, [x16] 683 STR s20, [x6] 68411: 685 # Restore x20,x21,x22,x23 from stack 686 LDP x22, x23, [sp, 80] 687 LDP x20, x21, [sp, 64] 688 689 # Restore d8-d15 from stack 690 LDP d14, d15, [sp, 48] 691 LDP d12, d13, [sp, 32] 692 LDP d10, d11, [sp, 16] 693 LDP d8, d9, [sp], 96 694 RET 695 696END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 697# LINT.ThenChange(6x8-aarch64-neonfma-cortex-a75.cc,upto6x8-aarch64-neonfma-cortex-a75.cc) 698 699#ifdef __ELF__ 700.section ".note.GNU-stack","",%progbits 701#endif 702