1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> x8 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x14 a0 26# x15 a1 27# x20 a2 28# x21 a3 29# x22 a4 30# x23 a5 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x10 c3 37# x13 c4 38# x7 c5 39 40# Vector register usage 41# A0 v0 v6 42# A1 v1 v7 43# A2 v2 v8 44# A3 v3 v9 45# A4 v4 v10 46# A5 v5 v11 47# B v12 v13 v14 v15 48# B v16 v17 v18 v19 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# C v28 v29 54# C v30 v31 55# Clamp v6 v7 56 57BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 58 59 # Load a_offset 60 LDR x11, [sp, 8] 61 62 # Load zero, params pointer 63 LDP x12, x8, [sp, 16] 64 65 # Clamp C pointers 66 STP d8, d9, [sp, -96]! 67 CMP x0, 2 // if mr < 2 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 STP d10, d11, [sp, 16] 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 STP d12, d13, [sp, 32] 77 CMP x0, 4 // if mr < 4 78 ADD x10, x17, x7 // c3 = c2 + cm_stride 79 CSEL x10, x17, x10, LO // c3 = c2 80 81 STP d14, d15, [sp, 48] 82 ADD x13, x10, x7 // c4 = c3 + cm_stride 83 // if mr <= 4 84 CSEL x13, x10, x13, LS // c4 = c3 85 86 # Save x20,x21,x22,x23 on stack 87 STP x20, x21, [sp, 64] 88 STP x22, x23, [sp, 80] 89 90 CMP x0, 6 // if mr < 6 91 ADD x7, x13, x7 // c5 = c4 + cm_stride 92 CSEL x7, x13, x7, LO // c5 = c4 93 94 # Load zero, params pointer 95 LDP x12, x8, [sp, 112] 96 97 # Load a_offset 98 LDR x11, [sp, 104] 99 100 # Load min/max values 101 LD2R {v6.4s, v7.4s}, [x8] 102 1030: 104 # Load initial bias from w into accumulators 105 LD1 {v20.16b, v21.16b}, [x5], 32 106 MOV v22.16b, v20.16b 107 MOV v23.16b, v21.16b 108 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 109 MOV v24.16b, v20.16b 110 MOV v25.16b, v21.16b 111 PRFM PLDL1KEEP, [x5, 64] 112 MOV v26.16b, v20.16b 113 MOV v27.16b, v21.16b 114 PRFM PLDL1KEEP, [x5, 128] 115 MOV v28.16b, v20.16b 116 MOV v29.16b, v21.16b 117 PRFM PLDL1KEEP, [x5, 192] 118 MOV v30.16b, v20.16b 119 MOV v31.16b, v21.16b 120 121 MOV x9, x3 // p = ks 122 1231: 124 # Load next 6 A pointers 125 LDP x14, x15, [x4], 16 126 LDP x20, x21, [x4], 16 127 LDP x22, x23, [x4], 16 128 129 CMP x14, x12 // if a0 == zero 130 ADD x14, x14, x11 // a0 += a_offset 131 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 132 CMP x15, x12 // if a1 == zero 133 ADD x15, x15, x11 // a1 += a_offset 134 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 135 CMP x20, x12 // if a2 == zero 136 ADD x20, x20, x11 // a2 += a_offset 137 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 138 CMP x21, x12 // if a3 == zero 139 ADD x21, x21, x11 // a3 += a_offset 140 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 141 CMP x22, x12 // if a4 == zero 142 ADD x22, x22, x11 // a4 += a_offset 143 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 144 CMP x23, x12 // if a5 == zero 145 ADD x23, x23, x11 // a5 += a_offset 146 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 147 148 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 149 SUBS x0, x2, 32 // k = kc - 32 150 B.LO 5f 151 152 # Prologue - loads for main loop of 96 FMA 153 # load A0 to A4 but not A5 154 LDP q0, q6, [x14], 32 155 LDP q1, q7, [x15], 32 156 LDP q2, q8, [x20], 32 157 LDP q3, q9, [x21], 32 158 LDP q4, q10, [x22], 32 159 # load first set of B 160 LDP q12, q13, [x5], 32 161 LDP q14, q15, [x5], 32 162 163 # Is there at least 8 floats (32 bytes) for main loop? 164 SUBS x0, x0, 32 165 B.LO 3f 166 167 # Main loop - 8 floats of A (32 bytes) 168 # 96 FMA + 6 LDP A + 8 LDP B 1692: 170 # First group of 4 A. 48 FMA. Loads A5 171 172 LDP q5, q11, [x23], 32 173 FMLA v20.4s, v12.4s, v0.s[0] 174 FMLA v22.4s, v12.4s, v1.s[0] 175 LDP q16, q17, [x5], 32 176 FMLA v24.4s, v12.4s, v2.s[0] 177 FMLA v26.4s, v12.4s, v3.s[0] 178 LDP q18, q19, [x5], 32 179 FMLA v28.4s, v12.4s, v4.s[0] 180 FMLA v30.4s, v12.4s, v5.s[0] 181 FMLA v21.4s, v13.4s, v0.s[0] 182 FMLA v23.4s, v13.4s, v1.s[0] 183 FMLA v25.4s, v13.4s, v2.s[0] 184 FMLA v27.4s, v13.4s, v3.s[0] 185 FMLA v29.4s, v13.4s, v4.s[0] 186 FMLA v31.4s, v13.4s, v5.s[0] 187 188 FMLA v20.4s, v14.4s, v0.s[1] 189 FMLA v22.4s, v14.4s, v1.s[1] 190 FMLA v24.4s, v14.4s, v2.s[1] 191 FMLA v26.4s, v14.4s, v3.s[1] 192 FMLA v28.4s, v14.4s, v4.s[1] 193 FMLA v30.4s, v14.4s, v5.s[1] 194 FMLA v21.4s, v15.4s, v0.s[1] 195 FMLA v23.4s, v15.4s, v1.s[1] 196 FMLA v25.4s, v15.4s, v2.s[1] 197 FMLA v27.4s, v15.4s, v3.s[1] 198 FMLA v29.4s, v15.4s, v4.s[1] 199 FMLA v31.4s, v15.4s, v5.s[1] 200 201 LDP q12, q13, [x5], 32 202 FMLA v20.4s, v16.4s, v0.s[2] 203 FMLA v22.4s, v16.4s, v1.s[2] 204 LDP q14, q15, [x5], 32 205 FMLA v24.4s, v16.4s, v2.s[2] 206 FMLA v26.4s, v16.4s, v3.s[2] 207 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 208 FMLA v28.4s, v16.4s, v4.s[2] 209 FMLA v30.4s, v16.4s, v5.s[2] 210 FMLA v21.4s, v17.4s, v0.s[2] 211 FMLA v23.4s, v17.4s, v1.s[2] 212 PRFM PLDL1KEEP, [x5, 256] 213 FMLA v25.4s, v17.4s, v2.s[2] 214 FMLA v27.4s, v17.4s, v3.s[2] 215 FMLA v29.4s, v17.4s, v4.s[2] 216 FMLA v31.4s, v17.4s, v5.s[2] 217 218 FMLA v20.4s, v18.4s, v0.s[3] 219 FMLA v22.4s, v18.4s, v1.s[3] 220 FMLA v24.4s, v18.4s, v2.s[3] 221 FMLA v26.4s, v18.4s, v3.s[3] 222 FMLA v28.4s, v18.4s, v4.s[3] 223 FMLA v30.4s, v18.4s, v5.s[3] 224 FMLA v21.4s, v19.4s, v0.s[3] 225 FMLA v23.4s, v19.4s, v1.s[3] 226 FMLA v25.4s, v19.4s, v2.s[3] 227 FMLA v27.4s, v19.4s, v3.s[3] 228 FMLA v29.4s, v19.4s, v4.s[3] 229 FMLA v31.4s, v19.4s, v5.s[3] 230 231 # Second group of 4 A. 48 FMA. Loads A0 - A4 232 233 LDP q16, q17, [x5], 32 234 FMLA v20.4s, v12.4s, v6.s[0] 235 FMLA v22.4s, v12.4s, v7.s[0] 236 LDP q18, q19, [x5], 32 237 FMLA v24.4s, v12.4s, v8.s[0] 238 FMLA v26.4s, v12.4s, v9.s[0] 239 FMLA v28.4s, v12.4s, v10.s[0] 240 FMLA v30.4s, v12.4s, v11.s[0] 241 FMLA v21.4s, v13.4s, v6.s[0] 242 FMLA v23.4s, v13.4s, v7.s[0] 243 FMLA v25.4s, v13.4s, v8.s[0] 244 FMLA v27.4s, v13.4s, v9.s[0] 245 FMLA v29.4s, v13.4s, v10.s[0] 246 FMLA v31.4s, v13.4s, v11.s[0] 247 248 FMLA v20.4s, v14.4s, v6.s[1] 249 FMLA v22.4s, v14.4s, v7.s[1] 250 FMLA v24.4s, v14.4s, v8.s[1] 251 FMLA v26.4s, v14.4s, v9.s[1] 252 FMLA v28.4s, v14.4s, v10.s[1] 253 FMLA v30.4s, v14.4s, v11.s[1] 254 FMLA v21.4s, v15.4s, v6.s[1] 255 FMLA v23.4s, v15.4s, v7.s[1] 256 FMLA v25.4s, v15.4s, v8.s[1] 257 FMLA v27.4s, v15.4s, v9.s[1] 258 FMLA v29.4s, v15.4s, v10.s[1] 259 FMLA v31.4s, v15.4s, v11.s[1] 260 261 LDP q12, q13, [x5], 32 262 FMLA v20.4s, v16.4s, v6.s[2] 263 FMLA v20.4s, v18.4s, v6.s[3] 264 LDP q14, q15, [x5], 32 265 FMLA v21.4s, v17.4s, v6.s[2] 266 FMLA v21.4s, v19.4s, v6.s[3] 267 LDP q0, q6, [x14], 32 268 FMLA v22.4s, v16.4s, v7.s[2] 269 FMLA v22.4s, v18.4s, v7.s[3] 270 FMLA v23.4s, v17.4s, v7.s[2] 271 FMLA v23.4s, v19.4s, v7.s[3] 272 LDP q1, q7, [x15], 32 273 FMLA v24.4s, v16.4s, v8.s[2] 274 FMLA v24.4s, v18.4s, v8.s[3] 275 FMLA v25.4s, v17.4s, v8.s[2] 276 FMLA v25.4s, v19.4s, v8.s[3] 277 LDP q2, q8, [x20], 32 278 FMLA v26.4s, v16.4s, v9.s[2] 279 FMLA v26.4s, v18.4s, v9.s[3] 280 FMLA v27.4s, v17.4s, v9.s[2] 281 FMLA v27.4s, v19.4s, v9.s[3] 282 LDP q3, q9, [x21], 32 283 FMLA v28.4s, v16.4s, v10.s[2] 284 FMLA v28.4s, v18.4s, v10.s[3] 285 FMLA v29.4s, v17.4s, v10.s[2] 286 FMLA v29.4s, v19.4s, v10.s[3] 287 LDP q4, q10, [x22], 32 288 FMLA v30.4s, v16.4s, v11.s[2] 289 FMLA v30.4s, v18.4s, v11.s[3] 290 SUBS x0, x0, 32 291 FMLA v31.4s, v17.4s, v11.s[2] 292 FMLA v31.4s, v19.4s, v11.s[3] 293 B.HS 2b 294 295 # Epilogue - 8 floats of A (32 bytes) 296 # 96 FMA + 6 LDP A + 8 LDP B 297 # First block same as main loop. Second block has no preloads. 2983: 299 # First group of 4 A. 48 FMA. Loads A5 300 301 LDP q5, q11, [x23], 32 302 FMLA v20.4s, v12.4s, v0.s[0] 303 FMLA v22.4s, v12.4s, v1.s[0] 304 LDP q16, q17, [x5], 32 305 FMLA v24.4s, v12.4s, v2.s[0] 306 FMLA v26.4s, v12.4s, v3.s[0] 307 LDP q18, q19, [x5], 32 308 FMLA v28.4s, v12.4s, v4.s[0] 309 FMLA v30.4s, v12.4s, v5.s[0] 310 FMLA v21.4s, v13.4s, v0.s[0] 311 FMLA v23.4s, v13.4s, v1.s[0] 312 FMLA v25.4s, v13.4s, v2.s[0] 313 FMLA v27.4s, v13.4s, v3.s[0] 314 FMLA v29.4s, v13.4s, v4.s[0] 315 FMLA v31.4s, v13.4s, v5.s[0] 316 317 FMLA v20.4s, v14.4s, v0.s[1] 318 FMLA v22.4s, v14.4s, v1.s[1] 319 FMLA v24.4s, v14.4s, v2.s[1] 320 FMLA v26.4s, v14.4s, v3.s[1] 321 FMLA v28.4s, v14.4s, v4.s[1] 322 FMLA v30.4s, v14.4s, v5.s[1] 323 FMLA v21.4s, v15.4s, v0.s[1] 324 FMLA v23.4s, v15.4s, v1.s[1] 325 FMLA v25.4s, v15.4s, v2.s[1] 326 FMLA v27.4s, v15.4s, v3.s[1] 327 FMLA v29.4s, v15.4s, v4.s[1] 328 FMLA v31.4s, v15.4s, v5.s[1] 329 330 LDP q12, q13, [x5], 32 331 FMLA v20.4s, v16.4s, v0.s[2] 332 FMLA v22.4s, v16.4s, v1.s[2] 333 LDP q14, q15, [x5], 32 334 FMLA v24.4s, v16.4s, v2.s[2] 335 FMLA v26.4s, v16.4s, v3.s[2] 336 FMLA v28.4s, v16.4s, v4.s[2] 337 FMLA v30.4s, v16.4s, v5.s[2] 338 FMLA v21.4s, v17.4s, v0.s[2] 339 FMLA v23.4s, v17.4s, v1.s[2] 340 FMLA v25.4s, v17.4s, v2.s[2] 341 FMLA v27.4s, v17.4s, v3.s[2] 342 FMLA v29.4s, v17.4s, v4.s[2] 343 FMLA v31.4s, v17.4s, v5.s[2] 344 345 FMLA v20.4s, v18.4s, v0.s[3] 346 FMLA v22.4s, v18.4s, v1.s[3] 347 FMLA v24.4s, v18.4s, v2.s[3] 348 FMLA v26.4s, v18.4s, v3.s[3] 349 FMLA v28.4s, v18.4s, v4.s[3] 350 FMLA v30.4s, v18.4s, v5.s[3] 351 FMLA v21.4s, v19.4s, v0.s[3] 352 FMLA v23.4s, v19.4s, v1.s[3] 353 FMLA v25.4s, v19.4s, v2.s[3] 354 FMLA v27.4s, v19.4s, v3.s[3] 355 FMLA v29.4s, v19.4s, v4.s[3] 356 FMLA v31.4s, v19.4s, v5.s[3] 357 358 # Second group of 4 A. 48 FMA. No A Loads, No last B load 359 360 LDP q16, q17, [x5], 32 361 FMLA v20.4s, v12.4s, v6.s[0] 362 FMLA v22.4s, v12.4s, v7.s[0] 363 LDP q18, q19, [x5], 32 364 FMLA v24.4s, v12.4s, v8.s[0] 365 FMLA v26.4s, v12.4s, v9.s[0] 366 FMLA v28.4s, v12.4s, v10.s[0] 367 FMLA v30.4s, v12.4s, v11.s[0] 368 FMLA v21.4s, v13.4s, v6.s[0] 369 FMLA v23.4s, v13.4s, v7.s[0] 370 FMLA v25.4s, v13.4s, v8.s[0] 371 FMLA v27.4s, v13.4s, v9.s[0] 372 FMLA v29.4s, v13.4s, v10.s[0] 373 FMLA v31.4s, v13.4s, v11.s[0] 374 375 FMLA v20.4s, v14.4s, v6.s[1] 376 FMLA v22.4s, v14.4s, v7.s[1] 377 FMLA v24.4s, v14.4s, v8.s[1] 378 FMLA v26.4s, v14.4s, v9.s[1] 379 FMLA v28.4s, v14.4s, v10.s[1] 380 FMLA v30.4s, v14.4s, v11.s[1] 381 FMLA v21.4s, v15.4s, v6.s[1] 382 FMLA v23.4s, v15.4s, v7.s[1] 383 FMLA v25.4s, v15.4s, v8.s[1] 384 FMLA v27.4s, v15.4s, v9.s[1] 385 FMLA v29.4s, v15.4s, v10.s[1] 386 FMLA v31.4s, v15.4s, v11.s[1] 387 388 # Last part of epilogue has loads removed. 389 390 FMLA v20.4s, v16.4s, v6.s[2] 391 FMLA v22.4s, v16.4s, v7.s[2] 392 FMLA v24.4s, v16.4s, v8.s[2] 393 FMLA v26.4s, v16.4s, v9.s[2] 394 FMLA v28.4s, v16.4s, v10.s[2] 395 FMLA v30.4s, v16.4s, v11.s[2] 396 FMLA v21.4s, v17.4s, v6.s[2] 397 FMLA v23.4s, v17.4s, v7.s[2] 398 FMLA v25.4s, v17.4s, v8.s[2] 399 FMLA v27.4s, v17.4s, v9.s[2] 400 FMLA v29.4s, v17.4s, v10.s[2] 401 FMLA v31.4s, v17.4s, v11.s[2] 402 403 FMLA v20.4s, v18.4s, v6.s[3] 404 FMLA v22.4s, v18.4s, v7.s[3] 405 FMLA v24.4s, v18.4s, v8.s[3] 406 FMLA v26.4s, v18.4s, v9.s[3] 407 FMLA v28.4s, v18.4s, v10.s[3] 408 FMLA v30.4s, v18.4s, v11.s[3] 409 FMLA v21.4s, v19.4s, v6.s[3] 410 FMLA v23.4s, v19.4s, v7.s[3] 411 412 # Load min/max values 413 LD2R {v6.4s, v7.4s}, [x8] 414 415 FMLA v25.4s, v19.4s, v8.s[3] 416 FMLA v27.4s, v19.4s, v9.s[3] 417 TST x0, 31 418 FMLA v29.4s, v19.4s, v10.s[3] 419 FMLA v31.4s, v19.4s, v11.s[3] 420 B.NE 5f 421 422 .p2align 3 4234: 424 # ks loop 425 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 426 B.HI 1b 427 428 # Clamp 429 FMAX v20.4s, v20.4s, v6.4s 430 # Load cn_stride 431 LDR x0, [sp, 96] 432 FMAX v21.4s, v21.4s, v6.4s 433 FMAX v22.4s, v22.4s, v6.4s 434 FMAX v23.4s, v23.4s, v6.4s 435 FMAX v24.4s, v24.4s, v6.4s 436 FMAX v25.4s, v25.4s, v6.4s 437 FMAX v26.4s, v26.4s, v6.4s 438 FMAX v27.4s, v27.4s, v6.4s 439 FMAX v28.4s, v28.4s, v6.4s 440 FMAX v29.4s, v29.4s, v6.4s 441 FMAX v30.4s, v30.4s, v6.4s 442 FMAX v31.4s, v31.4s, v6.4s 443 SUBS x1, x1, 8 444 FMIN v20.4s, v20.4s, v7.4s 445 FMIN v21.4s, v21.4s, v7.4s 446 FMIN v22.4s, v22.4s, v7.4s 447 FMIN v23.4s, v23.4s, v7.4s 448 FMIN v24.4s, v24.4s, v7.4s 449 FMIN v25.4s, v25.4s, v7.4s 450 FMIN v26.4s, v26.4s, v7.4s 451 FMIN v27.4s, v27.4s, v7.4s 452 FMIN v28.4s, v28.4s, v7.4s 453 FMIN v29.4s, v29.4s, v7.4s 454 FMIN v30.4s, v30.4s, v7.4s 455 FMIN v31.4s, v31.4s, v7.4s 456 457 # Store full 6 x 8 458 B.LO 8f 459 460 STP q30, q31, [x7] 461 ADD x7, x7, x0 462 STP q28, q29, [x13] 463 ADD x13, x13, x0 464 STP q26, q27, [x10] 465 ADD x10, x10, x0 466 STP q24, q25, [x17] 467 ADD x17, x17, x0 468 STP q22, q23, [x16] 469 ADD x16, x16, x0 470 STP q20, q21, [x6] 471 ADD x6, x6, x0 472 473 SUB x4, x4, x3 // a -= ks 474 475 # nc loop 476 B.HI 0b 477 478 # Restore x20,x21,x22,x23 from stack 479 LDP x22, x23, [sp, 80] 480 LDP x20, x21, [sp, 64] 481 482 # Restore d8-d15 from stack 483 LDP d14, d15, [sp, 48] 484 LDP d12, d13, [sp, 32] 485 LDP d10, d11, [sp, 16] 486 LDP d8, d9, [sp], 96 487 RET 488 489 .p2align 3 4905: 491 # Is there a remainder?- 4 floats of A (16 bytes) 492 TBZ x0, 4, 6f 493 494 # Remainder- 4 floats of A (16 bytes) 495 # Load A 496 LDR q0, [x14], 16 497 LDR q1, [x15], 16 498 LDR q2, [x20], 16 499 LDR q3, [x21], 16 500 LDR q4, [x22], 16 501 LDR q5, [x23], 16 502 # Load B 503 LDP q12, q13, [x5], 32 504 LDP q14, q15, [x5], 32 505 LDP q16, q17, [x5], 32 506 LDP q18, q19, [x5], 32 507 508 FMLA v20.4s, v12.4s, v0.s[0] 509 FMLA v22.4s, v12.4s, v1.s[0] 510 FMLA v24.4s, v12.4s, v2.s[0] 511 FMLA v26.4s, v12.4s, v3.s[0] 512 FMLA v28.4s, v12.4s, v4.s[0] 513 FMLA v30.4s, v12.4s, v5.s[0] 514 FMLA v21.4s, v13.4s, v0.s[0] 515 FMLA v23.4s, v13.4s, v1.s[0] 516 FMLA v25.4s, v13.4s, v2.s[0] 517 FMLA v27.4s, v13.4s, v3.s[0] 518 FMLA v29.4s, v13.4s, v4.s[0] 519 FMLA v31.4s, v13.4s, v5.s[0] 520 521 FMLA v20.4s, v14.4s, v0.s[1] 522 FMLA v22.4s, v14.4s, v1.s[1] 523 FMLA v24.4s, v14.4s, v2.s[1] 524 FMLA v26.4s, v14.4s, v3.s[1] 525 FMLA v28.4s, v14.4s, v4.s[1] 526 FMLA v30.4s, v14.4s, v5.s[1] 527 FMLA v21.4s, v15.4s, v0.s[1] 528 FMLA v23.4s, v15.4s, v1.s[1] 529 FMLA v25.4s, v15.4s, v2.s[1] 530 FMLA v27.4s, v15.4s, v3.s[1] 531 FMLA v29.4s, v15.4s, v4.s[1] 532 FMLA v31.4s, v15.4s, v5.s[1] 533 534 FMLA v20.4s, v16.4s, v0.s[2] 535 FMLA v22.4s, v16.4s, v1.s[2] 536 FMLA v24.4s, v16.4s, v2.s[2] 537 FMLA v26.4s, v16.4s, v3.s[2] 538 FMLA v28.4s, v16.4s, v4.s[2] 539 FMLA v30.4s, v16.4s, v5.s[2] 540 FMLA v21.4s, v17.4s, v0.s[2] 541 FMLA v23.4s, v17.4s, v1.s[2] 542 FMLA v25.4s, v17.4s, v2.s[2] 543 FMLA v27.4s, v17.4s, v3.s[2] 544 FMLA v29.4s, v17.4s, v4.s[2] 545 FMLA v31.4s, v17.4s, v5.s[2] 546 547 FMLA v20.4s, v18.4s, v0.s[3] 548 FMLA v22.4s, v18.4s, v1.s[3] 549 FMLA v24.4s, v18.4s, v2.s[3] 550 FMLA v26.4s, v18.4s, v3.s[3] 551 FMLA v28.4s, v18.4s, v4.s[3] 552 FMLA v30.4s, v18.4s, v5.s[3] 553 FMLA v21.4s, v19.4s, v0.s[3] 554 FMLA v23.4s, v19.4s, v1.s[3] 555 FMLA v25.4s, v19.4s, v2.s[3] 556 FMLA v27.4s, v19.4s, v3.s[3] 557 FMLA v29.4s, v19.4s, v4.s[3] 558 FMLA v31.4s, v19.4s, v5.s[3] 559 560 # Is there a remainder?- 2 floats of A (8 bytes) 5616: 562 TBZ x0, 3, 7f 563 564 # Remainder- 2 floats of A (8 bytes) 565 # Load A 566 LDR d0, [x14], 8 567 LDR d1, [x15], 8 568 LDR d2, [x20], 8 569 LDR d3, [x21], 8 570 LDR d4, [x22], 8 571 LDR d5, [x23], 8 572 # Load B 573 LDP q12, q13, [x5], 32 574 LDP q14, q15, [x5], 32 575 576 FMLA v20.4s, v12.4s, v0.s[0] 577 FMLA v22.4s, v12.4s, v1.s[0] 578 FMLA v24.4s, v12.4s, v2.s[0] 579 FMLA v26.4s, v12.4s, v3.s[0] 580 FMLA v28.4s, v12.4s, v4.s[0] 581 FMLA v30.4s, v12.4s, v5.s[0] 582 FMLA v21.4s, v13.4s, v0.s[0] 583 FMLA v23.4s, v13.4s, v1.s[0] 584 FMLA v25.4s, v13.4s, v2.s[0] 585 FMLA v27.4s, v13.4s, v3.s[0] 586 FMLA v29.4s, v13.4s, v4.s[0] 587 FMLA v31.4s, v13.4s, v5.s[0] 588 589 FMLA v20.4s, v14.4s, v0.s[1] 590 FMLA v22.4s, v14.4s, v1.s[1] 591 FMLA v24.4s, v14.4s, v2.s[1] 592 FMLA v26.4s, v14.4s, v3.s[1] 593 FMLA v28.4s, v14.4s, v4.s[1] 594 FMLA v30.4s, v14.4s, v5.s[1] 595 FMLA v21.4s, v15.4s, v0.s[1] 596 FMLA v23.4s, v15.4s, v1.s[1] 597 FMLA v25.4s, v15.4s, v2.s[1] 598 FMLA v27.4s, v15.4s, v3.s[1] 599 FMLA v29.4s, v15.4s, v4.s[1] 600 FMLA v31.4s, v15.4s, v5.s[1] 601 602 # Is there a remainder?- 1 float of A (4 bytes) 6037: 604 TBZ x0, 2, 4b 605 606 # Remainder- 1 float of A (4 bytes) 607 # Load A 608 LDR s0, [x14], 4 609 LDR s1, [x15], 4 610 LDR s2, [x20], 4 611 LDR s3, [x21], 4 612 LDR s4, [x22], 4 613 LDR s5, [x23], 4 614 # Load B 615 LDP q12, q13, [x5], 32 616 617 FMLA v20.4s, v12.4s, v0.s[0] 618 FMLA v22.4s, v12.4s, v1.s[0] 619 FMLA v24.4s, v12.4s, v2.s[0] 620 FMLA v26.4s, v12.4s, v3.s[0] 621 FMLA v28.4s, v12.4s, v4.s[0] 622 FMLA v30.4s, v12.4s, v5.s[0] 623 FMLA v21.4s, v13.4s, v0.s[0] 624 FMLA v23.4s, v13.4s, v1.s[0] 625 FMLA v25.4s, v13.4s, v2.s[0] 626 FMLA v27.4s, v13.4s, v3.s[0] 627 FMLA v29.4s, v13.4s, v4.s[0] 628 FMLA v31.4s, v13.4s, v5.s[0] 629 B 4b 630 631 # Store odd width 6328: 633 TBZ x1, 2, 9f 634 STR q30, [x7], 16 635 MOV v30.16b, v31.16b 636 STR q28, [x13], 16 637 MOV v28.16b, v29.16b 638 STR q26, [x10], 16 639 MOV v26.16b, v27.16b 640 STR q24, [x17], 16 641 MOV v24.16b, v25.16b 642 STR q22, [x16], 16 643 MOV v22.16b, v23.16b 644 STR q20, [x6], 16 645 MOV v20.16b, v21.16b 6469: 647 TBZ x1, 1, 10f 648 STR d30, [x7], 8 649 STR d28, [x13], 8 650 DUP d30, v30.d[1] 651 DUP d28, v28.d[1] 652 STR d26, [x10], 8 653 STR d24, [x17], 8 654 DUP d26, v26.d[1] 655 DUP d24, v24.d[1] 656 STR d22, [x16], 8 657 STR d20, [x6], 8 658 DUP d22, v22.d[1] 659 DUP d20, v20.d[1] 660 66110: 662 TBZ x1, 0, 11f 663 STR s30, [x7] 664 STR s28, [x13] 665 STR s26, [x10] 666 STR s24, [x17] 667 STR s22, [x16] 668 STR s20, [x6] 66911: 670 # Restore x20,x21,x22,x23 from stack 671 LDP x22, x23, [sp, 80] 672 LDP x20, x21, [sp, 64] 673 674 # Restore d8-d15 from stack 675 LDP d14, d15, [sp, 48] 676 LDP d12, d13, [sp, 32] 677 LDP d10, d11, [sp, 16] 678 LDP d8, d9, [sp], 96 679 RET 680 681END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 682 683#ifdef __ELF__ 684.section ".note.GNU-stack","",%progbits 685#endif 686