1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 v6 44# A1 v1 v7 45# A2 v2 v8 46# A3 v3 v9 47# A4 v4 v10 48# A5 v5 v11 49# B v12 v13 v14 v15 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58 59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 60 61 # Load params pointer 62 LDR x8, [sp, 8] 63 64 # Clamp A and C pointers / Save d8-d15 on stack 65 STP d8, d9, [sp, -64]! 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d10, d11, [sp, 16] 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 STP d12, d13, [sp, 32] 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x14, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x14, x17, x14, LO // c3 = c2 85 86 STP d14, d15, [sp, 48] 87 ADD x12, x11, x4 // a4 = a3 + a_stride 88 ADD x13, x14, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x12, x11, x12, LS // a4 = a3 91 CSEL x13, x14, x13, LS // c4 = c3 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 99 .p2align 3 1000: 101 # Load initial bias from w into accumulators 102 LDP q20, q21, [x5], 32 103 MOV v22.16b, v20.16b 104 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 105 MOV v23.16b, v21.16b 106 PRFM PLDL1KEEP, [x5, 64] 107 MOV v24.16b, v20.16b 108 PRFM PLDL1KEEP, [x5, 128] 109 MOV v25.16b, v21.16b 110 PRFM PLDL1KEEP, [x5, 192] 111 MOV v26.16b, v20.16b 112 PRFM PLDL1KEEP, [x3] // Prefetch A 113 MOV v27.16b, v21.16b 114 PRFM PLDL1KEEP, [x9] 115 MOV v28.16b, v20.16b 116 PRFM PLDL1KEEP, [x10] 117 MOV v29.16b, v21.16b 118 PRFM PLDL1KEEP, [x11] 119 MOV v30.16b, v20.16b 120 PRFM PLDL1KEEP, [x12] 121 MOV v31.16b, v21.16b 122 PRFM PLDL1KEEP, [x4] 123 124 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 125 SUBS x0, x2, 32 // k = kc - 32 126 B.LO 4f 127 128 # Prologue - loads for main loop of 96 FMA 129 # load A0 to A4 but not A5 130 LDP q0, q6, [x3], 32 131 LDP q1, q7, [x9], 32 132 LDP q2, q8, [x10], 32 133 LDP q3, q9, [x11], 32 134 LDP q4, q10, [x12], 32 135 # load first set of B 136 LDP q12, q13, [x5], 32 137 LDP q14, q15, [x5], 32 138 139 # Is there at least 8 floats (32 bytes) for main loop? 140 SUBS x0, x0, 32 141 B.LO 2f 142 143 # Main loop - 8 floats of A (32 bytes) 144 # 96 FMA + 6 LDP A + 8 LDP B 145 .p2align 3 1461: 147 # First group of 4 A. 48 FMA. Loads A5 148 149 LDP q5, q11, [x4], 32 150 FMLA v20.4s, v12.4s, v0.s[0] 151 FMLA v22.4s, v12.4s, v1.s[0] 152 LDP q16, q17, [x5], 32 153 FMLA v24.4s, v12.4s, v2.s[0] 154 FMLA v26.4s, v12.4s, v3.s[0] 155 LDP q18, q19, [x5], 32 156 FMLA v28.4s, v12.4s, v4.s[0] 157 FMLA v30.4s, v12.4s, v5.s[0] 158 FMLA v21.4s, v13.4s, v0.s[0] 159 FMLA v23.4s, v13.4s, v1.s[0] 160 FMLA v25.4s, v13.4s, v2.s[0] 161 FMLA v27.4s, v13.4s, v3.s[0] 162 FMLA v29.4s, v13.4s, v4.s[0] 163 FMLA v31.4s, v13.4s, v5.s[0] 164 165 FMLA v20.4s, v14.4s, v0.s[1] 166 FMLA v22.4s, v14.4s, v1.s[1] 167 FMLA v24.4s, v14.4s, v2.s[1] 168 FMLA v26.4s, v14.4s, v3.s[1] 169 FMLA v28.4s, v14.4s, v4.s[1] 170 FMLA v30.4s, v14.4s, v5.s[1] 171 FMLA v21.4s, v15.4s, v0.s[1] 172 FMLA v23.4s, v15.4s, v1.s[1] 173 FMLA v25.4s, v15.4s, v2.s[1] 174 FMLA v27.4s, v15.4s, v3.s[1] 175 FMLA v29.4s, v15.4s, v4.s[1] 176 FMLA v31.4s, v15.4s, v5.s[1] 177 178 LDP q12, q13, [x5], 32 179 FMLA v20.4s, v16.4s, v0.s[2] 180 FMLA v22.4s, v16.4s, v1.s[2] 181 LDP q14, q15, [x5], 32 182 FMLA v24.4s, v16.4s, v2.s[2] 183 FMLA v26.4s, v16.4s, v3.s[2] 184 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 185 FMLA v28.4s, v16.4s, v4.s[2] 186 FMLA v30.4s, v16.4s, v5.s[2] 187 PRFM PLDL1KEEP, [x5, 256] 188 FMLA v21.4s, v17.4s, v0.s[2] 189 FMLA v23.4s, v17.4s, v1.s[2] 190 FMLA v25.4s, v17.4s, v2.s[2] 191 FMLA v27.4s, v17.4s, v3.s[2] 192 FMLA v29.4s, v17.4s, v4.s[2] 193 FMLA v31.4s, v17.4s, v5.s[2] 194 195 FMLA v20.4s, v18.4s, v0.s[3] 196 FMLA v22.4s, v18.4s, v1.s[3] 197 FMLA v24.4s, v18.4s, v2.s[3] 198 FMLA v26.4s, v18.4s, v3.s[3] 199 FMLA v28.4s, v18.4s, v4.s[3] 200 FMLA v30.4s, v18.4s, v5.s[3] 201 FMLA v21.4s, v19.4s, v0.s[3] 202 FMLA v23.4s, v19.4s, v1.s[3] 203 FMLA v25.4s, v19.4s, v2.s[3] 204 FMLA v27.4s, v19.4s, v3.s[3] 205 FMLA v29.4s, v19.4s, v4.s[3] 206 FMLA v31.4s, v19.4s, v5.s[3] 207 208 # Second group of 4 A. 48 FMA. Loads A0 - A4 209 210 LDP q16, q17, [x5], 32 211 FMLA v20.4s, v12.4s, v6.s[0] 212 FMLA v22.4s, v12.4s, v7.s[0] 213 LDP q18, q19, [x5], 32 214 FMLA v24.4s, v12.4s, v8.s[0] 215 FMLA v26.4s, v12.4s, v9.s[0] 216 FMLA v28.4s, v12.4s, v10.s[0] 217 FMLA v30.4s, v12.4s, v11.s[0] 218 FMLA v21.4s, v13.4s, v6.s[0] 219 FMLA v23.4s, v13.4s, v7.s[0] 220 FMLA v25.4s, v13.4s, v8.s[0] 221 FMLA v27.4s, v13.4s, v9.s[0] 222 FMLA v29.4s, v13.4s, v10.s[0] 223 FMLA v31.4s, v13.4s, v11.s[0] 224 225 FMLA v20.4s, v14.4s, v6.s[1] 226 FMLA v22.4s, v14.4s, v7.s[1] 227 FMLA v24.4s, v14.4s, v8.s[1] 228 FMLA v26.4s, v14.4s, v9.s[1] 229 FMLA v28.4s, v14.4s, v10.s[1] 230 FMLA v30.4s, v14.4s, v11.s[1] 231 FMLA v21.4s, v15.4s, v6.s[1] 232 FMLA v23.4s, v15.4s, v7.s[1] 233 FMLA v25.4s, v15.4s, v8.s[1] 234 FMLA v27.4s, v15.4s, v9.s[1] 235 FMLA v29.4s, v15.4s, v10.s[1] 236 FMLA v31.4s, v15.4s, v11.s[1] 237 238 LDP q12, q13, [x5], 32 239 FMLA v20.4s, v16.4s, v6.s[2] 240 FMLA v20.4s, v18.4s, v6.s[3] 241 LDP q14, q15, [x5], 32 242 FMLA v21.4s, v17.4s, v6.s[2] 243 FMLA v21.4s, v19.4s, v6.s[3] 244 LDP q0, q6, [x3], 32 245 FMLA v22.4s, v16.4s, v7.s[2] 246 FMLA v22.4s, v18.4s, v7.s[3] 247 FMLA v23.4s, v17.4s, v7.s[2] 248 FMLA v23.4s, v19.4s, v7.s[3] 249 LDP q1, q7, [x9], 32 250 FMLA v24.4s, v16.4s, v8.s[2] 251 FMLA v24.4s, v18.4s, v8.s[3] 252 FMLA v25.4s, v17.4s, v8.s[2] 253 FMLA v25.4s, v19.4s, v8.s[3] 254 LDP q2, q8, [x10], 32 255 FMLA v26.4s, v16.4s, v9.s[2] 256 FMLA v26.4s, v18.4s, v9.s[3] 257 FMLA v27.4s, v17.4s, v9.s[2] 258 FMLA v27.4s, v19.4s, v9.s[3] 259 LDP q3, q9, [x11], 32 260 FMLA v28.4s, v16.4s, v10.s[2] 261 FMLA v28.4s, v18.4s, v10.s[3] 262 FMLA v29.4s, v17.4s, v10.s[2] 263 FMLA v29.4s, v19.4s, v10.s[3] 264 LDP q4, q10, [x12], 32 265 FMLA v30.4s, v16.4s, v11.s[2] 266 FMLA v30.4s, v18.4s, v11.s[3] 267 SUBS x0, x0, 32 268 FMLA v31.4s, v17.4s, v11.s[2] 269 FMLA v31.4s, v19.4s, v11.s[3] 270 B.HS 1b 271 272 # Epilogue - 8 floats of A (32 bytes) 273 # 96 FMA + 6 LDP A + 8 LDP B 274 # First block same as main loop. Second block has no preloads. 2752: 276 # First group of 4 A. 48 FMA. Loads A5 277 278 LDP q5, q11, [x4], 32 279 FMLA v20.4s, v12.4s, v0.s[0] 280 FMLA v22.4s, v12.4s, v1.s[0] 281 LDP q16, q17, [x5], 32 282 FMLA v24.4s, v12.4s, v2.s[0] 283 FMLA v26.4s, v12.4s, v3.s[0] 284 LDP q18, q19, [x5], 32 285 FMLA v28.4s, v12.4s, v4.s[0] 286 FMLA v30.4s, v12.4s, v5.s[0] 287 FMLA v21.4s, v13.4s, v0.s[0] 288 FMLA v23.4s, v13.4s, v1.s[0] 289 FMLA v25.4s, v13.4s, v2.s[0] 290 FMLA v27.4s, v13.4s, v3.s[0] 291 FMLA v29.4s, v13.4s, v4.s[0] 292 FMLA v31.4s, v13.4s, v5.s[0] 293 294 FMLA v20.4s, v14.4s, v0.s[1] 295 FMLA v22.4s, v14.4s, v1.s[1] 296 FMLA v24.4s, v14.4s, v2.s[1] 297 FMLA v26.4s, v14.4s, v3.s[1] 298 FMLA v28.4s, v14.4s, v4.s[1] 299 FMLA v30.4s, v14.4s, v5.s[1] 300 FMLA v21.4s, v15.4s, v0.s[1] 301 FMLA v23.4s, v15.4s, v1.s[1] 302 FMLA v25.4s, v15.4s, v2.s[1] 303 FMLA v27.4s, v15.4s, v3.s[1] 304 FMLA v29.4s, v15.4s, v4.s[1] 305 FMLA v31.4s, v15.4s, v5.s[1] 306 307 LDP q12, q13, [x5], 32 308 FMLA v20.4s, v16.4s, v0.s[2] 309 FMLA v22.4s, v16.4s, v1.s[2] 310 LDP q14, q15, [x5], 32 311 FMLA v24.4s, v16.4s, v2.s[2] 312 FMLA v26.4s, v16.4s, v3.s[2] 313 FMLA v28.4s, v16.4s, v4.s[2] 314 FMLA v30.4s, v16.4s, v5.s[2] 315 FMLA v21.4s, v17.4s, v0.s[2] 316 FMLA v23.4s, v17.4s, v1.s[2] 317 FMLA v25.4s, v17.4s, v2.s[2] 318 FMLA v27.4s, v17.4s, v3.s[2] 319 FMLA v29.4s, v17.4s, v4.s[2] 320 FMLA v31.4s, v17.4s, v5.s[2] 321 322 FMLA v20.4s, v18.4s, v0.s[3] 323 FMLA v22.4s, v18.4s, v1.s[3] 324 FMLA v24.4s, v18.4s, v2.s[3] 325 FMLA v26.4s, v18.4s, v3.s[3] 326 FMLA v28.4s, v18.4s, v4.s[3] 327 FMLA v30.4s, v18.4s, v5.s[3] 328 FMLA v21.4s, v19.4s, v0.s[3] 329 FMLA v23.4s, v19.4s, v1.s[3] 330 FMLA v25.4s, v19.4s, v2.s[3] 331 FMLA v27.4s, v19.4s, v3.s[3] 332 FMLA v29.4s, v19.4s, v4.s[3] 333 FMLA v31.4s, v19.4s, v5.s[3] 334 335 # Second group of 4 A. 48 FMA. No A Loads, No last B load 336 337 LDP q16, q17, [x5], 32 338 FMLA v20.4s, v12.4s, v6.s[0] 339 FMLA v22.4s, v12.4s, v7.s[0] 340 LDP q18, q19, [x5], 32 341 FMLA v24.4s, v12.4s, v8.s[0] 342 FMLA v26.4s, v12.4s, v9.s[0] 343 FMLA v28.4s, v12.4s, v10.s[0] 344 FMLA v30.4s, v12.4s, v11.s[0] 345 FMLA v21.4s, v13.4s, v6.s[0] 346 FMLA v23.4s, v13.4s, v7.s[0] 347 FMLA v25.4s, v13.4s, v8.s[0] 348 FMLA v27.4s, v13.4s, v9.s[0] 349 FMLA v29.4s, v13.4s, v10.s[0] 350 FMLA v31.4s, v13.4s, v11.s[0] 351 352 FMLA v20.4s, v14.4s, v6.s[1] 353 FMLA v22.4s, v14.4s, v7.s[1] 354 FMLA v24.4s, v14.4s, v8.s[1] 355 FMLA v26.4s, v14.4s, v9.s[1] 356 FMLA v28.4s, v14.4s, v10.s[1] 357 FMLA v30.4s, v14.4s, v11.s[1] 358 FMLA v21.4s, v15.4s, v6.s[1] 359 FMLA v23.4s, v15.4s, v7.s[1] 360 FMLA v25.4s, v15.4s, v8.s[1] 361 FMLA v27.4s, v15.4s, v9.s[1] 362 FMLA v29.4s, v15.4s, v10.s[1] 363 FMLA v31.4s, v15.4s, v11.s[1] 364 365 # Last part of epilogue has loads removed. 366 367 FMLA v20.4s, v16.4s, v6.s[2] 368 FMLA v22.4s, v16.4s, v7.s[2] 369 FMLA v24.4s, v16.4s, v8.s[2] 370 FMLA v26.4s, v16.4s, v9.s[2] 371 FMLA v28.4s, v16.4s, v10.s[2] 372 FMLA v30.4s, v16.4s, v11.s[2] 373 FMLA v21.4s, v17.4s, v6.s[2] 374 FMLA v23.4s, v17.4s, v7.s[2] 375 FMLA v25.4s, v17.4s, v8.s[2] 376 FMLA v27.4s, v17.4s, v9.s[2] 377 FMLA v29.4s, v17.4s, v10.s[2] 378 FMLA v31.4s, v17.4s, v11.s[2] 379 380 FMLA v20.4s, v18.4s, v6.s[3] 381 FMLA v22.4s, v18.4s, v7.s[3] 382 FMLA v24.4s, v18.4s, v8.s[3] 383 FMLA v26.4s, v18.4s, v9.s[3] 384 FMLA v28.4s, v18.4s, v10.s[3] 385 FMLA v30.4s, v18.4s, v11.s[3] 386 FMLA v21.4s, v19.4s, v6.s[3] 387 FMLA v23.4s, v19.4s, v7.s[3] 388 389 # Load min/max values 390 LD2R {v6.4s, v7.4s}, [x8] 391 392 FMLA v25.4s, v19.4s, v8.s[3] 393 FMLA v27.4s, v19.4s, v9.s[3] 394 # Is there a remainder?- 4 floats of A (16 bytes) or less 395 TST x0, 31 396 FMLA v29.4s, v19.4s, v10.s[3] 397 FMLA v31.4s, v19.4s, v11.s[3] 398 B.NE 4f 399 400 .p2align 3 401 402 # Clamp 4033: 404 FMAX v20.4s, v20.4s, v6.4s 405 # Load cn_stride 406 LDR x0, [sp, 64] 407 FMAX v21.4s, v21.4s, v6.4s 408 FMAX v22.4s, v22.4s, v6.4s 409 FMAX v23.4s, v23.4s, v6.4s 410 FMAX v24.4s, v24.4s, v6.4s 411 FMAX v25.4s, v25.4s, v6.4s 412 FMAX v26.4s, v26.4s, v6.4s 413 FMAX v27.4s, v27.4s, v6.4s 414 FMAX v28.4s, v28.4s, v6.4s 415 FMAX v29.4s, v29.4s, v6.4s 416 FMAX v30.4s, v30.4s, v6.4s 417 FMAX v31.4s, v31.4s, v6.4s 418 SUBS x1, x1, 8 419 FMIN v20.4s, v20.4s, v7.4s 420 FMIN v21.4s, v21.4s, v7.4s 421 FMIN v22.4s, v22.4s, v7.4s 422 FMIN v23.4s, v23.4s, v7.4s 423 FMIN v24.4s, v24.4s, v7.4s 424 FMIN v25.4s, v25.4s, v7.4s 425 FMIN v26.4s, v26.4s, v7.4s 426 FMIN v27.4s, v27.4s, v7.4s 427 FMIN v28.4s, v28.4s, v7.4s 428 FMIN v29.4s, v29.4s, v7.4s 429 FMIN v30.4s, v30.4s, v7.4s 430 FMIN v31.4s, v31.4s, v7.4s 431 432 # Store full 6 x 8 433 B.LO 7f 434 435 STP q20, q21, [x6] 436 ADD x6, x6, x0 437 SUB x3, x3, x2 // a0 -= kc 438 STP q22, q23, [x16] 439 ADD x16, x16, x0 440 SUB x9, x9, x2 // a1 -= kc 441 STP q24, q25, [x17] 442 ADD x17, x17, x0 443 SUB x10, x10, x2 // a2 -= kc 444 STP q26, q27, [x14] 445 ADD x14, x14, x0 446 SUB x11, x11, x2 // a3 -= kc 447 STP q28, q29, [x13] 448 ADD x13, x13, x0 449 SUB x12, x12, x2 // a4 -= kc 450 STP q30, q31, [x7] 451 ADD x7, x7, x0 452 SUB x4, x4, x2 // a5 -= kc 453 454 NOP 455 B.HI 0b 456 457 # Restore d8-d15 from stack 458 LDP d14, d15, [sp, 48] 459 LDP d12, d13, [sp, 32] 460 LDP d10, d11, [sp, 16] 461 LDP d8, d9, [sp], 64 462 RET 463 464 .p2align 3 4654: 466 # Load min/max values 467 LD2R {v6.4s, v7.4s}, [x8] 468 469 # Is there a remainder?- 4 floats of A (16 bytes) 470 TBZ x0, 4, 5f 471 472 # Remainder- 4 floats of A (16 bytes) 473 # Load A 474 LDR q0, [x3], 16 475 LDR q1, [x9], 16 476 LDR q2, [x10], 16 477 LDR q3, [x11], 16 478 LDR q4, [x12], 16 479 LDR q5, [x4], 16 480 # Load B 481 LDP q12, q13, [x5], 32 482 LDP q14, q15, [x5], 32 483 LDP q16, q17, [x5], 32 484 LDP q18, q19, [x5], 32 485 486 FMLA v20.4s, v12.4s, v0.s[0] 487 FMLA v22.4s, v12.4s, v1.s[0] 488 FMLA v24.4s, v12.4s, v2.s[0] 489 FMLA v26.4s, v12.4s, v3.s[0] 490 FMLA v28.4s, v12.4s, v4.s[0] 491 FMLA v30.4s, v12.4s, v5.s[0] 492 FMLA v21.4s, v13.4s, v0.s[0] 493 FMLA v23.4s, v13.4s, v1.s[0] 494 FMLA v25.4s, v13.4s, v2.s[0] 495 FMLA v27.4s, v13.4s, v3.s[0] 496 FMLA v29.4s, v13.4s, v4.s[0] 497 FMLA v31.4s, v13.4s, v5.s[0] 498 499 FMLA v20.4s, v14.4s, v0.s[1] 500 FMLA v22.4s, v14.4s, v1.s[1] 501 FMLA v24.4s, v14.4s, v2.s[1] 502 FMLA v26.4s, v14.4s, v3.s[1] 503 FMLA v28.4s, v14.4s, v4.s[1] 504 FMLA v30.4s, v14.4s, v5.s[1] 505 FMLA v21.4s, v15.4s, v0.s[1] 506 FMLA v23.4s, v15.4s, v1.s[1] 507 FMLA v25.4s, v15.4s, v2.s[1] 508 FMLA v27.4s, v15.4s, v3.s[1] 509 FMLA v29.4s, v15.4s, v4.s[1] 510 FMLA v31.4s, v15.4s, v5.s[1] 511 512 FMLA v20.4s, v16.4s, v0.s[2] 513 FMLA v22.4s, v16.4s, v1.s[2] 514 FMLA v24.4s, v16.4s, v2.s[2] 515 FMLA v26.4s, v16.4s, v3.s[2] 516 FMLA v28.4s, v16.4s, v4.s[2] 517 FMLA v30.4s, v16.4s, v5.s[2] 518 FMLA v21.4s, v17.4s, v0.s[2] 519 FMLA v23.4s, v17.4s, v1.s[2] 520 FMLA v25.4s, v17.4s, v2.s[2] 521 FMLA v27.4s, v17.4s, v3.s[2] 522 FMLA v29.4s, v17.4s, v4.s[2] 523 FMLA v31.4s, v17.4s, v5.s[2] 524 525 FMLA v20.4s, v18.4s, v0.s[3] 526 FMLA v22.4s, v18.4s, v1.s[3] 527 FMLA v24.4s, v18.4s, v2.s[3] 528 FMLA v26.4s, v18.4s, v3.s[3] 529 FMLA v28.4s, v18.4s, v4.s[3] 530 FMLA v30.4s, v18.4s, v5.s[3] 531 FMLA v21.4s, v19.4s, v0.s[3] 532 FMLA v23.4s, v19.4s, v1.s[3] 533 FMLA v25.4s, v19.4s, v2.s[3] 534 FMLA v27.4s, v19.4s, v3.s[3] 535 FMLA v29.4s, v19.4s, v4.s[3] 536 FMLA v31.4s, v19.4s, v5.s[3] 537 538 # Is there a remainder?- 2 floats of A (8 bytes) 5395: 540 TBZ x0, 3, 6f 541 542 # Remainder- 2 floats of A (8 bytes) 543 # Load A 544 LDR d0, [x3], 8 545 LDR d1, [x9], 8 546 LDR d2, [x10], 8 547 LDR d3, [x11], 8 548 LDR d4, [x12], 8 549 LDR d5, [x4], 8 550 # Load B 551 LDP q12, q13, [x5], 32 552 LDP q14, q15, [x5], 32 553 554 FMLA v20.4s, v12.4s, v0.s[0] 555 FMLA v22.4s, v12.4s, v1.s[0] 556 FMLA v24.4s, v12.4s, v2.s[0] 557 FMLA v26.4s, v12.4s, v3.s[0] 558 FMLA v28.4s, v12.4s, v4.s[0] 559 FMLA v30.4s, v12.4s, v5.s[0] 560 FMLA v21.4s, v13.4s, v0.s[0] 561 FMLA v23.4s, v13.4s, v1.s[0] 562 FMLA v25.4s, v13.4s, v2.s[0] 563 FMLA v27.4s, v13.4s, v3.s[0] 564 FMLA v29.4s, v13.4s, v4.s[0] 565 FMLA v31.4s, v13.4s, v5.s[0] 566 567 FMLA v20.4s, v14.4s, v0.s[1] 568 FMLA v22.4s, v14.4s, v1.s[1] 569 FMLA v24.4s, v14.4s, v2.s[1] 570 FMLA v26.4s, v14.4s, v3.s[1] 571 FMLA v28.4s, v14.4s, v4.s[1] 572 FMLA v30.4s, v14.4s, v5.s[1] 573 FMLA v21.4s, v15.4s, v0.s[1] 574 FMLA v23.4s, v15.4s, v1.s[1] 575 FMLA v25.4s, v15.4s, v2.s[1] 576 FMLA v27.4s, v15.4s, v3.s[1] 577 FMLA v29.4s, v15.4s, v4.s[1] 578 FMLA v31.4s, v15.4s, v5.s[1] 579 580 # Is there a remainder?- 1 float of A (4 bytes) 5816: 582 TBZ x0, 2, 3b 583 584 # Remainder- 1 float of A (4 bytes) 585 # Load A 586 LDR s0, [x3], 4 587 LDR s1, [x9], 4 588 LDR s2, [x10], 4 589 LDR s3, [x11], 4 590 LDR s4, [x12], 4 591 LDR s5, [x4], 4 592 # Load B 593 LDP q12, q13, [x5], 32 594 595 FMLA v20.4s, v12.4s, v0.s[0] 596 FMLA v22.4s, v12.4s, v1.s[0] 597 FMLA v24.4s, v12.4s, v2.s[0] 598 FMLA v26.4s, v12.4s, v3.s[0] 599 FMLA v28.4s, v12.4s, v4.s[0] 600 FMLA v30.4s, v12.4s, v5.s[0] 601 FMLA v21.4s, v13.4s, v0.s[0] 602 FMLA v23.4s, v13.4s, v1.s[0] 603 FMLA v25.4s, v13.4s, v2.s[0] 604 FMLA v27.4s, v13.4s, v3.s[0] 605 FMLA v29.4s, v13.4s, v4.s[0] 606 FMLA v31.4s, v13.4s, v5.s[0] 607 B 3b 608 609 .p2align 3 610 611 # Store odd width 6127: 613 TBZ x1, 2, 8f 614 STR q20, [x6], 16 615 MOV v20.16b, v21.16b 616 STR q22, [x16], 16 617 MOV v22.16b, v23.16b 618 STR q24, [x17], 16 619 MOV v24.16b, v25.16b 620 STR q26, [x14], 16 621 MOV v26.16b, v27.16b 622 STR q28, [x13], 16 623 MOV v28.16b, v29.16b 624 STR q30, [x7], 16 625 MOV v30.16b, v31.16b 6268: 627 TBZ x1, 1, 9f 628 STR d20, [x6], 8 629 STR d22, [x16], 8 630 DUP d20, v20.d[1] 631 DUP d22, v22.d[1] 632 STR d24, [x17], 8 633 STR d26, [x14], 8 634 DUP d24, v24.d[1] 635 DUP d26, v26.d[1] 636 STR d28, [x13], 8 637 STR d30, [x7], 8 638 DUP d28, v28.d[1] 639 DUP d30, v30.d[1] 640 6419: 642 TBZ x1, 0, 10f 643 STR s20, [x6] 644 STR s22, [x16] 645 STR s24, [x17] 646 STR s26, [x14] 647 STR s28, [x13] 648 STR s30, [x7] 64910: 650 # Restore d8-d15 from stack 651 LDP d14, d15, [sp, 48] 652 LDP d12, d13, [sp, 32] 653 LDP d10, d11, [sp, 16] 654 LDP d8, d9, [sp], 64 655 RET 656 657END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 658 659#ifdef __ELF__ 660.section ".note.GNU-stack","",%progbits 661#endif 662