1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t* a, x3 17# size_t a_stride, x4 18# const void* w, x5 19# uint8_t* c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const xnn_f32_minmax_params params [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 v6 44# A1 v1 v7 45# A2 v2 v8 46# A3 v3 v9 47# A4 v4 v10 48# A5 v5 v11 49# B v12 v13 v14 v15 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58 59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 60 61 # Clamp A and C pointers / Save d8-d15 on stack 62 CMP x0, 2 // if mr < 2 63 STP d8, d9, [sp, -64]! 64 ADD x9, x3, x4 // a1 = a0 + a_stride 65 ADD x16, x6, x7 // c1 = c0 + cm_stride 66 CSEL x9, x3, x9, LO // a1 = a0 67 CSEL x16, x6, x16, LO // c1 = c0 68 69 STP d10, d11, [sp, 16] 70 ADD x10, x9, x4 // a2 = a1 + a_stride 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 // if mr <= 2 73 CSEL x10, x9, x10, LS // a2 = a1 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 STP d12, d13, [sp, 32] 77 CMP x0, 4 // if mr < 4 78 ADD x11, x10, x4 // a3 = a2 + a_stride 79 ADD x14, x17, x7 // c3 = c2 + cm_stride 80 CSEL x11, x10, x11, LO // a3 = a2 81 CSEL x14, x17, x14, LO // c3 = c2 82 83 STP d14, d15, [sp, 48] 84 ADD x12, x11, x4 // a4 = a3 + a_stride 85 ADD x13, x14, x7 // c4 = c3 + cm_stride 86 // if mr <= 4 87 CSEL x12, x11, x12, LS // a4 = a3 88 CSEL x13, x14, x13, LS // c4 = c3 89 90 # Load params pointer 91 LDR x8, [sp, 72] 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 990: 100 # Load initial bias from w into accumulators 101 LDP q20, q21, [x5], 32 102 SUBS x0, x2, 32 // k = kc - 32 103 MOV v22.16b, v20.16b 104 MOV v23.16b, v21.16b 105 MOV v24.16b, v20.16b 106 MOV v25.16b, v21.16b 107 MOV v26.16b, v20.16b 108 MOV v27.16b, v21.16b 109 MOV v28.16b, v20.16b 110 MOV v29.16b, v21.16b 111 MOV v30.16b, v20.16b 112 MOV v31.16b, v21.16b 113 B.LO 4f 114 115 # Prologue - loads for main loop of 96 FMA 116 LDR q0, [x3], 16 117 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 118 LDR q1, [x9], 16 119 LDR q2, [x10], 16 120 LDR q3, [x11], 16 121 LDR q4, [x12], 16 122 LDR q5, [x4], 16 123 LDP q14, q15, [x5], 32 124 LDP q16, q17, [x5], 32 125 126 # Is there at least 8 floats (32 bytes) for main loop? 127 SUBS x0, x0, 32 128 B.LO 2f 129 130 # Main loop - 8 floats of A (32 bytes) 131 # 96 FMA + 6 LDP A + 8 LDP B 132 # 64 float weights = 256 bytes. 4 cache lines. 1331: 134 # First group of 4 A. 48 FMA. 135 FMLA v20.4s, v12.4s, v0.s[0] 136 LDP q18, q19, [x5], 32 // Load last B 137 FMLA v22.4s, v12.4s, v1.s[0] 138 FMLA v24.4s, v12.4s, v2.s[0] 139 FMLA v26.4s, v12.4s, v3.s[0] 140 FMLA v28.4s, v12.4s, v4.s[0] 141 FMLA v30.4s, v12.4s, v5.s[0] 142 FMLA v21.4s, v13.4s, v0.s[0] 143 FMLA v23.4s, v13.4s, v1.s[0] 144 FMLA v25.4s, v13.4s, v2.s[0] 145 FMLA v27.4s, v13.4s, v3.s[0] 146 FMLA v29.4s, v13.4s, v4.s[0] 147 FMLA v31.4s, v13.4s, v5.s[0] 148 FMLA v20.4s, v14.4s, v0.s[1] 149 FMLA v22.4s, v14.4s, v1.s[1] 150 FMLA v24.4s, v14.4s, v2.s[1] 151 FMLA v26.4s, v14.4s, v3.s[1] 152 FMLA v28.4s, v14.4s, v4.s[1] 153 FMLA v30.4s, v14.4s, v5.s[1] 154 FMLA v21.4s, v15.4s, v0.s[1] 155 FMLA v23.4s, v15.4s, v1.s[1] 156 FMLA v25.4s, v15.4s, v2.s[1] 157 LDR q6, [x3], 16 // Load next 6 A 158 FMLA v27.4s, v15.4s, v3.s[1] 159 FMLA v29.4s, v15.4s, v4.s[1] 160 FMLA v31.4s, v15.4s, v5.s[1] 161 LDR q7, [x9], 16 162 163 FMLA v20.4s, v16.4s, v0.s[2] 164 FMLA v22.4s, v16.4s, v1.s[2] 165 FMLA v24.4s, v16.4s, v2.s[2] 166 LDR q8, [x10], 16 167 FMLA v26.4s, v16.4s, v3.s[2] 168 FMLA v28.4s, v16.4s, v4.s[2] 169 FMLA v30.4s, v16.4s, v5.s[2] 170 LDR q9, [x11], 16 171 FMLA v21.4s, v17.4s, v0.s[2] 172 FMLA v23.4s, v17.4s, v1.s[2] 173 FMLA v25.4s, v17.4s, v2.s[2] 174 LDR q10, [x12], 16 175 FMLA v27.4s, v17.4s, v3.s[2] 176 FMLA v29.4s, v17.4s, v4.s[2] 177 FMLA v31.4s, v17.4s, v5.s[2] 178 LDR q11, [x4], 16 179 180 FMLA v20.4s, v18.4s, v0.s[3] 181 FMLA v22.4s, v18.4s, v1.s[3] 182 FMLA v24.4s, v18.4s, v2.s[3] 183 LDP q12, q13, [x5], 32 // Load 4 B 184 FMLA v26.4s, v18.4s, v3.s[3] 185 FMLA v28.4s, v18.4s, v4.s[3] 186 FMLA v30.4s, v18.4s, v5.s[3] 187 LDP q14, q15, [x5], 32 188 FMLA v21.4s, v19.4s, v0.s[3] 189 FMLA v23.4s, v19.4s, v1.s[3] 190 FMLA v25.4s, v19.4s, v2.s[3] 191 LDP q16, q17, [x5], 32 192 FMLA v27.4s, v19.4s, v3.s[3] 193 FMLA v29.4s, v19.4s, v4.s[3] 194 FMLA v31.4s, v19.4s, v5.s[3] 195 LDP q18, q19, [x5], 32 196 197 # Second group of 4 A. 48 FMA. 198 FMLA v20.4s, v12.4s, v6.s[0] 199 FMLA v22.4s, v12.4s, v7.s[0] 200 FMLA v24.4s, v12.4s, v8.s[0] 201 LDR q0, [x3], 16 // Load next 6 A 202 FMLA v26.4s, v12.4s, v9.s[0] 203 FMLA v28.4s, v12.4s, v10.s[0] 204 FMLA v30.4s, v12.4s, v11.s[0] 205 LDR q1, [x9], 16 206 FMLA v21.4s, v13.4s, v6.s[0] 207 FMLA v23.4s, v13.4s, v7.s[0] 208 FMLA v25.4s, v13.4s, v8.s[0] 209 LDR q2, [x10], 16 210 FMLA v27.4s, v13.4s, v9.s[0] 211 FMLA v29.4s, v13.4s, v10.s[0] 212 FMLA v31.4s, v13.4s, v11.s[0] 213 LDR q3, [x11], 16 214 215 FMLA v20.4s, v14.4s, v6.s[1] 216 FMLA v22.4s, v14.4s, v7.s[1] 217 FMLA v24.4s, v14.4s, v8.s[1] 218 LDR q4, [x12], 16 219 FMLA v26.4s, v14.4s, v9.s[1] 220 FMLA v28.4s, v14.4s, v10.s[1] 221 FMLA v30.4s, v14.4s, v11.s[1] 222 LDR q5, [x4], 16 223 FMLA v21.4s, v15.4s, v6.s[1] 224 FMLA v23.4s, v15.4s, v7.s[1] 225 FMLA v25.4s, v15.4s, v8.s[1] 226 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 227 FMLA v27.4s, v15.4s, v9.s[1] 228 FMLA v29.4s, v15.4s, v10.s[1] 229 FMLA v31.4s, v15.4s, v11.s[1] 230 LDP q14, q15, [x5], 32 231 232 FMLA v20.4s, v16.4s, v6.s[2] 233 FMLA v22.4s, v16.4s, v7.s[2] 234 FMLA v24.4s, v16.4s, v8.s[2] 235 FMLA v26.4s, v16.4s, v9.s[2] 236 FMLA v28.4s, v16.4s, v10.s[2] 237 FMLA v30.4s, v16.4s, v11.s[2] 238 FMLA v21.4s, v17.4s, v6.s[2] 239 FMLA v23.4s, v17.4s, v7.s[2] 240 FMLA v25.4s, v17.4s, v8.s[2] 241 FMLA v27.4s, v17.4s, v9.s[2] 242 FMLA v29.4s, v17.4s, v10.s[2] 243 FMLA v31.4s, v17.4s, v11.s[2] 244 245 FMLA v20.4s, v18.4s, v6.s[3] 246 FMLA v22.4s, v18.4s, v7.s[3] 247 LDP q16, q17, [x5], 32 248 FMLA v24.4s, v18.4s, v8.s[3] 249 FMLA v26.4s, v18.4s, v9.s[3] 250 FMLA v28.4s, v18.4s, v10.s[3] 251 FMLA v30.4s, v18.4s, v11.s[3] 252 SUBS x0, x0, 32 253 FMLA v21.4s, v19.4s, v6.s[3] 254 FMLA v23.4s, v19.4s, v7.s[3] 255 FMLA v25.4s, v19.4s, v8.s[3] 256 FMLA v27.4s, v19.4s, v9.s[3] 257 FMLA v29.4s, v19.4s, v10.s[3] 258 FMLA v31.4s, v19.4s, v11.s[3] 259 B.HS 1b 260 261 # Epilogue - 8 floats of A (32 bytes) 262 # 96 FMA + 6 LDP A + 8 LDP B 263 # First block same as main loop. Second block has no preloads. 2642: 265 # First group of 4 A. 48 FMA. 266 FMLA v20.4s, v12.4s, v0.s[0] 267 LDP q18, q19, [x5], 32 // Load last B 268 FMLA v22.4s, v12.4s, v1.s[0] 269 FMLA v24.4s, v12.4s, v2.s[0] 270 FMLA v26.4s, v12.4s, v3.s[0] 271 FMLA v28.4s, v12.4s, v4.s[0] 272 FMLA v30.4s, v12.4s, v5.s[0] 273 FMLA v21.4s, v13.4s, v0.s[0] 274 FMLA v23.4s, v13.4s, v1.s[0] 275 FMLA v25.4s, v13.4s, v2.s[0] 276 FMLA v27.4s, v13.4s, v3.s[0] 277 FMLA v29.4s, v13.4s, v4.s[0] 278 FMLA v31.4s, v13.4s, v5.s[0] 279 FMLA v20.4s, v14.4s, v0.s[1] 280 FMLA v22.4s, v14.4s, v1.s[1] 281 FMLA v24.4s, v14.4s, v2.s[1] 282 FMLA v26.4s, v14.4s, v3.s[1] 283 FMLA v28.4s, v14.4s, v4.s[1] 284 FMLA v30.4s, v14.4s, v5.s[1] 285 FMLA v21.4s, v15.4s, v0.s[1] 286 FMLA v23.4s, v15.4s, v1.s[1] 287 FMLA v25.4s, v15.4s, v2.s[1] 288 LDR q6, [x3], 16 // Load next 6 A 289 FMLA v27.4s, v15.4s, v3.s[1] 290 FMLA v29.4s, v15.4s, v4.s[1] 291 FMLA v31.4s, v15.4s, v5.s[1] 292 LDR q7, [x9], 16 293 294 FMLA v20.4s, v16.4s, v0.s[2] 295 FMLA v22.4s, v16.4s, v1.s[2] 296 FMLA v24.4s, v16.4s, v2.s[2] 297 LDR q8, [x10], 16 298 FMLA v26.4s, v16.4s, v3.s[2] 299 FMLA v28.4s, v16.4s, v4.s[2] 300 FMLA v30.4s, v16.4s, v5.s[2] 301 LDR q9, [x11], 16 302 FMLA v21.4s, v17.4s, v0.s[2] 303 FMLA v23.4s, v17.4s, v1.s[2] 304 FMLA v25.4s, v17.4s, v2.s[2] 305 LDR q10, [x12], 16 306 FMLA v27.4s, v17.4s, v3.s[2] 307 FMLA v29.4s, v17.4s, v4.s[2] 308 FMLA v31.4s, v17.4s, v5.s[2] 309 LDR q11, [x4], 16 310 311 FMLA v20.4s, v18.4s, v0.s[3] 312 FMLA v22.4s, v18.4s, v1.s[3] 313 FMLA v24.4s, v18.4s, v2.s[3] 314 LDP q12, q13, [x5], 32 // Load 4 B 315 FMLA v26.4s, v18.4s, v3.s[3] 316 FMLA v28.4s, v18.4s, v4.s[3] 317 FMLA v30.4s, v18.4s, v5.s[3] 318 LDP q14, q15, [x5], 32 319 FMLA v21.4s, v19.4s, v0.s[3] 320 FMLA v23.4s, v19.4s, v1.s[3] 321 FMLA v25.4s, v19.4s, v2.s[3] 322 LDP q16, q17, [x5], 32 323 FMLA v27.4s, v19.4s, v3.s[3] 324 FMLA v29.4s, v19.4s, v4.s[3] 325 FMLA v31.4s, v19.4s, v5.s[3] 326 LDP q18, q19, [x5], 32 327 328 # Second group of 4 A. 48 FMA. 329 FMLA v20.4s, v12.4s, v6.s[0] 330 FMLA v22.4s, v12.4s, v7.s[0] 331 FMLA v24.4s, v12.4s, v8.s[0] 332 FMLA v26.4s, v12.4s, v9.s[0] 333 FMLA v28.4s, v12.4s, v10.s[0] 334 FMLA v30.4s, v12.4s, v11.s[0] 335 FMLA v21.4s, v13.4s, v6.s[0] 336 FMLA v23.4s, v13.4s, v7.s[0] 337 FMLA v25.4s, v13.4s, v8.s[0] 338 FMLA v27.4s, v13.4s, v9.s[0] 339 FMLA v29.4s, v13.4s, v10.s[0] 340 FMLA v31.4s, v13.4s, v11.s[0] 341 342 FMLA v20.4s, v14.4s, v6.s[1] 343 FMLA v22.4s, v14.4s, v7.s[1] 344 FMLA v24.4s, v14.4s, v8.s[1] 345 FMLA v26.4s, v14.4s, v9.s[1] 346 FMLA v28.4s, v14.4s, v10.s[1] 347 FMLA v30.4s, v14.4s, v11.s[1] 348 FMLA v21.4s, v15.4s, v6.s[1] 349 FMLA v23.4s, v15.4s, v7.s[1] 350 FMLA v25.4s, v15.4s, v8.s[1] 351 FMLA v27.4s, v15.4s, v9.s[1] 352 FMLA v29.4s, v15.4s, v10.s[1] 353 FMLA v31.4s, v15.4s, v11.s[1] 354 355 FMLA v20.4s, v16.4s, v6.s[2] 356 FMLA v22.4s, v16.4s, v7.s[2] 357 FMLA v24.4s, v16.4s, v8.s[2] 358 FMLA v26.4s, v16.4s, v9.s[2] 359 FMLA v28.4s, v16.4s, v10.s[2] 360 FMLA v30.4s, v16.4s, v11.s[2] 361 FMLA v21.4s, v17.4s, v6.s[2] 362 FMLA v23.4s, v17.4s, v7.s[2] 363 FMLA v25.4s, v17.4s, v8.s[2] 364 FMLA v27.4s, v17.4s, v9.s[2] 365 FMLA v29.4s, v17.4s, v10.s[2] 366 FMLA v31.4s, v17.4s, v11.s[2] 367 368 FMLA v20.4s, v18.4s, v6.s[3] 369 FMLA v22.4s, v18.4s, v7.s[3] 370 FMLA v24.4s, v18.4s, v8.s[3] 371 FMLA v26.4s, v18.4s, v9.s[3] 372 FMLA v28.4s, v18.4s, v10.s[3] 373 FMLA v30.4s, v18.4s, v11.s[3] 374 375 # Is there a remainder?- 4 floats of A (16 bytes) or less 376 TST x0, 31 377 378 FMLA v21.4s, v19.4s, v6.s[3] 379 FMLA v23.4s, v19.4s, v7.s[3] 380 FMLA v25.4s, v19.4s, v8.s[3] 381 LD2R {v6.4s, v7.4s}, [x8] // Load min/max values 382 FMLA v27.4s, v19.4s, v9.s[3] 383 FMLA v29.4s, v19.4s, v10.s[3] 384 FMLA v31.4s, v19.4s, v11.s[3] 385 B.NE 4f 386 387 # Clamp 3883: 389 FMAX v20.4s, v20.4s, v6.4s 390 FMAX v21.4s, v21.4s, v6.4s 391 FMAX v22.4s, v22.4s, v6.4s 392 FMAX v23.4s, v23.4s, v6.4s 393 FMAX v24.4s, v24.4s, v6.4s 394 LDR x0, [sp, 64] // Load cn_stride 395 FMAX v25.4s, v25.4s, v6.4s 396 FMAX v26.4s, v26.4s, v6.4s 397 FMAX v27.4s, v27.4s, v6.4s 398 FMAX v28.4s, v28.4s, v6.4s 399 FMAX v29.4s, v29.4s, v6.4s 400 FMAX v30.4s, v30.4s, v6.4s 401 FMAX v31.4s, v31.4s, v6.4s 402 SUBS x1, x1, 8 403 FMIN v20.4s, v20.4s, v7.4s 404 FMIN v21.4s, v21.4s, v7.4s 405 FMIN v22.4s, v22.4s, v7.4s 406 FMIN v23.4s, v23.4s, v7.4s 407 FMIN v24.4s, v24.4s, v7.4s 408 FMIN v25.4s, v25.4s, v7.4s 409 FMIN v26.4s, v26.4s, v7.4s 410 FMIN v27.4s, v27.4s, v7.4s 411 FMIN v28.4s, v28.4s, v7.4s 412 FMIN v29.4s, v29.4s, v7.4s 413 FMIN v30.4s, v30.4s, v7.4s 414 FMIN v31.4s, v31.4s, v7.4s 415 416 # Store full 6 x 8 417 B.LO 7f 418 419 STP q20, q21, [x6] 420 ADD x6, x6, x0 421 SUB x3, x3, x2 // a0 -= kc 422 STP q22, q23, [x16] 423 ADD x16, x16, x0 424 SUB x9, x9, x2 // a1 -= kc 425 STP q24, q25, [x17] 426 ADD x17, x17, x0 427 SUB x10, x10, x2 // a2 -= kc 428 STP q26, q27, [x14] 429 ADD x14, x14, x0 430 SUB x11, x11, x2 // a3 -= kc 431 STP q28, q29, [x13] 432 ADD x13, x13, x0 433 SUB x12, x12, x2 // a4 -= kc 434 STP q30, q31, [x7] 435 ADD x7, x7, x0 436 SUB x4, x4, x2 // a5 -= kc 437 438 B.HI 0b 439 440 # Restore d8-d15 from stack 441 LDP d14, d15, [sp, 48] 442 LDP d12, d13, [sp, 32] 443 LDP d10, d11, [sp, 16] 444 LDP d8, d9, [sp], 64 445 RET 446 4474: 448 # Load min/max values 449 LD2R {v6.4s, v7.4s}, [x8] 450 451 # Is there a remainder?- 4 floats of A (16 bytes) 452 TBZ x0, 4, 5f 453 454 # Remainder- 4 floats of A (16 bytes) 455 # Load A 456 LDR q0, [x3], 16 457 LDR q1, [x9], 16 458 LDR q2, [x10], 16 459 LDR q3, [x11], 16 460 LDR q4, [x12], 16 461 LDR q5, [x4], 16 462 # Load B 463 LDP q12, q13, [x5], 32 464 LDP q14, q15, [x5], 32 465 LDP q16, q17, [x5], 32 466 LDP q18, q19, [x5], 32 467 468 FMLA v20.4s, v12.4s, v0.s[0] 469 FMLA v22.4s, v12.4s, v1.s[0] 470 FMLA v24.4s, v12.4s, v2.s[0] 471 FMLA v26.4s, v12.4s, v3.s[0] 472 FMLA v28.4s, v12.4s, v4.s[0] 473 FMLA v30.4s, v12.4s, v5.s[0] 474 FMLA v21.4s, v13.4s, v0.s[0] 475 FMLA v23.4s, v13.4s, v1.s[0] 476 FMLA v25.4s, v13.4s, v2.s[0] 477 FMLA v27.4s, v13.4s, v3.s[0] 478 FMLA v29.4s, v13.4s, v4.s[0] 479 FMLA v31.4s, v13.4s, v5.s[0] 480 481 FMLA v20.4s, v14.4s, v0.s[1] 482 FMLA v22.4s, v14.4s, v1.s[1] 483 FMLA v24.4s, v14.4s, v2.s[1] 484 FMLA v26.4s, v14.4s, v3.s[1] 485 FMLA v28.4s, v14.4s, v4.s[1] 486 FMLA v30.4s, v14.4s, v5.s[1] 487 FMLA v21.4s, v15.4s, v0.s[1] 488 FMLA v23.4s, v15.4s, v1.s[1] 489 FMLA v25.4s, v15.4s, v2.s[1] 490 FMLA v27.4s, v15.4s, v3.s[1] 491 FMLA v29.4s, v15.4s, v4.s[1] 492 FMLA v31.4s, v15.4s, v5.s[1] 493 494 FMLA v20.4s, v16.4s, v0.s[2] 495 FMLA v22.4s, v16.4s, v1.s[2] 496 FMLA v24.4s, v16.4s, v2.s[2] 497 FMLA v26.4s, v16.4s, v3.s[2] 498 FMLA v28.4s, v16.4s, v4.s[2] 499 FMLA v30.4s, v16.4s, v5.s[2] 500 FMLA v21.4s, v17.4s, v0.s[2] 501 FMLA v23.4s, v17.4s, v1.s[2] 502 FMLA v25.4s, v17.4s, v2.s[2] 503 FMLA v27.4s, v17.4s, v3.s[2] 504 FMLA v29.4s, v17.4s, v4.s[2] 505 FMLA v31.4s, v17.4s, v5.s[2] 506 507 FMLA v20.4s, v18.4s, v0.s[3] 508 FMLA v22.4s, v18.4s, v1.s[3] 509 FMLA v24.4s, v18.4s, v2.s[3] 510 FMLA v26.4s, v18.4s, v3.s[3] 511 FMLA v28.4s, v18.4s, v4.s[3] 512 FMLA v30.4s, v18.4s, v5.s[3] 513 FMLA v21.4s, v19.4s, v0.s[3] 514 FMLA v23.4s, v19.4s, v1.s[3] 515 FMLA v25.4s, v19.4s, v2.s[3] 516 FMLA v27.4s, v19.4s, v3.s[3] 517 FMLA v29.4s, v19.4s, v4.s[3] 518 FMLA v31.4s, v19.4s, v5.s[3] 519 520 # Is there a remainder?- 2 floats of A (8 bytes) 5215: 522 TBZ x0, 3, 6f 523 524 # Remainder- 2 floats of A (8 bytes) 525 # Load A 526 LDR d0, [x3], 8 527 LDR d1, [x9], 8 528 LDR d2, [x10], 8 529 LDR d3, [x11], 8 530 LDR d4, [x12], 8 531 LDR d5, [x4], 8 532 # Load B 533 LDP q12, q13, [x5], 32 534 LDP q14, q15, [x5], 32 535 536 FMLA v20.4s, v12.4s, v0.s[0] 537 FMLA v22.4s, v12.4s, v1.s[0] 538 FMLA v24.4s, v12.4s, v2.s[0] 539 FMLA v26.4s, v12.4s, v3.s[0] 540 FMLA v28.4s, v12.4s, v4.s[0] 541 FMLA v30.4s, v12.4s, v5.s[0] 542 FMLA v21.4s, v13.4s, v0.s[0] 543 FMLA v23.4s, v13.4s, v1.s[0] 544 FMLA v25.4s, v13.4s, v2.s[0] 545 FMLA v27.4s, v13.4s, v3.s[0] 546 FMLA v29.4s, v13.4s, v4.s[0] 547 FMLA v31.4s, v13.4s, v5.s[0] 548 549 FMLA v20.4s, v14.4s, v0.s[1] 550 FMLA v22.4s, v14.4s, v1.s[1] 551 FMLA v24.4s, v14.4s, v2.s[1] 552 FMLA v26.4s, v14.4s, v3.s[1] 553 FMLA v28.4s, v14.4s, v4.s[1] 554 FMLA v30.4s, v14.4s, v5.s[1] 555 FMLA v21.4s, v15.4s, v0.s[1] 556 FMLA v23.4s, v15.4s, v1.s[1] 557 FMLA v25.4s, v15.4s, v2.s[1] 558 FMLA v27.4s, v15.4s, v3.s[1] 559 FMLA v29.4s, v15.4s, v4.s[1] 560 FMLA v31.4s, v15.4s, v5.s[1] 561 562 # Is there a remainder?- 1 float of A (4 bytes) 5636: 564 TBZ x0, 2, 3b 565 566 # Remainder- 1 float of A (4 bytes) 567 # Load A 568 LDR s0, [x3], 4 569 LDR s1, [x9], 4 570 LDR s2, [x10], 4 571 LDR s3, [x11], 4 572 LDR s4, [x12], 4 573 LDR s5, [x4], 4 574 # Load B 575 LDP q12, q13, [x5], 32 576 577 FMLA v20.4s, v12.4s, v0.s[0] 578 FMLA v22.4s, v12.4s, v1.s[0] 579 FMLA v24.4s, v12.4s, v2.s[0] 580 FMLA v26.4s, v12.4s, v3.s[0] 581 FMLA v28.4s, v12.4s, v4.s[0] 582 FMLA v30.4s, v12.4s, v5.s[0] 583 FMLA v21.4s, v13.4s, v0.s[0] 584 FMLA v23.4s, v13.4s, v1.s[0] 585 FMLA v25.4s, v13.4s, v2.s[0] 586 FMLA v27.4s, v13.4s, v3.s[0] 587 FMLA v29.4s, v13.4s, v4.s[0] 588 FMLA v31.4s, v13.4s, v5.s[0] 589 B 3b 590 591 # Store odd width 5927: 593 TBZ x1, 2, 8f 594 STR q20, [x6], 16 595 MOV v20.16b, v21.16b 596 STR q22, [x16], 16 597 MOV v22.16b, v23.16b 598 STR q24, [x17], 16 599 MOV v24.16b, v25.16b 600 STR q26, [x14], 16 601 MOV v26.16b, v27.16b 602 STR q28, [x13], 16 603 MOV v28.16b, v29.16b 604 STR q30, [x7], 16 605 MOV v30.16b, v31.16b 6068: 607 TBZ x1, 1, 9f 608 STR d20, [x6], 8 609 STR d22, [x16], 8 610 DUP d20, v20.d[1] 611 DUP d22, v22.d[1] 612 STR d24, [x17], 8 613 STR d26, [x14], 8 614 DUP d24, v24.d[1] 615 DUP d26, v26.d[1] 616 STR d28, [x13], 8 617 STR d30, [x7], 8 618 DUP d28, v28.d[1] 619 DUP d30, v30.d[1] 620 6219: 622 TBZ x1, 0, 10f 623 STR s20, [x6] 624 STR s22, [x16] 625 STR s24, [x17] 626 STR s26, [x14] 627 STR s28, [x13] 628 STR s30, [x7] 62910: 630 # Restore d8-d15 from stack 631 LDP d14, d15, [sp, 48] 632 LDP d12, d13, [sp, 32] 633 LDP d10, d11, [sp, 16] 634 LDP d8, d9, [sp], 64 635 RET 636 637END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 638 639#ifdef __ELF__ 640.section ".note.GNU-stack","",%progbits 641#endif 642