1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# x8 temporary vector shadow register 39 40# Vector register usage and GPR shadows 41# a0 v0 42# a1 v0[1] 43# a2 v1 44# a3 v1[1] 45# a0 v2 46# a1 v2[1] 47# a2 v3 48# a3 v3[1] 49# B v6 v7 v8 50# B v9 v10 v11 51# B v14 v15 v16 52# B v17 v18 v19 53# C v20 v21 v22 54# C v23 v24 v25 55# C v26 v27 v28 56# C v29 v30 v31 57# Clamp v4 v5 58# v12 to v13 unused. 59 60BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 61 62 # Load cn_stride, params pointer 63 LDP x14, x8, [sp] 64 65 # Load min/max values 66 LD2R {v4.4s, v5.4s}, [x8] 67 68 # Save d8-d11,d14,d15 on stack 69 STP d8, d9, [sp, -48]! 70 STP d10, d11, [sp, 16] 71 STP d14, d15, [sp, 32] 72 73 # Clamp A and C pointers 74 CMP x0, 2 // if mr < 2 75 ADD x11, x3, x4 // a1 = a0 + a_stride 76 ADD x9, x6, x7 // c1 = c0 + cm_stride 77 CSEL x11, x3, x11, LO // a1 = a0 78 CSEL x9, x6, x9, LO // c1 = c0 79 ADD x12, x11, x4 // a2 = a1 + a_stride 80 ADD x10, x9, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x12, x11, x12, LS // a2 = a1 83 CSEL x10, x9, x10, LS // c2 = c1 84 CMP x0, 4 // if mr < 4 85 ADD x4, x12, x4 // a3 = a2 + a_stride 86 ADD x7, x10, x7 // c3 = c2 + cm_stride 87 CSEL x4, x12, x4, LO // a3 = a2 88 CSEL x7, x10, x7, LO // c3 = c2 89 900: 91 # Load initial bias from w into accumulators 92 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 93 MOV v23.16b, v20.16b 94 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 95 PRFM PLDL1KEEP, [x3, 64] 96 MOV v24.16b, v21.16b 97 PRFM PLDL1KEEP, [x11, 0] 98 PRFM PLDL1KEEP, [x11, 64] 99 MOV v25.16b, v22.16b 100 PRFM PLDL1KEEP, [x12, 0] 101 PRFM PLDL1KEEP, [x12, 64] 102 MOV v26.16b, v20.16b 103 PRFM PLDL1KEEP, [x4, 0] 104 PRFM PLDL1KEEP, [x4, 64] 105 MOV v27.16b, v21.16b 106 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 107 PRFM PLDL1KEEP, [x5, 64] 108 MOV v28.16b, v22.16b 109 PRFM PLDL1KEEP, [x5, 128] 110 PRFM PLDL1KEEP, [x5, 192] 111 MOV v29.16b, v20.16b 112 PRFM PLDL1KEEP, [x5, 256] 113 MOV v30.16b, v21.16b 114 PRFM PLDL1KEEP, [x5, 320] 115 MOV v31.16b, v22.16b 116 117 # Is there at least 4 floats (16 bytes)? 118 SUBS x0, x2, 16 // k = kc - 16 119 B.LO 4f 120 121 SUBS x0, x0, 16 122 123 # Prologue - loads for first group of 24 FMA 124 125 # Read first block of 4 A. 126 LDR d0, [x3], 8 // a0 127 LDR d1, [x12], 8 // a2 128 LD1 {v0.d}[1], [x11], 8 // a1 129 LD1 {v1.d}[1], [x4], 8 // a3 130 131 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 132 LD1 {v9.16b, v10.16b}, [x5], 32 133 LDR d11, [x5], 8 134 LDR x8, [x5], 8 135 136 # Is there at least 4 floats (16 bytes) for main loop? 137 B.LO 2f 138 139 # Main loop - 4 floats of A (16 bytes) 1401: 141 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 142 # A is loaded for 2nd group into v2/v3 143 # INS is 4 blocks (16 cycles) after load 144 145 # BLOCK 0 146 LDR d2, [x3], 8 // a0 147 INS v11.d[1], x8 148 FMLA v20.4s, v6.4s, v0.s[0] 149 LDR x8, [x11], 8 // a1 150 FMLA v23.4s, v6.4s, v0.s[2] 151 FMLA v26.4s, v6.4s, v1.s[0] 152 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 153 154 # BLOCK 1 155 LDR d3, [x12], 8 // a2 156 INS v2.d[1], x8 // a1 was loaded in block 0 157 FMLA v29.4s, v6.4s, v1.s[2] 158 LDR x8, [x4], 8 // a3 159 FMLA v21.4s, v7.4s, v0.s[0] 160 FMLA v24.4s, v7.4s, v0.s[2] 161 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1 162 163 # BLOCK 2 164 LDR d14, [x5] // vb0x0123 165 INS v3.d[1], x8 // a3 was loaded in block 1 166 FMLA v27.4s, v7.4s, v1.s[0] 167 LDR x8, [x5, 8] 168 FMLA v30.4s, v7.4s, v1.s[2] 169 FMLA v22.4s, v8.4s, v0.s[0] 170 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2 171 172 # BLOCK 3 173 LDR d15, [x5, 16] // vb0x4567 174 INS v14.d[1], x8 // v14 was loaded in block 2 175 FMLA v25.4s, v8.4s, v0.s[2] 176 LDR x8, [x5, 24] 177 FMLA v28.4s, v8.4s, v1.s[0] 178 FMLA v31.4s, v8.4s, v1.s[2] 179 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3 180 181 # BLOCK 4 182 LDR d16, [x5, 32] // vb0x89AB 183 INS v15.d[1], x8 184 FMLA v20.4s, v9.4s, v0.s[1] 185 LDR x8, [x5, 40] 186 FMLA v23.4s, v9.4s, v0.s[3] 187 FMLA v26.4s, v9.4s, v1.s[1] 188 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 189 190 # BLOCK 5 191 LDR d17, [x5, 48] // vb1x0123 192 INS v16.d[1], x8 193 FMLA v29.4s, v9.4s, v1.s[3] 194 LDR x8, [x5, 56] 195 FMLA v21.4s, v10.4s, v0.s[1] 196 FMLA v24.4s, v10.4s, v0.s[3] 197 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 198 199 # BLOCK 6 200 LDR d18, [x5, 64] // vb1x4567 201 INS v17.d[1], x8 202 FMLA v27.4s, v10.4s, v1.s[1] 203 LDR x8, [x5, 72] 204 FMLA v30.4s, v10.4s, v1.s[3] 205 FMLA v22.4s, v11.4s, v0.s[1] 206 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 207 208 # BLOCK 7 209 LDR d19, [x5, 80] // vb1x89AB 210 INS v18.d[1], x8 211 FMLA v25.4s, v11.4s, v0.s[3] 212 LDR x8, [x5, 88] 213 FMLA v28.4s, v11.4s, v1.s[1] 214 FMLA v31.4s, v11.4s, v1.s[3] 215 216 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 217 # A is loaded for 1st group into v0/v1 218 219 # BLOCK 0 220 LDR d0, [x3], 8 // a0 221 INS v19.d[1], x8 222 FMLA v20.4s, v14.4s, v2.s[0] 223 LDR x8, [x11], 8 // a1 224 FMLA v23.4s, v14.4s, v2.s[2] 225 FMLA v26.4s, v14.4s, v3.s[0] 226 227 # BLOCK 1 228 LDR d1, [x12], 8 // a2 229 INS v0.d[1], x8 // a1 230 FMLA v29.4s, v14.4s, v3.s[2] 231 LDR x8, [x4], 8 // a3 232 FMLA v21.4s, v15.4s, v2.s[0] 233 FMLA v24.4s, v15.4s, v2.s[2] 234 235 # BLOCK 2 236 LDR d6, [x5, 96] // vb0x0123 237 INS v1.d[1], x8 // a3 238 FMLA v27.4s, v15.4s, v3.s[0] 239 LDR x8, [x5, 104] 240 FMLA v30.4s, v15.4s, v3.s[2] 241 FMLA v22.4s, v16.4s, v2.s[0] 242 243 # BLOCK 3 244 LDR d7, [x5, 112] // vb0x4567 245 INS v6.d[1], x8 246 FMLA v25.4s, v16.4s, v2.s[2] 247 LDR x8, [x5, 120] 248 FMLA v28.4s, v16.4s, v3.s[0] 249 FMLA v31.4s, v16.4s, v3.s[2] 250 251 # BLOCK 4 252 LDR d8, [x5, 128] // vb0x89AB 253 INS v7.d[1], x8 254 FMLA v20.4s, v17.4s, v2.s[1] 255 LDR x8, [x5, 136] 256 FMLA v23.4s, v17.4s, v2.s[3] 257 FMLA v26.4s, v17.4s, v3.s[1] 258 259 # BLOCK 5 260 LDR d9, [x5, 144] // vb1x0123 261 INS v8.d[1], x8 262 FMLA v29.4s, v17.4s, v3.s[3] 263 LDR x8, [x5, 152] 264 FMLA v21.4s, v18.4s, v2.s[1] 265 FMLA v24.4s, v18.4s, v2.s[3] 266 267 # BLOCK 6 268 LDR d10, [x5, 160] // vb1x4567 269 INS v9.d[1], x8 270 FMLA v27.4s, v18.4s, v3.s[1] 271 LDR x8, [x5, 168] 272 FMLA v30.4s, v18.4s, v3.s[3] 273 SUBS x0, x0, 16 274 FMLA v22.4s, v19.4s, v2.s[1] 275 276 # BLOCK 7 277 LDR d11, [x5, 176] // vb1x89AB 278 INS v10.d[1], x8 279 FMLA v25.4s, v19.4s, v2.s[3] 280 LDR x8, [x5, 184] 281 FMLA v28.4s, v19.4s, v3.s[1] 282 ADD x5, x5, 192 283 FMLA v31.4s, v19.4s, v3.s[3] 284 B.HS 1b 285 286 # Epilogue 287 # First block same as main loop. Second block has no loads. 2882: 289 # BLOCK 0 290 LDR d2, [x3], 8 // a0 291 INS v11.d[1], x8 292 FMLA v20.4s, v6.4s, v0.s[0] 293 LDR x8, [x11], 8 // a1 294 FMLA v23.4s, v6.4s, v0.s[2] 295 FMLA v26.4s, v6.4s, v1.s[0] 296 297 # BLOCK 1 298 LDR d3, [x12], 8 // a2 299 INS v2.d[1], x8 // a1 was loaded in block 0 300 FMLA v29.4s, v6.4s, v1.s[2] 301 LDR x8, [x4], 8 // a3 302 FMLA v21.4s, v7.4s, v0.s[0] 303 FMLA v24.4s, v7.4s, v0.s[2] 304 305 # BLOCK 2 306 LDR d14, [x5] // vb0x0123 307 INS v3.d[1], x8 // a3 was loaded in block 1 308 FMLA v27.4s, v7.4s, v1.s[0] 309 LDR x8, [x5, 8] 310 FMLA v30.4s, v7.4s, v1.s[2] 311 FMLA v22.4s, v8.4s, v0.s[0] 312 313 # BLOCK 3 314 LDR d15, [x5, 16] // vb0x4567 315 INS v14.d[1], x8 // v14 was loaded in block 2 316 FMLA v25.4s, v8.4s, v0.s[2] 317 LDR x8, [x5, 24] 318 FMLA v28.4s, v8.4s, v1.s[0] 319 FMLA v31.4s, v8.4s, v1.s[2] 320 321 # BLOCK 4 322 LDR d16, [x5, 32] // vb0x89AB 323 INS v15.d[1], x8 324 FMLA v20.4s, v9.4s, v0.s[1] 325 LDR x8, [x5, 40] 326 FMLA v23.4s, v9.4s, v0.s[3] 327 FMLA v26.4s, v9.4s, v1.s[1] 328 329 # BLOCK 5 330 LDR d17, [x5, 48] // vb1x0123 331 INS v16.d[1], x8 332 FMLA v29.4s, v9.4s, v1.s[3] 333 LDR x8, [x5, 56] 334 FMLA v21.4s, v10.4s, v0.s[1] 335 FMLA v24.4s, v10.4s, v0.s[3] 336 337 # BLOCK 6 338 LDR d18, [x5, 64] // vb1x4567 339 INS v17.d[1], x8 340 FMLA v27.4s, v10.4s, v1.s[1] 341 LDR x8, [x5, 72] 342 FMLA v30.4s, v10.4s, v1.s[3] 343 FMLA v22.4s, v11.4s, v0.s[1] 344 345 # BLOCK 7 346 LDR d19, [x5, 80] // vb1x89AB 347 INS v18.d[1], x8 348 FMLA v25.4s, v11.4s, v0.s[3] 349 LDR x8, [x5, 88] 350 FMLA v28.4s, v11.4s, v1.s[1] 351 FMLA v31.4s, v11.4s, v1.s[3] 352 353 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 354 # A is loaded for 1st group into v0/v1 355 356 # BLOCK 0 357 INS v19.d[1], x8 358 FMLA v20.4s, v14.4s, v2.s[0] 359 FMLA v23.4s, v14.4s, v2.s[2] 360 FMLA v26.4s, v14.4s, v3.s[0] 361 362 # BLOCK 1 363 FMLA v29.4s, v14.4s, v3.s[2] 364 FMLA v21.4s, v15.4s, v2.s[0] 365 FMLA v24.4s, v15.4s, v2.s[2] 366 367 # BLOCK 2 368 FMLA v27.4s, v15.4s, v3.s[0] 369 FMLA v30.4s, v15.4s, v3.s[2] 370 FMLA v22.4s, v16.4s, v2.s[0] 371 372 # BLOCK 3 373 FMLA v25.4s, v16.4s, v2.s[2] 374 FMLA v28.4s, v16.4s, v3.s[0] 375 FMLA v31.4s, v16.4s, v3.s[2] 376 377 # BLOCK 4 378 FMLA v20.4s, v17.4s, v2.s[1] 379 FMLA v23.4s, v17.4s, v2.s[3] 380 FMLA v26.4s, v17.4s, v3.s[1] 381 382 # BLOCK 5 383 FMLA v29.4s, v17.4s, v3.s[3] 384 FMLA v21.4s, v18.4s, v2.s[1] 385 FMLA v24.4s, v18.4s, v2.s[3] 386 387 # BLOCK 6 388 FMLA v27.4s, v18.4s, v3.s[1] 389 FMLA v30.4s, v18.4s, v3.s[3] 390 FMLA v22.4s, v19.4s, v2.s[1] 391 TST x0, 15 392 393 # BLOCK 7 394 FMLA v25.4s, v19.4s, v2.s[3] 395 FMLA v28.4s, v19.4s, v3.s[1] 396 ADD x5, x5, 96 397 FMLA v31.4s, v19.4s, v3.s[3] 398 399 # Is there a remainder?- 2 floats of A (8 bytes) or less 400 B.NE 4f 401 4023: 403 # Clamp 404 FMAX v20.4s, v20.4s, v4.4s 405 SUBS x1, x1, 12 406 FMAX v21.4s, v21.4s, v4.4s 407 FMAX v22.4s, v22.4s, v4.4s 408 FMAX v23.4s, v23.4s, v4.4s 409 FMAX v24.4s, v24.4s, v4.4s 410 FMAX v25.4s, v25.4s, v4.4s 411 FMAX v26.4s, v26.4s, v4.4s 412 FMAX v27.4s, v27.4s, v4.4s 413 FMAX v28.4s, v28.4s, v4.4s 414 FMAX v29.4s, v29.4s, v4.4s 415 FMAX v30.4s, v30.4s, v4.4s 416 FMAX v31.4s, v31.4s, v4.4s 417 FMIN v20.4s, v20.4s, v5.4s 418 FMIN v21.4s, v21.4s, v5.4s 419 FMIN v22.4s, v22.4s, v5.4s 420 FMIN v23.4s, v23.4s, v5.4s 421 FMIN v24.4s, v24.4s, v5.4s 422 FMIN v25.4s, v25.4s, v5.4s 423 FMIN v26.4s, v26.4s, v5.4s 424 FMIN v27.4s, v27.4s, v5.4s 425 FMIN v28.4s, v28.4s, v5.4s 426 FMIN v29.4s, v29.4s, v5.4s 427 FMIN v30.4s, v30.4s, v5.4s 428 FMIN v31.4s, v31.4s, v5.4s 429 430 # Store full 4 x 12 431 B.LO 6f 432 433 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 434 SUB x3, x3, x2 // a0 -= kc 435 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 436 SUB x11, x11, x2 // a1 -= kc 437 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 438 SUB x12, x12, x2 // a2 -= kc 439 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 440 SUB x4, x4, x2 // a3 -= kc 441 442 B.HI 0b 443 444 # Restore d8-d11,d14,d15 from stack 445 LDP d14, d15, [sp, 32] 446 LDP d10, d11, [sp, 16] 447 LDP d8, d9, [sp], 48 448 RET 449 4504: 451 # Is there a remainder?- 2 floats of A (8 bytes) 452 TBZ x0, 3, 5f 453 454 # Remainder - 2 floats of A (8 bytes) 455 # Read first block of 4 A. 456 LDR d0, [x3], 8 // a0 457 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 458 LDR d1, [x11], 8 // a1 459 LDR d2, [x12], 8 // a2 460 LDR d3, [x4], 8 // a3 461 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 462 463 # First block of 3 B 464 FMLA v20.4s, v6.4s, v0.s[0] 465 FMLA v23.4s, v6.4s, v1.s[0] 466 FMLA v26.4s, v6.4s, v2.s[0] 467 FMLA v29.4s, v6.4s, v3.s[0] 468 FMLA v21.4s, v7.4s, v0.s[0] 469 FMLA v24.4s, v7.4s, v1.s[0] 470 FMLA v27.4s, v7.4s, v2.s[0] 471 FMLA v30.4s, v7.4s, v3.s[0] 472 FMLA v22.4s, v8.4s, v0.s[0] 473 FMLA v25.4s, v8.4s, v1.s[0] 474 FMLA v28.4s, v8.4s, v2.s[0] 475 FMLA v31.4s, v8.4s, v3.s[0] 476 477 # Second block of 3 B 478 FMLA v20.4s, v9.4s, v0.s[1] 479 FMLA v23.4s, v9.4s, v1.s[1] 480 FMLA v26.4s, v9.4s, v2.s[1] 481 FMLA v29.4s, v9.4s, v3.s[1] 482 FMLA v21.4s, v10.4s, v0.s[1] 483 FMLA v24.4s, v10.4s, v1.s[1] 484 FMLA v27.4s, v10.4s, v2.s[1] 485 FMLA v30.4s, v10.4s, v3.s[1] 486 FMLA v22.4s, v11.4s, v0.s[1] 487 FMLA v25.4s, v11.4s, v1.s[1] 488 FMLA v28.4s, v11.4s, v2.s[1] 489 FMLA v31.4s, v11.4s, v3.s[1] 490 491 TBZ x0, 2, 3b 4925: 493 # Remainder - 1 float of A (4 bytes) 494 LDR s0, [x3], 4 // a0 495 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 496 LDR s1, [x11], 4 // a1 497 LDR s2, [x12], 4 // a2 498 LDR s3, [x4], 4 // a3 499 500 FMLA v20.4s, v6.4s, v0.s[0] 501 FMLA v23.4s, v6.4s, v1.s[0] 502 FMLA v26.4s, v6.4s, v2.s[0] 503 FMLA v29.4s, v6.4s, v3.s[0] 504 FMLA v21.4s, v7.4s, v0.s[0] 505 FMLA v24.4s, v7.4s, v1.s[0] 506 FMLA v27.4s, v7.4s, v2.s[0] 507 FMLA v30.4s, v7.4s, v3.s[0] 508 FMLA v22.4s, v8.4s, v0.s[0] 509 FMLA v25.4s, v8.4s, v1.s[0] 510 FMLA v28.4s, v8.4s, v2.s[0] 511 FMLA v31.4s, v8.4s, v3.s[0] 512 B 3b 513 5146: 515 ADD x1, x1, 12 516 # Store odd channels 517 TBZ x1, 3, 7f 518 STP q20, q21, [x6], 32 519 MOV v20.16b, v22.16b 520 STP q23, q24, [x9], 32 521 MOV v23.16b, v25.16b 522 STP q26, q27, [x10], 32 523 MOV v26.16b, v28.16b 524 STP q29, q30, [x7], 32 525 MOV v29.16b, v31.16b 526 5277: 528 TBZ x1, 2, 8f 529 STR q20, [x6], 16 530 MOV v20.16b, v21.16b 531 STR q23, [x9], 16 532 MOV v23.16b, v24.16b 533 STR q26, [x10], 16 534 MOV v26.16b, v27.16b 535 STR q29, [x7], 16 536 MOV v29.16b, v30.16b 537 5388: 539 TBZ x1, 1, 9f 540 STR d20, [x6], 8 541 DUP d20, v20.d[1] 542 STR d23, [x9], 8 543 DUP d23, v23.d[1] 544 STR d26, [x10], 8 545 DUP d26, v26.d[1] 546 STR d29, [x7], 8 547 DUP d29, v29.d[1] 548 5499: 550 TBZ x1, 0, 10f 551 STR s20, [x6] 552 STR s23, [x9] 553 STR s26, [x10] 554 STR s29, [x7] 55510: 556 # Restore d8-d11,d14,d15 from stack 557 LDP d14, d15, [sp, 32] 558 LDP d10, d11, [sp, 16] 559 LDP d8, d9, [sp], 48 560 RET 561 562END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 563 564#ifdef __ELF__ 565.section ".note.GNU-stack","",%progbits 566#endif 567