1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x11 a1 30# x12 a2 31# x4 a3 / a_stride 32 33# C pointers 34# x6 c0 35# x9 c1 36# x10 c2 37# x7 c3 / cm_stride 38 39# x8 temporary vector shadow register 40 41# Vector register usage and GPR shadows 42# a0 v0 43# a1 v0[1] 44# a2 v1 45# a3 v1[1] 46# a0 v2 47# a1 v2[1] 48# a2 v3 49# a3 v3[1] 50# B v6 v7 v8 51# B v9 v10 v11 52# B v14 v15 v16 53# B v17 v18 v19 54# C v20 v21 v22 55# C v23 v24 v25 56# C v26 v27 v28 57# C v29 v30 v31 58# Clamp v4 v5 59# v12 to v13 unused. 60 61BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 62 63 # Load cn_stride, acc 64 LDP x14, x15, [sp] 65 # Load params pointer 66 LDR x8, [sp, 16] 67 68 # Load min/max values 69 LD2R {v4.4s, v5.4s}, [x8] 70 71 # Save d8-d11,d14,d15 on stack 72 STP d8, d9, [sp, -48]! 73 STP d10, d11, [sp, 16] 74 STP d14, d15, [sp, 32] 75 76 # Clamp A and C pointers 77 CMP x0, 2 // if mr < 2 78 ADD x11, x3, x4 // a1 = a0 + a_stride 79 ADD x9, x6, x7 // c1 = c0 + cm_stride 80 CSEL x11, x3, x11, LO // a1 = a0 81 CSEL x9, x6, x9, LO // c1 = c0 82 ADD x12, x11, x4 // a2 = a1 + a_stride 83 ADD x10, x9, x7 // c2 = c1 + cm_stride 84 // if mr <= 2 85 CSEL x12, x11, x12, LS // a2 = a1 86 CSEL x10, x9, x10, LS // c2 = c1 87 CMP x0, 4 // if mr < 4 88 ADD x4, x12, x4 // a3 = a2 + a_stride 89 ADD x7, x10, x7 // c3 = c2 + cm_stride 90 CSEL x4, x12, x4, LO // a3 = a2 91 CSEL x7, x10, x7, LO // c3 = c2 92 930: 94 # Load initial accumulators 95 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48 96 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48 97 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48 98 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48 99 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 100 PRFM PLDL1KEEP, [x3, 64] 101 PRFM PLDL1KEEP, [x11, 0] 102 PRFM PLDL1KEEP, [x11, 64] 103 PRFM PLDL1KEEP, [x12, 0] 104 PRFM PLDL1KEEP, [x12, 64] 105 PRFM PLDL1KEEP, [x4, 0] 106 PRFM PLDL1KEEP, [x4, 64] 107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 108 PRFM PLDL1KEEP, [x5, 64] 109 PRFM PLDL1KEEP, [x5, 128] 110 PRFM PLDL1KEEP, [x5, 192] 111 PRFM PLDL1KEEP, [x5, 256] 112 PRFM PLDL1KEEP, [x5, 320] 113 114 # Is there at least 4 floats (16 bytes)? 115 SUBS x0, x2, 16 // k = kc - 16 116 B.LO 4f 117 118 SUBS x0, x0, 16 119 120 # Prologue - loads for first group of 24 FMA 121 122 # Read first block of 4 A. 123 LDR d0, [x3], 8 // a0 124 LDR d1, [x12], 8 // a2 125 LD1 {v0.d}[1], [x11], 8 // a1 126 LD1 {v1.d}[1], [x4], 8 // a3 127 128 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 129 LD1 {v9.16b, v10.16b}, [x5], 32 130 LDR d11, [x5], 8 131 LDR x8, [x5], 8 132 133 # Is there at least 4 floats (16 bytes) for main loop? 134 B.LO 2f 135 136 # Main loop - 4 floats of A (16 bytes) 1371: 138 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 139 # A is loaded for 2nd group into v2/v3 140 # INS is 4 blocks (16 cycles) after load 141 142 # BLOCK 0 143 LDR d2, [x3], 8 // a0 144 INS v11.d[1], x8 145 FMLA v20.4s, v6.4s, v0.s[0] 146 LDR x8, [x11], 8 // a1 147 FMLA v23.4s, v6.4s, v0.s[2] 148 FMLA v26.4s, v6.4s, v1.s[0] 149 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 150 151 # BLOCK 1 152 LDR d3, [x12], 8 // a2 153 INS v2.d[1], x8 // a1 was loaded in block 0 154 FMLA v29.4s, v6.4s, v1.s[2] 155 LDR x8, [x4], 8 // a3 156 FMLA v21.4s, v7.4s, v0.s[0] 157 FMLA v24.4s, v7.4s, v0.s[2] 158 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1 159 160 # BLOCK 2 161 LDR d14, [x5] // vb0x0123 162 INS v3.d[1], x8 // a3 was loaded in block 1 163 FMLA v27.4s, v7.4s, v1.s[0] 164 LDR x8, [x5, 8] 165 FMLA v30.4s, v7.4s, v1.s[2] 166 FMLA v22.4s, v8.4s, v0.s[0] 167 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2 168 169 # BLOCK 3 170 LDR d15, [x5, 16] // vb0x4567 171 INS v14.d[1], x8 // v14 was loaded in block 2 172 FMLA v25.4s, v8.4s, v0.s[2] 173 LDR x8, [x5, 24] 174 FMLA v28.4s, v8.4s, v1.s[0] 175 FMLA v31.4s, v8.4s, v1.s[2] 176 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3 177 178 # BLOCK 4 179 LDR d16, [x5, 32] // vb0x89AB 180 INS v15.d[1], x8 181 FMLA v20.4s, v9.4s, v0.s[1] 182 LDR x8, [x5, 40] 183 FMLA v23.4s, v9.4s, v0.s[3] 184 FMLA v26.4s, v9.4s, v1.s[1] 185 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 186 187 # BLOCK 5 188 LDR d17, [x5, 48] // vb1x0123 189 INS v16.d[1], x8 190 FMLA v29.4s, v9.4s, v1.s[3] 191 LDR x8, [x5, 56] 192 FMLA v21.4s, v10.4s, v0.s[1] 193 FMLA v24.4s, v10.4s, v0.s[3] 194 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 195 196 # BLOCK 6 197 LDR d18, [x5, 64] // vb1x4567 198 INS v17.d[1], x8 199 FMLA v27.4s, v10.4s, v1.s[1] 200 LDR x8, [x5, 72] 201 FMLA v30.4s, v10.4s, v1.s[3] 202 FMLA v22.4s, v11.4s, v0.s[1] 203 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 204 205 # BLOCK 7 206 LDR d19, [x5, 80] // vb1x89AB 207 INS v18.d[1], x8 208 FMLA v25.4s, v11.4s, v0.s[3] 209 LDR x8, [x5, 88] 210 FMLA v28.4s, v11.4s, v1.s[1] 211 FMLA v31.4s, v11.4s, v1.s[3] 212 213 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 214 # A is loaded for 1st group into v0/v1 215 216 # BLOCK 0 217 LDR d0, [x3], 8 // a0 218 INS v19.d[1], x8 219 FMLA v20.4s, v14.4s, v2.s[0] 220 LDR x8, [x11], 8 // a1 221 FMLA v23.4s, v14.4s, v2.s[2] 222 FMLA v26.4s, v14.4s, v3.s[0] 223 224 # BLOCK 1 225 LDR d1, [x12], 8 // a2 226 INS v0.d[1], x8 // a1 227 FMLA v29.4s, v14.4s, v3.s[2] 228 LDR x8, [x4], 8 // a3 229 FMLA v21.4s, v15.4s, v2.s[0] 230 FMLA v24.4s, v15.4s, v2.s[2] 231 232 # BLOCK 2 233 LDR d6, [x5, 96] // vb0x0123 234 INS v1.d[1], x8 // a3 235 FMLA v27.4s, v15.4s, v3.s[0] 236 LDR x8, [x5, 104] 237 FMLA v30.4s, v15.4s, v3.s[2] 238 FMLA v22.4s, v16.4s, v2.s[0] 239 240 # BLOCK 3 241 LDR d7, [x5, 112] // vb0x4567 242 INS v6.d[1], x8 243 FMLA v25.4s, v16.4s, v2.s[2] 244 LDR x8, [x5, 120] 245 FMLA v28.4s, v16.4s, v3.s[0] 246 FMLA v31.4s, v16.4s, v3.s[2] 247 248 # BLOCK 4 249 LDR d8, [x5, 128] // vb0x89AB 250 INS v7.d[1], x8 251 FMLA v20.4s, v17.4s, v2.s[1] 252 LDR x8, [x5, 136] 253 FMLA v23.4s, v17.4s, v2.s[3] 254 FMLA v26.4s, v17.4s, v3.s[1] 255 256 # BLOCK 5 257 LDR d9, [x5, 144] // vb1x0123 258 INS v8.d[1], x8 259 FMLA v29.4s, v17.4s, v3.s[3] 260 LDR x8, [x5, 152] 261 FMLA v21.4s, v18.4s, v2.s[1] 262 FMLA v24.4s, v18.4s, v2.s[3] 263 264 # BLOCK 6 265 LDR d10, [x5, 160] // vb1x4567 266 INS v9.d[1], x8 267 FMLA v27.4s, v18.4s, v3.s[1] 268 LDR x8, [x5, 168] 269 FMLA v30.4s, v18.4s, v3.s[3] 270 SUBS x0, x0, 16 271 FMLA v22.4s, v19.4s, v2.s[1] 272 273 # BLOCK 7 274 LDR d11, [x5, 176] // vb1x89AB 275 INS v10.d[1], x8 276 FMLA v25.4s, v19.4s, v2.s[3] 277 LDR x8, [x5, 184] 278 FMLA v28.4s, v19.4s, v3.s[1] 279 ADD x5, x5, 192 280 FMLA v31.4s, v19.4s, v3.s[3] 281 B.HS 1b 282 283 # Epilogue 284 # First block same as main loop. Second block has no loads. 2852: 286 # BLOCK 0 287 LDR d2, [x3], 8 // a0 288 INS v11.d[1], x8 289 FMLA v20.4s, v6.4s, v0.s[0] 290 LDR x8, [x11], 8 // a1 291 FMLA v23.4s, v6.4s, v0.s[2] 292 FMLA v26.4s, v6.4s, v1.s[0] 293 294 # BLOCK 1 295 LDR d3, [x12], 8 // a2 296 INS v2.d[1], x8 // a1 was loaded in block 0 297 FMLA v29.4s, v6.4s, v1.s[2] 298 LDR x8, [x4], 8 // a3 299 FMLA v21.4s, v7.4s, v0.s[0] 300 FMLA v24.4s, v7.4s, v0.s[2] 301 302 # BLOCK 2 303 LDR d14, [x5] // vb0x0123 304 INS v3.d[1], x8 // a3 was loaded in block 1 305 FMLA v27.4s, v7.4s, v1.s[0] 306 LDR x8, [x5, 8] 307 FMLA v30.4s, v7.4s, v1.s[2] 308 FMLA v22.4s, v8.4s, v0.s[0] 309 310 # BLOCK 3 311 LDR d15, [x5, 16] // vb0x4567 312 INS v14.d[1], x8 // v14 was loaded in block 2 313 FMLA v25.4s, v8.4s, v0.s[2] 314 LDR x8, [x5, 24] 315 FMLA v28.4s, v8.4s, v1.s[0] 316 FMLA v31.4s, v8.4s, v1.s[2] 317 318 # BLOCK 4 319 LDR d16, [x5, 32] // vb0x89AB 320 INS v15.d[1], x8 321 FMLA v20.4s, v9.4s, v0.s[1] 322 LDR x8, [x5, 40] 323 FMLA v23.4s, v9.4s, v0.s[3] 324 FMLA v26.4s, v9.4s, v1.s[1] 325 326 # BLOCK 5 327 LDR d17, [x5, 48] // vb1x0123 328 INS v16.d[1], x8 329 FMLA v29.4s, v9.4s, v1.s[3] 330 LDR x8, [x5, 56] 331 FMLA v21.4s, v10.4s, v0.s[1] 332 FMLA v24.4s, v10.4s, v0.s[3] 333 334 # BLOCK 6 335 LDR d18, [x5, 64] // vb1x4567 336 INS v17.d[1], x8 337 FMLA v27.4s, v10.4s, v1.s[1] 338 LDR x8, [x5, 72] 339 FMLA v30.4s, v10.4s, v1.s[3] 340 FMLA v22.4s, v11.4s, v0.s[1] 341 342 # BLOCK 7 343 LDR d19, [x5, 80] // vb1x89AB 344 INS v18.d[1], x8 345 FMLA v25.4s, v11.4s, v0.s[3] 346 LDR x8, [x5, 88] 347 FMLA v28.4s, v11.4s, v1.s[1] 348 FMLA v31.4s, v11.4s, v1.s[3] 349 350 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 351 # A is loaded for 1st group into v0/v1 352 353 # BLOCK 0 354 INS v19.d[1], x8 355 FMLA v20.4s, v14.4s, v2.s[0] 356 FMLA v23.4s, v14.4s, v2.s[2] 357 FMLA v26.4s, v14.4s, v3.s[0] 358 359 # BLOCK 1 360 FMLA v29.4s, v14.4s, v3.s[2] 361 FMLA v21.4s, v15.4s, v2.s[0] 362 FMLA v24.4s, v15.4s, v2.s[2] 363 364 # BLOCK 2 365 FMLA v27.4s, v15.4s, v3.s[0] 366 FMLA v30.4s, v15.4s, v3.s[2] 367 FMLA v22.4s, v16.4s, v2.s[0] 368 369 # BLOCK 3 370 FMLA v25.4s, v16.4s, v2.s[2] 371 FMLA v28.4s, v16.4s, v3.s[0] 372 FMLA v31.4s, v16.4s, v3.s[2] 373 374 # BLOCK 4 375 FMLA v20.4s, v17.4s, v2.s[1] 376 FMLA v23.4s, v17.4s, v2.s[3] 377 FMLA v26.4s, v17.4s, v3.s[1] 378 379 # BLOCK 5 380 FMLA v29.4s, v17.4s, v3.s[3] 381 FMLA v21.4s, v18.4s, v2.s[1] 382 FMLA v24.4s, v18.4s, v2.s[3] 383 384 # BLOCK 6 385 FMLA v27.4s, v18.4s, v3.s[1] 386 FMLA v30.4s, v18.4s, v3.s[3] 387 FMLA v22.4s, v19.4s, v2.s[1] 388 TST x0, 15 389 390 # BLOCK 7 391 FMLA v25.4s, v19.4s, v2.s[3] 392 FMLA v28.4s, v19.4s, v3.s[1] 393 ADD x5, x5, 96 394 FMLA v31.4s, v19.4s, v3.s[3] 395 396 # Is there a remainder?- 2 floats of A (8 bytes) or less 397 B.NE 4f 398 3993: 400 # Clamp 401 FMAX v20.4s, v20.4s, v4.4s 402 SUBS x1, x1, 12 403 FMAX v21.4s, v21.4s, v4.4s 404 FMAX v22.4s, v22.4s, v4.4s 405 FMAX v23.4s, v23.4s, v4.4s 406 FMAX v24.4s, v24.4s, v4.4s 407 FMAX v25.4s, v25.4s, v4.4s 408 FMAX v26.4s, v26.4s, v4.4s 409 FMAX v27.4s, v27.4s, v4.4s 410 FMAX v28.4s, v28.4s, v4.4s 411 FMAX v29.4s, v29.4s, v4.4s 412 FMAX v30.4s, v30.4s, v4.4s 413 FMAX v31.4s, v31.4s, v4.4s 414 FMIN v20.4s, v20.4s, v5.4s 415 FMIN v21.4s, v21.4s, v5.4s 416 FMIN v22.4s, v22.4s, v5.4s 417 FMIN v23.4s, v23.4s, v5.4s 418 FMIN v24.4s, v24.4s, v5.4s 419 FMIN v25.4s, v25.4s, v5.4s 420 FMIN v26.4s, v26.4s, v5.4s 421 FMIN v27.4s, v27.4s, v5.4s 422 FMIN v28.4s, v28.4s, v5.4s 423 FMIN v29.4s, v29.4s, v5.4s 424 FMIN v30.4s, v30.4s, v5.4s 425 FMIN v31.4s, v31.4s, v5.4s 426 427 # Store full 4 x 12 428 B.LO 6f 429 430 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 431 SUB x3, x3, x2 // a0 -= kc 432 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 433 SUB x11, x11, x2 // a1 -= kc 434 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 435 SUB x12, x12, x2 // a2 -= kc 436 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 437 SUB x4, x4, x2 // a3 -= kc 438 439 B.HI 0b 440 441 # Restore d8-d11,d14,d15 from stack 442 LDP d14, d15, [sp, 32] 443 LDP d10, d11, [sp, 16] 444 LDP d8, d9, [sp], 48 445 RET 446 4474: 448 # Is there a remainder?- 2 floats of A (8 bytes) 449 TBZ x0, 3, 5f 450 451 # Remainder - 2 floats of A (8 bytes) 452 # Read first block of 4 A. 453 LDR d0, [x3], 8 // a0 454 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 455 LDR d1, [x11], 8 // a1 456 LDR d2, [x12], 8 // a2 457 LDR d3, [x4], 8 // a3 458 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 459 460 # First block of 3 B 461 FMLA v20.4s, v6.4s, v0.s[0] 462 FMLA v23.4s, v6.4s, v1.s[0] 463 FMLA v26.4s, v6.4s, v2.s[0] 464 FMLA v29.4s, v6.4s, v3.s[0] 465 FMLA v21.4s, v7.4s, v0.s[0] 466 FMLA v24.4s, v7.4s, v1.s[0] 467 FMLA v27.4s, v7.4s, v2.s[0] 468 FMLA v30.4s, v7.4s, v3.s[0] 469 FMLA v22.4s, v8.4s, v0.s[0] 470 FMLA v25.4s, v8.4s, v1.s[0] 471 FMLA v28.4s, v8.4s, v2.s[0] 472 FMLA v31.4s, v8.4s, v3.s[0] 473 474 # Second block of 3 B 475 FMLA v20.4s, v9.4s, v0.s[1] 476 FMLA v23.4s, v9.4s, v1.s[1] 477 FMLA v26.4s, v9.4s, v2.s[1] 478 FMLA v29.4s, v9.4s, v3.s[1] 479 FMLA v21.4s, v10.4s, v0.s[1] 480 FMLA v24.4s, v10.4s, v1.s[1] 481 FMLA v27.4s, v10.4s, v2.s[1] 482 FMLA v30.4s, v10.4s, v3.s[1] 483 FMLA v22.4s, v11.4s, v0.s[1] 484 FMLA v25.4s, v11.4s, v1.s[1] 485 FMLA v28.4s, v11.4s, v2.s[1] 486 FMLA v31.4s, v11.4s, v3.s[1] 487 488 TBZ x0, 2, 3b 4895: 490 # Remainder - 1 float of A (4 bytes) 491 LDR s0, [x3], 4 // a0 492 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 493 LDR s1, [x11], 4 // a1 494 LDR s2, [x12], 4 // a2 495 LDR s3, [x4], 4 // a3 496 497 FMLA v20.4s, v6.4s, v0.s[0] 498 FMLA v23.4s, v6.4s, v1.s[0] 499 FMLA v26.4s, v6.4s, v2.s[0] 500 FMLA v29.4s, v6.4s, v3.s[0] 501 FMLA v21.4s, v7.4s, v0.s[0] 502 FMLA v24.4s, v7.4s, v1.s[0] 503 FMLA v27.4s, v7.4s, v2.s[0] 504 FMLA v30.4s, v7.4s, v3.s[0] 505 FMLA v22.4s, v8.4s, v0.s[0] 506 FMLA v25.4s, v8.4s, v1.s[0] 507 FMLA v28.4s, v8.4s, v2.s[0] 508 FMLA v31.4s, v8.4s, v3.s[0] 509 B 3b 510 5116: 512 ADD x1, x1, 12 513 # Store odd channels 514 TBZ x1, 3, 7f 515 STP q29, q30, [x7], 32 516 MOV v29.16b, v31.16b 517 STP q26, q27, [x10], 32 518 MOV v26.16b, v28.16b 519 STP q23, q24, [x9], 32 520 MOV v23.16b, v25.16b 521 STP q20, q21, [x6], 32 522 MOV v20.16b, v22.16b 523 5247: 525 TBZ x1, 2, 8f 526 STR q29, [x7], 16 527 MOV v29.16b, v30.16b 528 STR q26, [x10], 16 529 MOV v26.16b, v27.16b 530 STR q23, [x9], 16 531 MOV v23.16b, v24.16b 532 STR q20, [x6], 16 533 MOV v20.16b, v21.16b 534 5358: 536 TBZ x1, 1, 9f 537 STR d29, [x7], 8 538 DUP d29, v29.d[1] 539 STR d26, [x10], 8 540 DUP d26, v26.d[1] 541 STR d23, [x9], 8 542 DUP d23, v23.d[1] 543 STR d20, [x6], 8 544 DUP d20, v20.d[1] 545 5469: 547 TBZ x1, 0, 10f 548 STR s29, [x7] 549 STR s26, [x10] 550 STR s23, [x9] 551 STR s20, [x6] 55210: 553 # Restore d8-d11,d14,d15 from stack 554 LDP d14, d15, [sp, 32] 555 LDP d10, d11, [sp, 16] 556 LDP d8, d9, [sp], 48 557 RET 558 559END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 560 561#ifdef __ELF__ 562.section ".note.GNU-stack","",%progbits 563#endif 564