1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55( 9# size_t channels, x0, x20 10# size_t output_width, x1 11# const float** input, x2 12# const float* weights, x3, x19 13# float* output, x4 14# size_t input_stride, x5 15# size_t output_increment, x6 16# size_t input_offset, x7 17# const float* zero, [sp + 64] -> x17 18# const xnn_f32_minmax_params params [sp + 72] -> (x16) 19 20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21 22# inputs 23# i0 x8 24# i1 x9 25# i2 x10 26# i3 x11 27# i4 x12 28# i5 x13 29# i6 x14 30# i7 x15 31# i8 x16 32 33# weights. Bias and 9 weights. 34# x19 35 36# accumulators 37# v0-v3 38 39# Input and weight paired values. 40# Inputs are even and weights are odd registers 41# v4 v5 42# v6 v7 43# v10 v11 44# v12 v13 45# v14 v15 46# v16 v17 47# v18 v19 48# v20 v21 49# v22 v23 50# v24 v25 51# v26 v27 52# v28 v29 53 54# Clamp v30 v31 55 56# unused v8 v9 57 58BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55 59 60 # Load zero, params pointer 61 LDP x17, x16, [sp] 62 63 # Save x19-x20,d10-d15 on stack 64 STP x19, x20, [sp, -64]! 65 STP d10, d11, [sp, 16] 66 STP d12, d13, [sp, 32] 67 STP d14, d15, [sp, 48] 68 69 # Load min/max values 70 LD2R {v30.4s, v31.4s}, [x16] 71 720: 73 # Load 9 input pointers 74 LDP x8, x9, [x2] 75 LDP x10, x11, [x2, 16] 76 LDP x12, x13, [x2, 32] 77 LDP x14, x15, [x2, 48] 78 LDR x16, [x2, 64] 79 80 CMP x8, x17 // if i0 == zero 81 ADD x8, x8, x7 // i0 += input_offset 82 CSEL x8, x17, x8, EQ // i0 = zero, else += i0 + input_offset 83 CMP x9, x17 // if i1 == zero 84 ADD x9, x9, x7 // i1 += input_offset 85 CSEL x9, x17, x9, EQ // i1 = zero, else += i1 + input_offset 86 CMP x10, x17 // if i2 == zero 87 ADD x10, x10, x7 // i2 += input_offset 88 CSEL x10, x17, x10, EQ // i2 = zero, else += i2 + input_offset 89 CMP x11, x17 // if i3 == zero 90 ADD x11, x11, x7 // i3 += input_offset 91 CSEL x11, x17, x11, EQ // i3 = zero, else += i3 + input_offset 92 CMP x12, x17 // if i4 == zero 93 ADD x12, x12, x7 // i4 += input_offset 94 CSEL x12, x17, x12, EQ // i4 = zero, else += i4 + input_offset 95 CMP x13, x17 // if i5 == zero 96 ADD x13, x13, x7 // i5 += input_offset 97 CSEL x13, x17, x13, EQ // i5 = zero, else += i5 + input_offset 98 CMP x14, x17 // if i6 == zero 99 ADD x14, x14, x7 // i6 += input_offset 100 CSEL x14, x17, x14, EQ // i6 = zero, else += i6 + input_offset 101 CMP x15, x17 // if i7 == zero 102 ADD x15, x15, x7 // i7 += input_offset 103 CSEL x15, x17, x15, EQ // i7 = zero, else += i7 + input_offset 104 CMP x16, x17 // if i8 == zero 105 ADD x16, x16, x7 // i8 += input_offset 106 CSEL x16, x17, x16, EQ // i8 = zero, else += i8 + input_offset 107 108 # input += input_stride 109 ADD x2, x2, x5 110 111 # x20 := c = channels 112 # c -= 8 113 SUBS x20, x0, 8 114 # x19 := w = weights 115 MOV x19, x3 116 117 # skip main loop if c < 8 118 B.LO 3f 119 120 # SWP prologue 121 122 # Load vbias.lo 123 LD1 {v0.2S}, [x19], 8 124 125 # Load vbias.hi 126 LD1 {v1.2S}, [x19], 8 127 128 # Load vi0.lo 129 LD1 {v4.2S}, [x8], 8 130 131 # Load vk0.lo 132 LD1 {v5.2S}, [x19], 8 133 134 # Load vi0.hi 135 LD1 {v6.2S}, [x8], 8 136 137 # Load vk0.hi 138 LD1 {v7.2S}, [x19], 8 139 140 # Load vi1.lo 141 LD1 {v28.2S}, [x9], 8 142 143 # Load vk1.lo 144 LD1 {v29.2S}, [x19], 8 145 146 # Load vi1.hi 147 LD1 {v10.2S}, [x9], 8 148 149 # Load vk1.hi 150 LD1 {v11.2S}, [x19], 8 151 152 # Load vi2.lo 153 LD1 {v12.2S}, [x10], 8 154 155 # Load vk2.lo 156 LD1 {v13.2S}, [x19], 8 157 158 # Load vi2.hi 159 LD1 {v14.2S}, [x10], 8 160 161 # Load vk2.hi 162 LD1 {v15.2S}, [x19], 8 163 164 # Load vi3.lo 165 LD1 {v16.2S}, [x11], 8 166 167 # Load vk3.lo 168 LD1 {v17.2S}, [x19], 8 169 170 # Load vi3.hi 171 LD1 {v18.2S}, [x11], 8 172 173 # Load vk3.hi 174 LD1 {v19.2S}, [x19], 8 175 176 # Load vi4.lo 177 LD1 {v20.2S}, [x12], 8 178 179 # Load vk4.lo 180 LD1 {v21.2S}, [x19], 8 181 182 # Load vi4.hi 183 LD1 {v22.2S}, [x12], 8 184 185 # Load vk4.hi 186 LD1 {v23.2S}, [x19], 8 187 188 # Load vi5.lo 189 LD1 {v24.2S}, [x13], 8 190 191 # Load vk5.lo 192 LD1 {v25.2S}, [x19], 8 193 194 # Load vi5.hi 195 LD1 {v26.2S}, [x13], 8 196 197 # Load vk5.hi 198 LD1 {v27.2S}, [x19], 8 199 200 # vacc.lo += vi0.lo * vk0.lo 201 FMLA v0.2S, v4.2S, v5.2S 202 # Load vi6.lo 203 LD1 {v4.2S}, [x14], 8 204 205 # Load vk6.lo 206 LD1 {v5.2S}, [x19], 8 207 208 # vacc.hi += vi0.hi * vk0.hi 209 FMLA v1.2S, v6.2S, v7.2S 210 # Load vi6.hi 211 LD1 {v6.2S}, [x14], 8 212 213 # Load vk6.hi 214 LD1 {v7.2S}, [x19], 8 215 216 # vacc.lo += vi1.lo * vk0.lo 217 FMLA v0.2S, v28.2S, v29.2S 218 # Load vi7.lo 219 LD1 {v28.2S}, [x15], 8 220 221 # Load vk7.lo 222 LD1 {v29.2S}, [x19], 8 223 224 # vacc.hi += vi1.hi * vk0.hi 225 FMLA v1.2S, v10.2S, v11.2S 226 # Load vi7.hi 227 LD1 {v10.2S}, [x15], 8 228 229 # Load vk7.hi 230 LD1 {v11.2S}, [x19], 8 231 232 # vacc.lo += vi2.lo * vk2.lo 233 FMLA v0.2S, v12.2S, v13.2S 234 # Load vi8.lo 235 LD1 {v12.2S}, [x16], 8 236 237 # Load vk8.lo 238 LD1 {v13.2S}, [x19], 8 239 240 # vacc.hi += vi2.hi * vk2.hi 241 FMLA v1.2S, v14.2S, v15.2S 242 # Load vi8.hi 243 LD1 {v14.2S}, [x16], 8 244 245 # Load vk8.hi 246 LD1 {v15.2S}, [x19], 8 247 248 # Load vbias_next.lo 249 LD1 {v2.2S}, [x19], 8 250 251 # Load vbias_next.hi 252 LD1 {v3.2S}, [x19], 8 253 254 # vacc.lo += vi3.lo * vk3.lo 255 FMLA v0.2S, v16.2S, v17.2S 256 # Load vi0_next.lo 257 LD1 {v16.2S}, [x8], 8 258 259 # Load vk0_next.lo 260 LD1 {v17.2S}, [x19], 8 261 262 # vacc.hi += vi3.hi * vk3.hi 263 FMLA v1.2S, v18.2S, v19.2S 264 # Load vi0_next.hi 265 LD1 {v18.2S}, [x8], 8 266 267 # Load vk0_next.hi 268 LD1 {v19.2S}, [x19], 8 269 270 # vacc.lo += vi4.lo * vk4.lo 271 FMLA v0.2S, v20.2S, v21.2S 272 # Load vi1_next.lo 273 LD1 {v20.2S}, [x9], 8 274 275 # Load vk1_next.lo 276 LD1 {v21.2S}, [x19], 8 277 278 # vacc.hi += vi4.hi * vk4.hi 279 FMLA v1.2S, v22.2S, v23.2S 280 # Load vi1_next.hi 281 LD1 {v22.2S}, [x9], 8 282 283 # Load vk1_next.hi 284 LD1 {v23.2S}, [x19], 8 285 286 # vacc.lo += vi5.lo * vk5.lo 287 FMLA v0.2S, v24.2S, v25.2S 288 # Load vi2_next.lo 289 LD1 {v24.2S}, [x10], 8 290 291 # Load vk2_next.lo 292 LD1 {v25.2S}, [x19], 8 293 294 # vacc.hi += vi5.hi * vk5.hi 295 FMLA v1.2S, v26.2S, v27.2S 296 # Load vi2_next.hi 297 LD1 {v26.2S}, [x10], 8 298 299 # Load vk2_next.hi 300 LD1 {v27.2S}, [x19], 8 301 302 # vacc.lo += vi6.lo * vk6.lo 303 FMLA v0.2S, v4.2S, v5.2S 304 # Load vi3_next.lo 305 LD1 {v4.2S}, [x11], 8 306 307 # Load vk3_next.lo 308 LD1 {v5.2S}, [x19], 8 309 310 # vacc.hi += vi6.hi * vk6.hi 311 FMLA v1.2S, v6.2S, v7.2S 312 # Load vi3_next.hi 313 LD1 {v6.2S}, [x11], 8 314 315 # Load vk3_next.hi 316 LD1 {v7.2S}, [x19], 8 317 318 # vacc.lo += vi7.lo * vk7.lo 319 FMLA v0.2S, v28.2S, v29.2S 320 # Load vi4_next.lo 321 LD1 {v28.2S}, [x12], 8 322 323 # Load vk4_next.lo 324 LD1 {v29.2S}, [x19], 8 325 326 # vacc.hi += vi7.hi * vk7.hi 327 FMLA v1.2S, v10.2S, v11.2S 328 # Load vi4_next.hi 329 LD1 {v10.2S}, [x12], 8 330 331 # Load vk4_next.hi 332 LD1 {v11.2S}, [x19], 8 333 334 # vacc.lo += vi8.lo * vk8.lo 335 FMLA v0.2S, v12.2S, v13.2S 336 # Load vi5_next.lo 337 LD1 {v12.2S}, [x13], 8 338 339 # Load vk5_next.lo 340 LD1 {v13.2S}, [x19], 8 341 342 # vacc.hi += vi8.hi * vk8.hi 343 FMLA v1.2S, v14.2S, v15.2S 344 # Load vi5_next.hi 345 LD1 {v14.2S}, [x13], 8 346 347 # Load vk5_next.hi 348 LD1 {v15.2S}, [x19], 8 349 350 # vacc_next.lo += vi0_next.lo * vk0_next.lo 351 FMLA v2.2S, v16.2S, v17.2S 352 # Load vi6_next.lo 353 LD1 {v16.2S}, [x14], 8 354 355 # vacc.lo = min(vacc.lo, vmin) 356 FMAX v0.2S, v0.2S, v30.2S 357 # Load vk6_next.lo 358 LD1 {v17.2S}, [x19], 8 359 360 # vacc_next.hi += vi0_next.hi * vk0_next.hi 361 FMLA v3.2S, v18.2S, v19.2S 362 # Load vi6_next.hi 363 LD1 {v18.2S}, [x14], 8 364 365 # vacc.hi = min(vacc.hi, vmin) 366 FMAX v1.2S, v1.2S, v30.2S 367 # Load vk6_next.hi 368 LD1 {v19.2S}, [x19], 8 369 370 # vacc_next.lo += vi1_next.lo * vk1_next.lo 371 FMLA v2.2S, v20.2S, v21.2S 372 # Load vi7_next.lo 373 LD1 {v20.2S}, [x15], 8 374 375 # vacc.lo = max(vacc.lo, vmax) 376 FMIN v0.2S, v0.2S, v31.2S 377 # Load vk7_next.lo 378 LD1 {v21.2S}, [x19], 8 379 380 # vacc_next.hi += vi1_next.hi * vk1_next.hi 381 FMLA v3.2S, v22.2S, v23.2S 382 # Load vi7_next.hi 383 LD1 {v22.2S}, [x15], 8 384 385 # vacc.hi = max(vacc.hi, vmax) 386 FMIN v1.2S, v1.2S, v31.2S 387 # Load vk7_next.hi 388 LD1 {v23.2S}, [x19], 8 389 390 # vacc_next.lo += vi2_next.lo * vk2_next.lo 391 FMLA v2.2S, v24.2S, v25.2S 392 # Load vi8_next.lo 393 LD1 {v24.2S}, [x16], 8 394 395 # Load vk8_next.lo 396 LD1 {v25.2S}, [x19], 8 397 398 # vacc_next.hi += vi2_next.hi * vk2_next.hi 399 FMLA v3.2S, v26.2S, v27.2S 400 # Load vi8_next.hi 401 LD1 {v26.2S}, [x16], 8 402 403 # Store vacc 404 STP d0, d1, [x4], 16 405 406 # c -= 8 407 SUBS x20, x20, 8 408 # Load vk8_next.hi 409 LD1 {v27.2S}, [x19], 8 410 411 B.LO 2f 412 4131: 414 # SWP iteration 415 416 # Load vbias.lo 417 LD1 {v0.2S}, [x19], 8 418 419 # Load vbias.hi 420 LD1 {v1.2S}, [x19], 8 421 422 # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo 423 FMLA v2.2S, v4.2S, v5.2S 424 # Load vi0.lo 425 LD1 {v4.2S}, [x8], 8 426 427 # Load vk0.lo 428 LD1 {v5.2S}, [x19], 8 429 430 # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi 431 FMLA v3.2S, v6.2S, v7.2S 432 # Load vi0.hi 433 LD1 {v6.2S}, [x8], 8 434 435 # Load vk0.hi 436 LD1 {v7.2S}, [x19], 8 437 438 # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo 439 FMLA v2.2S, v28.2S, v29.2S 440 # Load vi1.lo 441 LD1 {v28.2S}, [x9], 8 442 443 # Load vk1.lo 444 LD1 {v29.2S}, [x19], 8 445 446 # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi 447 FMLA v3.2S, v10.2S, v11.2S 448 # Load vi1.hi 449 LD1 {v10.2S}, [x9], 8 450 451 # Load vk1.hi 452 LD1 {v11.2S}, [x19], 8 453 454 # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo 455 FMLA v2.2S, v12.2S, v13.2S 456 # Load vi2.lo 457 LD1 {v12.2S}, [x10], 8 458 459 # Load vk2.lo 460 LD1 {v13.2S}, [x19], 8 461 462 # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi 463 FMLA v3.2S, v14.2S, v15.2S 464 # Load vi2.hi 465 LD1 {v14.2S}, [x10], 8 466 467 # Load vk2.hi 468 LD1 {v15.2S}, [x19], 8 469 470 # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo 471 FMLA v2.2S, v16.2S, v17.2S 472 # Load vi3.lo 473 LD1 {v16.2S}, [x11], 8 474 475 # Load vk3.lo 476 LD1 {v17.2S}, [x19], 8 477 478 # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi 479 FMLA v3.2S, v18.2S, v19.2S 480 # Load vi3.hi 481 LD1 {v18.2S}, [x11], 8 482 483 # Load vk3.hi 484 LD1 {v19.2S}, [x19], 8 485 486 # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo 487 FMLA v2.2S, v20.2S, v21.2S 488 # Load vi4.lo 489 LD1 {v20.2S}, [x12], 8 490 491 # Load vk4.lo 492 LD1 {v21.2S}, [x19], 8 493 494 # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi 495 FMLA v3.2S, v22.2S, v23.2S 496 # Load vi4.hi 497 LD1 {v22.2S}, [x12], 8 498 499 # Load vk4.hi 500 LD1 {v23.2S}, [x19], 8 501 502 # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo 503 FMLA v2.2S, v24.2S, v25.2S 504 # Load vi5.lo 505 LD1 {v24.2S}, [x13], 8 506 507 # Load vk5.lo 508 LD1 {v25.2S}, [x19], 8 509 510 # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi 511 FMLA v3.2S, v26.2S, v27.2S 512 # Load vi5.hi 513 LD1 {v26.2S}, [x13], 8 514 515 # Load vk5.hi 516 LD1 {v27.2S}, [x19], 8 517 518 # vacc.lo += vi0.lo * vk0.lo 519 FMLA v0.2S, v4.2S, v5.2S 520 # Load vi6.lo 521 LD1 {v4.2S}, [x14], 8 522 523 # vacc_prev.lo = min(vacc_prev.lo, vmin) 524 FMAX v2.2S, v2.2S, v30.2S 525 # Load vk6.lo 526 LD1 {v5.2S}, [x19], 8 527 528 # vacc.hi += vi0.hi * vk0.hi 529 FMLA v1.2S, v6.2S, v7.2S 530 # Load vi6.hi 531 LD1 {v6.2S}, [x14], 8 532 533 # vacc_prev.hi = min(vacc_prev.hi, vmin) 534 FMAX v3.2S, v3.2S, v30.2S 535 # Load vk6.hi 536 LD1 {v7.2S}, [x19], 8 537 538 # vacc.lo += vi1.lo * vk0.lo 539 FMLA v0.2S, v28.2S, v29.2S 540 # Load vi7.lo 541 LD1 {v28.2S}, [x15], 8 542 543 # vacc_prev.lo = max(vacc_prev.lo, vmax) 544 FMIN v2.2S, v2.2S, v31.2S 545 # Load vk7.lo 546 LD1 {v29.2S}, [x19], 8 547 548 # vacc.hi += vi1.hi * vk0.hi 549 FMLA v1.2S, v10.2S, v11.2S 550 # Load vi7.hi 551 LD1 {v10.2S}, [x15], 8 552 553 # vacc_prev.lo = max(vacc_prev.lo, vmax) 554 FMIN v3.2S, v3.2S, v31.2S 555 # Load vk7.hi 556 LD1 {v11.2S}, [x19], 8 557 558 # vacc.lo += vi2.lo * vk2.lo 559 FMLA v0.2S, v12.2S, v13.2S 560 # Load vi8.lo 561 LD1 {v12.2S}, [x16], 8 562 563 # Load vk8.lo 564 LD1 {v13.2S}, [x19], 8 565 566 # vacc.hi += vi2.hi * vk2.hi 567 FMLA v1.2S, v14.2S, v15.2S 568 # Load vi8.hi 569 LD1 {v14.2S}, [x16], 8 570 571 # Store vacc_prev 572 STP d2, d3, [x4], 16 573 574 # Load vk8.hi 575 LD1 {v15.2S}, [x19], 8 576 577 # Load vbias_next.lo 578 LD1 {v2.2S}, [x19], 8 579 580 # Load vbias_next.hi 581 LD1 {v3.2S}, [x19], 8 582 583 # vacc.lo += vi3.lo * vk3.lo 584 FMLA v0.2S, v16.2S, v17.2S 585 # Load vi0_next.lo 586 LD1 {v16.2S}, [x8], 8 587 588 # Load vk0_next.lo 589 LD1 {v17.2S}, [x19], 8 590 591 # vacc.hi += vi3.hi * vk3.hi 592 FMLA v1.2S, v18.2S, v19.2S 593 # Load vi0_next.hi 594 LD1 {v18.2S}, [x8], 8 595 596 # Load vk0_next.hi 597 LD1 {v19.2S}, [x19], 8 598 599 # vacc.lo += vi4.lo * vk4.lo 600 FMLA v0.2S, v20.2S, v21.2S 601 # Load vi1_next.lo 602 LD1 {v20.2S}, [x9], 8 603 604 # Load vk1_next.lo 605 LD1 {v21.2S}, [x19], 8 606 607 # vacc.hi += vi4.hi * vk4.hi 608 FMLA v1.2S, v22.2S, v23.2S 609 # Load vi1_next.hi 610 LD1 {v22.2S}, [x9], 8 611 612 # Load vk1_next.hi 613 LD1 {v23.2S}, [x19], 8 614 615 # vacc.lo += vi5.lo * vk5.lo 616 FMLA v0.2S, v24.2S, v25.2S 617 # Load vi2_next.lo 618 LD1 {v24.2S}, [x10], 8 619 620 # Load vk2_next.lo 621 LD1 {v25.2S}, [x19], 8 622 623 # vacc.hi += vi5.hi * vk5.hi 624 FMLA v1.2S, v26.2S, v27.2S 625 # Load vi2_next.hi 626 LD1 {v26.2S}, [x10], 8 627 628 # Load vk2_next.hi 629 LD1 {v27.2S}, [x19], 8 630 631 # vacc.lo += vi6.lo * vk6.lo 632 FMLA v0.2S, v4.2S, v5.2S 633 # Load vi3_next.lo 634 LD1 {v4.2S}, [x11], 8 635 636 # Load vk3_next.lo 637 LD1 {v5.2S}, [x19], 8 638 639 # vacc.hi += vi6.hi * vk6.hi 640 FMLA v1.2S, v6.2S, v7.2S 641 # Load vi3_next.hi 642 LD1 {v6.2S}, [x11], 8 643 644 # Load vk3_next.hi 645 LD1 {v7.2S}, [x19], 8 646 647 # vacc.lo += vi7.lo * vk7.lo 648 FMLA v0.2S, v28.2S, v29.2S 649 # Load vi4_next.lo 650 LD1 {v28.2S}, [x12], 8 651 652 # Load vk4_next.lo 653 LD1 {v29.2S}, [x19], 8 654 655 # vacc.hi += vi7.hi * vk7.hi 656 FMLA v1.2S, v10.2S, v11.2S 657 # Load vi4_next.hi 658 LD1 {v10.2S}, [x12], 8 659 660 # Load vk4_next.hi 661 LD1 {v11.2S}, [x19], 8 662 663 # vacc.lo += vi8.lo * vk8.lo 664 FMLA v0.2S, v12.2S, v13.2S 665 # Load vi5_next.lo 666 LD1 {v12.2S}, [x13], 8 667 668 # Load vk5_next.lo 669 LD1 {v13.2S}, [x19], 8 670 671 # vacc.hi += vi8.hi * vk8.hi 672 FMLA v1.2S, v14.2S, v15.2S 673 # Load vi5_next.hi 674 LD1 {v14.2S}, [x13], 8 675 676 # Load vk5_next.hi 677 LD1 {v15.2S}, [x19], 8 678 679 # vacc_next.lo += vi0_next.lo * vk0_next.lo 680 FMLA v2.2S, v16.2S, v17.2S 681 # Load vi6_next.lo 682 LD1 {v16.2S}, [x14], 8 683 684 # vacc.lo = min(vacc.lo, vmin) 685 FMAX v0.2S, v0.2S, v30.2S 686 # Load vk6_next.lo 687 LD1 {v17.2S}, [x19], 8 688 689 # vacc_next.hi += vi0_next.hi * vk0_next.hi 690 FMLA v3.2S, v18.2S, v19.2S 691 # Load vi6_next.hi 692 LD1 {v18.2S}, [x14], 8 693 694 # vacc.hi = min(vacc.hi, vmin) 695 FMAX v1.2S, v1.2S, v30.2S 696 # Load vk6_next.hi 697 LD1 {v19.2S}, [x19], 8 698 699 # vacc_next.lo += vi1_next.lo * vk1_next.lo 700 FMLA v2.2S, v20.2S, v21.2S 701 # Load vi7_next.lo 702 LD1 {v20.2S}, [x15], 8 703 704 # vacc.lo = max(vacc.lo, vmax) 705 FMIN v0.2S, v0.2S, v31.2S 706 # Load vk7_next.lo 707 LD1 {v21.2S}, [x19], 8 708 709 # vacc_next.hi += vi1_next.hi * vk1_next.hi 710 FMLA v3.2S, v22.2S, v23.2S 711 # Load vi7_next.hi 712 LD1 {v22.2S}, [x15], 8 713 714 # vacc.hi = max(vacc.hi, vmax) 715 FMIN v1.2S, v1.2S, v31.2S 716 # Load vk7_next.hi 717 LD1 {v23.2S}, [x19], 8 718 719 # vacc_next.lo += vi2_next.lo * vk2_next.lo 720 FMLA v2.2S, v24.2S, v25.2S 721 # Load vi8_next.lo 722 LD1 {v24.2S}, [x16], 8 723 724 # Load vk8_next.lo 725 LD1 {v25.2S}, [x19], 8 726 727 # vacc_next.hi += vi2_next.hi * vk2_next.hi 728 FMLA v3.2S, v26.2S, v27.2S 729 # Load vi8_next.hi 730 LD1 {v26.2S}, [x16], 8 731 732 # Store vacc 733 STP d0, d1, [x4], 16 734 735 # c -= 8 736 SUBS x20, x20, 8 737 # Load vk8_next.hi 738 LD1 {v27.2S}, [x19], 8 739 740 B.HS 1b 741 7422: 743 # SWP epilogue 744 745 # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo 746 FMLA v2.2S, v4.2S, v5.2S 747 748 # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi 749 FMLA v3.2S, v6.2S, v7.2S 750 751 # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo 752 FMLA v2.2S, v28.2S, v29.2S 753 754 # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi 755 FMLA v3.2S, v10.2S, v11.2S 756 757 # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo 758 FMLA v2.2S, v12.2S, v13.2S 759 760 # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi 761 FMLA v3.2S, v14.2S, v15.2S 762 763 # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo 764 FMLA v2.2S, v16.2S, v17.2S 765 766 # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi 767 FMLA v3.2S, v18.2S, v19.2S 768 769 # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo 770 FMLA v2.2S, v20.2S, v21.2S 771 772 # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi 773 FMLA v3.2S, v22.2S, v23.2S 774 775 # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo 776 FMLA v2.2S, v24.2S, v25.2S 777 778 # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi 779 FMLA v3.2S, v26.2S, v27.2S 780 781 # vacc_prev.lo = min(vacc_prev.lo, vmin) 782 FMAX v2.2S, v2.2S, v30.2S 783 784 # vacc_prev.hi = min(vacc_prev.hi, vmin) 785 FMAX v3.2S, v3.2S, v30.2S 786 787 # vacc_prev.lo = max(vacc_prev.lo, vmax) 788 FMIN v2.2S, v2.2S, v31.2S 789 790 # vacc_prev.lo = max(vacc_prev.lo, vmax) 791 FMIN v3.2S, v3.2S, v31.2S 792 793 # Store vacc_prev 794 STP d2, d3, [x4], 16 795 7963: 797 # Is there a remainder? - 4 channels 798 TBZ x20, 2, 4f 799 800 LDR q10, [x8], 16 // load 9 inputs 801 LDP q0, q1, [x19], 32 // load bias and 9 weights 802 LDR q11, [x9], 16 803 LDR q12, [x10], 16 804 LDR q13, [x11], 16 805 LDR q14, [x12], 16 806 LDR q15, [x13], 16 807 LDR q16, [x14], 16 808 LDR q17, [x15], 16 809 LDR q18, [x16], 16 810 LDP q2, q3, [x19], 32 811 LDP q4, q5, [x19], 32 812 LDP q6, q7, [x19], 32 813 LDP q28, q29, [x19], 32 814 815 FMLA v0.4S, v1.4S, v10.4S 816 FMLA v0.4S, v2.4S, v11.4S 817 FMLA v0.4S, v3.4S, v12.4S 818 FMLA v0.4S, v4.4S, v13.4S 819 FMLA v0.4S, v5.4S, v14.4S 820 FMLA v0.4S, v6.4S, v15.4S 821 FMLA v0.4S, v7.4S, v16.4S 822 FMLA v0.4S, v28.4S, v17.4S 823 FMLA v0.4S, v29.4S, v18.4S 824 825 FMAX v0.4S, v0.4S, v30.4S 826 FMIN v0.4S, v0.4S, v31.4S 827 828 STR q0, [x4], 16 829 8304: 831 # Is there a remainder?- 1 to 3 channels 832 TST x20, 3 833 B.EQ 6f 834 835 LDR q10, [x8], 16 // load 9 inputs 836 LDP q0, q1, [x19], 32 // load bias and 9 weights 837 LDR q11, [x9], 16 838 LDR q12, [x10], 16 839 LDR q13, [x11], 16 840 LDR q14, [x12], 16 841 LDR q15, [x13], 16 842 LDR q16, [x14], 16 843 LDR q17, [x15], 16 844 LDR q18, [x16], 16 845 LDP q2, q3, [x19], 32 846 LDP q4, q5, [x19], 32 847 LDP q6, q7, [x19], 32 848 LDP q28, q29, [x19], 32 849 850 FMLA v0.4S, v1.4S, v10.4S 851 FMLA v0.4S, v2.4S, v11.4S 852 FMLA v0.4S, v3.4S, v12.4S 853 FMLA v0.4S, v4.4S, v13.4S 854 FMLA v0.4S, v5.4S, v14.4S 855 FMLA v0.4S, v6.4S, v15.4S 856 FMLA v0.4S, v7.4S, v16.4S 857 FMLA v0.4S, v28.4S, v17.4S 858 FMLA v0.4S, v29.4S, v18.4S 859 860 FMAX v0.4S, v0.4S, v30.4S 861 FMIN v0.4S, v0.4S, v31.4S 862 863 TBZ x20, 1, 5f 864 865 STR d0, [x4], 8 866 DUP d0, v0.D[1] 867 TBZ x20, 0, 6f 8685: 869 STR s0, [x4], 4 8706: 871 # output_width -= 1 872 SUBS x1, x1, 1 873 # output += output_increment 874 ADD x4, x4, x6 875 # process next pixel if output_width != 0 876 B.NE 0b 877 878 # Restore x19-x20,d10-d15 from stack 879 LDP d14, d15, [sp, 48] 880 LDP d12, d13, [sp, 32] 881 LDP d10, d11, [sp, 16] 882 LDP x19, x20, [sp], 64 883 RET 884 885END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55 886 887#ifdef __ELF__ 888.section ".note.GNU-stack","",%progbits 889#endif 890