1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2 31 vmslt.vx v0, \vec_tmp1, zero 32 vneg.v \vec_tmp1, \vec_tmp1, v0.t 33 vmmv.m v1, v0 34 35 vmslt.vx v0, \vec_tmp2, zero 36 vneg.v \vec_tmp2, \vec_tmp2, v0.t 37 38 vsra.vx \vec1, \vec_tmp1, \shift 39 vsra.vx \vec2, \vec_tmp2, \shift 40 41 vrsub.vx \vec1, \vec1, \strength 42 vrsub.vx \vec2, \vec2, \strength 43 44 vmax.vx \vec1, \vec1, zero 45 vmax.vx \vec2, \vec2, zero 46 47 vmin.vv \vec_tmp1, \vec1, \vec_tmp1 48 vmin.vv \vec_tmp2, \vec2, \vec_tmp2 49 50 vneg.v \vec_tmp2, \vec_tmp2, v0.t 51 52 vmmv.m v0, v1 53 vneg.v \vec_tmp1, \vec_tmp1, v0.t 54.endm 55 56.macro padding_fn w, h 57 li t5, -32768 # INT16_MIN 58 59 andi t4, a7, 4 60 li t2, -2 # y_start 61 62.if \w == 4 63 vsetivli zero, \w + 4, e16, m1, ta, ma 64.else 65 vsetivli zero, \w + 4, e16, m2, ta, ma 66.endif 67 vmv.v.x v0, t5 68 bnez t4, L(top_done_\w\()x\h) 69 70 slli t5, a1, 1 71 addi t5, t5, 2 72 slli t5, t5, 1 73 sub t5, a0, t5 74 75 sh1add t4, a1, t5 76 vse16.v v0, (t5) 77 vse16.v v0, (t4) 78 li t2, 0 79 80L(top_done_\w\()x\h): 81 andi t4, a7, 8 82 li t3, 2 + \h # y_end 83 bnez t4, L(bottom_done_\w\()x\h) 84 85 li t5, \h 86 mul t5, a1, t5 87 addi t5, t5, -2 88 sh1add t5, t5, a0 89 90 sh1add t4, a1, t5 91 vse16.v v0, (t5) 92 vse16.v v0, (t4) 93 addi t3, t3, -2 94 95L(bottom_done_\w\()x\h): 96 andi t4, a7, 1 97 li t0, -2 # x_start 98 99.if \w == 4 100 vsetivli zero, 2, e16, m1, ta, ma 101.else 102 vsetivli zero, 2, e16, m2, ta, ma 103.endif 104 105 bnez t4, L(left_done_\w\()x\h) 106 107 mul t5, a1, t2 108 addi t5, t5, -2 109 sh1add t5, t5, a0 110 111 sub t0, t3, t2 112 1133: 114 vse16.v v0, (t5) 115 sh1add t5, a1, t5 116 addi t0, t0, -1 117 bnez t0, 3b 118 119L(left_done_\w\()x\h): 120 121 andi t4, a7, 2 122 li t1, 2 + \w # x_end 123 bnez t4, L(right_done_\w\()x\h) 124 125 mul t5, t2, a1 126 addi t5, t5, \w 127 sh1add t5, t5, a0 128 129 sub t1, t3, t2 130 1314: 132 vse16.v v0, (t5) 133 sh1add t5, a1, t5 134 addi t1, t1, -1 135 bnez t1, 4b 136 137 li t1, \w 138 139L(right_done_\w\()x\h): 140 141 beqz t2, L(top_skip_\w\()x\h) 142 143 mul t5, a1, t2 144 add t5, t0, t5 145 sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start 146 add a5, a5, t0 147 148 sub t5, t1, t0 # x_end - x_start 149 slli t6, t0, 1 150.if \w == 4 151 vsetvli zero, t5, e16, m1, ta, ma 152.else 153 vsetvli zero, t5, e16, m2, ta, ma 154.endif 155 1565: 157 vle8.v v0, (a5) 158 addi t2, t2, 1 159 vzext.vf2 v2, v0 160 add a5, a3, a5 161 vse16.v v2, (a0) 162 sh1add a0, a1, a0 163 bnez t2, 5b 164 165 sub a0, a0, t6 # tmp -= x_start 166 167L(top_skip_\w\()x\h): 168 169 li a5, \h 170 beqz t0, L(left_skip_\w\()x\h) 171 172 sh1add a0, t0, a0 # tmp += x_start 173 1747: 175.if \w == 4 176 vsetivli zero, 2, e16, m1, ta, ma 177.else 178 vsetivli zero, 2, e16, m2, ta, ma 179.endif 180 181 vle8.v v0, (a4) 182 addi a5, a5, -1 183 vzext.vf2 v2, v0 184 addi a4, a4, 2 185 vse16.v v2, (a0) 186 sh1add a0, a1, a0 187 bnez a5, 7b 188 189 li a5, \h 190 mul t5, a1, a5 191 add t5, t5, t0 192 slli t5, t5, 1 193 sub a0, a0, t5 # tmp -= h * tmp_stride + x_start 194 195L(left_skip_\w\()x\h): 196 1978: 198.if \w == 4 199 vsetvli zero, t1, e16, m1, ta, ma 200.else 201 vsetvli zero, t1, e16, m2, ta, ma 202.endif 203 204 vle8.v v0, (a2) 205 vzext.vf2 v2, v0 206 vse16.v v2, (a0) 207 add a2, a3, a2 208 sh1add a0, a1, a0 209 addi a5, a5, -1 210 bnez a5, 8b 211 212 213 li a5, \h 214 sh1add a0, t0, a0 # tmp += x_start 215 add a6, a6, t0 # bottom += x_start 216 beq a5, t3, L(bottom_skip_\w\()x\h) 217 218 sub t5, t1, t0 219.if \w == 4 220 vsetvli zero, t5, e16, m1, ta, ma 221.else 222 vsetvli zero, t5, e16, m2, ta, ma 223.endif 224 2259: 226 vle8.v v0, (a6) 227 add a6, a3, a6 228 vzext.vf2 v2, v0 229 addi a5, a5, 1 230 vse16.v v2, (a0) 231 sh1add a0, a1, a0 232 bne a5, t3, 9b 233 234L(bottom_skip_\w\()x\h): 235 li t6, \h 236 mul t6, a3, t6 237 sub a2, a2, t6 # src -= h * src_stride 238 mul t5, a1, t3 239 add t5, t5, t0 240 slli t5, t5, 1 241 sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start 242.endm 243 244 245.macro cdef_fn w, h 246function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb" 247 csrw vxrm, zero 248 249 addi sp, sp, -32 - 144*2 250 sd a5, 24(sp) # pri_strength 251 sd a6, 16(sp) # sec_strength 252 sd a7, 8(sp) # dir 253 254 255 ld a7, 8 + 32 + 144*2(sp) # edges 256 mv a6, a4 # bottom 257 mv a5, a3 # top 258 mv a4, a2 # left 259 mv a3, a1 # dst_stride 260 mv a2, a0 # dst 261 li a1, 12 # tmp_stride 262 addi a0, sp, 32 + 2*(2*12+2) 263 padding_fn \w, \h 264 265 ld a4, 32 + 2*144(sp) # damping 266 ld a5, 24(sp) # pri_strength 267 ld a6, 16(sp) # sec_strength 268 ld a7, 8(sp) # dir 269 270 beqz a5, cdef_filter_sec_only_\w\()x\h 271 272 bnez a6, cdef_filter_pri_sec_\w\()x\h 273 274 andi t0, a5, 1 275 li t1, 4 276 sub t4, t1, t0 277 278 li t1, 63 279 clz t2, a5 280 sub t1, t1, t2 281 sub t1, a4, t1 282 283 li t0, \h 284 285 la t2, dav1d_cdef_directions 286 addi t3, a7, 2 287 sh1add t2, t3, t2 288 289 blt zero, t1, 1f 290 mv t1, zero 2911: 292 vsetivli zero, \w, e16, m1, ta, mu 293 294 lb t3, 0(t2) 295 296 vle8.v v0, (a2) 297 vzext.vf2 v2, v0 298 299 sh1add t6, t3, a0 300 slli t3, t3, 1 301 sub t3, a0, t3 302 303 vle16.v v4, (t6) 304 vle16.v v6, (t3) 305 306 vwsub.vv v8, v4, v2 307 vwsub.vv v16, v6, v2 308 309 vsetvli zero, zero, e32, m2, ta, mu 310 311 constrain_vectors v4, v6, v12, a5, t1, v8, v16 312 313 vmul.vx v28, v16, t4 314 vmacc.vx v28, t4, v8 315 316 lb t3, 1(t2) 317 318 andi t5, t4, 3 319 ori t5, t5, 2 320 321 sh1add t6, t3, a0 322 slli t3, t3, 1 323 sub t3, a0, t3 324 325 vsetvli zero, zero, e16, m1, ta, mu 326 327 vle16.v v4, (t6) 328 vle16.v v6, (t3) 329 330 vwsub.vv v8, v4, v2 331 vwsub.vv v16, v6, v2 332 333 vsetvli zero, zero, e32, m2, ta, mu 334 335 constrain_vectors v4, v6, v12, a5, t1, v8, v16 336 337 vmacc.vx v28, t5, v16 338 vmacc.vx v28, t5, v8 339 340 vmslt.vx v0, v28, zero 341 vadd.vi v28, v28, -1, v0.t 342 343 vsetvli zero, zero, e16, m1, ta, ma 344 345 vnclip.wi v24, v28, 4 346 347 vadd.vv v28, v2, v24 348 349 vsetvli zero, zero, e8, mf2, ta, ma 350 351 vnclipu.wi v24, v28, 0 352 353 vse8.v v24, (a2) 354 355 addi t0, t0, -1 356 add a2, a2, a3 357 sh1add a0, a1, a0 358 359 bnez t0, 1b 360 361 addi sp, sp, 32 + 144*2 362 ret 363 364cdef_filter_sec_only_\w\()x\h: 365 li t1, 63 366 clz t2, a6 367 sub t1, t1, t2 368 sub t1, a4, t1 369 370 li t0, \h 371 372 la t2, dav1d_cdef_directions 373 addi t3, a7, 4 374 sh1add t3, t3, t2 375 sh1add t2, a7, t2 376 3772: 378 vsetivli zero, \w, e16, m1, ta, mu 379 380 lb t4, 0(t3) 381 lb t5, 0(t2) 382 383 vle8.v v0, (a2) 384 vzext.vf2 v2, v0 385 386 sh1add t6, t4, a0 387 slli t4, t4, 1 388 sub t4, a0, t4 389 390 vle16.v v4, (t6) 391 vle16.v v6, (t4) 392 393 sh1add t4, t5, a0 394 slli t5, t5, 1 395 sub t5, a0, t5 396 397 vle16.v v8, (t4) 398 vle16.v v10, (t5) 399 400 vwsub.vv v12, v4, v2 401 vwsub.vv v14, v6, v2 402 vwsub.vv v16, v8, v2 403 vwsub.vv v18, v10, v2 404 405 vsetvli zero, zero, e32, m2, ta, mu 406 407 li t4, 2 408 constrain_vectors v4, v6, v12, a6, t1, v12, v14 409 constrain_vectors v8, v10, v14, a6, t1, v16, v18 410 411 vmul.vx v28, v18, t4 412 vmacc.vx v28, t4, v16 413 vmacc.vx v28, t4, v14 414 vmacc.vx v28, t4, v12 415 416 417 lb t4, 1(t3) 418 lb t5, 1(t2) 419 420 sh1add t6, t4, a0 421 slli t4, t4, 1 422 sub t4, a0, t4 423 424 vsetvli zero, zero, e16, m1, ta, mu 425 426 vle16.v v4, (t6) 427 vle16.v v6, (t4) 428 429 sh1add t4, t5, a0 430 slli t5, t5, 1 431 sub t5, a0, t5 432 433 vle16.v v8, (t4) 434 vle16.v v10, (t5) 435 436 vwsub.vv v12, v4, v2 437 vwsub.vv v14, v6, v2 438 vwsub.vv v16, v8, v2 439 vwsub.vv v18, v10, v2 440 441 vsetvli zero, zero, e32, m2, ta, mu 442 443 constrain_vectors v4, v6, v12, a6, t1, v12, v14 444 constrain_vectors v8, v10, v14, a6, t1, v16, v18 445 446 vadd.vv v4, v28, v12 447 vadd.vv v28, v4, v14 448 vadd.vv v4, v28, v16 449 vadd.vv v28, v4, v18 450 451 vmslt.vx v0, v28, zero 452 vadd.vi v28, v28, -1, v0.t 453 454 vsetvli zero, zero, e16, m1, ta, ma 455 456 vnclip.wi v24, v28, 4 457 458 vadd.vv v28, v2, v24 459 460 vsetvli zero, zero, e8, mf2, ta, ma 461 462 vnclipu.wi v24, v28, 0 463 464 vse8.v v24, (a2) 465 466 addi t0, t0, -1 467 add a2, a2, a3 468 sh1add a0, a1, a0 469 470 bnez t0, 2b 471 472 addi sp, sp, 32 + 144*2 473 ret 474cdef_filter_pri_sec_\w\()x\h: 475 476 li t1, 63 477 clz t2, a5 478 clz t3, a6 479 sub t2, t1, t2 480 sub t3, t1, t3 481 sub t1, a4, t2 482 sub t2, a4, t3 483 484 li t0, \h 485 486 la t3, dav1d_cdef_directions 487 488 blt zero, t1, 3f 489 mv t1, zero 4903: 491 vsetivli zero, \w, e16, m1, ta, ma 492 493 li t4, 4 494 andi t6, a5, 1 495 addi t5, a7, 2 496 sub t4, t4, t6 497 498 sh1add t5, t5, t3 499 500 vle8.v v0, (a2) 501 502 lb t6, 0(t5) 503 504 vzext.vf2 v2, v0 505 506 sh1add a4, t6, a0 507 slli t6, t6, 1 508 sub t6, a0, t6 509 510 vle16.v v4, (a4) 511 vle16.v v6, (t6) 512 513 vminu.vv v20, v4, v2 514 vmax.vv v24, v4, v2 515 vminu.vv v20, v6, v20 516 vmax.vv v24, v6, v24 517 518 vwsub.vv v8, v4, v2 519 vwsub.vv v16, v6, v2 520 521 vsetvli zero, zero, e32, m2, ta, mu 522 523 constrain_vectors v4, v6, v12, a5, t1, v8, v16 524 525 vmul.vx v28, v16, t4 526 vmacc.vx v28, t4, v8 527 528 lb t6, 1(t5) 529 530 andi t4, t4, 3 531 ori t4, t4, 2 532 533 534 sh1add a4, t6, a0 535 slli t6, t6, 1 536 sub t6, a0, t6 537 538 vsetvli zero, zero, e16, m1, ta, ma 539 540 vle16.v v4, (a4) 541 vle16.v v6, (t6) 542 543 vminu.vv v20, v4, v20 544 vmax.vv v24, v4, v24 545 vminu.vv v20, v6, v20 546 vmax.vv v24, v6, v24 547 548 vwsub.vv v8, v4, v2 549 vwsub.vv v16, v6, v2 550 551 vsetvli zero, zero, e32, m2, ta, mu 552 553 constrain_vectors v4, v6, v12, a5, t1, v8, v16 554 555 addi t5, a7, 4 556 vmacc.vx v28, t4, v16 557 vmacc.vx v28, t4, v8 558 559 sh1add t5, t5, t3 560 561 lb t6, 0(t5) 562 563 sh1add a4, t6, a0 564 slli t6, t6, 1 565 sub t6, a0, t6 566 567 vsetvli zero, zero, e16, m1, ta, ma 568 569 vle16.v v4, (a4) 570 vle16.v v6, (t6) 571 572 vminu.vv v20, v4, v20 573 vmax.vv v24, v4, v24 574 vminu.vv v20, v6, v20 575 vmax.vv v24, v6, v24 576 577 vwsub.vv v8, v4, v2 578 vwsub.vv v16, v6, v2 579 580 vsetvli zero, zero, e32, m2, ta, mu 581 582 li t6, 2 583 constrain_vectors v4, v6, v12, a6, t2, v8, v16 584 585 vmacc.vx v28, t6, v16 586 vmacc.vx v28, t6, v8 587 588 lb t6, 1(t5) 589 590 sh1add a4, t6, a0 591 slli t6, t6, 1 592 sub t6, a0, t6 593 594 vsetvli zero, zero, e16, m1, ta, ma 595 596 vle16.v v4, (a4) 597 vle16.v v6, (t6) 598 599 vminu.vv v20, v4, v20 600 vmax.vv v24, v4, v24 601 vminu.vv v20, v6, v20 602 vmax.vv v24, v6, v24 603 604 vwsub.vv v8, v4, v2 605 vwsub.vv v16, v6, v2 606 607 vsetvli zero, zero, e32, m2, ta, mu 608 609 constrain_vectors v4, v6, v12, a6, t2, v8, v16 610 611 sh1add t5, a7, t3 612 613 vadd.vv v4, v28, v8 614 vadd.vv v28, v4, v16 615 616 vsetvli zero, zero, e16, m1, ta, ma 617 618 lb t6, 0(t5) 619 620 sh1add a4, t6, a0 621 slli t6, t6, 1 622 sub t6, a0, t6 623 624 vle16.v v4, (a4) 625 vle16.v v6, (t6) 626 627 vminu.vv v20, v4, v20 628 vmax.vv v24, v4, v24 629 vminu.vv v20, v6, v20 630 vmax.vv v24, v6, v24 631 632 vwsub.vv v8, v4, v2 633 vwsub.vv v16, v6, v2 634 635 vsetvli zero, zero, e32, m2, ta, mu 636 637 li t6, 2 638 constrain_vectors v4, v6, v12, a6, t2, v8, v16 639 640 vmacc.vx v28, t6, v16 641 vmacc.vx v28, t6, v8 642 643 lb t6, 1(t5) 644 645 sh1add a4, t6, a0 646 slli t6, t6, 1 647 sub t6, a0, t6 648 649 vsetvli zero, zero, e16, m1, ta, ma 650 651 vle16.v v4, (a4) 652 vle16.v v6, (t6) 653 654 vminu.vv v20, v4, v20 655 vmax.vv v24, v4, v24 656 vminu.vv v20, v6, v20 657 vmax.vv v24, v6, v24 658 659 vwsub.vv v8, v4, v2 660 vwsub.vv v16, v6, v2 661 662 vsetvli zero, zero, e32, m2, ta, mu 663 664 constrain_vectors v4, v6, v12, a6, t2, v8, v16 665 666 vadd.vv v4, v28, v8 667 vadd.vv v28, v4, v16 668 669 vmslt.vx v0, v28, zero 670 vadd.vi v28, v28, -1, v0.t 671 672 vsetvli zero, zero, e16, m1, ta, mu 673 674 vnclip.wi v16, v28, 4 675 676 vadd.vv v28, v2, v16 677 678 vmslt.vv v0, v20, v28 679 vmerge.vvm v4, v20, v28, v0 680 681 vmslt.vv v0, v4, v24 682 vmerge.vvm v28, v24, v4, v0 683 684 vsetvli zero, zero, e8, mf2, ta, ma 685 686 vnclipu.wi v24, v28, 0 687 688 vse8.v v24, (a2) 689 690 addi t0, t0, -1 691 add a2, a2, a3 692 sh1add a0, a1, a0 693 694 bnez t0, 3b 695 696 addi sp, sp, 32 + 144*2 697 ret 698endfunc 699.endm 700 701cdef_fn 4, 4 702cdef_fn 4, 8 703cdef_fn 8, 8 704