1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2 31 vmslt.vx v0, \vec_tmp1, zero 32 vneg.v \vec_tmp1, \vec_tmp1, v0.t 33 vmmv.m v1, v0 34 35 vmslt.vx v0, \vec_tmp2, zero 36 vneg.v \vec_tmp2, \vec_tmp2, v0.t 37 38 vsra.vx \vec1, \vec_tmp1, \shift 39 vsra.vx \vec2, \vec_tmp2, \shift 40 41 vrsub.vx \vec1, \vec1, \strength 42 vrsub.vx \vec2, \vec2, \strength 43 44 vmax.vx \vec1, \vec1, zero 45 vmax.vx \vec2, \vec2, zero 46 47 vmin.vv \vec_tmp1, \vec1, \vec_tmp1 48 vmin.vv \vec_tmp2, \vec2, \vec_tmp2 49 50 vneg.v \vec_tmp2, \vec_tmp2, v0.t 51 52 vmmv.m v0, v1 53 vneg.v \vec_tmp1, \vec_tmp1, v0.t 54.endm 55 56.macro padding_fn w, h 57 li t5, -32768 # INT16_MIN 58 59 andi t4, a7, 4 60 li t2, -2 # y_start 61 62.if \w == 4 63 vsetivli zero, \w + 4, e16, m1, ta, ma 64.else 65 vsetivli zero, \w + 4, e16, m2, ta, ma 66.endif 67 vmv.v.x v0, t5 68 bnez t4, L(top_done_\w\()x\h) 69 70 slli t5, a1, 1 71 addi t5, t5, 2 72 slli t5, t5, 1 73 sub t5, a0, t5 74 75 sh1add t4, a1, t5 76 vse16.v v0, (t5) 77 vse16.v v0, (t4) 78 li t2, 0 79 80L(top_done_\w\()x\h): 81 andi t4, a7, 8 82 li t3, 2 + \h # y_end 83 bnez t4, L(bottom_done_\w\()x\h) 84 85 li t5, \h 86 mul t5, a1, t5 87 addi t5, t5, -2 88 sh1add t5, t5, a0 89 90 sh1add t4, a1, t5 91 vse16.v v0, (t5) 92 vse16.v v0, (t4) 93 addi t3, t3, -2 94 95L(bottom_done_\w\()x\h): 96 andi t4, a7, 1 97 li t0, -2 # x_start 98 99.if \w == 4 100 vsetivli zero, 2, e16, m1, ta, ma 101.else 102 vsetivli zero, 2, e16, m2, ta, ma 103.endif 104 105 bnez t4, L(left_done_\w\()x\h) 106 107 mul t5, a1, t2 108 addi t5, t5, -2 109 sh1add t5, t5, a0 110 111 sub t0, t3, t2 112 1133: 114 vse16.v v0, (t5) 115 sh1add t5, a1, t5 116 addi t0, t0, -1 117 bnez t0, 3b 118 119L(left_done_\w\()x\h): 120 121 andi t4, a7, 2 122 li t1, 2 + \w # x_end 123 bnez t4, L(right_done_\w\()x\h) 124 125 mul t5, t2, a1 126 addi t5, t5, \w 127 sh1add t5, t5, a0 128 129 sub t1, t3, t2 130 1314: 132 vse16.v v0, (t5) 133 sh1add t5, a1, t5 134 addi t1, t1, -1 135 bnez t1, 4b 136 137 li t1, \w 138 139L(right_done_\w\()x\h): 140 141 beqz t2, L(top_skip_\w\()x\h) 142 143 mul t5, a1, t2 144 add t5, t0, t5 145 sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start 146 sh1add a5, t0, a5 # top += x_start 147 148 sub t5, t1, t0 149 slli t6, t0, 1 150.if \w == 4 151 vsetvli zero, t5, e16, m1, ta, ma 152.else 153 vsetvli zero, t5, e16, m2, ta, ma 154.endif 155 1565: 157 vle16.v v2, (a5) 158 addi t2, t2, 1 159 add a5, a3, a5 160 vse16.v v2, (a0) 161 sh1add a0, a1, a0 162 bnez t2, 5b 163 164 sub a0, a0, t6 # tmp -= x_start 165 166L(top_skip_\w\()x\h): 167 168 li a5, \h 169 beqz t0, L(left_skip_\w\()x\h) 170 171 sh1add a0, t0, a0 # tmp += x_start 172 1737: 174.if \w == 4 175 vsetivli zero, 2, e16, m1, ta, ma 176.else 177 vsetivli zero, 2, e16, m2, ta, ma 178.endif 179 180 vle16.v v2, (a4) 181 addi a5, a5, -1 182 addi a4, a4, 4 183 vse16.v v2, (a0) 184 sh1add a0, a1, a0 185 bnez a5, 7b 186 187 li a5, \h 188 mul t5, a1, a5 189 add t5, t5, t0 190 slli t5, t5, 1 191 sub a0, a0, t5 # tmp -= h * tmp_stride + x_start 192 193L(left_skip_\w\()x\h): 194 1958: 196.if \w == 4 197 vsetvli zero, t1, e16, m1, ta, ma 198.else 199 vsetvli zero, t1, e16, m2, ta, ma 200.endif 201 202 vle16.v v2, (a2) 203 add a2, a3, a2 204 vse16.v v2, (a0) 205 sh1add a0, a1, a0 206 addi a5, a5, -1 207 bnez a5, 8b 208 209 210 li a5, \h 211 sh1add a0, t0, a0 # tmp += x_start 212 sh1add a6, t0, a6 # bottom += x_start 213 beq a5, t3, L(bottom_skip_\w\()x\h) 214 215 sub t5, t1, t0 216.if \w == 4 217 vsetvli zero, t5, e16, m1, ta, ma 218.else 219 vsetvli zero, t5, e16, m2, ta, ma 220.endif 221 2229: 223 vle16.v v2, (a6) 224 add a6, a3, a6 225 addi a5, a5, 1 226 vse16.v v2, (a0) 227 sh1add a0, a1, a0 228 bne a5, t3, 9b 229 230L(bottom_skip_\w\()x\h): 231 li t6, \h 232 mul t6, a3, t6 233 sub a2, a2, t6 # src -= h * PXSTRIDE(src_stride) 234 mul t5, a1, t3 235 add t5, t5, t0 236 slli t5, t5, 1 237 sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start 238.endm 239 240.macro cdef_fn w, h 241function cdef_filter_block_\w\()x\h\()_16bpc_rvv, export=1, ext="v,zba,zbb" 242 csrw vxrm, zero 243 244 addi sp, sp, -32 - 144*2 245 sd a5, 24(sp) # pri_strength 246 sd a6, 16(sp) # sec_strength 247 sd a7, 8(sp) # dir 248 249 ld a7, 8 + 32 + 144*2(sp) # edges 250 mv a6, a4 # bottom 251 mv a5, a3 # top 252 mv a4, a2 # left 253 mv a3, a1 # dst_stride 254 mv a2, a0 # dst 255 li a1, 12 # tmp_stride 256 addi a0, sp, 32 + 2*(2*12+2) 257 258 padding_fn \w, \h 259 260 ld a4, 32 + 2*144(sp) # damping 261 ld a5, 24(sp) # pri_strength 262 ld a6, 16(sp) # sec_strength 263 ld a7, 8(sp) # dir 264 265 beqz a5, cdef_filter_sec_only_\w\()x\h 266 267 bnez a6, cdef_filter_pri_sec_\w\()x\h 268 269 li t1, 64-8 270 ld t4, 32 + 2*144 + 16(sp) # bitdepth_max 271 clz t4, t4 272 sub t4, t1, t4 273 sra t4, a5, t4 274 andi t0, t4, 1 275 li t1, 4 276 sub t4, t1, t0 277 278 li t1, 63 279 clz t2, a5 280 sub t1, t1, t2 281 sub t1, a4, t1 282 283 li t0, \h 284 285 la t2, dav1d_cdef_directions 286 addi t3, a7, 2 287 sh1add t2, t3, t2 288 289 vsetivli zero, \w, e16, m1, ta, ma 290 blt zero, t1, 1f 291 mv t1, zero 2921: 293 lb t3, 0(t2) 294 295 vle16.v v2, (a2) 296 297 sh1add t6, t3, a0 298 slli t3, t3, 1 299 sub t3, a0, t3 300 301 vle16.v v4, (t6) 302 vle16.v v6, (t3) 303 304 vwsub.vv v8, v4, v2 305 vwsub.vv v16, v6, v2 306 307 vsetvli zero, zero, e32, m2, ta, mu 308 309 constrain_vectors v4, v6, v2, a5, t1, v8, v16 310 311 vmul.vx v28, v16, t4 312 vmacc.vx v28, t4, v8 313 314 lb t3, 1(t2) 315 316 andi t5, t4, 3 317 ori t5, t5, 2 318 319 sh1add t6, t3, a0 320 slli t3, t3, 1 321 sub t3, a0, t3 322 323 vsetvli zero, zero, e16, m1, ta, ma 324 325 vle16.v v4, (t6) 326 vle16.v v6, (t3) 327 328 vwsub.vv v8, v4, v2 329 vwsub.vv v16, v6, v2 330 331 vsetvli zero, zero, e32, m2, ta, mu 332 333 constrain_vectors v4, v6, v2, a5, t1, v8, v16 334 335 vmacc.vx v28, t5, v16 336 vmacc.vx v28, t5, v8 337 338 vmslt.vx v0, v28, zero 339 vadd.vi v28, v28, -1, v0.t 340 341 vsetvli zero, zero, e16, m1, ta, ma 342 343 vnclip.wi v24, v28, 4 344 345 vadd.vv v28, v2, v24 346 347 vse16.v v28, (a2) 348 349 add a2, a2, a3 350 sh1add a0, a1, a0 351 352 addi t0, t0, -1 353 bnez t0, 1b 354 355 addi sp, sp, 32 + 144*2 356 ret 357 358cdef_filter_sec_only_\w\()x\h: 359 li t1, 63 360 clz t2, a6 361 sub t1, t1, t2 362 sub t1, a4, t1 363 364 li t0, \h 365 366 la t2, dav1d_cdef_directions 367 addi t3, a7, 4 368 sh1add t3, t3, t2 369 sh1add t2, a7, t2 370 371 vsetivli zero, \w, e16, m1, ta, ma 3722: 373 374 lb t4, 0(t3) 375 lb t5, 0(t2) 376 377 vle16.v v2, (a2) 378 379 sh1add t6, t4, a0 380 slli t4, t4, 1 381 sub t4, a0, t4 382 383 vle16.v v4, (t6) 384 vle16.v v6, (t4) 385 386 sh1add t4, t5, a0 387 slli t5, t5, 1 388 sub t5, a0, t5 389 390 vle16.v v8, (t4) 391 vle16.v v10, (t5) 392 393 vwsub.vv v12, v4, v2 394 vwsub.vv v14, v6, v2 395 vwsub.vv v16, v8, v2 396 vwsub.vv v18, v10, v2 397 398 vsetvli zero, zero, e32, m2, ta, mu 399 400 li t4, 2 401 constrain_vectors v4, v6, v2, a6, t1, v12, v14 402 constrain_vectors v8, v10, v2, a6, t1, v16, v18 403 404 vmul.vx v28, v18, t4 405 vmacc.vx v28, t4, v16 406 vmacc.vx v28, t4, v14 407 vmacc.vx v28, t4, v12 408 409 lb t4, 1(t3) 410 lb t5, 1(t2) 411 412 sh1add t6, t4, a0 413 slli t4, t4, 1 414 sub t4, a0, t4 415 416 vsetvli zero, zero, e16, m1, ta, ma 417 418 vle16.v v4, (t6) 419 vle16.v v6, (t4) 420 421 sh1add t4, t5, a0 422 slli t5, t5, 1 423 sub t5, a0, t5 424 425 vle16.v v8, (t4) 426 vle16.v v10, (t5) 427 428 vwsub.vv v12, v4, v2 429 vwsub.vv v14, v6, v2 430 vwsub.vv v16, v8, v2 431 vwsub.vv v18, v10, v2 432 433 vsetvli zero, zero, e32, m2, ta, mu 434 435 constrain_vectors v4, v6, v2, a6, t1, v12, v14 436 constrain_vectors v8, v10, v2, a6, t1, v16, v18 437 438 vadd.vv v4, v28, v12 439 vadd.vv v28, v4, v14 440 vadd.vv v4, v28, v16 441 vadd.vv v28, v4, v18 442 443 vmslt.vx v0, v28, zero 444 vadd.vi v28, v28, -1, v0.t 445 446 vsetvli zero, zero, e16, m1, ta, ma 447 448 vnclip.wi v24, v28, 4 449 450 vadd.vv v28, v2, v24 451 452 vse16.v v28, (a2) 453 454 add a2, a2, a3 455 sh1add a0, a1, a0 456 457 addi t0, t0, -1 458 bnez t0, 2b 459 460 addi sp, sp, 32 + 144*2 461 ret 462cdef_filter_pri_sec_\w\()x\h: 463 464 li t1, 63 465 clz t2, a5 466 clz t3, a6 467 sub t2, t1, t2 468 sub t3, t1, t3 469 sub t1, a4, t2 470 sub t2, a4, t3 471 472 li t0, \h 473 474 la t3, dav1d_cdef_directions 475 476 vsetivli zero, \w, e16, m1, ta, ma 477 blt zero, t1, 3f 478 mv t1, zero 4793: 480 li t5, 64-8 481 ld t4, 32 + 2*144 + 16(sp) # bitdepth_max 482 clz t4, t4 483 sub t4, t5, t4 484 sra t4, a5, t4 485 li t6, 4 486 andi t5, t4, 1 487 sub t4, t6, t5 488 489 addi t5, a7, 2 490 491 sh1add t5, t5, t3 492 493 vle16.v v2, (a2) 494 495 lb t6, 0(t5) 496 497 sh1add a4, t6, a0 498 slli t6, t6, 1 499 sub t6, a0, t6 500 501 vle16.v v4, (a4) 502 vle16.v v6, (t6) 503 504 vminu.vv v20, v4, v2 505 vmax.vv v24, v4, v2 506 vminu.vv v20, v6, v20 507 vmax.vv v24, v6, v24 508 509 vwsub.vv v8, v4, v2 510 vwsub.vv v16, v6, v2 511 512 vsetvli zero, zero, e32, m2, ta, mu 513 514 constrain_vectors v4, v6, v2, a5, t1, v8, v16 515 516 vmul.vx v28, v16, t4 517 vmacc.vx v28, t4, v8 518 519 andi t4, t4, 3 520 ori t4, t4, 2 521 522 lb t6, 1(t5) 523 524 sh1add a4, t6, a0 525 slli t6, t6, 1 526 sub t6, a0, t6 527 528 vsetvli zero, zero, e16, m1, ta, ma 529 530 vle16.v v4, (a4) 531 vle16.v v6, (t6) 532 533 vminu.vv v20, v4, v20 534 vmax.vv v24, v4, v24 535 vminu.vv v20, v6, v20 536 vmax.vv v24, v6, v24 537 538 vwsub.vv v8, v4, v2 539 vwsub.vv v16, v6, v2 540 541 vsetvli zero, zero, e32, m2, ta, mu 542 543 constrain_vectors v4, v6, v2, a5, t1, v8, v16 544 545 addi t5, a7, 4 546 vmacc.vx v28, t4, v16 547 vmacc.vx v28, t4, v8 548 549 sh1add t5, t5, t3 550 551 lb t6, 0(t5) 552 553 sh1add a4, t6, a0 554 slli t6, t6, 1 555 sub t6, a0, t6 556 557 vsetvli zero, zero, e16, m1, ta, ma 558 559 vle16.v v4, (a4) 560 vle16.v v6, (t6) 561 562 vminu.vv v20, v4, v20 563 vmax.vv v24, v4, v24 564 vminu.vv v20, v6, v20 565 vmax.vv v24, v6, v24 566 567 vwsub.vv v8, v4, v2 568 vwsub.vv v16, v6, v2 569 570 vsetvli zero, zero, e32, m2, ta, mu 571 572 li t6, 2 573 constrain_vectors v4, v6, v2, a6, t2, v8, v16 574 575 vmacc.vx v28, t6, v16 576 vmacc.vx v28, t6, v8 577 578 lb t6, 1(t5) 579 580 sh1add a4, t6, a0 581 slli t6, t6, 1 582 sub t6, a0, t6 583 584 vsetvli zero, zero, e16, m1, ta, ma 585 586 vle16.v v4, (a4) 587 vle16.v v6, (t6) 588 589 vminu.vv v20, v4, v20 590 vmax.vv v24, v4, v24 591 vminu.vv v20, v6, v20 592 vmax.vv v24, v6, v24 593 594 vwsub.vv v8, v4, v2 595 vwsub.vv v16, v6, v2 596 597 vsetvli zero, zero, e32, m2, ta, mu 598 599 constrain_vectors v4, v6, v2, a6, t2, v8, v16 600 601 sh1add t5, a7, t3 602 603 vadd.vv v4, v28, v8 604 vadd.vv v28, v4, v16 605 606 vsetvli zero, zero, e16, m1, ta, ma 607 608 lb t6, 0(t5) 609 610 sh1add a4, t6, a0 611 slli t6, t6, 1 612 sub t6, a0, t6 613 614 vle16.v v4, (a4) 615 vle16.v v6, (t6) 616 617 vminu.vv v20, v4, v20 618 vmax.vv v24, v4, v24 619 vminu.vv v20, v6, v20 620 vmax.vv v24, v6, v24 621 622 vwsub.vv v8, v4, v2 623 vwsub.vv v16, v6, v2 624 625 vsetvli zero, zero, e32, m2, ta, mu 626 627 li t6, 2 628 constrain_vectors v4, v6, v2, a6, t2, v8, v16 629 630 vmacc.vx v28, t6, v16 631 vmacc.vx v28, t6, v8 632 633 lb t6, 1(t5) 634 635 sh1add a4, t6, a0 636 slli t6, t6, 1 637 sub t6, a0, t6 638 639 vsetvli zero, zero, e16, m1, ta, ma 640 641 vle16.v v4, (a4) 642 vle16.v v6, (t6) 643 644 vminu.vv v20, v4, v20 645 vmax.vv v24, v4, v24 646 vminu.vv v20, v6, v20 647 vmax.vv v24, v6, v24 648 649 vwsub.vv v8, v4, v2 650 vwsub.vv v16, v6, v2 651 652 vsetvli zero, zero, e32, m2, ta, mu 653 654 constrain_vectors v4, v6, v2, a6, t2, v8, v16 655 656 vadd.vv v4, v28, v8 657 vadd.vv v28, v4, v16 658 659 vmslt.vx v0, v28, zero 660 vadd.vi v28, v28, -1, v0.t 661 662 vsetvli zero, zero, e16, m1, ta, ma 663 664 vnclip.wi v16, v28, 4 665 666 vadd.vv v28, v2, v16 667 668 vmslt.vv v0, v20, v28 669 vmerge.vvm v4, v20, v28, v0 670 671 vmslt.vv v0, v4, v24 672 vmerge.vvm v28, v24, v4, v0 673 674 vse16.v v28, (a2) 675 676 add a2, a2, a3 677 sh1add a0, a1, a0 678 679 addi t0, t0, -1 680 bnez t0, 3b 681 682 addi sp, sp, 32 + 144*2 683 ret 684endfunc 685.endm 686 687cdef_fn 4, 4 688cdef_fn 4, 8 689cdef_fn 8, 8 690