1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2023, Nathan Egge 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30function inv_txfm_add_4x4_rvv, export=1, ext=v 31 csrw vxrm, zero 32 33 vsetivli zero, 4, e16, mf2, ta, ma 34 vle16.v v0, (a2) 35 addi t0, a2, 8 36 vle16.v v1, (t0) 37 addi t0, t0, 8 38 vle16.v v2, (t0) 39 addi t0, t0, 8 40 vle16.v v3, (t0) 41 42 jalr t0, a4 43 44 vmv.v.x v4, zero 45 46 vsseg4e16.v v0, (a2) 47 vle16.v v0, (a2) 48 vse16.v v4, (a2) 49 addi t0, a2, 8 50 vle16.v v1, (t0) 51 vse16.v v4, (t0) 52 addi t0, t0, 8 53 vle16.v v2, (t0) 54 vse16.v v4, (t0) 55 addi t0, t0, 8 56 vle16.v v3, (t0) 57 vse16.v v4, (t0) 58 59 jalr t0, a5 60 61 vssra.vi v0, v0, 4 62 vssra.vi v1, v1, 4 63 vssra.vi v2, v2, 4 64 vssra.vi v3, v3, 4 65 66itx_4x4_end: 67 vsetvli zero, zero, e8, mf4, ta, ma 68 vle8.v v4, (a0) 69 add t0, a0, a1 70 vle8.v v5, (t0) 71 add t0, t0, a1 72 vle8.v v6, (t0) 73 add t0, t0, a1 74 vle8.v v7, (t0) 75 76 vwaddu.wv v0, v0, v4 77 vwaddu.wv v1, v1, v5 78 vwaddu.wv v2, v2, v6 79 vwaddu.wv v3, v3, v7 80 81 vsetvli zero, zero, e16, mf2, ta, ma 82 vmax.vx v0, v0, zero 83 vmax.vx v1, v1, zero 84 vmax.vx v2, v2, zero 85 vmax.vx v3, v3, zero 86 87 vsetvli zero, zero, e8, mf4, ta, ma 88 89 vnclipu.wi v4, v0, 0 90 vnclipu.wi v5, v1, 0 91 vnclipu.wi v6, v2, 0 92 vnclipu.wi v7, v3, 0 93 94 vse8.v v4, (a0) 95 add a0, a0, a1 96 vse8.v v5, (a0) 97 add a0, a0, a1 98 vse8.v v6, (a0) 99 add a0, a0, a1 100 vse8.v v7, (a0) 101 102 ret 103endfunc 104 105function inv_identity_e16_x4_rvv, export=1, ext=v 106 li t1, (5793-4096)*8 107 vsmul.vx v4, v0, t1 108 vsmul.vx v5, v1, t1 109 vsmul.vx v6, v2, t1 110 vsmul.vx v7, v3, t1 111 112 vsadd.vv v0, v0, v4 113 vsadd.vv v1, v1, v5 114 vsadd.vv v2, v2, v6 115 vsadd.vv v3, v3, v7 116 117 jr t0 118endfunc 119 120.macro iwht_4 121 vadd.vv v0, v0, v1 122 vsub.vv v5, v2, v3 123 vsub.vv v4, v0, v5 124 vsra.vi v4, v4, 1 125 vsub.vv v2, v4, v1 126 vsub.vv v1, v4, v3 127 vadd.vv v3, v5, v2 128 vsub.vv v0, v0, v1 129.endm 130 131.macro idct_4 o0, o1, o2, o3 132 li t1, 2896 133 li t2, 1567 134 li t3, 3784 135 136 vwmul.vx v16, \o0, t1 137 vwmul.vx v18, \o0, t1 138 vwmacc.vx v16, t1, \o2 139 neg t1, t1 140 vwmacc.vx v18, t1, \o2 141 142 vwmul.vx v20, \o1, t3 143 neg t3, t3 144 vwmul.vx v22, \o1, t2 145 vwmacc.vx v20, t2, \o3 146 vwmacc.vx v22, t3, \o3 147 148 vnclip.wi v16, v16, 12 149 vnclip.wi v18, v18, 12 150 vnclip.wi v20, v20, 12 151 vnclip.wi v22, v22, 12 152 153 vsadd.vv \o0, v16, v20 154 vsadd.vv \o1, v18, v22 155 vssub.vv \o2, v18, v22 156 vssub.vv \o3, v16, v20 157.endm 158 159.macro iadst_4 o0, o1, o2, o3, lm2, lm 160 li t1, 1321 161 li t2, 3803 162 li t3, 2482 163 164 vwmul.vx v16, v0, t1 165 vwmul.vx v18, v0, t3 166 neg t1, t1 167 vwmacc.vx v16, t2, v2 168 vwmacc.vx v18, t1, v2 169 neg t2, t2 170 vwmacc.vx v16, t3, v3 171 vwmacc.vx v18, t2, v3 172 173 vwsub.vv v20, v0, v2 174 vwadd.wv v20, v20, v3 175 176 li t1, 3344 177 vwmul.vx v22, v1, t1 178 179 vsetvli zero, zero, e32, \lm2, ta, ma 180 181 vmul.vx v20, v20, t1 182 183 vadd.vv v24, v16, v18 184 vadd.vv v16, v16, v22 185 vadd.vv v18, v18, v22 186 vsub.vv v22, v24, v22 187 188 vsetvli zero, zero, e16, \lm, ta, ma 189 190 vnclip.wi \o0, v16, 12 191 vnclip.wi \o1, v18, 12 192 vnclip.wi \o2, v20, 12 193 vnclip.wi \o3, v22, 12 194.endm 195 196function inv_dct_e16_x4_rvv, export=1, ext=v 197 idct_4 v0, v1, v2, v3 198 jr t0 199endfunc 200 201function inv_adst_e16_x4_rvv, export=1, ext=v 202 iadst_4 v0, v1, v2, v3, m1, mf2 203 jr t0 204endfunc 205 206function inv_flipadst_e16_x4_rvv, export=1, ext=v 207 iadst_4 v3, v2, v1, v0, m1, mf2 208 jr t0 209endfunc 210 211function inv_adst_e16_x4w_rvv, export=1, ext=v 212 iadst_4 v0, v1, v2, v3, m2, m1 213 jr t0 214endfunc 215 216function inv_flipadst_e16_x4w_rvv, export=1, ext=v 217 iadst_4 v3, v2, v1, v0, m2, m1 218 jr t0 219endfunc 220 221function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v 222 csrw vxrm, zero 223 224 vsetivli zero, 4, e16, mf2, ta, ma 225 vle16.v v0, (a2) 226 addi t0, a2, 8 227 vle16.v v1, (t0) 228 addi t0, t0, 8 229 vle16.v v2, (t0) 230 addi t0, t0, 8 231 vle16.v v3, (t0) 232 233 vsra.vi v0, v0, 2 234 vsra.vi v1, v1, 2 235 vsra.vi v2, v2, 2 236 vsra.vi v3, v3, 2 237 238 iwht_4 239 240 vmv.v.x v4, zero 241 242 vsseg4e16.v v0, (a2) 243 vle16.v v0, (a2) 244 vse16.v v4, (a2) 245 addi t0, a2, 8 246 vle16.v v1, (t0) 247 vse16.v v4, (t0) 248 addi t0, t0, 8 249 vle16.v v2, (t0) 250 vse16.v v4, (t0) 251 addi t0, t0, 8 252 vle16.v v3, (t0) 253 vse16.v v4, (t0) 254 255 iwht_4 256 257 j itx_4x4_end 258endfunc 259 260.macro def_fn_4x4 txfm1, txfm2 261function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v 262.ifc \txfm1\()_\txfm2, dct_dct 263 beqz a3, 1f 264.endif 265 la a4, inv_\txfm1\()_e16_x4_rvv 266 la a5, inv_\txfm2\()_e16_x4_rvv 267 j inv_txfm_add_4x4_rvv 268.ifc \txfm1\()_\txfm2, dct_dct 2691: 270 csrw vxrm, zero 271 vsetivli zero, 4, e16, mf2, ta, ma 272 ld t2, (a2) 273 li t1, 2896*8 274 vmv.v.x v0, t2 275 vsmul.vx v0, v0, t1 276 sd x0, (a2) 277 vsmul.vx v0, v0, t1 278 vssra.vi v0, v0, 4 279 vmv.v.v v1, v0 280 vmv.v.v v2, v0 281 vmv.v.v v3, v0 282 j itx_4x4_end 283.endif 284endfunc 285.endm 286 287def_fn_4x4 dct, dct 288def_fn_4x4 identity, identity 289def_fn_4x4 dct, adst 290def_fn_4x4 dct, flipadst 291def_fn_4x4 dct, identity 292def_fn_4x4 adst, dct 293def_fn_4x4 adst, adst 294def_fn_4x4 adst, flipadst 295def_fn_4x4 flipadst, dct 296def_fn_4x4 flipadst, adst 297def_fn_4x4 flipadst, flipadst 298def_fn_4x4 identity, dct 299def_fn_4x4 adst, identity 300def_fn_4x4 flipadst, identity 301def_fn_4x4 identity, adst 302def_fn_4x4 identity, flipadst 303 304.macro def_fn_8x8_base variant 305function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v 306 csrw vxrm, zero 307 308 vsetivli zero, 8, e16, m1, ta, ma 309 vle16.v v0, (a2) 310 addi t0, a2, 16 311 vle16.v v1, (t0) 312 addi t0, t0, 16 313 vle16.v v2, (t0) 314 addi t0, t0, 16 315 vle16.v v3, (t0) 316 addi t0, t0, 16 317 vle16.v v4, (t0) 318 addi t0, t0, 16 319 vle16.v v5, (t0) 320 addi t0, t0, 16 321 vle16.v v6, (t0) 322 addi t0, t0, 16 323 vle16.v v7, (t0) 324 325.ifc \variant, identity_ 326 // The identity vsadd.vv and downshift vssra.vi 1 cancel out 327 328 j L(itx_8x8_epilog) 329.else 330 jalr t0, a4 331 332 vssra.vi v0, v0, 1 333 vssra.vi v1, v1, 1 334 vssra.vi v2, v2, 1 335 vssra.vi v3, v3, 1 336 vssra.vi v4, v4, 1 337 vssra.vi v5, v5, 1 338 vssra.vi v6, v6, 1 339 vssra.vi v7, v7, 1 340 341L(itx_8x8_epilog): 342 vsseg8e16.v v0, (a2) 343 vle16.v v0, (a2) 344 addi t0, a2, 16 345 vle16.v v1, (t0) 346 addi t0, t0, 16 347 vle16.v v2, (t0) 348 addi t0, t0, 16 349 vle16.v v3, (t0) 350 addi t0, t0, 16 351 vle16.v v4, (t0) 352 addi t0, t0, 16 353 vle16.v v5, (t0) 354 addi t0, t0, 16 355 vle16.v v6, (t0) 356 addi t0, t0, 16 357 vle16.v v7, (t0) 358 359 jalr t0, a5 360 361 vssra.vi v0, v0, 4 362 vssra.vi v1, v1, 4 363 vssra.vi v2, v2, 4 364 vssra.vi v3, v3, 4 365 vssra.vi v4, v4, 4 366 vssra.vi v5, v5, 4 367 vssra.vi v6, v6, 4 368 vssra.vi v7, v7, 4 369 370 li t1, 64 371 vsetvli zero, t1, e16, m8, ta, ma 372 vmv.v.x v8, zero 373 vse16.v v8, (a2) 374 375itx_8x8_end: 376 vsetivli zero, 8, e8, mf2, ta, ma 377 vle8.v v8, (a0) 378 add t0, a0, a1 379 vle8.v v9, (t0) 380 add t0, t0, a1 381 vle8.v v10, (t0) 382 add t0, t0, a1 383 vle8.v v11, (t0) 384 add t0, t0, a1 385 vle8.v v12, (t0) 386 add t0, t0, a1 387 vle8.v v13, (t0) 388 add t0, t0, a1 389 vle8.v v14, (t0) 390 add t0, t0, a1 391 vle8.v v15, (t0) 392 393 vwaddu.wv v0, v0, v8 394 vwaddu.wv v1, v1, v9 395 vwaddu.wv v2, v2, v10 396 vwaddu.wv v3, v3, v11 397 vwaddu.wv v4, v4, v12 398 vwaddu.wv v5, v5, v13 399 vwaddu.wv v6, v6, v14 400 vwaddu.wv v7, v7, v15 401 402 vsetvli zero, zero, e16, m1, ta, ma 403 vmax.vx v0, v0, zero 404 vmax.vx v1, v1, zero 405 vmax.vx v2, v2, zero 406 vmax.vx v3, v3, zero 407 vmax.vx v4, v4, zero 408 vmax.vx v5, v5, zero 409 vmax.vx v6, v6, zero 410 vmax.vx v7, v7, zero 411 412 vsetvli zero, zero, e8, mf2, ta, ma 413 414 vnclipu.wi v8, v0, 0 415 vnclipu.wi v9, v1, 0 416 vnclipu.wi v10, v2, 0 417 vnclipu.wi v11, v3, 0 418 vnclipu.wi v12, v4, 0 419 vnclipu.wi v13, v5, 0 420 vnclipu.wi v14, v6, 0 421 vnclipu.wi v15, v7, 0 422 423 vse8.v v8, (a0) 424 add a0, a0, a1 425 vse8.v v9, (a0) 426 add a0, a0, a1 427 vse8.v v10, (a0) 428 add a0, a0, a1 429 vse8.v v11, (a0) 430 add a0, a0, a1 431 vse8.v v12, (a0) 432 add a0, a0, a1 433 vse8.v v13, (a0) 434 add a0, a0, a1 435 vse8.v v14, (a0) 436 add a0, a0, a1 437 vse8.v v15, (a0) 438 439 ret 440.endif 441endfunc 442.endm 443 444def_fn_8x8_base identity_ 445def_fn_8x8_base 446 447function inv_identity_e16_x8_rvv, export=1, ext=v 448 vsadd.vv v0, v0, v0 449 vsadd.vv v1, v1, v1 450 vsadd.vv v2, v2, v2 451 vsadd.vv v3, v3, v3 452 vsadd.vv v4, v4, v4 453 vsadd.vv v5, v5, v5 454 vsadd.vv v6, v6, v6 455 vsadd.vv v7, v7, v7 456 457 jr t0 458endfunc 459 460.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 461 idct_4 \o0, \o2, \o4, \o6 462 463 li t1, 799 464 li t2, 4017 465 li t3, 3406 466 li t4, 2276 467 468 vwmul.vx v22, \o1, t2 469 neg t2, t2 470 vwmul.vx v16, \o1, t1 471 vwmacc.vx v22, t1, \o7 472 vwmacc.vx v16, t2, \o7 473 474 vwmul.vx v20, \o5, t4 475 neg t4, t4 476 vwmul.vx v18, \o5, t3 477 vwmacc.vx v20, t3, \o3 478 vwmacc.vx v18, t4, \o3 479 480 vnclip.wi v16, v16, 12 481 vnclip.wi v18, v18, 12 482 vnclip.wi v20, v20, 12 483 vnclip.wi v22, v22, 12 484 485 vssub.vv \o7, v22, v20 486 vsadd.vv v22, v22, v20 487 vssub.vv \o1, v16, v18 488 vsadd.vv v16, v16, v18 489 490 li t2, 2896 491 492 vwmul.vx v18, \o7, t2 493 vwmul.vx v20, \o7, t2 494 vwmacc.vx v20, t2, \o1 495 neg t2, t2 496 vwmacc.vx v18, t2, \o1 497 498 vnclip.wi v18, v18, 12 499 vnclip.wi v20, v20, 12 500 501 vssub.vv \o7, \o0, v22 502 vsadd.vv \o0, \o0, v22 503 vssub.vv v17, \o2, v20 504 vsadd.vv \o1, \o2, v20 505 vssub.vv \o5, \o4, v18 506 vsadd.vv \o2, \o4, v18 507 vssub.vv \o4, \o6, v16 508 vsadd.vv \o3, \o6, v16 509 vmv.v.v \o6, v17 510.endm 511 512.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 513 li t1, 4076 514 li t2, 401 515 li t3, 3612 516 li t4, 1931 517 li t5, 2598 518 li t6, 3166 519 520 vwmul.vx v16, v7, t1 521 neg t1, t1 522 vwmul.vx v18, v7, t2 523 vwmacc.vx v16, t2, v0 524 vwmacc.vx v18, t1, v0 525 526 vwmul.vx v20, v5, t3 527 neg t3, t3 528 vwmul.vx v22, v5, t4 529 vwmacc.vx v20, t4, v2 530 vwmacc.vx v22, t3, v2 531 532 vwmul.vx v24, v3, t5 533 neg t5, t5 534 vwmul.vx v26, v3, t6 535 vwmacc.vx v24, t6, v4 536 vwmacc.vx v26, t5, v4 537 538 li t2, 1189 539 li t3, 3920 540 li t4, 1567 541 li t5, 3784 542 li t6, 2896 543 544 vwmul.vx v28, v1, t2 545 neg t2, t2 546 vwmul.vx v30, v1, t3 547 vwmacc.vx v28, t3, v6 548 vwmacc.vx v30, t2, v6 549 550 vnclip.wi v16, v16, 12 551 vnclip.wi v18, v18, 12 552 vnclip.wi v20, v20, 12 553 vnclip.wi v22, v22, 12 554 vnclip.wi v24, v24, 12 555 vnclip.wi v26, v26, 12 556 vnclip.wi v28, v28, 12 557 vnclip.wi v30, v30, 12 558 559 vssub.vv v4, v16, v24 560 vsadd.vv v16, v16, v24 561 vsadd.vv v1, v18, v26 562 vsadd.vv v2, v20, v28 563 vsadd.vv v3, v22, v30 564 vssub.vv v5, v18, v26 565 vssub.vv v6, v20, v28 566 vssub.vv v30, v22, v30 567 568 vsadd.vv \o0, v16, v2 569 vsadd.vv \o7, v1, v3 570 vssub.vv v2, v16, v2 571 vssub.vv v3, v1, v3 572 573 vwmul.vx v16, v4, t5 574 vwmul.vx v18, v4, t4 575 vwmul.vx v20, v30, t5 576 vwmul.vx v22, v30, t4 577 vwmacc.vx v16, t4, v5 578 neg t4, t4 579 vwmacc.vx v22, t5, v6 580 neg t5, t5 581 vwmacc.vx v20, t4, v6 582 vwmacc.vx v18, t5, v5 583 584 vnclip.wi v16, v16, 12 585 vnclip.wi v18, v18, 12 586 vnclip.wi v20, v20, 12 587 vnclip.wi v22, v22, 12 588 589 vsadd.vv \o1, v16, v20 590 vsadd.vv \o6, v18, v22 591 vssub.vv v16, v16, v20 592 vssub.vv v17, v18, v22 593 594 vwmul.vx v18, v2, t6 595 vwmul.vx v20, v2, t6 596 vwmul.vx v22, v16, t6 597 vwmul.vx v24, v16, t6 598 vwmacc.vx v18, t6, v3 599 vwmacc.vx v22, t6, v17 600 neg t6, t6 601 vwmacc.vx v20, t6, v3 602 vwmacc.vx v24, t6, v17 603 604 vnclip.wi \o3, v18, 12 605 vnclip.wi \o4, v20, 12 606 vnclip.wi \o2, v22, 12 607 vnclip.wi \o5, v24, 12 608 609 vmv.v.x v16, zero 610 vssub.vv \o1, v16, \o1 611 vssub.vv \o3, v16, \o3 612 vssub.vv \o5, v16, \o5 613 vssub.vv \o7, v16, \o7 614.endm 615 616function inv_dct_e16_x8_rvv, export=1, ext=v 617 idct_8 v0, v1, v2, v3, v4, v5, v6, v7 618 jr t0 619endfunc 620 621function inv_adst_e16_x8_rvv, export=1, ext=v 622 iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 623 jr t0 624endfunc 625 626function inv_flipadst_e16_x8_rvv, export=1, ext=v 627 iadst_8 v7, v6, v5, v4, v3, v2, v1, v0 628 jr t0 629endfunc 630 631.macro def_fn_8x8 txfm1, txfm2 632function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v 633.ifc \txfm1\()_\txfm2, dct_dct 634 beqz a3, 1f 635.endif 636 la a5, inv_\txfm2\()_e16_x8_rvv 637.ifc \txfm1, identity 638 j inv_txfm_identity_add_8x8_rvv 639.else 640 la a4, inv_\txfm1\()_e16_x8_rvv 641 j inv_txfm_add_8x8_rvv 642.endif 643.ifc \txfm1\()_\txfm2, dct_dct 6441: 645 csrw vxrm, zero 646 vsetivli zero, 8, e16, m1, ta, ma 647 ld t2, (a2) 648 li t1, 2896*8 649 vmv.v.x v0, t2 650 vsmul.vx v0, v0, t1 651 sd x0, (a2) 652 vssra.vi v0, v0, 1 653 vsmul.vx v0, v0, t1 654 vssra.vi v0, v0, 4 655 vmv.v.v v1, v0 656 vmv.v.v v2, v0 657 vmv.v.v v3, v0 658 vmv.v.v v4, v0 659 vmv.v.v v5, v0 660 vmv.v.v v6, v0 661 vmv.v.v v7, v0 662 j itx_8x8_end 663.endif 664endfunc 665.endm 666 667def_fn_8x8 dct, dct 668def_fn_8x8 identity, identity 669def_fn_8x8 dct, adst 670def_fn_8x8 dct, flipadst 671def_fn_8x8 dct, identity 672def_fn_8x8 adst, dct 673def_fn_8x8 adst, adst 674def_fn_8x8 adst, flipadst 675def_fn_8x8 flipadst, dct 676def_fn_8x8 flipadst, adst 677def_fn_8x8 flipadst, flipadst 678def_fn_8x8 identity, dct 679def_fn_8x8 adst, identity 680def_fn_8x8 flipadst, identity 681def_fn_8x8 identity, adst 682def_fn_8x8 identity, flipadst 683 684function inv_txfm_add_4x8_rvv, export=1, ext=v 685 csrw vxrm, zero 686 687 vsetivli zero, 8, e16, m1, ta, ma 688 vle16.v v0, (a2) 689 addi t0, a2, 16 690 vle16.v v1, (t0) 691 addi t0, t0, 16 692 vle16.v v2, (t0) 693 addi t0, t0, 16 694 vle16.v v3, (t0) 695 696 li t1, 2896*8 697.irp i, 0, 1, 2, 3 698 vsmul.vx v\i, v\i, t1 699.endr 700 701 jalr t0, a4 702 703 vsseg4e16.v v0, (a2) 704 705 vsetivli zero, 4, e16, mf2, ta, ma 706 vmv.v.x v8, zero 707 vle16.v v0, (a2) 708 vse16.v v8, (a2) 709.irp i, 1, 2, 3, 4, 5, 6, 7 710 addi a2, a2, 8 711 vle16.v v\i, (a2) 712 vse16.v v8, (a2) 713.endr 714 715 jalr t0, a5 716 717.irp i, 0, 1, 2, 3, 4, 5, 6, 7 718 vssra.vi v\i, v\i, 4 719.endr 720 721 vsetvli zero, zero, e8, mf4, ta, ma 722 vle8.v v8, (a0) 723 add t0, a0, a1 724 vle8.v v9, (t0) 725.irp i, 10, 11, 12, 13, 14, 15 726 add t0, t0, a1 727 vle8.v v\i, (t0) 728.endr 729 730 vwaddu.wv v0, v0, v8 731 vwaddu.wv v1, v1, v9 732 vwaddu.wv v2, v2, v10 733 vwaddu.wv v3, v3, v11 734 vwaddu.wv v4, v4, v12 735 vwaddu.wv v5, v5, v13 736 vwaddu.wv v6, v6, v14 737 vwaddu.wv v7, v7, v15 738 739 vsetvli zero, zero, e16, mf2, ta, ma 740.irp i, 0, 1, 2, 3, 4, 5, 6, 7 741 vmax.vx v\i, v\i, zero 742.endr 743 744 vsetvli zero, zero, e8, mf4, ta, ma 745 746 vnclipu.wi v8, v0, 0 747 vnclipu.wi v9, v1, 0 748 vnclipu.wi v10, v2, 0 749 vnclipu.wi v11, v3, 0 750 vnclipu.wi v12, v4, 0 751 vnclipu.wi v13, v5, 0 752 vnclipu.wi v14, v6, 0 753 vnclipu.wi v15, v7, 0 754 755 vse8.v v8, (a0) 756.irp i, 9, 10, 11, 12, 13, 14, 15 757 add a0, a0, a1 758 vse8.v v\i, (a0) 759.endr 760 761 ret 762endfunc 763 764function inv_txfm_add_8x4_rvv, export=1, ext=v 765 csrw vxrm, zero 766 767 vsetivli zero, 4, e16, mf2, ta, ma 768 vle16.v v0, (a2) 769 addi t0, a2, 8 770 vle16.v v1, (t0) 771.irp i, 2, 3, 4, 5, 6, 7 772 addi t0, t0, 8 773 vle16.v v\i, (t0) 774.endr 775 776 li t1, 2896*8 777.irp i, 0, 1, 2, 3, 4, 5, 6, 7 778 vsmul.vx v\i, v\i, t1 779.endr 780 781 jalr t0, a4 782 783 vsseg8e16.v v0, (a2) 784 785 vsetivli zero, 8, e16, m1, ta, ma 786 vmv.v.x v4, zero 787 vle16.v v0, (a2) 788 vse16.v v4, (a2) 789.irp i, 1, 2, 3 790 addi a2, a2, 16 791 vle16.v v\i, (a2) 792 vse16.v v4, (a2) 793.endr 794 795 jalr t0, a5 796 797 vssra.vi v0, v0, 4 798 vssra.vi v1, v1, 4 799 vssra.vi v2, v2, 4 800 vssra.vi v3, v3, 4 801 802 vsetvli zero, zero, e8, mf2, ta, ma 803 vle8.v v4, (a0) 804 add t0, a0, a1 805 vle8.v v5, (t0) 806 add t0, t0, a1 807 vle8.v v6, (t0) 808 add t0, t0, a1 809 vle8.v v7, (t0) 810 811 vwaddu.wv v0, v0, v4 812 vwaddu.wv v1, v1, v5 813 vwaddu.wv v2, v2, v6 814 vwaddu.wv v3, v3, v7 815 816 vsetvli zero, zero, e16, m1, ta, ma 817 vmax.vx v0, v0, zero 818 vmax.vx v1, v1, zero 819 vmax.vx v2, v2, zero 820 vmax.vx v3, v3, zero 821 822 vsetvli zero, zero, e8, mf2, ta, ma 823 824 vnclipu.wi v4, v0, 0 825 vnclipu.wi v5, v1, 0 826 vnclipu.wi v6, v2, 0 827 vnclipu.wi v7, v3, 0 828 829 vse8.v v4, (a0) 830 add a0, a0, a1 831 vse8.v v5, (a0) 832 add a0, a0, a1 833 vse8.v v6, (a0) 834 add a0, a0, a1 835 vse8.v v7, (a0) 836 837 ret 838endfunc 839 840/* Define symbols added in .if statement */ 841.equ dct, 1 842.equ identity, 2 843.equ adst, 3 844.equ flipadst, 4 845 846.macro def_fn_48 w, h, txfm1, txfm2 847function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 848.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) 849 la a4, inv_\txfm1\()_e16_x\w\()w_rvv 850.else 851 la a4, inv_\txfm1\()_e16_x\w\()_rvv 852.endif 853.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) 854 la a5, inv_\txfm2\()_e16_x\h\()w_rvv 855.else 856 la a5, inv_\txfm2\()_e16_x\h\()_rvv 857.endif 858 j inv_txfm_add_\w\()x\h\()_rvv 859endfunc 860.endm 861 862.macro def_fns_48 w, h 863def_fn_48 \w, \h, dct, dct 864def_fn_48 \w, \h, identity, identity 865def_fn_48 \w, \h, dct, adst 866def_fn_48 \w, \h, dct, flipadst 867def_fn_48 \w, \h, dct, identity 868def_fn_48 \w, \h, adst, dct 869def_fn_48 \w, \h, adst, adst 870def_fn_48 \w, \h, adst, flipadst 871def_fn_48 \w, \h, flipadst, dct 872def_fn_48 \w, \h, flipadst, adst 873def_fn_48 \w, \h, flipadst, flipadst 874def_fn_48 \w, \h, identity, dct 875def_fn_48 \w, \h, adst, identity 876def_fn_48 \w, \h, flipadst, identity 877def_fn_48 \w, \h, identity, adst 878def_fn_48 \w, \h, identity, flipadst 879.endm 880 881def_fns_48 4, 8 882def_fns_48 8, 4 883 884function inv_identity_e16_x16_rvv, export=1, ext=v 885 li t1, 2*(5793-4096)*8 886.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 887 vsmul.vx v16, v\i, t1 888 vsadd.vv v\i, v\i, v\i 889 vsadd.vv v\i, v\i, v16 890.endr 891 jr t0 892endfunc 893 894function inv_dct_e16_x16_rvv, export=1, ext=v 895 idct_8 v0, v2, v4, v6, v8, v10, v12, v14 896 897 li t1, 401 898 li t2, 4076 899 li t3, 3166 900 li t4, 2598 901 902 vwmul.vx v30, v1, t2 903 neg t2, t2 904 vwmul.vx v16, v1, t1 905 vwmacc.vx v30, t1, v15 906 vwmacc.vx v16, t2, v15 907 908 vwmul.vx v28, v9, t4 909 neg t4, t4 910 vwmul.vx v18, v9, t3 911 vwmacc.vx v28, t3, v7 912 vwmacc.vx v18, t4, v7 913 914 li t1, 1931 915 li t2, 3612 916 li t3, 3920 917 li t4, 1189 918 919 vwmul.vx v26, v5, t2 920 neg t2, t2 921 vwmul.vx v20, v5, t1 922 vwmacc.vx v26, t1, v11 923 vwmacc.vx v20, t2, v11 924 925 vwmul.vx v24, v13, t4 926 neg t4, t4 927 vwmul.vx v22, v13, t3 928 vwmacc.vx v24, t3, v3 929 vwmacc.vx v22, t4, v3 930 931 li t2, 2896 932 li t3, 1567 933 li t4, 3784 934 935 vnclip.wi v16, v16, 12 936 vnclip.wi v18, v18, 12 937 vnclip.wi v20, v20, 12 938 vnclip.wi v22, v22, 12 939 vnclip.wi v24, v24, 12 940 vnclip.wi v26, v26, 12 941 vnclip.wi v28, v28, 12 942 vnclip.wi v30, v30, 12 943 944 vssub.vv v3, v16, v18 945 vsadd.vv v16, v16, v18 946 vssub.vv v5, v22, v20 947 vsadd.vv v22, v22, v20 948 vssub.vv v11, v24, v26 949 vsadd.vv v24, v24, v26 950 vssub.vv v13, v30, v28 951 vsadd.vv v30, v30, v28 952 953 vwmul.vx v28, v13, t4 954 neg t4, t4 955 vwmul.vx v18, v13, t3 956 vwmul.vx v26, v11, t3 957 vwmacc.vx v28, t3, v3 958 neg t3, t3 959 vwmul.vx v20, v11, t4 960 vwmacc.vx v18, t4, v3 961 vwmacc.vx v20, t3, v5 962 vwmacc.vx v26, t4, v5 963 964 vnclip.wi v18, v18, 12 965 vnclip.wi v20, v20, 12 966 vnclip.wi v26, v26, 12 967 vnclip.wi v28, v28, 12 968 969 vssub.vv v5, v18, v20 970 vsadd.vv v18, v18, v20 971 vssub.vv v11, v28, v26 972 vsadd.vv v28, v28, v26 973 974 vssub.vv v7, v16, v22 975 vsadd.vv v16, v16, v22 976 vssub.vv v9, v30, v24 977 vsadd.vv v30, v30, v24 978 979 vwmul.vx v20, v11, t2 980 vwmul.vx v22, v9, t2 981 vwmul.vx v24, v9, t2 982 vwmul.vx v26, v11, t2 983 vwmacc.vx v24, t2, v7 984 vwmacc.vx v26, t2, v5 985 neg t2, t2 986 vwmacc.vx v20, t2, v5 987 vwmacc.vx v22, t2, v7 988 989 vnclip.wi v20, v20, 12 990 vnclip.wi v22, v22, 12 991 vnclip.wi v24, v24, 12 992 vnclip.wi v26, v26, 12 993 994 vssub.vv v15, v0, v30 995 vsadd.vv v0, v0, v30 996 vssub.vv v17, v2, v28 997 vsadd.vv v1, v2, v28 998 vssub.vv v13, v4, v26 999 vsadd.vv v2, v4, v26 1000 vssub.vv v19, v6, v24 1001 vsadd.vv v3, v6, v24 1002 vssub.vv v11, v8, v22 1003 vsadd.vv v4, v8, v22 1004 vsadd.vv v5, v10, v20 1005 vssub.vv v10, v10, v20 1006 vssub.vv v9, v12, v18 1007 vsadd.vv v6, v12, v18 1008 vssub.vv v8, v14, v16 1009 vsadd.vv v7, v14, v16 1010 vmv.v.v v14, v17 1011 vmv.v.v v12, v19 1012 1013 jr t0 1014endfunc 1015 1016.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 1017 li t1, 4091 1018 li t2, 201 1019 li t3, 3973 1020 li t4, 995 1021 1022 vwmul.vx v16, v15, t1 1023 neg t1, t1 1024 vwmul.vx v18, v15, t2 1025 vwmacc.vx v16, t2, v0 1026 vwmacc.vx v18, t1, v0 1027 1028 vwmul.vx v20, v13, t3 1029 neg t3, t3 1030 vwmul.vx v22, v13, t4 1031 vwmacc.vx v20, t4, v2 1032 vwmacc.vx v22, t3, v2 1033 1034 li t1, 3703 1035 li t2, 1751 1036 li t3, 3290 1037 li t4, 2440 1038 1039 vwmul.vx v24, v11, t1 1040 neg t1, t1 1041 vwmul.vx v26, v11, t2 1042 vwmacc.vx v24, t2, v4 1043 vwmacc.vx v26, t1, v4 1044 1045 vwmul.vx v28, v9, t3 1046 neg t3, t3 1047 vwmul.vx v30, v9, t4 1048 vwmacc.vx v28, t4, v6 1049 vwmacc.vx v30, t3, v6 1050 1051 vnclip.wi v0, v16, 12 1052 vnclip.wi v18, v18, 12 1053 vnclip.wi v2, v20, 12 1054 vnclip.wi v22, v22, 12 1055 vnclip.wi v4, v24, 12 1056 vnclip.wi v26, v26, 12 1057 vnclip.wi v6, v28, 12 1058 vnclip.wi v30, v30, 12 1059 1060 li t1, 2751 1061 li t2, 3035 1062 li t3, 2106 1063 li t4, 3513 1064 1065 vwmul.vx v16, v7, t1 1066 neg t1, t1 1067 vwmul.vx v20, v7, t2 1068 vwmacc.vx v16, t2, v8 1069 vwmacc.vx v20, t1, v8 1070 1071 vwmul.vx v24, v5, t3 1072 neg t3, t3 1073 vwmul.vx v28, v5, t4 1074 vwmacc.vx v24, t4, v10 1075 vwmacc.vx v28, t3, v10 1076 1077 vnclip.wi v16, v16, 12 1078 vnclip.wi v9, v20, 12 1079 vnclip.wi v24, v24, 12 1080 vnclip.wi v11, v28, 12 1081 1082 vssub.vv v8, v0, v16 1083 vsadd.vv v0, v0, v16 1084 vssub.vv v10, v2, v24 1085 vsadd.vv v2, v2, v24 1086 1087 li t1, 1380 1088 li t2, 3857 1089 li t3, 601 1090 li t4, 4052 1091 1092 vwmul.vx v16, v3, t1 1093 neg t1, t1 1094 vwmul.vx v20, v3, t2 1095 vwmacc.vx v16, t2, v12 1096 vwmacc.vx v20, t1, v12 1097 1098 vwmul.vx v24, v1, t3 1099 neg t3, t3 1100 vwmul.vx v28, v1, t4 1101 vwmacc.vx v24, t4, v14 1102 vwmacc.vx v28, t3, v14 1103 1104 vnclip.wi v16, v16, 12 1105 vnclip.wi v13, v20, 12 1106 vnclip.wi v24, v24, 12 1107 vnclip.wi v15, v28, 12 1108 1109 vssub.vv v12, v4, v16 1110 vsadd.vv v16, v4, v16 1111 vssub.vv v14, v6, v24 1112 vsadd.vv v20, v6, v24 1113 1114 vsadd.vv v1, v18, v9 1115 vssub.vv v9, v18, v9 1116 vsadd.vv v3, v22, v11 1117 vssub.vv v11, v22, v11 1118 vsadd.vv v18, v26, v13 1119 vssub.vv v13, v26, v13 1120 vsadd.vv v22, v30, v15 1121 vssub.vv v15, v30, v15 1122 1123 vssub.vv v4, v0, v16 1124 vsadd.vv v0, v0, v16 1125 vssub.vv v5, v1, v18 1126 vsadd.vv v1, v1, v18 1127 vssub.vv v6, v2, v20 1128 vsadd.vv v2, v2, v20 1129 vssub.vv v7, v3, v22 1130 vsadd.vv v3, v3, v22 1131 1132 li t1, 799 1133 li t2, 4017 1134 li t3, 3406 1135 li t4, 2276 1136 1137 vwmul.vx v16, v8, t2 1138 vwmul.vx v18, v8, t1 1139 vwmul.vx v20, v10, t4 1140 vwmul.vx v22, v10, t3 1141 vwmul.vx v24, v13, t2 1142 vwmul.vx v26, v13, t1 1143 vwmul.vx v28, v15, t4 1144 vwmul.vx v30, v15, t3 1145 vwmacc.vx v16, t1, v9 1146 neg t1, t1 1147 vwmacc.vx v20, t3, v11 1148 neg t3, t3 1149 vwmacc.vx v26, t2, v12 1150 neg t2, t2 1151 vwmacc.vx v30, t4, v14 1152 neg t4, t4 1153 vwmacc.vx v18, t2, v9 1154 vwmacc.vx v22, t4, v11 1155 vwmacc.vx v24, t1, v12 1156 vwmacc.vx v28, t3, v14 1157 1158 li t2, 2896 1159 li t3, 1567 1160 li t4, 3784 1161 1162 vnclip.wi v16, v16, 12 1163 vnclip.wi v18, v18, 12 1164 vnclip.wi v20, v20, 12 1165 vnclip.wi v22, v22, 12 1166 vnclip.wi v24, v24, 12 1167 vnclip.wi v26, v26, 12 1168 vnclip.wi v28, v28, 12 1169 vnclip.wi v30, v30, 12 1170 1171 vsadd.vv v8, v16, v24 1172 vsadd.vv v9, v18, v26 1173 vsadd.vv v10, v20, v28 1174 vsadd.vv v11, v22, v30 1175 vssub.vv v12, v16, v24 1176 vssub.vv v13, v18, v26 1177 vssub.vv v14, v20, v28 1178 vssub.vv v15, v22, v30 1179 1180 vwmul.vx v16, v4, t4 1181 vwmul.vx v18, v4, t3 1182 vwmul.vx v20, v7, t4 1183 vwmul.vx v22, v7, t3 1184 vwmul.vx v24, v12, t4 1185 vwmul.vx v26, v12, t3 1186 vwmul.vx v28, v15, t4 1187 vwmul.vx v30, v15, t3 1188 vwmacc.vx v16, t3, v5 1189 vwmacc.vx v22, t4, v6 1190 vwmacc.vx v24, t3, v13 1191 neg t3, t3 1192 vwmacc.vx v30, t4, v14 1193 neg t4, t4 1194 vwmacc.vx v20, t3, v6 1195 vwmacc.vx v28, t3, v14 1196 vwmacc.vx v18, t4, v5 1197 vwmacc.vx v26, t4, v13 1198 1199 vnclip.wi v16, v16, 12 1200 vnclip.wi v18, v18, 12 1201 vnclip.wi v20, v20, 12 1202 vnclip.wi v22, v22, 12 1203 vnclip.wi v24, v24, 12 1204 vnclip.wi v26, v26, 12 1205 vnclip.wi v28, v28, 12 1206 vnclip.wi v30, v30, 12 1207 1208.ifc \o0, v0 1209 vsadd.vv \o14, v9, v11 1210 vssub.vv v11, v9, v11 1211 vssub.vv v9, v1, v3 1212 vsadd.vv \o15, v1, v3 1213 vsadd.vv \o1, v8, v10 1214 vssub.vv v10, v8, v10 1215 vssub.vv v8, v0, v2 1216 vsadd.vv \o0, v0, v2 1217.else 1218 vsadd.vv \o1, v8, v10 1219 vssub.vv v10, v8, v10 1220 vssub.vv v8, v0, v2 1221 vsadd.vv \o0, v0, v2 1222 vsadd.vv v2, v9, v11 1223 vssub.vv v11, v9, v11 1224 vssub.vv v9, v1, v3 1225 vsadd.vv \o15, v1, v3 1226 vmv.v.v \o14, v2 1227.endif 1228 1229 vsadd.vv \o3, v16, v20 1230 vssub.vv v6, v16, v20 1231 vsadd.vv \o12, v18, v22 1232 vssub.vv v7, v18, v22 1233 vsadd.vv \o2, v24, v28 1234 vssub.vv v24, v24, v28 1235 vsadd.vv \o13, v26, v30 1236 vssub.vv v26, v26, v30 1237 1238 neg t3, t2 1239 1240 vwmul.vx v28, v24, t2 1241 vwmul.vx v30, v24, t2 1242 vwmacc.vx v28, t2, v26 1243 vwmacc.vx v30, t3, v26 1244 1245 vwmul.vx v24, v10, t2 1246 vwmul.vx v26, v10, t2 1247 vwmacc.vx v24, t2, v11 1248 vwmacc.vx v26, t3, v11 1249 1250 vwmul.vx v20, v6, t2 1251 vwmul.vx v22, v6, t2 1252 vwmacc.vx v20, t2, v7 1253 vwmacc.vx v22, t3, v7 1254 1255 vwmul.vx v16, v8, t2 1256 vwmul.vx v18, v8, t2 1257 vwmacc.vx v16, t2, v9 1258 vwmacc.vx v18, t3, v9 1259 1260 vnclip.wi \o7, v16, 12 1261 vnclip.wi \o8, v18, 12 1262 vnclip.wi \o4, v20, 12 1263 vnclip.wi \o11, v22, 12 1264 vnclip.wi \o6, v24, 12 1265 vnclip.wi \o9, v26, 12 1266 vnclip.wi \o5, v28, 12 1267 vnclip.wi \o10, v30, 12 1268 1269 vmv.v.x v16, zero 1270 vssub.vv \o1, v16, \o1 1271 vssub.vv \o3, v16, \o3 1272 vssub.vv \o5, v16, \o5 1273 vssub.vv \o7, v16, \o7 1274 vssub.vv \o9, v16, \o9 1275 vssub.vv \o11, v16, \o11 1276 vssub.vv \o13, v16, \o13 1277 vssub.vv \o15, v16, \o15 1278.endm 1279 1280function inv_adst_e16_x16_rvv, export=1, ext=v 1281 iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 1282 jr t0 1283endfunc 1284 1285function inv_flipadst_e16_x16_rvv, export=1, ext=v 1286 iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 1287 jr t0 1288endfunc 1289 1290.macro def_horz_16 variant 1291function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v 1292 vmv.v.x v16, zero 1293 vle16.v v0, (t4) 1294 vse16.v v16, (t4) 1295.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1296 add t4, t4, t6 1297 vle16.v v\i, (t4) 1298 vse16.v v16, (t4) 1299.endr 1300.ifc \variant, _identity 1301 li t1, 2*(5793-4096)*8 1302.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1303 vsmul.vx v16, v\i, t1 1304 vsra.vi v16, v16, 1 1305 vaadd.vv v\i, v\i, v16 1306.endr 1307 j L(horz_16x8_epilog) 1308.else 1309 jalr t0, a4 1310.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1311 vssra.vi v\i, v\i, 2 1312.endr 1313L(horz_16x8_epilog): 1314 vsse16.v v0, (t5), t6 1315.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1316 addi t5, t5, 2 1317 vsse16.v v\i, (t5), t6 1318.endr 1319 jr a7 1320.endif 1321endfunc 1322.endm 1323 1324def_horz_16 _identity 1325def_horz_16 1326 1327function inv_txfm_add_vert_8x16_rvv, export=1, ext=v 1328 vsetivli zero, 8, e16, m1, ta, ma 1329 1330 vle16.v v0, (t4) 1331.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1332 add t4, t4, t6 1333 vle16.v v\i, (t4) 1334.endr 1335 1336 jalr t0, a5 1337 1338.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1339 vssra.vi v\i, v\i, 4 1340.endr 1341 1342 vsetivli zero, 8, e8, mf2, ta, ma 1343 1344 vle8.v v16, (t5) 1345 add t0, t5, a1 1346 vle8.v v17, (t0) 1347.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1348 add t0, t0, a1 1349 vle8.v v\i, (t0) 1350.endr 1351 1352 vwaddu.wv v0, v0, v16 1353 vwaddu.wv v1, v1, v17 1354 vwaddu.wv v2, v2, v18 1355 vwaddu.wv v3, v3, v19 1356 vwaddu.wv v4, v4, v20 1357 vwaddu.wv v5, v5, v21 1358 vwaddu.wv v6, v6, v22 1359 vwaddu.wv v7, v7, v23 1360 vwaddu.wv v8, v8, v24 1361 vwaddu.wv v9, v9, v25 1362 vwaddu.wv v10, v10, v26 1363 vwaddu.wv v11, v11, v27 1364 vwaddu.wv v12, v12, v28 1365 vwaddu.wv v13, v13, v29 1366 vwaddu.wv v14, v14, v30 1367 vwaddu.wv v15, v15, v31 1368 1369 vsetvli zero, zero, e16, m1, ta, ma 1370.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1371 vmax.vx v\i, v\i, zero 1372.endr 1373 1374 vsetvli zero, zero, e8, mf2, ta, ma 1375 vnclipu.wi v16, v0, 0 1376 vnclipu.wi v17, v1, 0 1377 vnclipu.wi v18, v2, 0 1378 vnclipu.wi v19, v3, 0 1379 vnclipu.wi v20, v4, 0 1380 vnclipu.wi v21, v5, 0 1381 vnclipu.wi v22, v6, 0 1382 vnclipu.wi v23, v7, 0 1383 vnclipu.wi v24, v8, 0 1384 vnclipu.wi v25, v9, 0 1385 vnclipu.wi v26, v10, 0 1386 vnclipu.wi v27, v11, 0 1387 vnclipu.wi v28, v12, 0 1388 vnclipu.wi v29, v13, 0 1389 vnclipu.wi v30, v14, 0 1390 vnclipu.wi v31, v15, 0 1391 1392 vse8.v v16, (t5) 1393.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1394 add t5, t5, a1 1395 vse8.v v\i, (t5) 1396.endr 1397 1398 jr a7 1399endfunc 1400 1401function inv_txfm_add_16x16_rvv, export=1, ext=v 1402 csrw vxrm, zero 1403 vsetivli zero, 8, e16, m1, ta, ma 1404 addi sp, sp, -16*32 1405.irp i, 8, 0 1406 addi t4, a2, \i*2 1407 addi t5, sp, \i*16*2 1408.if \i == 8 1409 blt a3, a7, 1f 1410.endif 1411 li t6, 16*2 1412 jalr a7, a6 1413.if \i == 8 1414 j 2f 14151: 1416 li t1, 64 1417 vsetvli zero, t1, e16, m8, ta, ma 1418 vmv.v.x v0, zero 1419 vse16.v v0, (t5) 1420 addi t5, t5, 128 1421 vse16.v v0, (t5) 1422 vsetivli zero, 8, e16, m1, ta, ma 14232: 1424.endif 1425.endr 1426.irp i, 0, 8 1427 addi t4, sp, \i*2 1428 addi t5, a0, \i 1429 li t6, 16*2 1430 jal a7, inv_txfm_add_vert_8x16_rvv 1431.endr 1432 addi sp, sp, 16*32 1433 ret 1434endfunc 1435 1436.macro def_fn_16x16 txfm1, txfm2, eob_half 1437function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v 1438.ifc \txfm1\()_\txfm2, dct_dct 1439 beqz a3, 1f 1440.endif 1441.ifc \txfm1, identity 1442 la a6, inv_txfm_horz_identity_16x8_rvv 1443.else 1444 la a6, inv_txfm_horz_16x8_rvv 1445 la a4, inv_\txfm1\()_e16_x16_rvv 1446.endif 1447 la a5, inv_\txfm2\()_e16_x16_rvv 1448 li a7, \eob_half 1449 j inv_txfm_add_16x16_rvv 1450.ifc \txfm1\()_\txfm2, dct_dct 14511: 1452 csrw vxrm, zero 1453 vsetivli zero, 16, e16, m2, ta, ma 1454 lh t2, (a2) 1455 li t3, 2896*8 1456 li t4, 1<<14 1457 li t5, 0xFFFF 1458 li t6, -0x10000 1459 1460 sh x0, (a2) 1461 1462 mul t2, t2, t3 1463 add t2, t2, t4 1464 srai t2, t2, 15 1465 ble t2, t5, 3f 1466 mv t2, t5 14673: 1468 ble t6, t2, 4f 1469 mv t2, t6 14704: 1471 addi t2, t2, 2 1472 srai t2, t2, 2 1473 mul t2, t2, t3 1474 add t2, t2, t4 1475 srai t2, t2, 15 1476 ble t2, t5, 5f 1477 mv t2, t5 14785: 1479 ble t6, t2, 6f 1480 mv t2, t6 14816: 1482 addi t2, t2, 8 1483 srai t2, t2, 4 1484 vmv.v.x v24, t2 1485 1486 vsetvli zero, zero, e8, m1, ta, ma 1487 add t2, a1, a1 1488 li t3, 16 14892: 1490 add t0, a0, a1 1491 vle8.v v16, (a0) 1492 vle8.v v17, (t0) 1493 1494 vwaddu.wv v0, v24, v16 1495 vwaddu.wv v2, v24, v17 1496 1497 addi t3, t3, -2 # loop counter 1498 1499 1500 vsetvli zero, zero, e16, m2, ta, ma 1501.irp i, 0, 2 1502 vmax.vx v\i, v\i, zero 1503.endr 1504 1505 vsetvli zero, zero, e8, m1, ta, ma 1506 1507 vnclipu.wi v16, v0, 0 1508 vnclipu.wi v17, v2, 0 1509 1510 add t0, a0, a1 1511 vse8.v v16, (a0) 1512 add a0, a0, t2 1513 vse8.v v17, (t0) 1514 1515 bnez t3, 2b 1516 1517 ret 1518.endif 1519endfunc 1520.endm 1521 1522def_fn_16x16 dct, dct, 36 1523def_fn_16x16 identity, identity, 36 1524def_fn_16x16 dct, adst, 36 1525def_fn_16x16 dct, flipadst, 36 1526def_fn_16x16 dct, identity, 8 1527def_fn_16x16 adst, dct, 36 1528def_fn_16x16 adst, adst, 36 1529def_fn_16x16 adst, flipadst, 36 1530def_fn_16x16 flipadst, dct, 36 1531def_fn_16x16 flipadst, adst, 36 1532def_fn_16x16 flipadst, flipadst, 36 1533def_fn_16x16 identity, dct, 8 1534 1535.macro def_fn_416_base variant 1536function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v 1537 csrw vxrm, zero 1538 1539 vsetivli zero, 8, e16, m1, ta, ma 1540 1541 blt a3, a6, 1f 1542 1543 addi t0, a2, 16 1544 vle16.v v0, (t0) 1545 addi t0, t0, 32 1546 vle16.v v1, (t0) 1547 addi t0, t0, 32 1548 vle16.v v2, (t0) 1549 addi t0, t0, 32 1550 vle16.v v3, (t0) 1551 1552.ifc \variant, identity_ 1553 li t1, (5793-4096)*8 1554 vsmul.vx v8, v0, t1 1555 vaadd.vv v4, v0, v8 1556 vsmul.vx v8, v1, t1 1557 vaadd.vv v5, v1, v8 1558 vsmul.vx v8, v2, t1 1559 vaadd.vv v6, v2, v8 1560 vsmul.vx v8, v3, t1 1561 vaadd.vv v7, v3, v8 1562.else 1563 jalr t0, a4 1564 1565 vssra.vi v4, v0, 1 1566 vssra.vi v5, v1, 1 1567 vssra.vi v6, v2, 1 1568 vssra.vi v7, v3, 1 1569.endif 1570 1571 j 2f 1572 15731: 1574.irp i, 4, 5, 6, 7 1575 vmv.v.x v\i, zero 1576.endr 1577 15782: 1579 vle16.v v0, (a2) 1580 addi t0, a2, 32 1581 vle16.v v1, (t0) 1582 addi t0, t0, 32 1583 vle16.v v2, (t0) 1584 addi t0, t0, 32 1585 vle16.v v3, (t0) 1586 1587.ifc \variant, identity_ 1588 li t1, (5793-4096)*8 1589.irp i, 0, 1, 2, 3 1590 vsmul.vx v8, v\i, t1 1591 vaadd.vv v\i, v\i, v8 1592.endr 1593 1594 j L(itx_4x16_epilog) 1595.else 1596 jalr t0, a4 1597 1598 vssra.vi v0, v0, 1 1599 vssra.vi v1, v1, 1 1600 vssra.vi v2, v2, 1 1601 vssra.vi v3, v3, 1 1602 1603L(itx_4x16_epilog): 1604 vsseg4e16.v v0, (a2) 1605 addi t0, a2, 64 1606 vsseg4e16.v v4, (t0) 1607 1608 vsetivli zero, 4, e16, mf2, ta, ma 1609 1610 vmv.v.x v16, zero 1611 vle16.v v0, (a2) 1612 vse16.v v16, (a2) 1613 addi t0, a2, 8 1614 vle16.v v1, (t0) 1615 vse16.v v16, (t0) 1616.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1617 addi t0, t0, 8 1618 vle16.v v\i, (t0) 1619 vse16.v v16, (t0) 1620.endr 1621 1622 jalr t0, a5 1623 1624.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1625 vssra.vi v\i, v\i, 4 1626.endr 1627 1628 vsetvli zero, zero, e8, mf4, ta, ma 1629 1630 vle8.v v16, (a0) 1631 add t0, a0, a1 1632 vle8.v v17, (t0) 1633.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1634 add t0, t0, a1 1635 vle8.v v\i, (t0) 1636.endr 1637 1638 vwaddu.wv v0, v0, v16 1639 vwaddu.wv v1, v1, v17 1640 vwaddu.wv v2, v2, v18 1641 vwaddu.wv v3, v3, v19 1642 vwaddu.wv v4, v4, v20 1643 vwaddu.wv v5, v5, v21 1644 vwaddu.wv v6, v6, v22 1645 vwaddu.wv v7, v7, v23 1646 vwaddu.wv v8, v8, v24 1647 vwaddu.wv v9, v9, v25 1648 vwaddu.wv v10, v10, v26 1649 vwaddu.wv v11, v11, v27 1650 vwaddu.wv v12, v12, v28 1651 vwaddu.wv v13, v13, v29 1652 vwaddu.wv v14, v14, v30 1653 vwaddu.wv v15, v15, v31 1654 1655 vsetvli zero, zero, e16, mf2, ta, ma 1656.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1657 vmax.vx v\i, v\i, zero 1658.endr 1659 1660 vsetvli zero, zero, e8, mf4, ta, ma 1661 1662 vnclipu.wi v16, v0, 0 1663 vnclipu.wi v17, v1, 0 1664 vnclipu.wi v18, v2, 0 1665 vnclipu.wi v19, v3, 0 1666 vnclipu.wi v20, v4, 0 1667 vnclipu.wi v21, v5, 0 1668 vnclipu.wi v22, v6, 0 1669 vnclipu.wi v23, v7, 0 1670 vnclipu.wi v24, v8, 0 1671 vnclipu.wi v25, v9, 0 1672 vnclipu.wi v26, v10, 0 1673 vnclipu.wi v27, v11, 0 1674 vnclipu.wi v28, v12, 0 1675 vnclipu.wi v29, v13, 0 1676 vnclipu.wi v30, v14, 0 1677 vnclipu.wi v31, v15, 0 1678 1679 vse8.v v16, (a0) 1680.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1681 add a0, a0, a1 1682 vse8.v v\i, (a0) 1683.endr 1684 1685 ret 1686.endif 1687endfunc 1688 1689function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v 1690 csrw vxrm, zero 1691 1692 vsetivli zero, 4, e16, mf2, ta, ma 1693 vle16.v v0, (a2) 1694 addi t0, a2, 8 1695 vle16.v v1, (t0) 1696.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1697 addi t0, t0, 8 1698 vle16.v v\i, (t0) 1699.endr 1700 1701.ifc \variant, identity_ 1702 li t1, 2*(5793-4096)*8 1703.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1704 vsmul.vx v16, v\i, t1 1705 vssra.vi v16, v16, 1 1706 vsadd.vv v\i, v\i, v16 1707.endr 1708 1709 j L(itx_16x4_epilog) 1710.else 1711 jalr t0, a4 1712 1713.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1714 vssra.vi v\i, v\i, 1 1715.endr 1716 1717L(itx_16x4_epilog): 1718 li t0, 32 1719 vssseg8e16.v v0, (a2), t0 1720 addi t1, a2, 16 1721 vssseg8e16.v v8, (t1), t0 1722 1723.irp j, 0, 8 1724 vsetivli zero, 8, e16, m1, ta, ma 1725 1726 vmv.v.x v4, zero 1727 addi t0, a2, \j*2 1728 vle16.v v0, (t0) 1729 vse16.v v4, (t0) 1730.irp i, 1, 2, 3 1731 addi t0, t0, 32 1732 vle16.v v\i, (t0) 1733 vse16.v v4, (t0) 1734.endr 1735 1736 jalr t0, a5 1737 1738 vssra.vi v0, v0, 4 1739 vssra.vi v1, v1, 4 1740 vssra.vi v2, v2, 4 1741 vssra.vi v3, v3, 4 1742 1743 vsetvli zero, zero, e8, mf2, ta, ma 1744 addi t0, a0, \j 1745 vle8.v v4, (t0) 1746 add t0, t0, a1 1747 vle8.v v5, (t0) 1748 add t0, t0, a1 1749 vle8.v v6, (t0) 1750 add t0, t0, a1 1751 vle8.v v7, (t0) 1752 1753 vwaddu.wv v0, v0, v4 1754 vwaddu.wv v1, v1, v5 1755 vwaddu.wv v2, v2, v6 1756 vwaddu.wv v3, v3, v7 1757 1758 vsetvli zero, zero, e16, m1, ta, ma 1759 vmax.vx v0, v0, zero 1760 vmax.vx v1, v1, zero 1761 vmax.vx v2, v2, zero 1762 vmax.vx v3, v3, zero 1763 1764 vsetvli zero, zero, e8, mf2, ta, ma 1765 1766 vnclipu.wi v4, v0, 0 1767 vnclipu.wi v5, v1, 0 1768 vnclipu.wi v6, v2, 0 1769 vnclipu.wi v7, v3, 0 1770 1771 addi t0, a0, \j 1772 vse8.v v4, (t0) 1773 add t0, t0, a1 1774 vse8.v v5, (t0) 1775 add t0, t0, a1 1776 vse8.v v6, (t0) 1777 add t0, t0, a1 1778 vse8.v v7, (t0) 1779.endr 1780 1781 ret 1782.endif 1783endfunc 1784.endm 1785 1786def_fn_416_base identity_ 1787def_fn_416_base 1788 1789.macro def_fn_416 w, h, txfm1, txfm2, eob_half 1790function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 1791.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) 1792 la a4, inv_\txfm1\()_e16_x\w\()w_rvv 1793.elseif \txfm1 != identity 1794 la a4, inv_\txfm1\()_e16_x\w\()_rvv 1795.endif 1796.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) 1797 la a5, inv_\txfm2\()_e16_x\h\()w_rvv 1798.else 1799 la a5, inv_\txfm2\()_e16_x\h\()_rvv 1800.endif 1801.if \w == 4 1802 li a6, \eob_half 1803.endif 1804.ifc \txfm1, identity 1805 j inv_txfm_identity_add_\w\()x\h\()_rvv 1806.else 1807 j inv_txfm_add_\w\()x\h\()_rvv 1808.endif 1809endfunc 1810.endm 1811 1812.macro def_fns_416 w, h 1813def_fn_416 \w, \h, dct, dct, 29 1814def_fn_416 \w, \h, identity, identity, 29 1815def_fn_416 \w, \h, dct, adst, 29 1816def_fn_416 \w, \h, dct, flipadst, 29 1817def_fn_416 \w, \h, dct, identity, 8 1818def_fn_416 \w, \h, adst, dct, 29 1819def_fn_416 \w, \h, adst, adst, 29 1820def_fn_416 \w, \h, adst, flipadst, 29 1821def_fn_416 \w, \h, flipadst, dct, 29 1822def_fn_416 \w, \h, flipadst, adst, 29 1823def_fn_416 \w, \h, flipadst, flipadst, 29 1824def_fn_416 \w, \h, identity, dct, 32 1825def_fn_416 \w, \h, adst, identity, 8 1826def_fn_416 \w, \h, flipadst, identity, 8 1827def_fn_416 \w, \h, identity, adst, 32 1828def_fn_416 \w, \h, identity, flipadst, 32 1829.endm 1830 1831def_fns_416 4, 16 1832def_fns_416 16, 4 1833 1834.macro def_fn_816_base variant 1835function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v 1836 csrw vxrm, zero 1837 1838 vsetivli zero, 8, e16, m1, ta, ma 1839 1840 blt a3, a6, 1f 1841 1842 vmv.v.x v16, zero 1843 addi t0, a2, 16 1844 vle16.v v0, (t0) 1845 vse16.v v16, (t0) 1846.irp i, 1, 2, 3, 4, 5, 6, 7 1847 addi t0, t0, 32 1848 vle16.v v\i, (t0) 1849 vse16.v v16, (t0) 1850.endr 1851 1852 li t1, 2896*8 1853.ifc \variant, identity_ 1854 vsmul.vx v8, v0, t1 1855 vsmul.vx v9, v1, t1 1856 vsmul.vx v10, v2, t1 1857 vsmul.vx v11, v3, t1 1858 vsmul.vx v12, v4, t1 1859 vsmul.vx v13, v5, t1 1860 vsmul.vx v14, v6, t1 1861 vsmul.vx v15, v7, t1 1862.else 1863.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1864 vsmul.vx v\i, v\i, t1 1865.endr 1866 1867 jalr t0, a4 1868 1869 vssra.vi v8, v0, 1 1870 vssra.vi v9, v1, 1 1871 vssra.vi v10, v2, 1 1872 vssra.vi v11, v3, 1 1873 vssra.vi v12, v4, 1 1874 vssra.vi v13, v5, 1 1875 vssra.vi v14, v6, 1 1876 vssra.vi v15, v7, 1 1877.endif 1878 1879 j 2f 1880 18811: 1882.irp i, 8, 9, 10, 11, 12, 13, 14, 15 1883 vmv.v.x v\i, zero 1884.endr 1885 18862: 1887 vmv.v.x v16, zero 1888 vle16.v v0, (a2) 1889 vse16.v v16, (a2) 1890 addi t0, a2, 32 1891 vle16.v v1, (t0) 1892 vse16.v v16, (t0) 1893.irp i, 2, 3, 4, 5, 6, 7 1894 addi t0, t0, 32 1895 vle16.v v\i, (t0) 1896 vse16.v v16, (t0) 1897.endr 1898 1899 li t1, 2896*8 1900.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1901 vsmul.vx v\i, v\i, t1 1902.endr 1903 1904.ifc \variant, identity_ 1905 j L(itx_8x16_epilog) 1906.else 1907 jalr t0, a4 1908 1909.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1910 vssra.vi v\i, v\i, 1 1911.endr 1912 1913L(itx_8x16_epilog): 1914 addi t4, sp, -8*32 1915 vsseg8e16.v v0, (t4) 1916 addi t0, t4, 8*16 1917 vsseg8e16.v v8, (t0) 1918 1919 mv t5, a0 1920 li t6, 16 1921 jal a7, inv_txfm_add_vert_8x16_rvv 1922 1923 ret 1924.endif 1925endfunc 1926 1927function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v 1928 csrw vxrm, zero 1929 1930 vsetivli zero, 8, e16, m1, ta, ma 1931 vle16.v v0, (a2) 1932 addi t0, a2, 16 1933 vle16.v v1, (t0) 1934.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1935 addi t0, t0, 16 1936 vle16.v v\i, (t0) 1937.endr 1938 1939 li t1, 2896*8 1940.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1941 vsmul.vx v\i, v\i, t1 1942.endr 1943 1944.ifc \variant, identity_ 1945 li t1, 2*(5793-4096)*8 1946.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1947 vsmul.vx v16, v\i, t1 1948 vssra.vi v16, v16, 1 1949 vsadd.vv v\i, v\i, v16 1950.endr 1951 1952 j L(itx_16x8_epilog) 1953.else 1954 jalr t0, a4 1955 1956.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1957 vssra.vi v\i, v\i, 1 1958.endr 1959 1960L(itx_16x8_epilog): 1961 li t0, 32 1962 vssseg8e16.v v0, (a2), t0 1963 addi t1, a2, 16 1964 vssseg8e16.v v8, (t1), t0 1965 1966.irp j, 0, 8 1967 vsetivli zero, 8, e16, m1, ta, ma 1968 1969 vmv.v.x v8, zero 1970 addi t0, a2, \j*2 1971 vle16.v v0, (t0) 1972 vse16.v v8, (t0) 1973.irp i, 1, 2, 3, 4, 5, 6, 7 1974 addi t0, t0, 32 1975 vle16.v v\i, (t0) 1976 vse16.v v8, (t0) 1977.endr 1978 1979 jalr t0, a5 1980 1981.irp i, 0, 1, 2, 3, 4, 5, 6, 7 1982 vssra.vi v\i, v\i, 4 1983.endr 1984 1985 vsetvli zero, zero, e8, mf2, ta, ma 1986 addi t0, a0, \j 1987 vle8.v v8, (t0) 1988.irp i, 9, 10, 11, 12, 13, 14, 15 1989 add t0, t0, a1 1990 vle8.v v\i, (t0) 1991.endr 1992 1993 vwaddu.wv v0, v0, v8 1994 vwaddu.wv v1, v1, v9 1995 vwaddu.wv v2, v2, v10 1996 vwaddu.wv v3, v3, v11 1997 vwaddu.wv v4, v4, v12 1998 vwaddu.wv v5, v5, v13 1999 vwaddu.wv v6, v6, v14 2000 vwaddu.wv v7, v7, v15 2001 2002 vsetvli zero, zero, e16, m1, ta, ma 2003.irp i, 0, 1, 2, 3, 4, 5, 6, 7 2004 vmax.vx v\i, v\i, zero 2005.endr 2006 2007 vsetvli zero, zero, e8, mf2, ta, ma 2008 2009 vnclipu.wi v8, v0, 0 2010 vnclipu.wi v9, v1, 0 2011 vnclipu.wi v10, v2, 0 2012 vnclipu.wi v11, v3, 0 2013 vnclipu.wi v12, v4, 0 2014 vnclipu.wi v13, v5, 0 2015 vnclipu.wi v14, v6, 0 2016 vnclipu.wi v15, v7, 0 2017 2018 addi t0, a0, \j 2019 vse8.v v8, (t0) 2020.irp i, 9, 10, 11, 12, 13, 14, 15 2021 add t0, t0, a1 2022 vse8.v v\i, (t0) 2023.endr 2024.endr 2025 2026 ret 2027.endif 2028endfunc 2029.endm 2030 2031def_fn_816_base identity_ 2032def_fn_816_base 2033 2034.macro def_fn_816 w, h, txfm1, txfm2, eob_half 2035function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 2036.ifnc \txfm1, identity 2037 la a4, inv_\txfm1\()_e16_x\w\()_rvv 2038.endif 2039 la a5, inv_\txfm2\()_e16_x\h\()_rvv 2040.if \w == 8 2041 li a6, \eob_half 2042.endif 2043.ifc \txfm1, identity 2044 j inv_txfm_identity_add_\w\()x\h\()_rvv 2045.else 2046 j inv_txfm_add_\w\()x\h\()_rvv 2047.endif 2048endfunc 2049.endm 2050 2051.macro def_fns_816 w, h 2052def_fn_816 \w, \h, dct, dct, 43 2053def_fn_816 \w, \h, identity, identity, 43 2054def_fn_816 \w, \h, dct, adst, 43 2055def_fn_816 \w, \h, dct, flipadst, 43 2056def_fn_816 \w, \h, dct, identity, 8 2057def_fn_816 \w, \h, adst, dct, 43 2058def_fn_816 \w, \h, adst, adst, 43 2059def_fn_816 \w, \h, adst, flipadst, 43 2060def_fn_816 \w, \h, flipadst, dct, 43 2061def_fn_816 \w, \h, flipadst, adst, 43 2062def_fn_816 \w, \h, flipadst, flipadst, 43 2063def_fn_816 \w, \h, identity, dct, 64 2064def_fn_816 \w, \h, adst, identity, 8 2065def_fn_816 \w, \h, flipadst, identity, 8 2066def_fn_816 \w, \h, identity, adst, 64 2067def_fn_816 \w, \h, identity, flipadst, 64 2068.endm 2069 2070def_fns_816 8, 16 2071def_fns_816 16, 8 2072