1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30function blend_vl256_8bpc_rvv, export=1, ext=zbb 31 ctz t0, a3 32 addi t0, t0, 0xc3 33 j L(blend_epilog) 34endfunc 35 36function blend_8bpc_rvv, export=1, ext="v,zbb" 37 ctz t0, a3 38 addi t0, t0, 0xc4 39L(blend_epilog): 40 csrw vxrm, zero 41 andi t0, t0, 0xc7 42 vsetvl zero, a3, t0 43 li t1, 64 441: 45 addi a4, a4, -2 46 vle8.v v4, (a2) 47 add a2, a2, a3 48 vle8.v v6, (a2) 49 add a2, a2, a3 50 vle8.v v8, (a5) 51 add a5, a5, a3 52 vle8.v v10, (a5) 53 add a5, a5, a3 54 vle8.v v0, (a0) 55 add t0, a0, a1 56 vle8.v v2, (t0) 57 vwmulu.vv v16, v4, v8 58 vwmulu.vv v20, v6, v10 59 vrsub.vx v8, v8, t1 60 vrsub.vx v10, v10, t1 61 vwmaccu.vv v16, v0, v8 62 vwmaccu.vv v20, v2, v10 63 vnclipu.wi v0, v16, 6 64 vnclipu.wi v2, v20, 6 65 vse8.v v0, (a0) 66 vse8.v v2, (t0) 67 add a0, t0, a1 68 bnez a4, 1b 69 ret 70endfunc 71 72function blend_h_vl256_8bpc_rvv, export=1, ext=zbb 73 srai t0, a3, 2 74 li t2, 64 75 ctz t0, t0 76 addi t0, t0, 0xc5 77 j L(blend_h_epilog) 78endfunc 79 80function blend_h_8bpc_rvv, export=1, ext="v,zbb" 81 li t2, 64 82 bgt a3, t2, 128f 83 ctz t0, a3 84 addi t0, t0, 0xc4 85L(blend_h_epilog): 86 csrw vxrm, zero 87 andi t0, t0, 0xc7 88 vsetvl zero, a3, t0 89 la t1, dav1d_obmc_masks 90 srai t0, a4, 2 91 add t1, t1, a4 92 sub a4, a4, t0 930: 94 mv t5, ra 951: 96 addi a4, a4, -2 97 lbu t3, (t1) 98 addi t1, t1, 1 99 lbu t4, (t1) 100 addi t1, t1, 1 101 vle8.v v8, (a2) 102 add a2, a2, a3 103 vle8.v v12, (a2) 104 add a2, a2, a3 105 vle8.v v0, (a0) 106 add t0, a0, a1 107 vle8.v v4, (t0) 108 vwmulu.vx v16, v8, t3 109 vwmulu.vx v24, v12, t4 110 sub t3, t2, t3 111 sub t4, t2, t4 112 vwmaccu.vx v16, t3, v0 113 vwmaccu.vx v24, t4, v4 114 vnclipu.wi v0, v16, 6 115 vnclipu.wi v4, v24, 6 116 vse8.v v0, (a0) 117 vse8.v v4, (t0) 118 add a0, t0, a1 119 bgtz a4, 1b 120 jr t5 121128: 122 csrw vxrm, zero 123 vsetvli zero, t2, e8, m4, ta, ma 124 la t1, dav1d_obmc_masks 125 srai t0, a4, 2 126 add t1, t1, a4 127 sub a4, a4, t0 128 mv a5, a0 129 mv a6, a2 130 mv a7, a4 131 jal t5, 1b 132 add t1, t1, a4 133 add a0, a5, t2 134 add a2, a6, t2 135 mv a4, a7 136 sub t1, t1, a4 137 j 0b 138endfunc 139 140function blend_v_vl256_8bpc_rvv, export=1, ext=zbb 141 srai t0, a3, 2 142 ctz t0, t0 143 addi t0, t0, 0xc5 144 j L(blend_v_epilog) 145endfunc 146 147function blend_v_8bpc_rvv, export=1, ext="v,zbb" 148 ctz t0, a3 149 addi t0, t0, 0xc4 150L(blend_v_epilog): 151 andi t0, t0, 0xc7 152 vsetvl zero, a3, t0 153 csrw vxrm, zero 154 la t1, dav1d_obmc_masks 155 add t1, t1, a3 156 vle8.v v8, (t1) 157 li t0, 64 158 vrsub.vx v10, v8, t0 1591: 160 addi a4, a4, -2 161 vle8.v v4, (a2) 162 add a2, a2, a3 163 vle8.v v6, (a2) 164 add a2, a2, a3 165 vle8.v v0, (a0) 166 add t0, a0, a1 167 vle8.v v2, (t0) 168 vwmulu.vv v12, v4, v8 169 vwmulu.vv v16, v6, v8 170 vwmaccu.vv v12, v0, v10 171 vwmaccu.vv v16, v2, v10 172 vnclipu.wi v0, v12, 6 173 vnclipu.wi v2, v16, 6 174 vse8.v v0, (a0) 175 vse8.v v2, (t0) 176 add a0, t0, a1 177 bnez a4, 1b 178 ret 179endfunc 180 181.macro avg va, vb, vm 182 vadd.vv \va, \va, \vb 183.endm 184 185.macro w_avg va, vb, vm 186 vwmul.vx v24, \va, a6 187 vwmacc.vx v24, a7, \vb 188 vnclip.wi \va, v24, 8 189.endm 190 191.macro mask va, vb, vm 192 vwmul.vv v24, \va, \vm 193 vrsub.vx \vm, \vm, a7 194 vwmacc.vv v24, \vb, \vm 195 vnclip.wi \va, v24, 10 196.endm 197 198.macro bidir_fn type, shift 199function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb" 200.ifc \type, w_avg 201 li a7, 16 202 sub a7, a7, a6 203.endif 204.ifc \type, mask 205 li a7, 64 206.endif 207 li t0, 4 208 csrw vxrm, zero 209 beq t0, a4, 4f 210 csrr t0, vlenb 211 ctz t1, a4 212 ctz t0, t0 213 li t2, 1 214 sub t0, t1, t0 215 li t4, -3 216 bgt t0, t2, 2f 217 max t0, t0, t4 218 andi t1, t0, 0x7 219 addi t0, t1, 1 # may overflow into E16 bit 220 ori t0, t0, MA | TA | E16 221 ori t1, t1, MA | TA | E8 2221: 223 addi a5, a5, -4 224.rept 2 225 vsetvl zero, a4, t0 226 sh1add t3, a4, a2 227 vle16.v v0, (a2) 228 sh1add a2, a4, t3 229 vle16.v v4, (t3) 230 sh1add t3, a4, a3 231 vle16.v v8, (a3) 232 sh1add a3, a4, t3 233 vle16.v v12, (t3) 234.ifc \type, mask 235 add t3, a4, a6 236 vle8.v v24, (a6) 237 add a6, a4, t3 238 vle8.v v26, (t3) 239 vzext.vf2 v16, v24 240 vzext.vf2 v20, v26 241.endif 242 \type v0, v8, v16 243 \type v4, v12, v20 244 vmax.vx v8, v0, zero 245 vmax.vx v12, v4, zero 246 vsetvl zero, zero, t1 247 vnclipu.wi v0, v8, \shift 248 vnclipu.wi v2, v12, \shift 249 add t3, a1, a0 250 vse8.v v0, (a0) 251 add a0, a1, t3 252 vse8.v v2, (t3) 253.endr 254 bnez a5, 1b 255 ret 2562: 257 mv t0, a0 258 neg t4, a4 259 add a0, a1, a0 260 addi a5, a5, -1 26120: 262 vsetvli t2, a4, e16, m4, ta, ma 263 sh1add t4, t2, t4 264 sh1add t3, t2, a2 265 vle16.v v0, (a2) 266 sh1add a2, t2, t3 267 vle16.v v4, (t3) 268 sh1add t3, t2, a3 269 vle16.v v8, (a3) 270 sh1add a3, t2, t3 271 vle16.v v12, (t3) 272.ifc \type, mask 273 add t3, t2, a6 274 vle8.v v24, (a6) 275 add a6, t2, t3 276 vle8.v v26, (t3) 277 vzext.vf2 v16, v24 278 vzext.vf2 v20, v26 279.endif 280 \type v0, v8, v16 281 \type v4, v12, v20 282 vmax.vx v8, v0, zero 283 vmax.vx v12, v4, zero 284 vsetvli zero, zero, e8, m2, ta, ma 285 vnclipu.wi v0, v8, \shift 286 vnclipu.wi v2, v12, \shift 287 add t3, t2, t0 288 vse8.v v0, (t0) 289 add t0, t2, t3 290 vse8.v v2, (t3) 291 bnez t4, 20b 292 bnez a5, 2b 293 ret 2944: 295 slli t0, a5, 2 296 vsetvli t1, t0, e16, m4, ta, ma 297 vle16.v v0, (a2) 298 sh1add a2, t1, a2 299 vle16.v v4, (a3) 300 sh1add a3, t1, a3 301.ifc \type, mask 302 vle8.v v16, (a6) 303 add a6, t1, a6 304 vzext.vf2 v8, v16 305.endif 306 \type v0, v4, v8 307 vmax.vx v8, v0, zero 308 vsetvli zero, zero, e8, m2, ta, ma 309 vnclipu.wi v0, v8, \shift 310 vsetvli t1, a5, e32, m2, ta, ma 311 vsse32.v v0, (a0), a1 312 ctz t0, t1 313 sub a5, a5, t1 314 sll t0, a1, t0 315 add a0, t0, a0 316 bnez a5, 4b 317 ret 318endfunc 319.endm 320 321bidir_fn avg, 5 322bidir_fn w_avg, 0 323bidir_fn mask, 0 324 325function warp_8x8_8bpc_rvv, export=1, ext="v" 326 csrw vxrm, zero 327 328 vsetivli zero, 8, e16, m1, ta, ma 329 addi sp, sp, -2*15*8 330 mv t5, sp 331 li t0, 3 332 mul t0, a3, t0 333 sub a2, a2, t0 334 addi a2, a2, -3 335 336 li t0, 64 337 addi a3, a3, -8 338 li t1, 15 339 la t2, dav1d_mc_warp_filter 340 341 lh t6, (a4) 342 lh t4, 2(a4) 343 vid.v v30 344 vwmul.vx v28, v30, t6 3451: 346 addi t1, t1, -1 347 348 349 vsetvli zero, zero, e32, m2, ta, ma 350 vadd.vx v4, v28, a5 351 add a5, a5, t4 352 vssra.vi v2, v4, 10 353 vadd.vx v2, v2, t0 354 vsll.vi v24, v2, 3 355 vsetvli zero, zero, e8, mf2, ta, ma 356 357 vluxseg8ei32.v v2, (t2), v24 358 359 vsetvli zero, zero, e16, m1, ta, ma 360.irp i, 2, 3, 4, 5, 6, 7, 8, 9 361 vle8.v v10, (a2) 362 addi a2, a2, 1 363 364 vsext.vf2 v14, v\i 365 vzext.vf2 v16, v10 366 367.if \i == 2 368 vwmulsu.vv v12, v14, v16 369.else 370 vwmaccsu.vv v12, v14, v16 371.endif 372.endr 373 vnclip.wi v10, v12, 3 374 375 add a2, a2, a3 376 vse16.v v10, (t5) 377 addi t5, t5, 16 378 379 bnez t1, 1b 380 381 mv t5, sp 382 li t1, 8 383 384 lh t6, 4(a4) 385 lh t4, 6(a4) 386 vwmul.vx v28, v30, t6 3872: 388 addi t1, t1, -1 389 390 vsetvli zero, zero, e32, m2, ta, ma 391 vadd.vx v4, v28, a6 392 393 add a6, a6, t4 394 vssra.vi v2, v4, 10 395 vadd.vx v2, v2, t0 396 vsll.vi v24, v2, 3 397 vsetvli zero, zero, e8, mf2, ta, ma 398 399 vluxseg8ei32.v v2, (t2), v24 400 vsetvli zero, zero, e16, m1, ta, ma 401 402.irp i, 2, 3, 4, 5, 6, 7, 8, 9 403 vle16.v v10, (t5) 404 addi t5, t5, 16 405 406 vsext.vf2 v14, v\i 407 408.if \i == 2 409 vwmul.vv v12, v14, v10 410.else 411 vwmacc.vv v12, v14, v10 412.endif 413.endr 414 addi t5, t5, -16*7 415 vnclip.wi v10, v12, 11 416 417 vmax.vx v10, v10, zero 418 vsetvli zero, zero, e8, mf2, ta, ma 419 420 vnclipu.wi v12, v10, 0 421 422 vse8.v v12, (a0) 423 add a0, a0, a1 424 425 bnez t1, 2b 426 427 addi sp, sp, 2*15*8 428 429 ret 430endfunc 431 432function warp_8x8t_8bpc_rvv, export=1, ext="v,zba" 433 csrw vxrm, zero 434 435 vsetivli zero, 8, e16, m1, ta, ma 436 addi sp, sp, -2*15*8 437 mv t5, sp 438 li t0, 3 439 mul t0, a3, t0 440 sub a2, a2, t0 441 addi a2, a2, -3 442 443 li t0, 64 444 addi a3, a3, -8 445 li t1, 15 446 la t2, dav1d_mc_warp_filter 447 448 lh t6, (a4) 449 lh t4, 2(a4) 450 vid.v v30 451 vwmul.vx v28, v30, t6 4521: 453 addi t1, t1, -1 454 455 456 vsetvli zero, zero, e32, m2, ta, ma 457 vadd.vx v4, v28, a5 458 add a5, a5, t4 459 vssra.vi v2, v4, 10 460 vadd.vx v2, v2, t0 461 vsll.vi v24, v2, 3 462 vsetvli zero, zero, e8, mf2, ta, ma 463 464 vluxseg8ei32.v v2, (t2), v24 465 466 vsetvli zero, zero, e16, m1, ta, ma 467.irp i, 2, 3, 4, 5, 6, 7, 8, 9 468 vle8.v v10, (a2) 469 addi a2, a2, 1 470 471 vsext.vf2 v14, v\i 472 vzext.vf2 v16, v10 473 474.if \i == 2 475 vwmulsu.vv v12, v14, v16 476.else 477 vwmaccsu.vv v12, v14, v16 478.endif 479.endr 480 vnclip.wi v10, v12, 3 481 482 add a2, a2, a3 483 vse16.v v10, (t5) 484 addi t5, t5, 16 485 486 bnez t1, 1b 487 488 mv t5, sp 489 li t1, 8 490 491 lh t6, 4(a4) 492 lh t4, 6(a4) 493 vwmul.vx v28, v30, t6 4942: 495 addi t1, t1, -1 496 497 vsetvli zero, zero, e32, m2, ta, ma 498 vadd.vx v4, v28, a6 499 add a6, a6, t4 500 vssra.vi v2, v4, 10 501 vadd.vx v2, v2, t0 502 vsll.vi v24, v2, 3 503 vsetvli zero, zero, e8, mf2, ta, ma 504 505 vluxseg8ei32.v v2, (t2), v24 506 vsetvli zero, zero, e16, m1, ta, ma 507 508.irp i, 2, 3, 4, 5, 6, 7, 8, 9 509 vle16.v v10, (t5) 510 addi t5, t5, 16 511 512 vsext.vf2 v14, v\i 513 514.if \i == 2 515 vwmul.vv v12, v14, v10 516.else 517 vwmacc.vv v12, v14, v10 518.endif 519 520.endr 521 addi t5, t5, -16*7 522 vnclip.wi v10, v12, 7 523 524 vse16.v v10, (a0) 525 sh1add a0, a1, a0 526 527 bnez t1, 2b 528 529 addi sp, sp, 2*15*8 530 531 ret 532endfunc 533