1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30function dc_gen_8bpc_rvv, export=1, ext="v,zbb" 31 .variant_cc dav1d_dc_gen_8bpc_rvv 32 add t1, a1, a2 33 srli t5, t1, 1 34 mv t1, a1 35 addi t2, a0, 1 36 vsetvli zero, t1, e16, m4, ta, ma 37 vmv.v.x v0, zero 381: 39 vsetvli t3, t1, e8, m2, tu, ma 40 vle8.v v4, (t2) 41 vwaddu.wv v0, v0, v4 42 43 sub t1, t1, t3 44 add t2, t2, t3 45 bnez t1, 1b 46 47 mv t1, a2 48 mv t2, a0 49 vsetvli zero, t1, e16, m4, ta, ma 50 vmv.v.x v8, zero 512: 52 vsetvli t3, t1, e8, m2, tu, ma 53 sub t2, t2, t3 54 vle8.v v4, (t2) 55 vwaddu.wv v8, v8, v4 56 sub t1, t1, t3 57 58 bnez t1, 2b 59 60 vsetvli zero, zero, e32, m8, ta, ma 61 vmv.s.x v16, t5 62 vmv.s.x v12, zero 63 vsetvli zero, a1, e16, m4, ta, ma 64 vwredsum.vs v24, v0, v16 65 vsetvli zero, a2, e16, m4, ta, ma 66 vwredsum.vs v16, v8, v12 67 vsetvli zero, zero, e32, m8, ta, ma 68 vmv.x.s t5, v24 69 vmv.x.s t1, v16 70 add t5, t5, t1 71 72 add t1, a1, a2 73 ctz t1, t1 74 75 srl a0, t5, t1 76 77 78 beq a1, a2, 5f 79 slli t1, a1, 1 80 sltu t2, t1, a2 81 slli t3, a2, 1 82 sltu t1, t3, a1 83 or t1, t1, t2 84 bnez t1, 3f 85 86 li t1, 0x5556 87 j 4f 883: 89 li t1, 0x3334 904: 91 mul a0, a0, t1 92 srli a0, a0, 16 935: 94 jr t0 95endfunc 96 97function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb" 98 .variant_cc dav1d_dc_gen_top_8bpc_rvv 99 mv t1, a1 100 srli t5, a1, 1 101 addi a0, a0, 1 102 vsetvli zero, t1, e16, m4, ta, ma 103 vmv.v.x v0, zero 1041: 105 vsetvli t3, t1, e8, m2, tu, ma 106 vle8.v v4, (a0) 107 vwaddu.wv v0, v0, v4 108 sub t1, t1, t3 109 110 add a0, a0, t3 111 bnez t1, 1b 112 j dc_gen_sum_up_8bpc_rvv 113endfunc 114 115function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb" 116 .variant_cc dav1d_dc_gen_left_8bpc_rvv 117 mv t1, a1 118 srli t5, a1, 1 119 vsetvli t2, t1, e16, m4, ta, ma 120 vmv.v.x v0, zero 121 1221: 123 vsetvli t3, t1, e8, m2, tu, ma 124 sub a0, a0, t3 125 vle8.v v4, (a0) 126 vwaddu.wv v0, v0, v4 127 sub t1, t1, t3 128 bnez t1, 1b 129 130 j dc_gen_sum_up_8bpc_rvv 131endfunc 132 133function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb" 134 .variant_cc dav1d_dc_gen_sum_up_8bpc_rvv 135 vsetvli zero, a1, e32, m8, ta, ma 136 vmv.s.x v4, t5 137 vsetvli zero, zero, e16, m4, ta, ma 138 vwredsum.vs v8, v0, v4 139 vsetvli zero, zero, e32, m8, ta, ma 140 vmv.x.s t5, v8 141 142 ctz t1, a1 143 144 srl a0, t5, t1 145 jr t0 146endfunc 147 148function cfl_pred_8bpc_rvv, export=1, ext="v,zba" 149 csrw vxrm, zero 1501: 151 li t2, 0 152 mv t3, a2 1532: 154 vsetvli t0, t3, e16, m2, ta, ma 155 add t4, a0, t2 156 vle16.v v0, (a5) 157 sh1add a5, t0, a5 158 159 vwmul.vx v4, v0, a6 160 vsetvli zero, zero, e32, m4, ta, mu 161 vneg.v v8, v4 162 vmslt.vx v0, v4, x0 163 vmax.vv v12, v8, v4 164 vssra.vi v16, v12, 6 165 vneg.v v16, v16, v0.t 166 vadd.vx v20, v16, a4 167 vmax.vx v0, v20, zero 168 vsetvli zero, zero, e16, m2, ta, ma 169 vnclipu.wi v4, v0, 0 170 vsetvli zero, zero, e8, m1, ta, ma 171 vnclipu.wi v0, v4, 0 172 vse8.v v0, (t4) 173 add t2, t0, t2 174 sub t3, t3, t0 175 bnez t3, 2b 176 addi a3, a3, -1 177 add a0, a0, a1 178 179 bnez a3, 1b 180 ret 181endfunc 182 183function ipred_cfl_8bpc_rvv, export=1, ext=v 184 mv t6, a0 # dst 185 mv a0, a2 # topleft 186 mv t4, a1 # stride 187 mv a1, a3 # width 188 mv a2, a4 # height 189 jal t0, dc_gen_8bpc_rvv 190 mv a2, a3 # width 191 mv a3, a4 # height 192 mv a4, a0 # dc_get_top 193 mv a0, t6 # dst 194 mv a1, t4 # stride 195 j cfl_pred_8bpc_rvv 196endfunc 197 198function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba" 199 # dc = 128, then just rearrange registers 200 mv a2, a3 201 mv a3, a4 202 li a4, 128 203 204 j cfl_pred_8bpc_rvv 205endfunc 206 207function ipred_cfl_top_8bpc_rvv, export=1, ext=v 208 mv t6, a0 # dst 209 mv a0, a2 # topleft 210 mv t4, a1 # stride 211 mv a1, a3 # width 212 jal t0, dc_gen_top_8bpc_rvv 213 mv a3, a4 # height 214 mv a4, a0 # dc_get_top 215 mv a0, t6 # dst 216 mv a2, a1 # width 217 mv a1, t4 # stride 218 j cfl_pred_8bpc_rvv 219endfunc 220 221function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba" 222 mv t6, a0 # dst 223 mv a0, a2 # topleft 224 mv t4, a1 # stride 225 mv a1, a4 # height 226 mv a2, a3 # width 227 jal t0, dc_gen_left_8bpc_rvv 228 mv a3, a4 # height 229 mv a4, a0 # dc_get_left 230 mv a1, t4 # stride 231 mv a0, t6 # dst 232 j cfl_pred_8bpc_rvv 233endfunc 234 235function ipred_paeth_8bpc_rvv, export=1, ext="v,zba" 236 csrw vxrm, zero 237 li t0, 0 238 mv t3, a2 239 lbu t1, (a2) 240 addi a6, a2, -1 241 addi a2, a2, 1 2421: 243 lbu t2, (a6) 244 mv t3, a3 2452: 246 sub t5, a3, t3 247 add t5, a2, t5 248 vsetvli t6, t3, e8, m1, ta, ma 249 vle8.v v2, (t5) 250 vwaddu.vx v4, v2, t2 251 vsetvli zero, zero, e16, m2, ta, ma 252 vwsub.vx v8, v4, t1 253 254 vsetvli zero, zero, e32, m4, ta, mu 255 vzext.vf4 v24, v2 256 vsub.vx v12, v8, t1 257 vmslt.vx v0, v12, zero 258 vneg.v v12, v12, v0.t 259 vsub.vx v16, v8, t2 260 vmslt.vx v0, v16, zero 261 vneg.v v16, v16, v0.t 262 vsub.vv v20, v8, v24 263 vmslt.vx v0, v20, zero 264 vneg.v v20, v20, v0.t 265 266 sub t5, a3, t3 267 vmsleu.vv v4, v16, v20 268 vmsleu.vv v5, v16, v12 269 vmsgtu.vv v0, v20, v12 270 vmand.mm v6, v4, v5 271 272 vsetvli zero, zero, e8, m1, ta, ma 273 vmerge.vxm v8, v2, t1, v0 274 vmmv.m v0, v6 275 add t5, a0, t5 276 sub t3, t3, t6 277 vmerge.vxm v4, v8, t2, v0 278 279 vse8.v v4, (t5) 280 281 bnez t3, 2b 282 283 addi a4, a4, -1 284 addi a6, a6, -1 285 add a0, a0, a1 286 bnez a4, 1b 287 ret 288endfunc 289 290function ipred_smooth_8bpc_rvv, export=1, ext="v,zba" 291 csrw vxrm, zero 292 la t0, dav1d_sm_weights 293 add t1, t0, a3 294 add t2, a2, a3 295 add t0, t0, a4 296 lbu t2, (t2) 297 sub t3, a2, a4 298 addi a6, a2, -1 299 addi a2, a2, 1 300 lbu t3, (t3) 3011: 302 mv t6, a3 303 304 lbu a7, (a6) 305 lbu t4, (t0) 3062: 307 li a5, 256 308 vsetvli t5, t6, e8, m1, ta, ma 309 vle8.v v2, (t1) 310 add t1, t1, t5 311 vle8.v v4, (a2) 312 add a2, a2, t5 313 sub a5, a5, t4 314 315 vwmulu.vx v8, v4, t4 316 vsetvli zero, zero, e16, m2, ta, ma 317 mul a5, a5, t3 318 319 vadd.vx v4, v8, a5 320 vsetvli zero, zero, e8, m1, ta, ma 321 vwmulu.vx v8, v2, a7 322 323 vneg.v v12, v2 324 vwmaccu.vx v8, t2, v12 325 vsetvli zero, zero, e16, m2, ta, ma 326 vwaddu.vv v12, v4, v8 327 328 sub a5, a3, t6 329 sub t6, t6, t5 330 add a5, a5, a0 331 vnclipu.wi v2, v12, 9 332 vsetvli zero, zero, e8, m1, ta, ma 333 vnclipu.wi v0, v2, 0 334 vse8.v v0, (a5) 335 336 bnez t6, 2b 337 338 sub t1, t1, a3 339 add a0, a0, a1 340 sub a2, a2, a3 341 addi a4, a4, -1 342 addi t0, t0, 1 343 addi a6, a6, -1 344 bnez a4, 1b 345 346 ret 347endfunc 348 349function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba" 350 csrw vxrm, zero 351 la t0, dav1d_sm_weights 352 add t2, a2, a3 353 add t0, t0, a4 354 sub t3, a2, a4 355 addi a2, a2, 1 356 lbu t3, (t3) 3571: 358 mv t6, a3 359 360 lbu t4, (t0) 3612: 362 li a5, 256 363 vsetvli t5, t6, e8, m1, ta, ma 364 vle8.v v4, (a2) 365 add a2, a2, t5 366 sub a5, a5, t4 367 368 vwmulu.vx v8, v4, t4 369 vsetvli zero, zero, e16, m2, ta, ma 370 mul a5, a5, t3 371 vwaddu.vx v4, v8, a5 372 373 sub a5, a3, t6 374 sub t6, t6, t5 375 add a5, a5, a0 376 vsetvli zero, zero, e16, m2, ta, ma 377 vnclipu.wi v2, v4, 8 378 vsetvli zero, zero, e8, m1, ta, ma 379 vnclipu.wi v0, v2, 0 380 vse8.v v0, (a5) 381 382 bnez t6, 2b 383 384 add a0, a0, a1 385 sub a2, a2, a3 386 addi a4, a4, -1 387 addi t0, t0, 1 388 bnez a4, 1b 389 390 ret 391endfunc 392 393function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba" 394 csrw vxrm, zero 395 la t0, dav1d_sm_weights 396 add t1, t0, a3 397 add t2, a2, a3 398 lbu t2, (t2) 399 addi a6, a2, -1 4001: 401 mv t6, a3 402 403 lbu a7, (a6) 4042: 405 vsetvli t5, t6, e8, m1, ta, ma 406 vle8.v v2, (t1) 407 add t1, t1, t5 408 409 vwmulu.vx v8, v2, a7 410 411 vneg.v v12, v2 412 vwmaccu.vx v8, t2, v12 413 414 sub a5, a3, t6 415 sub t6, t6, t5 416 add a5, a5, a0 417 vsetvli zero, zero, e8, m1, ta, ma 418 vnclipu.wi v0, v8, 8 419 vse8.v v0, (a5) 420 421 bnez t6, 2b 422 423 sub t1, t1, a3 424 add a0, a0, a1 425 addi a4, a4, -1 426 addi a6, a6, -1 427 bnez a4, 1b 428 429 ret 430endfunc 431 432function pal_pred_8bpc_rvv, export=1, ext="v,zba" 433 csrw vxrm, zero 434 vsetivli t5, 8, e8, m1, ta, ma 435 vle8.v v30, (a2) 436 li t0, 2 437 srli t1, a4, 1 4381: 439 mv t4, a4 4402: 441 vsetvli t5, t1, e8, m1, ta, ma 442 vle8.v v0, (a3) 443 add a3, a3, t5 444 vsrl.vi v2, v0, 4 445 sub t6, a4, t4 446 vand.vi v1, v0, 7 447 add t6, a0, t6 448 vrgather.vv v3, v30, v1 449 addi t2, t6, 1 450 vrgather.vv v4, v30, v2 451 slli t5, t5, 1 452 vsse8.v v3, (t6), t0 453 sub t4, t4, t5 454 vsse8.v v4, (t2), t0 455 456 bnez t4, 2b 457 addi a5, a5, -1 458 add a0, a0, a1 459 bnez a5, 1b 460 ret 461endfunc 462