1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28#include "src/riscv/asm.S" 29 30function dc_gen_16bpc_rvv, export=1, ext="v,zba,zbb" 31 .variant_cc dav1d_dc_gen_8bpc_rvv 32 add t1, a1, a2 33 srli t5, t1, 1 34 mv t1, a1 35 addi t2, a0, 2 36 vsetvli zero, t1, e32, m8, ta, ma 37 vmv.v.x v0, zero 381: 39 vsetvli t3, t1, e16, m4, tu, ma 40 vle16.v v8, (t2) 41 vwaddu.wv v0, v0, v8 42 sub t1, t1, t3 43 44 sh1add t2, t3, t2 45 bnez t1, 1b 46 47 mv t1, a2 48 mv t2, a0 49 vsetvli zero, t1, e32, m8, ta, ma 50 vmv.v.x v16, zero 512: 52 vsetvli t3, t1, e16, m4, tu, ma 53 sub t1, t1, t3 54 sll t3, t3, 1 55 sub t2, t2, t3 56 vle16.v v8, (t2) 57 vwaddu.wv v16, v16, v8 58 59 bnez t1, 2b 60 61 vsetvli zero, a1, e32, m8, ta, ma 62 vmv.s.x v24, t5 63 vmv.s.x v25, zero 64 vredsum.vs v8, v0, v24 65 vsetvli zero, a2, e32, m8, ta, ma 66 vredsum.vs v0, v16, v25 67 vmv.x.s t5, v8 68 vmv.x.s t1, v0 69 add t5, t5, t1 70 71 add t1, a1, a2 72 ctz t1, t1 73 74 srl a0, t5, t1 75 76 beq a1, a2, 5f 77 slli t1, a1, 1 78 sltu t2, t1, a2 79 slli t3, a2, 1 80 sltu t1, t3, a1 81 or t1, t1, t2 82 bnez t1, 3f 83 84 li t1, 0xAAAB 85 j 4f 863: 87 li t1, 0x6667 884: 89 mul a0, a0, t1 90 li t1, 17 91 srl a0, a0, t1 925: 93 jr t0 94endfunc 95 96function dc_gen_top_16bpc_rvv, export=1, ext="v,zba,zbb" 97 .variant_cc dav1d_dc_gen_top_16bpc_rvv 98 mv t1, a1 99 srli t5, a1, 1 100 addi a0, a0, 2 101 vsetvli zero, t1, e32, m2, ta, ma 102 vmv.v.x v0, zero 1031: 104 vsetvli t3, t1, e16, m1, tu, ma 105 vle16.v v4, (a0) 106 vwaddu.wv v0, v0, v4 107 108 sh1add a0, t3, a0 109 sub t1, t1, t3 110 bnez t1, 1b 111 112 j dc_gen_sum_up_16bpc_rvv 113endfunc 114 115function dc_gen_left_16bpc_rvv, export=1, ext="v,zba,zbb" 116 .variant_cc dav1d_dc_gen_left_16bpc_rvv 117 mv t1, a1 118 srli t5, a1, 1 119 vsetvli zero, t1, e32, m2, ta, ma 120 vmv.v.x v0, zero 1211: 122 vsetvli t3, t1, e16, m1, tu, ma 123 sub t1, t1, t3 124 slli t3, t3, 1 125 sub a0, a0, t3 126 vle16.v v4, (a0) 127 vwaddu.wv v0, v0, v4 128 129 bnez t1, 1b 130 131 j dc_gen_sum_up_16bpc_rvv 132endfunc 133 134function dc_gen_sum_up_16bpc_rvv, export=1, ext="v,zba,zbb" 135 .variant_cc dav1d_dc_gen_sum_up_16bpc_rvv 136 137 vsetvli zero, a1, e32, m2, ta, ma 138 vmv.s.x v4, t5 139 vredsum.vs v8, v0, v4 140 vmv.x.s t5, v8 141 142 ctz t1, a1 143 144 srl a0, t5, t1 145 jr t0 146endfunc 147 148function cfl_pred_16bpc_rvv, export=1, ext="v,zba" 149 csrw vxrm, zero 1501: 151 li t2, 0 152 mv t3, a2 1532: 154 vsetvli t0, t3, e16, m2, ta, ma 155 sh1add t4, t2, a0 156 vle16.v v0, (a5) 157 sh1add a5, t0, a5 158 159 vwmul.vx v4, v0, a6 160 vsetvli zero, zero, e32, m4, ta, mu 161 vneg.v v8, v4 162 vmslt.vx v0, v4, x0 163 vmax.vv v12, v8, v4 164 vssra.vi v16, v12, 6 165 vneg.v v16, v16, v0.t 166 vadd.vx v20, v16, a4 167 vmax.vx v0, v20, zero 168 vmin.vx v0, v0, a7 169 vsetvli zero, zero, e16, m2, ta, ma 170 vnclipu.wi v4, v0, 0 171 vse16.v v4, (t4) 172 add t2, t0, t2 173 sub t3, t3, t0 174 bnez t3, 2b 175 addi a3, a3, -1 176 add a0, a0, a1 177 178 bnez a3, 1b 179 ret 180endfunc 181 182function ipred_cfl_16bpc_rvv, export=1, ext=v 183 mv t6, a0 # dst 184 mv a0, a2 # topleft 185 mv t4, a1 # stride 186 mv a1, a3 # width 187 mv a2, a4 # height 188 jal t0, dc_gen_16bpc_rvv 189 mv a2, a3 # width 190 mv a3, a4 # height 191 mv a4, a0 # dc_get_top 192 mv a0, t6 # dst 193 mv a1, t4 # stride 194 j cfl_pred_16bpc_rvv 195endfunc 196 197function ipred_cfl_128_16bpc_rvv, export=1, ext="v,zba" 198 # dc = (bitdepth_max + 1) >> 1, then just rearrange registers 199 mv a2, a3 200 mv a3, a4 201 addi a4, a7, 1 202 srli a4, a4, 1 203 204 j cfl_pred_16bpc_rvv 205endfunc 206 207function ipred_cfl_top_16bpc_rvv, export=1, ext=v 208 mv t6, a0 # dst 209 mv a0, a2 # topleft 210 mv t4, a1 # stride 211 mv a1, a3 # width 212 jal t0, dc_gen_top_16bpc_rvv 213 mv a3, a4 # height 214 mv a4, a0 # dc_get_top 215 mv a0, t6 # dst 216 mv a2, a1 # width 217 mv a1, t4 # stride 218 j cfl_pred_16bpc_rvv 219endfunc 220 221function ipred_cfl_left_16bpc_rvv, export=1, ext=v 222 mv t6, a0 # dst 223 mv a0, a2 # topleft 224 mv t4, a1 # stride 225 mv a1, a4 # height 226 mv a2, a3 # width 227 jal t0, dc_gen_left_16bpc_rvv 228 mv a3, a4 # height 229 mv a4, a0 # dc_get_top 230 mv a1, t4 # stride 231 mv a0, t6 # dst 232 j cfl_pred_16bpc_rvv 233endfunc 234 235function ipred_paeth_16bpc_rvv, export=1, ext="v,zba" 236 csrw vxrm, zero 237 li t0, 0 238 mv t3, a2 239 lhu t1, (a2) 240 addi a6, a2, -2 241 addi a2, a2, 2 2421: 243 lhu t2, (a6) 244 mv t3, a3 2452: 246 sub t5, a3, t3 247 sh1add t5, t5, a2 248 vsetvli t6, t3, e16, m2, ta, ma 249 vle16.v v2, (t5) 250 vwaddu.vx v4, v2, t2 251 252 vsetvli zero, zero, e32, m4, ta, mu 253 vsub.vx v8, v4, t1 254 vzext.vf2 v24, v2 255 vsub.vx v12, v8, t1 256 vmslt.vx v0, v12, zero 257 vneg.v v12, v12, v0.t 258 vsub.vx v16, v8, t2 259 vmslt.vx v0, v16, zero 260 vneg.v v16, v16, v0.t 261 vsub.vv v20, v8, v24 262 vmslt.vx v0, v20, zero 263 vneg.v v20, v20, v0.t 264 265 sub t5, a3, t3 266 vmsleu.vv v4, v16, v20 267 vmsleu.vv v5, v16, v12 268 vmsgtu.vv v0, v20, v12 269 vmand.mm v6, v4, v5 270 271 vsetvli zero, zero, e16, m2, ta, ma 272 vmerge.vxm v8, v2, t1, v0 273 vmmv.m v0, v6 274 sh1add t5, t5, a0 275 sub t3, t3, t6 276 vmerge.vxm v4, v8, t2, v0 277 278 vse16.v v4, (t5) 279 280 bnez t3, 2b 281 282 addi a4, a4, -1 283 addi a6, a6, -2 284 add a0, a0, a1 285 bnez a4, 1b 286 ret 287endfunc 288 289function ipred_smooth_16bpc_rvv, export=1, ext="v,zba" 290 csrw vxrm, zero 291 la t0, dav1d_sm_weights 292 add t1, t0, a3 293 sh1add t2, a3, a2 294 slli t3, a4, 1 295 add t0, t0, a4 296 lhu t2, (t2) 297 sub t3, a2, t3 298 addi a6, a2, -2 299 addi a2, a2, 2 300 lhu t3, (t3) 3011: 302 mv t6, a3 303 304 lhu a7, (a6) 305 lbu t4, (t0) 3062: 307 li a5, 256 308 vsetvli t5, t6, e16, m2, ta, ma 309 vle8.v v2, (t1) 310 add t1, t1, t5 311 vle16.v v4, (a2) 312 sh1add a2, t5, a2 313 sub a5, a5, t4 314 315 vwmul.vx v8, v4, t4 316 mul a5, a5, t3 317 318 vsetvli zero, zero, e32, m4, ta, ma 319 vadd.vx v4, v8, a5 320 321 li a5, 256 322 vzext.vf4 v12, v2 323 vmul.vx v8, v12, a7 324 325 vrsub.vx v12, v12, a5 326 vmacc.vx v8, t2, v12 327 vadd.vv v12, v4, v8 328 vsetvli zero, zero, e32, m4, ta, ma 329 330 sub a5, a3, t6 331 sub t6, t6, t5 332 sh1add a5, a5, a0 333 vsetvli zero, zero, e16, m2, ta, ma 334 vnclipu.wi v2, v12, 9 335 vse16.v v2, (a5) 336 337 bnez t6, 2b 338 339 sub t1, t1, a3 340 slli t6, a3, 1 341 add a0, a0, a1 342 sub a2, a2, t6 343 addi a4, a4, -1 344 addi t0, t0, 1 345 addi a6, a6, -2 346 bnez a4, 1b 347 348 ret 349endfunc 350 351function ipred_smooth_v_16bpc_rvv, export=1, ext="v,zba" 352 csrw vxrm, zero 353 la t0, dav1d_sm_weights 354 slli t3, a4, 1 355 add t0, t0, a4 356 sub t3, a2, t3 357 addi a2, a2, 2 358 lhu t3, (t3) 3591: 360 mv t6, a3 361 362 lbu t4, (t0) 3632: 364 li a5, 256 365 vsetvli t5, t6, e16, m2, ta, ma 366 vle16.v v4, (a2) 367 sh1add a2, t5, a2 368 sub a5, a5, t4 369 370 vwmul.vx v8, v4, t4 371 mul a5, a5, t3 372 373 vsetvli zero, zero, e32, m4, ta, ma 374 vadd.vx v4, v8, a5 375 vsetvli zero, zero, e32, m4, ta, ma 376 377 sub a5, a3, t6 378 sub t6, t6, t5 379 sh1add a5, a5, a0 380 vsetvli zero, zero, e16, m2, ta, ma 381 vnclipu.wi v2, v4, 8 382 vse16.v v2, (a5) 383 384 bnez t6, 2b 385 386 slli t6, a3, 1 387 add a0, a0, a1 388 sub a2, a2, t6 389 addi a4, a4, -1 390 addi t0, t0, 1 391 bnez a4, 1b 392 393 ret 394endfunc 395 396function ipred_smooth_h_16bpc_rvv, export=1, ext="v,zba" 397 csrw vxrm, zero 398 la t0, dav1d_sm_weights 399 add t1, t0, a3 400 sh1add t2, a3, a2 401 lhu t2, (t2) 402 addi a6, a2, -2 4031: 404 mv t6, a3 405 406 lhu a7, (a6) 4072: 408 vsetvli t5, t6, e16, m2, ta, ma 409 vle8.v v2, (t1) 410 add t1, t1, t5 411 412 li a5, 256 413 vsetvli zero, zero, e32, m4, ta, ma 414 vzext.vf4 v12, v2 415 vmul.vx v8, v12, a7 416 417 vrsub.vx v12, v12, a5 418 vmacc.vx v8, t2, v12 419 420 sub a5, a3, t6 421 sub t6, t6, t5 422 sh1add a5, a5, a0 423 vsetvli zero, zero, e16, m2, ta, ma 424 vnclipu.wi v2, v8, 8 425 vse16.v v2, (a5) 426 427 bnez t6, 2b 428 429 sub t1, t1, a3 430 add a0, a0, a1 431 addi a4, a4, -1 432 addi a6, a6, -2 433 bnez a4, 1b 434 435 ret 436endfunc 437 438function pal_pred_16bpc_rvv, export=1, ext="v,zba" 439 csrw vxrm, zero 440 vsetivli t5, 8, e16, m1, ta, ma 441 vle16.v v30, (a2) 442 li t0, 4 443 srli t1, a4, 1 444 li t2, 1 4451: 446 mv t4, a4 4472: 448 vsetvli t5, t1, e8, mf2, ta, ma 449 vle8.v v0, (a3) 450 add a3, a3, t5 451 vand.vi v1, v0, 7 452 sub t6, a4, t4 453 vsrl.vi v2, v0, 4 454 vwmul.vx v4, v1, t2 455 vwmul.vx v6, v2, t2 456 vsetvli zero, zero, e16, m1, ta, ma 457 sh1add t6, t6, a0 458 vrgather.vv v8, v30, v4 459 addi t3, t6, 2 460 vrgather.vv v10, v30, v6 461 slli t5, t5, 1 462 vsse16.v v8, (t6), t0 463 vsse16.v v10, (t3), t0 464 465 sub t4, t4, t5 466 bnez t4, 2b 467 add a0, a0, a1 468 addi a5, a5, -1 469 bnez a5, 1b 470 ret 471endfunc 472