1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30/* 31static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, 32 const pixel *src, const ptrdiff_t src_stride, 33 const int16_t *const abcd, int mx, int my 34 HIGHBD_DECL_SUFFIX) 35*/ 36.macro vld_filter_row dst, src, inc 37 addi.w t3, \src, 512 38 srai.w t3, t3, 10 39 add.w \src, \src, \inc 40 addi.w t3, t3, 64 41 slli.w t3, t3, 3 42 fldx.d \dst, t4, t3 43.endm 44 45.macro warp_filter_horz_lsx 46 addi.w t5, a5, 0 47 vld vr10, a2, 0 48 add.d a2, a2, a3 49 50 vld_filter_row f0, t5, t0 51 vld_filter_row f1, t5, t0 52 vld_filter_row f2, t5, t0 53 vld_filter_row f3, t5, t0 54 vld_filter_row f4, t5, t0 55 vld_filter_row f5, t5, t0 56 vld_filter_row f6, t5, t0 57 vld_filter_row f7, t5, t0 58 59 vxor.v vr10, vr10, vr20 60 61 vbsrl.v vr8, vr10, 1 62 vbsrl.v vr9, vr10, 2 63 vilvl.d vr8, vr8, vr10 64 vilvl.d vr0, vr1, vr0 65 vmulwev.h.b vr11, vr8, vr0 66 vmulwod.h.b vr12, vr8, vr0 67 vbsrl.v vr8, vr10, 3 68 vbsrl.v vr19, vr10, 4 69 vilvl.d vr8, vr8, vr9 70 vilvl.d vr2, vr3, vr2 71 vmulwev.h.b vr13, vr8, vr2 72 vmulwod.h.b vr14, vr8, vr2 73 vbsrl.v vr8, vr10, 5 74 vbsrl.v vr9, vr10, 6 75 vilvl.d vr8, vr8, vr19 76 vilvl.d vr4, vr5, vr4 77 vmulwev.h.b vr15, vr8, vr4 78 vmulwod.h.b vr16, vr8, vr4 79 vbsrl.v vr8, vr10, 7 80 vilvl.d vr8, vr8, vr9 81 vilvl.d vr6, vr7, vr6 82 vmulwev.h.b vr17, vr8, vr6 83 vmulwod.h.b vr18, vr8, vr6 84 85 vadd.h vr11, vr11, vr12 86 vadd.h vr13, vr13, vr14 87 vadd.h vr15, vr15, vr16 88 vadd.h vr17, vr17, vr18 89 vpickev.h vr12, vr13, vr11 90 vpickod.h vr14, vr13, vr11 91 vpickev.h vr16, vr17, vr15 92 vpickod.h vr18, vr17, vr15 93 vadd.h vr11, vr12, vr14 94 vadd.h vr15, vr16, vr18 95 vpickev.h vr12, vr15, vr11 96 vpickod.h vr14, vr15, vr11 97 vadd.h vr11, vr12, vr14 98 99 add.d a5, a5, t1 100.endm 101 102.macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7 103 vilvl.b \in0, \in1, \in0 104 vilvl.b \in2, \in3, \in2 105 vilvl.b \in4, \in5, \in4 106 vilvl.b \in6, \in7, \in6 107 108 vpackev.h \in1, \in2, \in0 109 vpackod.h \in3, \in2, \in0 110 vpackev.h \in5, \in6, \in4 111 vpackod.h \in7, \in6, \in4 112 113 vpackev.w \in0, \in5, \in1 114 vpackod.w \in2, \in5, \in1 115 vpackev.w \in1, \in7, \in3 116 vpackod.w \in3, \in7, \in3 117 118 vexth.h.b \in4, \in0 119 vsllwil.h.b \in0, \in0, 0 120 vexth.h.b \in5, \in1 121 vsllwil.h.b \in1, \in1, 0 122 vexth.h.b \in6, \in2 123 vsllwil.h.b \in2, \in2, 0 124 vexth.h.b \in7, \in3 125 vsllwil.h.b \in3, \in3, 0 126.endm 127 128.macro warp t, shift 129function warp_affine_8x8\t\()_8bpc_lsx 130 addi.d sp, sp, -64 131 fst.d f24, sp, 0 132 fst.d f25, sp, 8 133 fst.d f26, sp, 16 134 fst.d f27, sp, 24 135 fst.d f28, sp, 32 136 fst.d f29, sp, 40 137 fst.d f30, sp, 48 138 fst.d f31, sp, 56 139 140 ld.h t0, a4, 0 141 ld.h t1, a4, 2 142 ld.h t2, a4, 4 143 ld.h a4, a4, 6 144 145 li.d t7, 8 146 alsl.w t3, a3, a3, 1 147 sub.d a2, a2, t3 148 addi.d a2, a2, -3 149 la.local t4, dav1d_mc_warp_filter 150 151.ifnb \t 152 slli.d a1, a1, 1 153.endif 154 155 li.w t3, 128 156 vreplgr2vr.b vr20, t3 157.ifb \t 158 vreplgr2vr.h vr21, t3 159.else 160 li.w t3, 2048 161 vreplgr2vr.h vr21, t3 162.endif 163 warp_filter_horz_lsx 164 vsrari.h vr24, vr11, 3 165 warp_filter_horz_lsx 166 vsrari.h vr25, vr11, 3 167 warp_filter_horz_lsx 168 vsrari.h vr26, vr11, 3 169 warp_filter_horz_lsx 170 vsrari.h vr27, vr11, 3 171 warp_filter_horz_lsx 172 vsrari.h vr28, vr11, 3 173 warp_filter_horz_lsx 174 vsrari.h vr29, vr11, 3 175 warp_filter_horz_lsx 176 vsrari.h vr30, vr11, 3 177 1781: 179 addi.d t6, a6, 0 180 warp_filter_horz_lsx 181 vsrari.h vr31, vr11, 3 182 183 vld_filter_row f0, t6, t2 184 vld_filter_row f1, t6, t2 185 vld_filter_row f2, t6, t2 186 vld_filter_row f3, t6, t2 187 vld_filter_row f4, t6, t2 188 vld_filter_row f5, t6, t2 189 vld_filter_row f6, t6, t2 190 vld_filter_row f7, t6, t2 191 192 transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 193 194 vmulwev.w.h vr16, vr24, vr0 195 vmulwod.w.h vr17, vr24, vr0 196 vmaddwev.w.h vr16, vr25, vr1 197 vmaddwod.w.h vr17, vr25, vr1 198 vmaddwev.w.h vr16, vr26, vr2 199 vmaddwod.w.h vr17, vr26, vr2 200 vmaddwev.w.h vr16, vr27, vr3 201 vmaddwod.w.h vr17, vr27, vr3 202 vmaddwev.w.h vr16, vr28, vr4 203 vmaddwod.w.h vr17, vr28, vr4 204 vmaddwev.w.h vr16, vr29, vr5 205 vmaddwod.w.h vr17, vr29, vr5 206 vmaddwev.w.h vr16, vr30, vr6 207 vmaddwod.w.h vr17, vr30, vr6 208 vmaddwev.w.h vr16, vr31, vr7 209 vmaddwod.w.h vr17, vr31, vr7 210 211 vssrarni.h.w vr16, vr16, \shift 212 vssrarni.h.w vr17, vr17, \shift 213 vilvl.h vr16, vr17, vr16 214 vadd.h vr16, vr16, vr21 215 216 vor.v vr24, vr25, vr25 217 vor.v vr25, vr26, vr26 218 vor.v vr26, vr27, vr27 219 vor.v vr27, vr28, vr28 220 vor.v vr28, vr29, vr29 221 vor.v vr29, vr30, vr30 222 vor.v vr30, vr31, vr31 223 224.ifb \t 225 vssrarni.bu.h vr16, vr16, 0 226.endif 227 228 addi.d t7, t7, -1 229.ifnb \t 230 vst vr16, a0, 0 231.else 232 vstelm.d vr16, a0, 0, 0 233.endif 234 add.d a0, a1, a0 235 236 add.d a6, a6, a4 237 blt zero, t7, 1b 238 239 fld.d f24, sp, 0 240 fld.d f25, sp, 8 241 fld.d f26, sp, 16 242 fld.d f27, sp, 24 243 fld.d f28, sp, 32 244 fld.d f29, sp, 40 245 fld.d f30, sp, 48 246 fld.d f31, sp, 56 247 addi.d sp, sp, 64 248endfunc 249.endm 250 251warp , 11 252warp t, 7 253 254.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 255 xvshuf.b xr2, \in0, \in0, \in2 256 257 addi.w t4, \in1, 512 258 srai.w t4, t4, 10 259 addi.w t4, t4, 64 260 slli.w t4, t4, 3 261 vldx vr3, t5, t4 262 add.w t3, t3, t0 // tmx += abcd[0] 263 264 addi.w t4, t3, 512 265 srai.w t4, t4, 10 266 addi.w t4, t4, 64 267 slli.w t4, t4, 3 268 vldx vr4, t5, t4 269 add.w t3, t3, t0 // tmx += abcd[0] 270 271 addi.w t4, t3, 512 272 srai.w t4, t4, 10 273 addi.w t4, t4, 64 274 slli.w t4, t4, 3 275 vldx vr5, t5, t4 276 add.w t3, t3, t0 // tmx += abcd[0] 277 278 addi.w t4, t3, 512 279 srai.w t4, t4, 10 280 addi.w t4, t4, 64 281 slli.w t4, t4, 3 282 vldx vr6, t5, t4 283 add.w t3, t3, t0 // tmx += abcd[0] 284 285 xvinsve0.d xr3, xr5, 1 286 xvinsve0.d xr3, xr4, 2 287 xvinsve0.d xr3, xr6, 3 288 289 xvmulwev.h.bu.b xr4, xr2, xr3 290 xvmulwod.h.bu.b xr5, xr2, xr3 291 xvilvl.d xr2, xr5, xr4 292 xvilvh.d xr3, xr5, xr4 293 xvhaddw.w.h xr2, xr2, xr2 294 xvhaddw.w.h xr3, xr3, xr3 295 xvhaddw.d.w xr2, xr2, xr2 296 xvhaddw.d.w xr3, xr3, xr3 297 xvhaddw.q.d xr2, xr2, xr2 298 xvhaddw.q.d xr3, xr3, xr3 299 300 xvextrins.w \out0, xr2, \out1 301 xvextrins.w \out2, xr3, \out3 302.endm 303 304.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 305 add.w \in0, \in0, \in1 306 addi.w t6, \in0, 512 307 srai.w t6, t6, 10 308 addi.w t6, t6, 64 309 slli.w t6, t6, 3 310 fldx.d f1, t5, t6 311 312 add.w t2, t2, t7 313 addi.w t6, t2, 512 314 srai.w t6, t6, 10 315 addi.w t6, t6, 64 316 slli.w t6, t6, 3 317 fldx.d f2, t5, t6 318 319 vilvl.d vr0, vr2, vr1 320 vext2xv.h.b xr0, xr0 321 xvmulwev.w.h xr3, \in2, xr0 322 xvmaddwod.w.h xr3, \in2, xr0 323 xvhaddw.d.w xr3, xr3, xr3 324 xvhaddw.q.d xr3, xr3, xr3 325 xvextrins.w \out0, xr3, \out1 326.endm 327 328const shuf0 329.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 330.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 331endconst 332 333const warp_sh 334.rept 2 335.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 336.endr 337.rept 2 338.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 339.endr 340endconst 341 342.macro warp_lasx t, shift 343function warp_affine_8x8\t\()_8bpc_lasx 344 addi.d sp, sp, -16 345 ld.h t0, a4, 0 // abcd[0] 346 ld.h t1, a4, 2 // abcd[1] 347 fst.d f24, sp, 0 348 fst.d f25, sp, 8 349 350 alsl.w t2, a3, a3, 1 351 addi.w t3, a5, 0 352 la.local t4, warp_sh 353 la.local t5, dav1d_mc_warp_filter 354 sub.d a2, a2, t2 355 addi.d a2, a2, -3 356 vld vr0, a2, 0 357 xvld xr24, t4, 0 358 xvld xr25, t4, 32 359 la.local t2, shuf0 360 xvld xr1, t2, 0 361 xvpermi.q xr0, xr0, 0x00 362 xvaddi.bu xr9, xr1, 4 363 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 364 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 365 366 add.w a5, a5, t1 367 or t3, a5, a5 368 add.d a2, a2, a3 369 vld vr0, a2, 0 370 xvpermi.q xr0, xr0, 0x00 371 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 372 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 373 374 add.w a5, a5, t1 375 or t3, a5, a5 376 add.d a2, a2, a3 377 vld vr0, a2, 0 378 xvpermi.q xr0, xr0, 0x00 379 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 380 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 381 382 add.w a5, a5, t1 383 or t3, a5, a5 384 add.d a2, a2, a3 385 vld vr0, a2, 0 386 xvpermi.q xr0, xr0, 0x00 387 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 388 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 389 390 add.w a5, a5, t1 391 or t3, a5, a5 392 add.d a2, a2, a3 393 vld vr0, a2, 0 394 xvpermi.q xr0, xr0, 0x00 395 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 396 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 397 398 add.w a5, a5, t1 399 or t3, a5, a5 400 add.d a2, a2, a3 401 vld vr0, a2, 0 402 xvpermi.q xr0, xr0, 0x00 403 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 404 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 405 406 add.w a5, a5, t1 407 or t3, a5, a5 408 add.d a2, a2, a3 409 vld vr0, a2, 0 410 xvpermi.q xr0, xr0, 0x00 411 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 412 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 413 414 add.w a5, a5, t1 415 or t3, a5, a5 416 add.d a2, a2, a3 417 vld vr0, a2, 0 418 xvpermi.q xr0, xr0, 0x00 419 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 420 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 421 422 xvsrarni.h.w xr12, xr7, 3 423 xvsrarni.h.w xr13, xr8, 3 424 xvsrarni.h.w xr14, xr10, 3 425 xvsrarni.h.w xr15, xr11, 3 426 427 add.w a5, a5, t1 428 or t3, a5, a5 429 add.d a2, a2, a3 430 vld vr0, a2, 0 431 xvpermi.q xr0, xr0, 0x00 432 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 433 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 434 435 add.w a5, a5, t1 436 or t3, a5, a5 437 add.d a2, a2, a3 438 vld vr0, a2, 0 439 xvpermi.q xr0, xr0, 0x00 440 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 441 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 442 443 add.w a5, a5, t1 444 or t3, a5, a5 445 add.d a2, a2, a3 446 vld vr0, a2, 0 447 xvpermi.q xr0, xr0, 0x00 448 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 449 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 450 451 add.w a5, a5, t1 452 or t3, a5, a5 453 add.d a2, a2, a3 454 vld vr0, a2, 0 455 xvpermi.q xr0, xr0, 0x00 456 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 457 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 458 459 add.w a5, a5, t1 460 or t3, a5, a5 461 add.d a2, a2, a3 462 vld vr0, a2, 0 463 xvpermi.q xr0, xr0, 0x00 464 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 465 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 466 467 add.w a5, a5, t1 468 or t3, a5, a5 469 add.d a2, a2, a3 470 vld vr0, a2, 0 471 xvpermi.q xr0, xr0, 0x00 472 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 473 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 474 475 add.w a5, a5, t1 476 or t3, a5, a5 477 add.d a2, a2, a3 478 vld vr0, a2, 0 479 xvpermi.q xr0, xr0, 0x00 480 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 481 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 482 483 xvsrarni.h.w xr16, xr7, 3 484 xvsrarni.h.w xr17, xr8, 3 485 xvsrarni.h.w xr18, xr10, 3 486 xvsrarni.h.w xr19, xr11, 3 487 488 addi.w t2, a6, 0 // my 489 ld.h t7, a4, 4 // abcd[2] 490 ld.h t8, a4, 6 // abcd[3] 491 492.ifnb \t 493 slli.d a1, a1, 1 494.endif 495 496 // y = 0 497 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 498 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 499 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 500 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 501 502 xvshuf.b xr12, xr16, xr12, xr24 503 xvshuf.b xr13, xr17, xr13, xr24 504 xvshuf.b xr14, xr18, xr14, xr24 505 xvshuf.b xr15, xr19, xr15, xr24 506 xvextrins.h xr24, xr25, 0x70 507 508 add.w a6, a6, t8 509 addi.w t2, a6, 0 510 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 511 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 512 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 513 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 514 515.ifnb \t 516 xvssrarni.h.w xr21, xr20, \shift 517 xvpermi.q xr22, xr21, 0x01 518 vilvl.h vr23, vr22, vr21 519 vilvh.h vr21, vr22, vr21 520 vst vr23, a0, 0 521 vstx vr21, a0, a1 522.else 523 xvssrarni.hu.w xr21, xr20, \shift 524 xvssrlni.bu.h xr22, xr21, 0 525 xvpermi.q xr23, xr22, 0x01 526 vilvl.b vr21, vr23, vr22 527 fst.d f21, a0, 0 528 add.d a0, a0, a1 529 vstelm.d vr21, a0, 0, 1 530.endif 531 532 xvaddi.bu xr25, xr25, 2 533 xvshuf.b xr12, xr16, xr12, xr24 534 xvshuf.b xr13, xr17, xr13, xr24 535 xvshuf.b xr14, xr18, xr14, xr24 536 xvshuf.b xr15, xr19, xr15, xr24 537 xvextrins.h xr24, xr25, 0x70 538 539 add.w a6, a6, t8 540 addi.w t2, a6, 0 541 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 542 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 543 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 544 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 545 546 xvaddi.bu xr25, xr25, 2 547 xvshuf.b xr12, xr16, xr12, xr24 548 xvshuf.b xr13, xr17, xr13, xr24 549 xvshuf.b xr14, xr18, xr14, xr24 550 xvshuf.b xr15, xr19, xr15, xr24 551 xvextrins.h xr24, xr25, 0x70 552 553 add.w a6, a6, t8 554 addi.w t2, a6, 0 555 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 556 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 557 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 558 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 559 560.ifnb \t 561 xvssrarni.h.w xr21, xr20, \shift 562 alsl.d a0, a1, a0, 1 563 xvpermi.q xr22, xr21, 0x01 564 vilvl.h vr23, vr22, vr21 565 vilvh.h vr21, vr22, vr21 566 vst vr23, a0, 0 567 vstx vr21, a0, a1 568.else 569 xvssrarni.hu.w xr21, xr20, 11 570 xvssrlni.bu.h xr22, xr21, 0 571 xvpermi.q xr23, xr22, 0x01 572 vilvl.b vr21, vr23, vr22 573 add.d a0, a0, a1 574 fst.d f21, a0, 0 575 add.d a0, a0, a1 576 vstelm.d vr21, a0, 0, 1 577.endif 578 579 xvaddi.bu xr25, xr25, 2 580 xvshuf.b xr12, xr16, xr12, xr24 581 xvshuf.b xr13, xr17, xr13, xr24 582 xvshuf.b xr14, xr18, xr14, xr24 583 xvshuf.b xr15, xr19, xr15, xr24 584 xvextrins.h xr24, xr25, 0x70 585 586 add.w a6, a6, t8 587 addi.w t2, a6, 0 588 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 589 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 590 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 591 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 592 593 xvaddi.bu xr25, xr25, 2 594 xvshuf.b xr12, xr16, xr12, xr24 595 xvshuf.b xr13, xr17, xr13, xr24 596 xvshuf.b xr14, xr18, xr14, xr24 597 xvshuf.b xr15, xr19, xr15, xr24 598 xvextrins.h xr24, xr25, 0x70 599 600 add.w a6, a6, t8 601 addi.w t2, a6, 0 602 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 603 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 604 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 605 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 606 607.ifnb \t 608 xvssrarni.h.w xr21, xr20, \shift 609 alsl.d a0, a1, a0, 1 610 xvpermi.q xr22, xr21, 0x01 611 vilvl.h vr23, vr22, vr21 612 vilvh.h vr21, vr22, vr21 613 vst vr23, a0, 0 614 vstx vr21, a0, a1 615.else 616 xvssrarni.hu.w xr21, xr20, 11 617 xvssrlni.bu.h xr22, xr21, 0 618 xvpermi.q xr23, xr22, 0x01 619 vilvl.b vr21, vr23, vr22 620 add.d a0, a0, a1 621 fst.d f21, a0, 0 622 add.d a0, a0, a1 623 vstelm.d vr21, a0, 0, 1 624.endif 625 626 xvaddi.bu xr25, xr25, 2 627 xvshuf.b xr12, xr16, xr12, xr24 628 xvshuf.b xr13, xr17, xr13, xr24 629 xvshuf.b xr14, xr18, xr14, xr24 630 xvshuf.b xr15, xr19, xr15, xr24 631 xvextrins.h xr24, xr25, 0x70 632 633 add.w a6, a6, t8 634 addi.w t2, a6, 0 635 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 636 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 637 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 638 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 639 640 xvshuf.b xr12, xr16, xr12, xr24 641 xvshuf.b xr13, xr17, xr13, xr24 642 xvshuf.b xr14, xr18, xr14, xr24 643 xvshuf.b xr15, xr19, xr15, xr24 644 645 add.w a6, a6, t8 646 addi.w t2, a6, 0 647 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 648 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 649 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 650 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 651 652.ifnb \t 653 xvssrarni.h.w xr21, xr20, \shift 654 alsl.d a0, a1, a0, 1 655 xvpermi.q xr22, xr21, 0x01 656 vilvl.h vr23, vr22, vr21 657 vilvh.h vr21, vr22, vr21 658 vst vr23, a0, 0 659 vstx vr21, a0, a1 660.else 661 xvssrarni.hu.w xr21, xr20, 11 662 xvssrlni.bu.h xr22, xr21, 0 663 xvpermi.q xr23, xr22, 0x01 664 vilvl.b vr21, vr23, vr22 665 add.d a0, a0, a1 666 fst.d f21, a0, 0 667 add.d a0, a0, a1 668 vstelm.d vr21, a0, 0, 1 669.endif 670 fld.d f24, sp, 0 671 fld.d f25, sp, 8 672 addi.d sp, sp, 16 673endfunc 674.endm 675 676warp_lasx , 11 677warp_lasx t, 7 678 679/* 680static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, 681 const int16_t *tmp1, const int16_t *tmp2, 682 const int w, int h, 683 const int weight HIGHBD_DECL_SUFFIX) 684*/ 685 686#define bpc8_sh 5 // sh = intermediate_bits + 1 687#define bpcw8_sh 8 // sh = intermediate_bits + 4 688 689#define bpc_sh bpc8_sh 690#define bpcw_sh bpcw8_sh 691 692function avg_8bpc_lsx 693 addi.d t8, a0, 0 694 695 clz.w t0, a4 696 li.w t1, 24 697 sub.w t0, t0, t1 698 la.local t1, .AVG_LSX_JRTABLE 699 alsl.d t0, t0, t1, 1 700 ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE 701 add.d t1, t1, t2 // Get absolute address 702 jirl $r0, t1, 0 703 704 .align 3 705.AVG_LSX_JRTABLE: 706 .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE 707 .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE 708 .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE 709 .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE 710 .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE 711 .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE 712 713.AVG_W4_LSX: 714 vld vr0, a2, 0 715 vld vr1, a3, 0 716 vadd.h vr2, vr0, vr1 717 vssrarni.bu.h vr3, vr2, bpc_sh 718 vstelm.w vr3, a0, 0, 0 719 add.d a0, a0, a1 720 vstelm.w vr3, a0, 0, 1 721 addi.w a5, a5, -2 722 addi.d a2, a2, 16 723 addi.d a3, a3, 16 724 add.d a0, a0, a1 725 blt zero, a5, .AVG_W4_LSX 726 b .AVG_END_LSX 727 728.AVG_W8_LSX: 729 vld vr0, a2, 0 730 vld vr2, a2, 16 731 vld vr1, a3, 0 732 vld vr3, a3, 16 733 vadd.h vr4, vr0, vr1 734 vadd.h vr5, vr2, vr3 735 vssrarni.bu.h vr5, vr4, bpc_sh 736 addi.w a5, a5, -2 737 addi.d a2, a2, 32 738 vstelm.d vr5, a0, 0, 0 739 add.d a0, a0, a1 740 vstelm.d vr5, a0, 0, 1 741 addi.d a3, a3, 32 742 add.d a0, a0, a1 743 blt zero, a5, .AVG_W8_LSX 744 b .AVG_END_LSX 745 746.AVG_W16_LSX: 747 vld vr0, a2, 0 748 vld vr2, a2, 16 749 vld vr1, a3, 0 750 vld vr3, a3, 16 751 vadd.h vr4, vr0, vr1 752 vadd.h vr5, vr2, vr3 753 vssrarni.bu.h vr5, vr4, bpc_sh 754 addi.w a5, a5, -1 755 addi.d a2, a2, 32 756 vst vr5, a0, 0 757 addi.d a3, a3, 32 758 add.d a0, a0, a1 759 blt zero, a5, .AVG_W16_LSX 760 b .AVG_END_LSX 761 762.AVG_W32_LSX: 763 vld vr0, a2, 0 764 vld vr2, a2, 16 765 vld vr4, a2, 32 766 vld vr6, a2, 48 767 vld vr1, a3, 0 768 vld vr3, a3, 16 769 vld vr5, a3, 32 770 vld vr7, a3, 48 771 vadd.h vr0, vr0, vr1 772 vadd.h vr2, vr2, vr3 773 vadd.h vr4, vr4, vr5 774 vadd.h vr6, vr6, vr7 775 vssrarni.bu.h vr2, vr0, bpc_sh 776 vssrarni.bu.h vr6, vr4, bpc_sh 777 addi.w a5, a5, -1 778 addi.d a2, a2, 64 779 vst vr2, a0, 0 780 vst vr6, a0, 16 781 addi.d a3, a3, 64 782 add.d a0, a0, a1 783 blt zero, a5, .AVG_W32_LSX 784 b .AVG_END_LSX 785 786.AVG_W64_LSX: 787.rept 4 788 vld vr0, a2, 0 789 vld vr2, a2, 16 790 vld vr1, a3, 0 791 vld vr3, a3, 16 792 vadd.h vr0, vr0, vr1 793 vadd.h vr2, vr2, vr3 794 vssrarni.bu.h vr2, vr0, bpc_sh 795 addi.d a2, a2, 32 796 addi.d a3, a3, 32 797 vst vr2, a0, 0 798 addi.d a0, a0, 16 799.endr 800 addi.w a5, a5, -1 801 add.d t8, t8, a1 802 add.d a0, t8, zero 803 blt zero, a5, .AVG_W64_LSX 804 b .AVG_END_LSX 805 806.AVG_W128_LSX: 807.rept 8 808 vld vr0, a2, 0 809 vld vr2, a2, 16 810 vld vr1, a3, 0 811 vld vr3, a3, 16 812 vadd.h vr0, vr0, vr1 813 vadd.h vr2, vr2, vr3 814 vssrarni.bu.h vr2, vr0, bpc_sh 815 addi.d a2, a2, 32 816 addi.d a3, a3, 32 817 vst vr2, a0, 0 818 addi.d a0, a0, 16 819.endr 820 addi.w a5, a5, -1 821 add.d t8, t8, a1 822 add.d a0, t8, zero 823 blt zero, a5, .AVG_W128_LSX 824.AVG_END_LSX: 825endfunc 826 827function avg_8bpc_lasx 828 clz.w t0, a4 829 li.w t1, 24 830 sub.w t0, t0, t1 831 la.local t1, .AVG_LASX_JRTABLE 832 alsl.d t0, t0, t1, 1 833 ld.h t2, t0, 0 834 add.d t1, t1, t2 835 jirl $r0, t1, 0 836 837 .align 3 838.AVG_LASX_JRTABLE: 839 .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE 840 .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE 841 .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE 842 .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE 843 .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE 844 .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE 845 846.AVG_W4_LASX: 847 vld vr0, a2, 0 848 vld vr1, a3, 0 849 vadd.h vr0, vr0, vr1 850 vssrarni.bu.h vr1, vr0, bpc_sh 851 vstelm.w vr1, a0, 0, 0 852 add.d a0, a0, a1 853 vstelm.w vr1, a0, 0, 1 854 addi.w a5, a5, -2 855 addi.d a2, a2, 16 856 addi.d a3, a3, 16 857 add.d a0, a0, a1 858 blt zero, a5, .AVG_W4_LASX 859 b .AVG_END_LASX 860.AVG_W8_LASX: 861 xvld xr0, a2, 0 862 xvld xr1, a3, 0 863 xvadd.h xr2, xr0, xr1 864 xvssrarni.bu.h xr1, xr2, bpc_sh 865 xvstelm.d xr1, a0, 0, 0 866 add.d a0, a0, a1 867 xvstelm.d xr1, a0, 0, 2 868 addi.w a5, a5, -2 869 addi.d a2, a2, 32 870 addi.d a3, a3, 32 871 add.d a0, a1, a0 872 blt zero, a5, .AVG_W8_LASX 873 b .AVG_END_LASX 874.AVG_W16_LASX: 875 xvld xr0, a2, 0 876 xvld xr2, a2, 32 877 xvld xr1, a3, 0 878 xvld xr3, a3, 32 879 xvadd.h xr4, xr0, xr1 880 xvadd.h xr5, xr2, xr3 881 xvssrarni.bu.h xr5, xr4, bpc_sh 882 xvpermi.d xr2, xr5, 0xd8 883 xvpermi.d xr3, xr5, 0x8d 884 vst vr2, a0, 0 885 vstx vr3, a0, a1 886 addi.w a5, a5, -2 887 addi.d a2, a2, 64 888 addi.d a3, a3, 64 889 alsl.d a0, a1, a0, 1 890 blt zero, a5, .AVG_W16_LASX 891 b .AVG_END_LASX 892.AVG_W32_LASX: 893 xvld xr0, a2, 0 894 xvld xr2, a2, 32 895 xvld xr1, a3, 0 896 xvld xr3, a3, 32 897 xvadd.h xr4, xr0, xr1 898 xvadd.h xr5, xr2, xr3 899 xvssrarni.bu.h xr5, xr4, bpc_sh 900 xvpermi.d xr6, xr5, 0xd8 901 xvst xr6, a0, 0 902 addi.w a5, a5, -1 903 addi.d a2, a2, 64 904 addi.d a3, a3, 64 905 add.d a0, a0, a1 906 blt zero, a5, .AVG_W32_LASX 907 b .AVG_END_LASX 908.AVG_W64_LASX: 909 xvld xr0, a2, 0 910 xvld xr2, a2, 32 911 xvld xr4, a2, 64 912 xvld xr6, a2, 96 913 xvld xr1, a3, 0 914 xvld xr3, a3, 32 915 xvld xr5, a3, 64 916 xvld xr7, a3, 96 917 xvadd.h xr0, xr0, xr1 918 xvadd.h xr2, xr2, xr3 919 xvadd.h xr4, xr4, xr5 920 xvadd.h xr6, xr6, xr7 921 xvssrarni.bu.h xr2, xr0, bpc_sh 922 xvssrarni.bu.h xr6, xr4, bpc_sh 923 xvpermi.d xr1, xr2, 0xd8 924 xvpermi.d xr3, xr6, 0xd8 925 xvst xr1, a0, 0 926 xvst xr3, a0, 32 927 addi.w a5, a5, -1 928 addi.d a2, a2, 128 929 addi.d a3, a3, 128 930 add.d a0, a0, a1 931 blt zero, a5, .AVG_W64_LASX 932 b .AVG_END_LASX 933.AVG_W128_LASX: 934 xvld xr0, a2, 0 935 xvld xr2, a2, 32 936 xvld xr4, a2, 64 937 xvld xr6, a2, 96 938 xvld xr8, a2, 128 939 xvld xr10, a2, 160 940 xvld xr12, a2, 192 941 xvld xr14, a2, 224 942 xvld xr1, a3, 0 943 xvld xr3, a3, 32 944 xvld xr5, a3, 64 945 xvld xr7, a3, 96 946 xvld xr9, a3, 128 947 xvld xr11, a3, 160 948 xvld xr13, a3, 192 949 xvld xr15, a3, 224 950 xvadd.h xr0, xr0, xr1 951 xvadd.h xr2, xr2, xr3 952 xvadd.h xr4, xr4, xr5 953 xvadd.h xr6, xr6, xr7 954 xvadd.h xr8, xr8, xr9 955 xvadd.h xr10, xr10, xr11 956 xvadd.h xr12, xr12, xr13 957 xvadd.h xr14, xr14, xr15 958 xvssrarni.bu.h xr2, xr0, bpc_sh 959 xvssrarni.bu.h xr6, xr4, bpc_sh 960 xvssrarni.bu.h xr10, xr8, bpc_sh 961 xvssrarni.bu.h xr14, xr12, bpc_sh 962 xvpermi.d xr1, xr2, 0xd8 963 xvpermi.d xr3, xr6, 0xd8 964 xvpermi.d xr5, xr10, 0xd8 965 xvpermi.d xr7, xr14, 0xd8 966 xvst xr1, a0, 0 967 xvst xr3, a0, 32 968 xvst xr5, a0, 64 969 xvst xr7, a0, 96 970 addi.w a5, a5, -1 971 addi.d a2, a2, 256 972 addi.d a3, a3, 256 973 add.d a0, a0, a1 974 blt zero, a5, .AVG_W128_LASX 975.AVG_END_LASX: 976endfunc 977 978function w_avg_8bpc_lsx 979 addi.d t8, a0, 0 980 li.w t2, 16 981 sub.w t2, t2, a6 // 16 - weight 982 vreplgr2vr.h vr21, a6 983 vreplgr2vr.h vr22, t2 984 985 clz.w t0, a4 986 li.w t1, 24 987 sub.w t0, t0, t1 988 la.local t1, .W_AVG_LSX_JRTABLE 989 alsl.d t0, t0, t1, 1 990 ld.h t2, t0, 0 991 add.d t1, t1, t2 992 jirl $r0, t1, 0 993 994 .align 3 995.W_AVG_LSX_JRTABLE: 996 .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE 997 .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE 998 .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE 999 .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE 1000 .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE 1001 .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE 1002 1003.W_AVG_W4_LSX: 1004 vld vr0, a2, 0 1005 vld vr1, a3, 0 1006 vmulwev.w.h vr2, vr0, vr21 1007 vmulwod.w.h vr3, vr0, vr21 1008 vmaddwev.w.h vr2, vr1, vr22 1009 vmaddwod.w.h vr3, vr1, vr22 1010 vssrarni.hu.w vr3, vr2, bpcw_sh 1011 vssrlni.bu.h vr1, vr3, 0 1012 vpickod.w vr4, vr2, vr1 1013 vilvl.b vr0, vr4, vr1 1014 fst.s f0, a0, 0 1015 add.d a0, a0, a1 1016 vstelm.w vr0, a0, 0, 1 1017 addi.w a5, a5, -2 1018 addi.d a2, a2, 16 1019 addi.d a3, a3, 16 1020 add.d a0, a1, a0 1021 blt zero, a5, .W_AVG_W4_LSX 1022 b .W_AVG_END_LSX 1023.W_AVG_W8_LSX: 1024 vld vr0, a2, 0 1025 vld vr1, a3, 0 1026 vmulwev.w.h vr2, vr0, vr21 1027 vmulwod.w.h vr3, vr0, vr21 1028 vmaddwev.w.h vr2, vr1, vr22 1029 vmaddwod.w.h vr3, vr1, vr22 1030 vssrarni.hu.w vr3, vr2, bpcw_sh 1031 vssrlni.bu.h vr1, vr3, 0 1032 vpickod.w vr4, vr2, vr1 1033 vilvl.b vr0, vr4, vr1 1034 fst.d f0, a0, 0 1035 addi.w a5, a5, -1 1036 addi.d a2, a2, 16 1037 addi.d a3, a3, 16 1038 add.d a0, a0, a1 1039 blt zero, a5, .W_AVG_W8_LSX 1040 b .W_AVG_END_LSX 1041.W_AVG_W16_LSX: 1042 vld vr0, a2, 0 1043 vld vr2, a2, 16 1044 vld vr1, a3, 0 1045 vld vr3, a3, 16 1046 vmulwev.w.h vr4, vr0, vr21 1047 vmulwod.w.h vr5, vr0, vr21 1048 vmulwev.w.h vr6, vr2, vr21 1049 vmulwod.w.h vr7, vr2, vr21 1050 vmaddwev.w.h vr4, vr1, vr22 1051 vmaddwod.w.h vr5, vr1, vr22 1052 vmaddwev.w.h vr6, vr3, vr22 1053 vmaddwod.w.h vr7, vr3, vr22 1054 vssrarni.hu.w vr6, vr4, bpcw_sh 1055 vssrarni.hu.w vr7, vr5, bpcw_sh 1056 vssrlrni.bu.h vr7, vr6, 0 1057 vshuf4i.w vr8, vr7, 0x4E 1058 vilvl.b vr0, vr8, vr7 1059 vst vr0, a0, 0 1060 addi.w a5, a5, -1 1061 addi.d a2, a2, 32 1062 addi.d a3, a3, 32 1063 add.d a0, a0, a1 1064 blt zero, a5, .W_AVG_W16_LSX 1065 b .W_AVG_END_LSX 1066.W_AVG_W32_LSX: 1067.rept 2 1068 vld vr0, a2, 0 1069 vld vr2, a2, 16 1070 vld vr1, a3, 0 1071 vld vr3, a3, 16 1072 vmulwev.w.h vr4, vr0, vr21 1073 vmulwod.w.h vr5, vr0, vr21 1074 vmulwev.w.h vr6, vr2, vr21 1075 vmulwod.w.h vr7, vr2, vr21 1076 vmaddwev.w.h vr4, vr1, vr22 1077 vmaddwod.w.h vr5, vr1, vr22 1078 vmaddwev.w.h vr6, vr3, vr22 1079 vmaddwod.w.h vr7, vr3, vr22 1080 vssrarni.hu.w vr6, vr4, bpcw_sh 1081 vssrarni.hu.w vr7, vr5, bpcw_sh 1082 vssrlrni.bu.h vr7, vr6, 0 1083 vshuf4i.w vr8, vr7, 0x4E 1084 vilvl.b vr0, vr8, vr7 1085 vst vr0, a0, 0 1086 addi.d a2, a2, 32 1087 addi.d a3, a3, 32 1088 addi.d a0, a0, 16 1089.endr 1090 addi.w a5, a5, -1 1091 add.d t8, t8, a1 1092 add.d a0, t8, zero 1093 blt zero, a5, .W_AVG_W32_LSX 1094 b .W_AVG_END_LSX 1095 1096.W_AVG_W64_LSX: 1097.rept 4 1098 vld vr0, a2, 0 1099 vld vr2, a2, 16 1100 vld vr1, a3, 0 1101 vld vr3, a3, 16 1102 vmulwev.w.h vr4, vr0, vr21 1103 vmulwod.w.h vr5, vr0, vr21 1104 vmulwev.w.h vr6, vr2, vr21 1105 vmulwod.w.h vr7, vr2, vr21 1106 vmaddwev.w.h vr4, vr1, vr22 1107 vmaddwod.w.h vr5, vr1, vr22 1108 vmaddwev.w.h vr6, vr3, vr22 1109 vmaddwod.w.h vr7, vr3, vr22 1110 vssrarni.hu.w vr6, vr4, bpcw_sh 1111 vssrarni.hu.w vr7, vr5, bpcw_sh 1112 vssrlrni.bu.h vr7, vr6, 0 1113 vshuf4i.w vr8, vr7, 0x4E 1114 vilvl.b vr0, vr8, vr7 1115 vst vr0, a0, 0 1116 addi.d a2, a2, 32 1117 addi.d a3, a3, 32 1118 addi.d a0, a0, 16 1119.endr 1120 addi.w a5, a5, -1 1121 add.d t8, t8, a1 1122 add.d a0, t8, zero 1123 blt zero, a5, .W_AVG_W64_LSX 1124 b .W_AVG_END_LSX 1125 1126.W_AVG_W128_LSX: 1127.rept 8 1128 vld vr0, a2, 0 1129 vld vr2, a2, 16 1130 vld vr1, a3, 0 1131 vld vr3, a3, 16 1132 vmulwev.w.h vr4, vr0, vr21 1133 vmulwod.w.h vr5, vr0, vr21 1134 vmulwev.w.h vr6, vr2, vr21 1135 vmulwod.w.h vr7, vr2, vr21 1136 vmaddwev.w.h vr4, vr1, vr22 1137 vmaddwod.w.h vr5, vr1, vr22 1138 vmaddwev.w.h vr6, vr3, vr22 1139 vmaddwod.w.h vr7, vr3, vr22 1140 vssrarni.hu.w vr6, vr4, bpcw_sh 1141 vssrarni.hu.w vr7, vr5, bpcw_sh 1142 vssrlrni.bu.h vr7, vr6, 0 1143 vshuf4i.w vr8, vr7, 0x4E 1144 vilvl.b vr0, vr8, vr7 1145 vst vr0, a0, 0 1146 addi.d a2, a2, 32 1147 addi.d a3, a3, 32 1148 addi.d a0, a0, 16 1149.endr 1150 addi.w a5, a5, -1 1151 add.d t8, t8, a1 1152 add.d a0, t8, zero 1153 blt zero, a5, .W_AVG_W128_LSX 1154.W_AVG_END_LSX: 1155endfunc 1156 1157function w_avg_8bpc_lasx 1158 addi.d t8, a0, 0 1159 li.w t2, 16 1160 sub.w t2, t2, a6 // 16 - weight 1161 xvreplgr2vr.h xr21, a6 1162 xvreplgr2vr.h xr22, t2 1163 1164 clz.w t0, a4 1165 li.w t1, 24 1166 sub.w t0, t0, t1 1167 la.local t1, .W_AVG_LASX_JRTABLE 1168 alsl.d t0, t0, t1, 1 1169 ld.h t2, t0, 0 1170 add.d t1, t1, t2 1171 jirl $r0, t1, 0 1172 1173 .align 3 1174.W_AVG_LASX_JRTABLE: 1175 .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE 1176 .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE 1177 .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE 1178 .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE 1179 .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE 1180 .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE 1181 1182.W_AVG_W4_LASX: 1183 vld vr0, a2, 0 1184 vld vr1, a3, 0 1185 xvpermi.d xr2, xr0, 0xD8 1186 xvpermi.d xr3, xr1, 0xD8 1187 xvilvl.h xr4, xr3, xr2 1188 xvmulwev.w.h xr0, xr4, xr21 1189 xvmaddwod.w.h xr0, xr4, xr22 1190 xvssrarni.hu.w xr1, xr0, bpcw_sh 1191 xvssrlni.bu.h xr0, xr1, 0 1192 fst.s f0, a0, 0 1193 add.d a0, a0, a1 1194 xvstelm.w xr0, a0, 0, 4 1195 addi.w a5, a5, -2 1196 addi.d a2, a2, 16 1197 addi.d a3, a3, 16 1198 add.d a0, a1, a0 1199 blt zero, a5, .W_AVG_W4_LASX 1200 b .W_AVG_END_LASX 1201 1202.W_AVG_W8_LASX: 1203 xvld xr0, a2, 0 1204 xvld xr1, a3, 0 1205 xvmulwev.w.h xr2, xr0, xr21 1206 xvmulwod.w.h xr3, xr0, xr21 1207 xvmaddwev.w.h xr2, xr1, xr22 1208 xvmaddwod.w.h xr3, xr1, xr22 1209 xvssrarni.hu.w xr3, xr2, bpcw_sh 1210 xvssrlni.bu.h xr1, xr3, 0 1211 xvpickod.w xr4, xr2, xr1 1212 xvilvl.b xr0, xr4, xr1 1213 xvstelm.d xr0, a0, 0, 0 1214 add.d a0, a0, a1 1215 xvstelm.d xr0, a0, 0, 2 1216 addi.w a5, a5, -2 1217 addi.d a2, a2, 32 1218 addi.d a3, a3, 32 1219 add.d a0, a0, a1 1220 blt zero, a5, .W_AVG_W8_LASX 1221 b .W_AVG_END_LASX 1222 1223.W_AVG_W16_LASX: 1224 xvld xr0, a2, 0 1225 xvld xr1, a3, 0 1226 xvmulwev.w.h xr2, xr0, xr21 1227 xvmulwod.w.h xr3, xr0, xr21 1228 xvmaddwev.w.h xr2, xr1, xr22 1229 xvmaddwod.w.h xr3, xr1, xr22 1230 xvssrarni.hu.w xr3, xr2, bpcw_sh 1231 xvssrlni.bu.h xr1, xr3, 0 1232 xvpickod.w xr4, xr2, xr1 1233 xvilvl.b xr0, xr4, xr1 1234 xvpermi.d xr1, xr0, 0xD8 1235 vst vr1, a0, 0 1236 addi.w a5, a5, -1 1237 addi.d a2, a2, 32 1238 addi.d a3, a3, 32 1239 add.d a0, a0, a1 1240 blt zero, a5, .W_AVG_W16_LASX 1241 b .W_AVG_END_LSX 1242 1243.W_AVG_W32_LASX: 1244 xvld xr0, a2, 0 1245 xvld xr2, a2, 32 1246 xvld xr1, a3, 0 1247 xvld xr3, a3, 32 1248 xvmulwev.w.h xr4, xr0, xr21 1249 xvmulwod.w.h xr5, xr0, xr21 1250 xvmulwev.w.h xr6, xr2, xr21 1251 xvmulwod.w.h xr7, xr2, xr21 1252 xvmaddwev.w.h xr4, xr1, xr22 1253 xvmaddwod.w.h xr5, xr1, xr22 1254 xvmaddwev.w.h xr6, xr3, xr22 1255 xvmaddwod.w.h xr7, xr3, xr22 1256 xvssrarni.hu.w xr6, xr4, bpcw_sh 1257 xvssrarni.hu.w xr7, xr5, bpcw_sh 1258 xvssrlni.bu.h xr7, xr6, 0 1259 xvshuf4i.w xr8, xr7, 0x4E 1260 xvilvl.b xr9, xr8, xr7 1261 xvpermi.d xr0, xr9, 0xD8 1262 xvst xr0, a0, 0 1263 addi.w a5, a5, -1 1264 addi.d a2, a2, 64 1265 addi.d a3, a3, 64 1266 add.d a0, a0, a1 1267 blt zero, a5, .W_AVG_W32_LASX 1268 b .W_AVG_END_LASX 1269 1270.W_AVG_W64_LASX: 1271.rept 2 1272 xvld xr0, a2, 0 1273 xvld xr2, a2, 32 1274 xvld xr1, a3, 0 1275 xvld xr3, a3, 32 1276 xvmulwev.w.h xr4, xr0, xr21 1277 xvmulwod.w.h xr5, xr0, xr21 1278 xvmulwev.w.h xr6, xr2, xr21 1279 xvmulwod.w.h xr7, xr2, xr21 1280 xvmaddwev.w.h xr4, xr1, xr22 1281 xvmaddwod.w.h xr5, xr1, xr22 1282 xvmaddwev.w.h xr6, xr3, xr22 1283 xvmaddwod.w.h xr7, xr3, xr22 1284 xvssrarni.hu.w xr6, xr4, bpcw_sh 1285 xvssrarni.hu.w xr7, xr5, bpcw_sh 1286 xvssrlni.bu.h xr7, xr6, 0 1287 xvshuf4i.w xr8, xr7, 0x4E 1288 xvilvl.b xr9, xr8, xr7 1289 xvpermi.d xr0, xr9, 0xD8 1290 xvst xr0, a0, 0 1291 addi.d a2, a2, 64 1292 addi.d a3, a3, 64 1293 addi.d a0, a0, 32 1294.endr 1295 addi.w a5, a5, -1 1296 add.d t8, t8, a1 1297 add.d a0, t8, zero 1298 blt zero, a5, .W_AVG_W64_LASX 1299 b .W_AVG_END_LASX 1300 1301.W_AVG_W128_LASX: 1302.rept 4 1303 xvld xr0, a2, 0 1304 xvld xr2, a2, 32 1305 xvld xr1, a3, 0 1306 xvld xr3, a3, 32 1307 xvmulwev.w.h xr4, xr0, xr21 1308 xvmulwod.w.h xr5, xr0, xr21 1309 xvmulwev.w.h xr6, xr2, xr21 1310 xvmulwod.w.h xr7, xr2, xr21 1311 xvmaddwev.w.h xr4, xr1, xr22 1312 xvmaddwod.w.h xr5, xr1, xr22 1313 xvmaddwev.w.h xr6, xr3, xr22 1314 xvmaddwod.w.h xr7, xr3, xr22 1315 xvssrarni.hu.w xr6, xr4, bpcw_sh 1316 xvssrarni.hu.w xr7, xr5, bpcw_sh 1317 xvssrlni.bu.h xr7, xr6, 0 1318 xvshuf4i.w xr8, xr7, 0x4E 1319 xvilvl.b xr9, xr8, xr7 1320 xvpermi.d xr0, xr9, 0xD8 1321 xvst xr0, a0, 0 1322 addi.d a2, a2, 64 1323 addi.d a3, a3, 64 1324 addi.d a0, a0, 32 1325.endr 1326 1327 addi.w a5, a5, -1 1328 add.d t8, t8, a1 1329 add.d a0, t8, zero 1330 blt zero, a5, .W_AVG_W128_LASX 1331.W_AVG_END_LASX: 1332endfunc 1333 1334#undef bpc_sh 1335#undef bpcw_sh 1336 1337#define mask_sh 10 1338/* 1339static void mask_c(pixel *dst, const ptrdiff_t dst_stride, 1340 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 1341 const uint8_t *mask HIGHBD_DECL_SUFFIX) 1342*/ 1343function mask_8bpc_lsx 1344 vldi vr21, 0x440 // 64 1345 vxor.v vr19, vr19, vr19 1346 addi.d t8, a0, 0 1347 clz.w t0, a4 1348 li.w t1, 24 1349 sub.w t0, t0, t1 1350 la.local t1, .MASK_LSX_JRTABLE 1351 alsl.d t0, t0, t1, 1 1352 ld.h t2, t0, 0 1353 add.d t1, t1, t2 1354 jirl $r0, t1, 0 1355 1356 .align 3 1357.MASK_LSX_JRTABLE: 1358 .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE 1359 .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE 1360 .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE 1361 .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE 1362 .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE 1363 .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE 1364 1365.MASK_W4_LSX: 1366 vld vr0, a2, 0 1367 vld vr1, a3, 0 1368 fld.d f22, a6, 0 1369 1370 vilvl.b vr2, vr19, vr22 1371 vsub.h vr3, vr21, vr2 1372 1373 vmulwev.w.h vr4, vr0, vr2 1374 vmulwod.w.h vr5, vr0, vr2 1375 vmaddwev.w.h vr4, vr1, vr3 1376 vmaddwod.w.h vr5, vr1, vr3 1377 vssrarni.hu.w vr5, vr4, mask_sh 1378 vssrlrni.bu.h vr1, vr5, 0 1379 vpickod.w vr4, vr2, vr1 1380 vilvl.b vr0, vr4, vr1 1381 fst.s f0, a0, 0 1382 add.d a0, a0, a1 1383 vstelm.w vr0, a0, 0, 1 1384 addi.d a2, a2, 16 1385 addi.d a3, a3, 16 1386 addi.d a6, a6, 8 1387 add.d a0, a0, a1 1388 addi.w a5, a5, -2 1389 blt zero, a5, .MASK_W4_LSX 1390 b .MASK_END_LSX 1391.MASK_W8_LSX: 1392 vld vr0, a2, 0 1393 vld vr10, a2, 16 1394 vld vr1, a3, 0 1395 vld vr11, a3, 16 1396 vld vr22, a6, 0 1397 1398 vilvl.b vr2, vr19, vr22 1399 vilvh.b vr12, vr19, vr22 1400 vsub.h vr3, vr21, vr2 1401 vsub.h vr13, vr21, vr12 1402 1403 vmulwev.w.h vr4, vr0, vr2 1404 vmulwod.w.h vr5, vr0, vr2 1405 vmulwev.w.h vr14, vr10, vr12 1406 vmulwod.w.h vr15, vr10, vr12 1407 vmaddwev.w.h vr4, vr1, vr3 1408 vmaddwod.w.h vr5, vr1, vr3 1409 vmaddwev.w.h vr14, vr11, vr13 1410 vmaddwod.w.h vr15, vr11, vr13 1411 vssrarni.hu.w vr14, vr4, mask_sh 1412 vssrarni.hu.w vr15, vr5, mask_sh 1413 vssrlrni.bu.h vr15, vr14, 0 1414 vshuf4i.w vr6, vr15, 0x4E 1415 vilvl.b vr0, vr6, vr15 1416 fst.d f0, a0, 0 1417 add.d a0, a0, a1 1418 vstelm.d vr0, a0, 0, 1 1419 addi.d a2, a2, 32 1420 addi.d a3, a3, 32 1421 addi.d a6, a6, 16 1422 add.d a0, a0, a1 1423 addi.w a5, a5, -2 1424 blt zero, a5, .MASK_W8_LSX 1425 b .MASK_END_LSX 1426 1427.MASK_W16_LSX: 1428 vld vr0, a2, 0 1429 vld vr10, a2, 16 1430 vld vr1, a3, 0 1431 vld vr11, a3, 16 1432 vld vr22, a6, 0 1433 1434 vilvl.b vr2, vr19, vr22 1435 vilvh.b vr12, vr19, vr22 1436 vsub.h vr3, vr21, vr2 1437 vsub.h vr13, vr21, vr12 1438 1439 vmulwev.w.h vr4, vr0, vr2 1440 vmulwod.w.h vr5, vr0, vr2 1441 vmulwev.w.h vr14, vr10, vr12 1442 vmulwod.w.h vr15, vr10, vr12 1443 vmaddwev.w.h vr4, vr1, vr3 1444 vmaddwod.w.h vr5, vr1, vr3 1445 vmaddwev.w.h vr14, vr11, vr13 1446 vmaddwod.w.h vr15, vr11, vr13 1447 vssrarni.hu.w vr14, vr4, mask_sh 1448 vssrarni.hu.w vr15, vr5, mask_sh 1449 vssrlrni.bu.h vr15, vr14, 0 1450 vshuf4i.w vr6, vr15, 0x4E 1451 vilvl.b vr0, vr6, vr15 1452 vst vr0, a0, 0 1453 addi.d a2, a2, 32 1454 addi.d a3, a3, 32 1455 addi.d a6, a6, 16 1456 add.d a0, a0, a1 1457 addi.w a5, a5, -1 1458 blt zero, a5, .MASK_W16_LSX 1459 b .MASK_END_LSX 1460.MASK_W32_LSX: 1461.rept 2 1462 vld vr0, a2, 0 1463 vld vr10, a2, 16 1464 vld vr1, a3, 0 1465 vld vr11, a3, 16 1466 vld vr22, a6, 0 1467 vilvl.b vr2, vr19, vr22 1468 vilvh.b vr12, vr19, vr22 1469 vsub.h vr3, vr21, vr2 1470 vsub.h vr13, vr21, vr12 1471 vmulwev.w.h vr4, vr0, vr2 1472 vmulwod.w.h vr5, vr0, vr2 1473 vmulwev.w.h vr14, vr10, vr12 1474 vmulwod.w.h vr15, vr10, vr12 1475 vmaddwev.w.h vr4, vr1, vr3 1476 vmaddwod.w.h vr5, vr1, vr3 1477 vmaddwev.w.h vr14, vr11, vr13 1478 vmaddwod.w.h vr15, vr11, vr13 1479 vssrarni.hu.w vr14, vr4, mask_sh 1480 vssrarni.hu.w vr15, vr5, mask_sh 1481 vssrlrni.bu.h vr15, vr14, 0 1482 vshuf4i.w vr6, vr15, 0x4E 1483 vilvl.b vr0, vr6, vr15 1484 vst vr0, a0, 0 1485 addi.d a2, a2, 32 1486 addi.d a3, a3, 32 1487 addi.d a6, a6, 16 1488 addi.d a0, a0, 16 1489.endr 1490 add.d t8, t8, a1 1491 add.d a0, t8, zero 1492 addi.w a5, a5, -1 1493 blt zero, a5, .MASK_W32_LSX 1494 b .MASK_END_LSX 1495.MASK_W64_LSX: 1496.rept 4 1497 vld vr0, a2, 0 1498 vld vr10, a2, 16 1499 vld vr1, a3, 0 1500 vld vr11, a3, 16 1501 vld vr22, a6, 0 1502 vilvl.b vr2, vr19, vr22 1503 vilvh.b vr12, vr19, vr22 1504 vsub.h vr3, vr21, vr2 1505 vsub.h vr13, vr21, vr12 1506 vmulwev.w.h vr4, vr0, vr2 1507 vmulwod.w.h vr5, vr0, vr2 1508 vmulwev.w.h vr14, vr10, vr12 1509 vmulwod.w.h vr15, vr10, vr12 1510 vmaddwev.w.h vr4, vr1, vr3 1511 vmaddwod.w.h vr5, vr1, vr3 1512 vmaddwev.w.h vr14, vr11, vr13 1513 vmaddwod.w.h vr15, vr11, vr13 1514 vssrarni.hu.w vr14, vr4, mask_sh 1515 vssrarni.hu.w vr15, vr5, mask_sh 1516 vssrlrni.bu.h vr15, vr14, 0 1517 vshuf4i.w vr6, vr15, 0x4E 1518 vilvl.b vr0, vr6, vr15 1519 vst vr0, a0, 0 1520 addi.d a2, a2, 32 1521 addi.d a3, a3, 32 1522 addi.d a6, a6, 16 1523 addi.d a0, a0, 16 1524.endr 1525 add.d t8, t8, a1 1526 add.d a0, t8, zero 1527 addi.w a5, a5, -1 1528 blt zero, a5, .MASK_W64_LSX 1529 b .MASK_END_LSX 1530.MASK_W128_LSX: 1531.rept 8 1532 vld vr0, a2, 0 1533 vld vr10, a2, 16 1534 vld vr1, a3, 0 1535 vld vr11, a3, 16 1536 vld vr22, a6, 0 1537 vilvl.b vr2, vr19, vr22 1538 vilvh.b vr12, vr19, vr22 1539 vsub.h vr3, vr21, vr2 1540 vsub.h vr13, vr21, vr12 1541 vmulwev.w.h vr4, vr0, vr2 1542 vmulwod.w.h vr5, vr0, vr2 1543 vmulwev.w.h vr14, vr10, vr12 1544 vmulwod.w.h vr15, vr10, vr12 1545 vmaddwev.w.h vr4, vr1, vr3 1546 vmaddwod.w.h vr5, vr1, vr3 1547 vmaddwev.w.h vr14, vr11, vr13 1548 vmaddwod.w.h vr15, vr11, vr13 1549 vssrarni.hu.w vr14, vr4, mask_sh 1550 vssrarni.hu.w vr15, vr5, mask_sh 1551 vssrlrni.bu.h vr15, vr14, 0 1552 vshuf4i.w vr6, vr15, 0x4E 1553 vilvl.b vr0, vr6, vr15 1554 vst vr0, a0, 0 1555 addi.d a2, a2, 32 1556 addi.d a3, a3, 32 1557 addi.d a6, a6, 16 1558 addi.d a0, a0, 16 1559.endr 1560 add.d t8, t8, a1 1561 add.d a0, t8, zero 1562 addi.w a5, a5, -1 1563 blt zero, a5, .MASK_W128_LSX 1564.MASK_END_LSX: 1565endfunc 1566 1567function mask_8bpc_lasx 1568 xvldi xr21, 0x440 // 64 1569 xvxor.v xr19, xr19, xr19 1570 addi.d t8, a0, 0 1571 clz.w t0, a4 1572 li.w t1, 24 1573 sub.w t0, t0, t1 1574 la.local t1, .MASK_LASX_JRTABLE 1575 alsl.d t0, t0, t1, 1 1576 ld.h t2, t0, 0 1577 add.d t1, t1, t2 1578 jirl $r0, t1, 0 1579 1580 .align 3 1581.MASK_LASX_JRTABLE: 1582 .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE 1583 .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE 1584 .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE 1585 .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE 1586 .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE 1587 .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE 1588 1589.MASK_W4_LASX: 1590 vld vr0, a2, 0 1591 vld vr1, a3, 0 1592 fld.d f22, a6, 0 1593 1594 vilvl.h vr4, vr1, vr0 1595 vilvh.h vr14, vr1, vr0 1596 vilvl.b vr2, vr19, vr22 1597 vsub.h vr3, vr21, vr2 1598 xvpermi.q xr14, xr4, 0x20 1599 vilvl.h vr5, vr3, vr2 1600 vilvh.h vr15, vr3, vr2 1601 xvpermi.q xr15, xr5, 0x20 1602 xvmulwev.w.h xr0, xr14, xr15 1603 xvmaddwod.w.h xr0, xr14, xr15 1604 xvssrarni.hu.w xr1, xr0, mask_sh 1605 xvssrlni.bu.h xr2, xr1, 0 1606 fst.s f2, a0, 0 1607 add.d a0, a0, a1 1608 xvstelm.w xr2, a0, 0, 4 1609 1610 addi.d a2, a2, 16 1611 addi.d a3, a3, 16 1612 addi.d a6, a6, 8 1613 add.d a0, a0, a1 1614 addi.w a5, a5, -2 1615 blt zero, a5, .MASK_W4_LASX 1616 b .MASK_END_LASX 1617 1618.MASK_W8_LASX: 1619 xvld xr0, a2, 0 1620 xvld xr1, a3, 0 1621 vld vr22, a6, 0 1622 1623 vext2xv.hu.bu xr2, xr22 1624 xvsub.h xr3, xr21, xr2 1625 xvmulwev.w.h xr4, xr0, xr2 1626 xvmulwod.w.h xr5, xr0, xr2 1627 xvmaddwev.w.h xr4, xr1, xr3 1628 xvmaddwod.w.h xr5, xr1, xr3 1629 xvssrarni.hu.w xr5, xr4, mask_sh 1630 xvssrlni.bu.h xr1, xr5, 0 1631 xvpickod.w xr4, xr2, xr1 1632 xvilvl.b xr0, xr4, xr1 1633 fst.d f0, a0, 0 1634 add.d a0, a0, a1 1635 xvstelm.d xr0, a0, 0, 2 1636 1637 addi.d a2, a2, 32 1638 addi.d a3, a3, 32 1639 addi.d a6, a6, 16 1640 add.d a0, a0, a1 1641 addi.w a5, a5, -2 1642 blt zero, a5, .MASK_W8_LASX 1643 b .MASK_END_LASX 1644 1645.MASK_W16_LASX: 1646 xvld xr0, a2, 0 1647 xvld xr1, a3, 0 1648 vld vr22, a6, 0 1649 1650 vext2xv.hu.bu xr2, xr22 1651 xvsub.h xr3, xr21, xr2 1652 xvmulwev.w.h xr4, xr0, xr2 1653 xvmulwod.w.h xr5, xr0, xr2 1654 xvmaddwev.w.h xr4, xr1, xr3 1655 xvmaddwod.w.h xr5, xr1, xr3 1656 xvssrarni.hu.w xr5, xr4, mask_sh 1657 xvssrlni.bu.h xr1, xr5, 0 1658 xvpickod.w xr4, xr2, xr1 1659 xvilvl.b xr0, xr4, xr1 1660 xvpermi.d xr1, xr0, 0xD8 1661 vst vr1, a0, 0 1662 1663 addi.d a2, a2, 32 1664 addi.d a3, a3, 32 1665 addi.d a6, a6, 16 1666 add.d a0, a0, a1 1667 addi.w a5, a5, -1 1668 blt zero, a5, .MASK_W16_LASX 1669 b .MASK_END_LASX 1670.MASK_W32_LASX: 1671 xvld xr0, a2, 0 1672 xvld xr10, a2, 32 1673 xvld xr1, a3, 0 1674 xvld xr11, a3, 32 1675 xvld xr22, a6, 0 1676 vext2xv.hu.bu xr2, xr22 1677 xvpermi.q xr4, xr22, 0x01 1678 vext2xv.hu.bu xr12, xr4 1679 xvsub.h xr3, xr21, xr2 1680 xvsub.h xr13, xr21, xr12 1681 1682 xvmulwev.w.h xr4, xr0, xr2 1683 xvmulwod.w.h xr5, xr0, xr2 1684 xvmulwev.w.h xr14, xr10, xr12 1685 xvmulwod.w.h xr15, xr10, xr12 1686 xvmaddwev.w.h xr4, xr1, xr3 1687 xvmaddwod.w.h xr5, xr1, xr3 1688 xvmaddwev.w.h xr14, xr11, xr13 1689 xvmaddwod.w.h xr15, xr11, xr13 1690 xvssrarni.hu.w xr14, xr4, mask_sh 1691 xvssrarni.hu.w xr15, xr5, mask_sh 1692 xvssrlni.bu.h xr15, xr14, 0 1693 xvshuf4i.w xr6, xr15, 0x4E 1694 xvilvl.b xr1, xr6, xr15 1695 xvpermi.d xr0, xr1, 0xD8 1696 xvst xr0, a0, 0 1697 1698 addi.d a2, a2, 64 1699 addi.d a3, a3, 64 1700 addi.d a6, a6, 32 1701 add.d a0, a0, a1 1702 addi.w a5, a5, -1 1703 blt zero, a5, .MASK_W32_LASX 1704 b .MASK_END_LASX 1705 1706.MASK_W64_LASX: 1707.rept 2 1708 xvld xr0, a2, 0 1709 xvld xr10, a2, 32 1710 xvld xr1, a3, 0 1711 xvld xr11, a3, 32 1712 xvld xr22, a6, 0 1713 vext2xv.hu.bu xr2, xr22 1714 xvpermi.q xr4, xr22, 0x01 1715 vext2xv.hu.bu xr12, xr4 1716 xvsub.h xr3, xr21, xr2 1717 xvsub.h xr13, xr21, xr12 1718 1719 xvmulwev.w.h xr4, xr0, xr2 1720 xvmulwod.w.h xr5, xr0, xr2 1721 xvmulwev.w.h xr14, xr10, xr12 1722 xvmulwod.w.h xr15, xr10, xr12 1723 xvmaddwev.w.h xr4, xr1, xr3 1724 xvmaddwod.w.h xr5, xr1, xr3 1725 xvmaddwev.w.h xr14, xr11, xr13 1726 xvmaddwod.w.h xr15, xr11, xr13 1727 xvssrarni.hu.w xr14, xr4, mask_sh 1728 xvssrarni.hu.w xr15, xr5, mask_sh 1729 xvssrlni.bu.h xr15, xr14, 0 1730 xvshuf4i.w xr6, xr15, 0x4E 1731 xvilvl.b xr1, xr6, xr15 1732 xvpermi.d xr0, xr1, 0xD8 1733 xvst xr0, a0, 0 1734 addi.d a2, a2, 64 1735 addi.d a3, a3, 64 1736 addi.d a6, a6, 32 1737 addi.d a0, a0, 32 1738.endr 1739 add.d t8, t8, a1 1740 add.d a0, t8, zero 1741 addi.w a5, a5, -1 1742 blt zero, a5, .MASK_W64_LASX 1743 b .MASK_END_LASX 1744 1745.MASK_W128_LASX: 1746.rept 4 1747 xvld xr0, a2, 0 1748 xvld xr10, a2, 32 1749 xvld xr1, a3, 0 1750 xvld xr11, a3, 32 1751 xvld xr22, a6, 0 1752 vext2xv.hu.bu xr2, xr22 1753 xvpermi.q xr4, xr22, 0x01 1754 vext2xv.hu.bu xr12, xr4 1755 xvsub.h xr3, xr21, xr2 1756 xvsub.h xr13, xr21, xr12 1757 1758 xvmulwev.w.h xr4, xr0, xr2 1759 xvmulwod.w.h xr5, xr0, xr2 1760 xvmulwev.w.h xr14, xr10, xr12 1761 xvmulwod.w.h xr15, xr10, xr12 1762 xvmaddwev.w.h xr4, xr1, xr3 1763 xvmaddwod.w.h xr5, xr1, xr3 1764 xvmaddwev.w.h xr14, xr11, xr13 1765 xvmaddwod.w.h xr15, xr11, xr13 1766 xvssrarni.hu.w xr14, xr4, mask_sh 1767 xvssrarni.hu.w xr15, xr5, mask_sh 1768 xvssrlni.bu.h xr15, xr14, 0 1769 xvshuf4i.w xr6, xr15, 0x4E 1770 xvilvl.b xr1, xr6, xr15 1771 xvpermi.d xr0, xr1, 0xD8 1772 xvst xr0, a0, 0 1773 1774 addi.d a2, a2, 64 1775 addi.d a3, a3, 64 1776 addi.d a6, a6, 32 1777 addi.d a0, a0, 32 1778.endr 1779 add.d t8, t8, a1 1780 add.d a0, t8, zero 1781 addi.w a5, a5, -1 1782 blt zero, a5, .MASK_W128_LASX 1783.MASK_END_LASX: 1784endfunc 1785 1786/* 1787static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, 1788 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 1789 uint8_t *mask, const int sign, 1790 const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) 1791*/ 1792function w_mask_420_8bpc_lsx 1793 addi.d sp, sp, -24 1794 fst.d f24, sp, 0 1795 fst.d f25, sp, 8 1796 fst.d f26, sp, 16 1797 vldi vr20, 0x440 1798 vreplgr2vr.h vr21, a7 1799 vldi vr22, 0x426 1800 1801 clz.w t0, a4 1802 li.w t1, 24 1803 sub.w t0, t0, t1 1804 la.local t1, .WMASK420_LSX_JRTABLE 1805 alsl.d t0, t0, t1, 1 1806 ld.h t8, t0, 0 1807 add.d t1, t1, t8 1808 jirl $r0, t1, 0 1809 1810 .align 3 1811.WMASK420_LSX_JRTABLE: 1812 .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE 1813 .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE 1814 .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE 1815 .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE 1816 .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE 1817 .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE 1818 1819.WMASK420_W4_LSX: 1820 vld vr0, a2, 0 1821 vld vr1, a2, 16 1822 vld vr2, a3, 0 1823 vld vr3, a3, 16 1824 addi.w a5, a5, -4 1825 1826 vabsd.h vr4, vr0, vr2 1827 vabsd.h vr5, vr1, vr3 1828 vaddi.hu vr4, vr4, 8 1829 vaddi.hu vr5, vr5, 8 1830 vsrli.h vr4, vr4, 8 1831 vsrli.h vr5, vr5, 8 1832 vadd.h vr4, vr4, vr22 1833 vadd.h vr5, vr5, vr22 1834 vmin.hu vr6, vr4, vr20 1835 vmin.hu vr7, vr5, vr20 1836 vsub.h vr8, vr20, vr6 1837 vsub.h vr9, vr20, vr7 1838 vmulwev.w.h vr4, vr6, vr0 1839 vmulwod.w.h vr5, vr6, vr0 1840 vmulwev.w.h vr10, vr7, vr1 1841 vmulwod.w.h vr11, vr7, vr1 1842 vmaddwev.w.h vr4, vr8, vr2 1843 vmaddwod.w.h vr5, vr8, vr2 1844 vmaddwev.w.h vr10, vr9, vr3 1845 vmaddwod.w.h vr11, vr9, vr3 1846 vilvl.w vr0, vr5, vr4 1847 vilvh.w vr1, vr5, vr4 1848 vilvl.w vr2, vr11, vr10 1849 vilvh.w vr3, vr11, vr10 1850 vssrarni.hu.w vr1, vr0, 10 1851 vssrarni.hu.w vr3, vr2, 10 1852 vssrlni.bu.h vr3, vr1, 0 1853 vstelm.w vr3, a0, 0, 0 1854 add.d a0, a0, a1 1855 vstelm.w vr3, a0, 0, 1 1856 add.d a0, a0, a1 1857 vstelm.w vr3, a0, 0, 2 1858 add.d a0, a0, a1 1859 vstelm.w vr3, a0, 0, 3 1860 add.d a0, a0, a1 1861 vpickev.h vr0, vr7, vr6 1862 vpickod.h vr1, vr7, vr6 1863 vadd.h vr0, vr0, vr1 1864 vshuf4i.h vr0, vr0, 0xd8 1865 vhaddw.w.h vr2, vr0, vr0 1866 vpickev.h vr2, vr2, vr2 1867 vsub.h vr2, vr2, vr21 1868 vaddi.hu vr2, vr2, 2 1869 vssrani.bu.h vr2, vr2, 2 1870 vstelm.w vr2, a6, 0, 0 1871 1872 addi.d a2, a2, 32 1873 addi.d a3, a3, 32 1874 addi.d a6, a6, 4 1875 blt zero, a5, .WMASK420_W4_LSX 1876 b .END_W420 1877 1878.WMASK420_W8_LSX: 1879 vld vr0, a2, 0 1880 vld vr1, a2, 16 1881 vld vr2, a3, 0 1882 vld vr3, a3, 16 1883 addi.w a5, a5, -2 1884 1885 vabsd.h vr4, vr0, vr2 1886 vabsd.h vr5, vr1, vr3 1887 vaddi.hu vr4, vr4, 8 1888 vaddi.hu vr5, vr5, 8 1889 vsrli.h vr4, vr4, 8 1890 vsrli.h vr5, vr5, 8 1891 vadd.h vr4, vr4, vr22 1892 vadd.h vr5, vr5, vr22 1893 vmin.hu vr6, vr4, vr20 1894 vmin.hu vr7, vr5, vr20 1895 vsub.h vr8, vr20, vr6 1896 vsub.h vr9, vr20, vr7 1897 vmulwev.w.h vr4, vr6, vr0 1898 vmulwod.w.h vr5, vr6, vr0 1899 vmulwev.w.h vr10, vr7, vr1 1900 vmulwod.w.h vr11, vr7, vr1 1901 vmaddwev.w.h vr4, vr8, vr2 1902 vmaddwod.w.h vr5, vr8, vr2 1903 vmaddwev.w.h vr10, vr9, vr3 1904 vmaddwod.w.h vr11, vr9, vr3 1905 vssrarni.hu.w vr10, vr4, 10 1906 vssrarni.hu.w vr11, vr5, 10 1907 vssrlni.bu.h vr11, vr10, 0 1908 vshuf4i.w vr0, vr11, 0x4E 1909 vilvl.b vr3, vr0, vr11 1910 vstelm.d vr3, a0, 0, 0 1911 add.d a0, a0, a1 1912 vstelm.d vr3, a0, 0, 1 1913 add.d a0, a0, a1 1914 vpickev.h vr0, vr7, vr6 1915 vpickod.h vr1, vr7, vr6 1916 vadd.h vr0, vr0, vr1 1917 vilvh.d vr2, vr0, vr0 1918 vadd.h vr2, vr2, vr0 1919 vsub.h vr2, vr2, vr21 1920 vaddi.hu vr2, vr2, 2 1921 vssrani.bu.h vr2, vr2, 2 1922 vstelm.w vr2, a6, 0, 0 1923 1924 addi.d a2, a2, 32 1925 addi.d a3, a3, 32 1926 addi.d a6, a6, 4 1927 blt zero, a5, .WMASK420_W8_LSX 1928 b .END_W420 1929 1930.WMASK420_W16_LSX: 1931 vld vr0, a2, 0 1932 vld vr1, a2, 16 1933 alsl.d a2, a4, a2, 1 1934 vld vr2, a2, 0 1935 vld vr3, a2, 16 1936 vld vr4, a3, 0 1937 vld vr5, a3, 16 1938 alsl.d a3, a4, a3, 1 1939 vld vr6, a3, 0 1940 vld vr7, a3, 16 1941 1942 vabsd.h vr8, vr0, vr4 1943 vabsd.h vr9, vr1, vr5 1944 vabsd.h vr10, vr2, vr6 1945 vabsd.h vr11, vr3, vr7 1946 vaddi.hu vr8, vr8, 8 1947 vaddi.hu vr9, vr9, 8 1948 vaddi.hu vr10, vr10, 8 1949 vaddi.hu vr11, vr11, 8 1950 vsrli.h vr8, vr8, 8 1951 vsrli.h vr9, vr9, 8 1952 vsrli.h vr10, vr10, 8 1953 vsrli.h vr11, vr11, 8 1954 vadd.h vr8, vr8, vr22 1955 vadd.h vr9, vr9, vr22 1956 vadd.h vr10, vr10, vr22 1957 vadd.h vr11, vr11, vr22 1958 vmin.hu vr12, vr8, vr20 1959 vmin.hu vr13, vr9, vr20 1960 vmin.hu vr14, vr10, vr20 1961 vmin.hu vr15, vr11, vr20 1962 vsub.h vr16, vr20, vr12 1963 vsub.h vr17, vr20, vr13 1964 vsub.h vr18, vr20, vr14 1965 vsub.h vr19, vr20, vr15 1966 vmulwev.w.h vr8, vr12, vr0 1967 vmulwod.w.h vr9, vr12, vr0 1968 vmulwev.w.h vr10, vr13, vr1 1969 vmulwod.w.h vr11, vr13, vr1 1970 vmulwev.w.h vr23, vr14, vr2 1971 vmulwod.w.h vr24, vr14, vr2 1972 vmulwev.w.h vr25, vr15, vr3 1973 vmulwod.w.h vr26, vr15, vr3 1974 vmaddwev.w.h vr8, vr16, vr4 1975 vmaddwod.w.h vr9, vr16, vr4 1976 vmaddwev.w.h vr10, vr17, vr5 1977 vmaddwod.w.h vr11, vr17, vr5 1978 vmaddwev.w.h vr23, vr18, vr6 1979 vmaddwod.w.h vr24, vr18, vr6 1980 vmaddwev.w.h vr25, vr19, vr7 1981 vmaddwod.w.h vr26, vr19, vr7 1982 vssrarni.hu.w vr10, vr8, 10 1983 vssrarni.hu.w vr11, vr9, 10 1984 vssrarni.hu.w vr25, vr23, 10 1985 vssrarni.hu.w vr26, vr24, 10 1986 vssrlni.bu.h vr11, vr10, 0 1987 vssrlni.bu.h vr26, vr25, 0 1988 vshuf4i.w vr0, vr11, 0x4E 1989 vshuf4i.w vr1, vr26, 0x4E 1990 vilvl.b vr3, vr0, vr11 1991 vilvl.b vr7, vr1, vr26 1992 vst vr3, a0, 0 1993 vstx vr7, a0, a1 1994 vpickev.h vr0, vr13, vr12 1995 vpickod.h vr1, vr13, vr12 1996 vpickev.h vr2, vr15, vr14 1997 vpickod.h vr3, vr15, vr14 1998 vadd.h vr4, vr0, vr1 1999 vadd.h vr5, vr2, vr3 2000 vadd.h vr4, vr4, vr5 2001 vsub.h vr4, vr4, vr21 2002 vssrarni.bu.h vr4, vr4, 2 2003 vstelm.d vr4, a6, 0, 0 2004 2005 alsl.d a2, a4, a2, 1 2006 alsl.d a3, a4, a3, 1 2007 alsl.d a0, a1, a0, 1 2008 addi.d a6, a6, 8 2009 addi.w a5, a5, -2 2010 blt zero, a5, .WMASK420_W16_LSX 2011 b .END_W420 2012 2013.WMASK420_W32_LSX: 2014.WMASK420_W64_LSX: 2015.WMASK420_W128_LSX: 2016 2017.LOOP_W32_420_LSX: 2018 add.d t1, a2, zero 2019 add.d t2, a3, zero 2020 add.d t3, a0, zero 2021 add.d t4, a6, zero 2022 alsl.d t5, a4, t1, 1 2023 alsl.d t6, a4, t2, 1 2024 or t7, a4, a4 2025 2026.W32_420_LSX: 2027 vld vr0, t1, 0 2028 vld vr1, t1, 16 2029 vld vr2, t2, 0 2030 vld vr3, t2, 16 2031 vld vr4, t5, 0 2032 vld vr5, t5, 16 2033 vld vr6, t6, 0 2034 vld vr7, t6, 16 2035 addi.d t1, t1, 32 2036 addi.d t2, t2, 32 2037 addi.d t5, t5, 32 2038 addi.d t6, t6, 32 2039 addi.w t7, t7, -16 2040 vabsd.h vr8, vr0, vr2 2041 vabsd.h vr9, vr1, vr3 2042 vabsd.h vr10, vr4, vr6 2043 vabsd.h vr11, vr5, vr7 2044 vaddi.hu vr8, vr8, 8 2045 vaddi.hu vr9, vr9, 8 2046 vaddi.hu vr10, vr10, 8 2047 vaddi.hu vr11, vr11, 8 2048 vsrli.h vr8, vr8, 8 2049 vsrli.h vr9, vr9, 8 2050 vsrli.h vr10, vr10, 8 2051 vsrli.h vr11, vr11, 8 2052 vadd.h vr8, vr8, vr22 2053 vadd.h vr9, vr9, vr22 2054 vadd.h vr10, vr10, vr22 2055 vadd.h vr11, vr11, vr22 2056 vmin.hu vr12, vr8, vr20 2057 vmin.hu vr13, vr9, vr20 2058 vmin.hu vr14, vr10, vr20 2059 vmin.hu vr15, vr11, vr20 2060 vsub.h vr16, vr20, vr12 2061 vsub.h vr17, vr20, vr13 2062 vsub.h vr18, vr20, vr14 2063 vsub.h vr19, vr20, vr15 2064 vmulwev.w.h vr8, vr12, vr0 2065 vmulwod.w.h vr9, vr12, vr0 2066 vmulwev.w.h vr10, vr13, vr1 2067 vmulwod.w.h vr11, vr13, vr1 2068 vmulwev.w.h vr23, vr14, vr4 2069 vmulwod.w.h vr24, vr14, vr4 2070 vmulwev.w.h vr25, vr15, vr5 2071 vmulwod.w.h vr26, vr15, vr5 2072 vmaddwev.w.h vr8, vr16, vr2 2073 vmaddwod.w.h vr9, vr16, vr2 2074 vmaddwev.w.h vr10, vr17, vr3 2075 vmaddwod.w.h vr11, vr17, vr3 2076 vmaddwev.w.h vr23, vr18, vr6 2077 vmaddwod.w.h vr24, vr18, vr6 2078 vmaddwev.w.h vr25, vr19, vr7 2079 vmaddwod.w.h vr26, vr19, vr7 2080 vssrarni.hu.w vr10, vr8, 10 2081 vssrarni.hu.w vr11, vr9, 10 2082 vssrarni.hu.w vr25, vr23, 10 2083 vssrarni.hu.w vr26, vr24, 10 2084 vssrlni.bu.h vr11, vr10, 0 2085 vssrlni.bu.h vr26, vr25, 0 2086 vshuf4i.w vr8, vr11, 0x4E 2087 vshuf4i.w vr9, vr26, 0x4E 2088 vilvl.b vr3, vr8, vr11 2089 vilvl.b vr7, vr9, vr26 2090 vst vr3, t3, 0 2091 vstx vr7, a1, t3 2092 addi.d t3, t3, 16 2093 vpickev.h vr8, vr13, vr12 2094 vpickod.h vr9, vr13, vr12 2095 vpickev.h vr10, vr15, vr14 2096 vpickod.h vr11, vr15, vr14 2097 vadd.h vr8, vr8, vr9 2098 vadd.h vr10, vr10, vr11 2099 vadd.h vr12, vr8, vr10 2100 vsub.h vr12, vr12, vr21 2101 vssrarni.bu.h vr12, vr12, 2 2102 vstelm.d vr12, t4, 0, 0 2103 addi.d t4, t4, 8 2104 bne t7, zero, .W32_420_LSX 2105 2106 alsl.d a2, a4, a2, 2 2107 alsl.d a3, a4, a3, 2 2108 alsl.d a0, a1, a0, 1 2109 srai.w t8, a4, 1 2110 add.d a6, a6, t8 2111 addi.w a5, a5, -2 2112 blt zero, a5, .LOOP_W32_420_LSX 2113 2114.END_W420: 2115 fld.d f24, sp, 0 2116 fld.d f25, sp, 8 2117 fld.d f26, sp, 16 2118 addi.d sp, sp, 24 2119endfunc 2120 2121function w_mask_420_8bpc_lasx 2122 xvldi xr20, 0x440 2123 xvreplgr2vr.h xr21, a7 2124 xvldi xr22, 0x426 2125 2126 clz.w t0, a4 2127 li.w t1, 24 2128 sub.w t0, t0, t1 2129 la.local t1, .WMASK420_LASX_JRTABLE 2130 alsl.d t0, t0, t1, 1 2131 ld.h t8, t0, 0 2132 add.d t1, t1, t8 2133 jirl $r0, t1, 0 2134 2135 .align 3 2136.WMASK420_LASX_JRTABLE: 2137 .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE 2138 .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE 2139 .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE 2140 .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE 2141 .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE 2142 .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE 2143 2144.WMASK420_W4_LASX: 2145 xvld xr0, a2, 0 2146 xvld xr1, a3, 0 2147 addi.w a5, a5, -4 2148 2149 xvabsd.h xr2, xr0, xr1 2150 xvaddi.hu xr2, xr2, 8 2151 xvsrli.h xr2, xr2, 8 2152 xvadd.h xr2, xr2, xr22 2153 xvmin.hu xr3, xr2, xr20 2154 xvsub.h xr4, xr20, xr3 2155 xvmulwev.w.h xr5, xr3, xr0 2156 xvmulwod.w.h xr6, xr3, xr0 2157 xvmaddwev.w.h xr5, xr4, xr1 2158 xvmaddwod.w.h xr6, xr4, xr1 2159 xvilvl.w xr7, xr6, xr5 2160 xvilvh.w xr8, xr6, xr5 2161 xvssrarni.hu.w xr8, xr7, 10 2162 xvssrlni.bu.h xr9, xr8, 0 2163 vstelm.w vr9, a0, 0, 0 2164 add.d a0, a0, a1 2165 vstelm.w vr9, a0, 0, 1 2166 add.d a0, a0, a1 2167 xvstelm.w xr9, a0, 0, 4 2168 add.d a0, a0, a1 2169 xvstelm.w xr9, a0, 0, 5 2170 add.d a0, a0, a1 2171 2172 xvhaddw.w.h xr3, xr3, xr3 2173 xvpermi.d xr4, xr3, 0xb1 2174 xvadd.h xr3, xr3, xr4 2175 xvpickev.h xr3, xr3, xr3 2176 xvsub.h xr3, xr3, xr21 2177 xvssrarni.bu.h xr3, xr3, 2 2178 vstelm.h vr3, a6, 0, 0 2179 xvstelm.h xr3, a6, 2, 8 2180 2181 addi.d a2, a2, 32 2182 addi.d a3, a3, 32 2183 addi.d a6, a6, 4 2184 blt zero, a5, .WMASK420_W4_LASX 2185 b .END_W420_LASX 2186 2187.WMASK420_W8_LASX: 2188 xvld xr0, a2, 0 2189 xvld xr1, a2, 32 2190 xvld xr2, a3, 0 2191 xvld xr3, a3, 32 2192 addi.w a5, a5, -4 2193 2194 xvabsd.h xr4, xr0, xr2 2195 xvabsd.h xr5, xr1, xr3 2196 xvaddi.hu xr4, xr4, 8 2197 xvaddi.hu xr5, xr5, 8 2198 xvsrli.h xr4, xr4, 8 2199 xvsrli.h xr5, xr5, 8 2200 xvadd.h xr4, xr4, xr22 2201 xvadd.h xr5, xr5, xr22 2202 xvmin.hu xr6, xr4, xr20 2203 xvmin.hu xr7, xr5, xr20 2204 xvsub.h xr8, xr20, xr6 2205 xvsub.h xr9, xr20, xr7 2206 xvmulwev.w.h xr10, xr6, xr0 2207 xvmulwod.w.h xr11, xr6, xr0 2208 xvmulwev.w.h xr12, xr7, xr1 2209 xvmulwod.w.h xr13, xr7, xr1 2210 xvmaddwev.w.h xr10, xr8, xr2 2211 xvmaddwod.w.h xr11, xr8, xr2 2212 xvmaddwev.w.h xr12, xr9, xr3 2213 xvmaddwod.w.h xr13, xr9, xr3 2214 xvssrarni.hu.w xr12, xr10, 10 2215 xvssrarni.hu.w xr13, xr11, 10 2216 xvssrlni.bu.h xr13, xr12, 0 2217 xvshuf4i.w xr1, xr13, 0x4E 2218 xvilvl.b xr17, xr1, xr13 2219 vstelm.d vr17, a0, 0, 0 2220 add.d a0, a0, a1 2221 xvstelm.d xr17, a0, 0, 2 2222 add.d a0, a0, a1 2223 xvstelm.d xr17, a0, 0, 1 2224 add.d a0, a0, a1 2225 xvstelm.d xr17, a0, 0, 3 2226 add.d a0, a0, a1 2227 2228 xvhaddw.w.h xr6, xr6, xr6 2229 xvhaddw.w.h xr7, xr7, xr7 2230 xvpickev.h xr8, xr7, xr6 2231 xvpermi.q xr9, xr8, 0x01 2232 vadd.h vr8, vr8, vr9 2233 vsub.h vr8, vr8, vr21 2234 vssrarni.bu.h vr8, vr8, 2 2235 vstelm.d vr8, a6, 0, 0 2236 addi.d a2, a2, 64 2237 addi.d a3, a3, 64 2238 addi.d a6, a6, 8 2239 blt zero, a5, .WMASK420_W8_LASX 2240 b .END_W420_LASX 2241 2242.WMASK420_W16_LASX: 2243 xvld xr0, a2, 0 2244 xvld xr1, a2, 32 2245 xvld xr2, a3, 0 2246 xvld xr3, a3, 32 2247 addi.w a5, a5, -2 2248 2249 xvabsd.h xr4, xr0, xr2 2250 xvabsd.h xr5, xr1, xr3 2251 xvaddi.hu xr4, xr4, 8 2252 xvaddi.hu xr5, xr5, 8 2253 xvsrli.h xr4, xr4, 8 2254 xvsrli.h xr5, xr5, 8 2255 xvadd.h xr4, xr4, xr22 2256 xvadd.h xr5, xr5, xr22 2257 xvmin.hu xr4, xr4, xr20 2258 xvmin.hu xr5, xr5, xr20 2259 xvsub.h xr6, xr20, xr4 2260 xvsub.h xr7, xr20, xr5 2261 xvmulwev.w.h xr8, xr4, xr0 2262 xvmulwod.w.h xr9, xr4, xr0 2263 xvmulwev.w.h xr10, xr5, xr1 2264 xvmulwod.w.h xr11, xr5, xr1 2265 xvmaddwev.w.h xr8, xr6, xr2 2266 xvmaddwod.w.h xr9, xr6, xr2 2267 xvmaddwev.w.h xr10, xr7, xr3 2268 xvmaddwod.w.h xr11, xr7, xr3 2269 xvssrarni.hu.w xr10, xr8, 10 2270 xvssrarni.hu.w xr11, xr9, 10 2271 xvssrlni.bu.h xr11, xr10, 0 2272 xvshuf4i.w xr8, xr11, 0x4E 2273 xvilvl.b xr15, xr8, xr11 2274 xvpermi.d xr16, xr15, 0xd8 2275 vst vr16, a0, 0 2276 add.d a0, a0, a1 2277 xvpermi.q xr16, xr16, 0x01 2278 vst vr16, a0, 0 2279 add.d a0, a0, a1 2280 2281 xvhaddw.w.h xr4, xr4, xr4 2282 xvhaddw.w.h xr5, xr5, xr5 2283 xvadd.h xr4, xr5, xr4 2284 xvpickev.h xr6, xr4, xr4 2285 xvpermi.d xr7, xr6, 0x08 2286 vsub.h vr7, vr7, vr21 2287 vssrarni.bu.h vr7, vr7, 2 2288 vstelm.d vr7, a6, 0, 0 2289 2290 addi.d a2, a2, 64 2291 addi.d a3, a3, 64 2292 addi.d a6, a6, 8 2293 blt zero, a5, .WMASK420_W16_LASX 2294 b .END_W420_LASX 2295 2296.WMASK420_W32_LASX: 2297.WMASK420_W64_LASX: 2298.WMASK420_W128_LASX: 2299 2300.LOOP_W32_420_LASX: 2301 add.d t1, a2, zero 2302 add.d t2, a3, zero 2303 add.d t3, a0, zero 2304 add.d t4, a6, zero 2305 alsl.d t5, a4, t1, 1 2306 alsl.d t6, a4, t2, 1 2307 or t7, a4, a4 2308.W32_420_LASX: 2309 xvld xr0, t1, 0 2310 xvld xr1, t2, 0 2311 xvld xr2, t5, 0 2312 xvld xr3, t6, 0 2313 addi.d t1, t1, 32 2314 addi.d t2, t2, 32 2315 addi.d t5, t5, 32 2316 addi.d t6, t6, 32 2317 addi.w t7, t7, -16 2318 xvabsd.h xr4, xr0, xr1 2319 xvabsd.h xr5, xr2, xr3 2320 xvaddi.hu xr4, xr4, 8 2321 xvaddi.hu xr5, xr5, 8 2322 xvsrli.h xr4, xr4, 8 2323 xvsrli.h xr5, xr5, 8 2324 xvadd.h xr4, xr4, xr22 2325 xvadd.h xr5, xr5, xr22 2326 xvmin.hu xr6, xr4, xr20 2327 xvmin.hu xr7, xr5, xr20 2328 xvsub.h xr8, xr20, xr6 2329 xvsub.h xr9, xr20, xr7 2330 xvmulwev.w.h xr10, xr6, xr0 2331 xvmulwod.w.h xr11, xr6, xr0 2332 xvmulwev.w.h xr12, xr7, xr2 2333 xvmulwod.w.h xr13, xr7, xr2 2334 xvmaddwev.w.h xr10, xr8, xr1 2335 xvmaddwod.w.h xr11, xr8, xr1 2336 xvmaddwev.w.h xr12, xr9, xr3 2337 xvmaddwod.w.h xr13, xr9, xr3 2338 xvssrarni.hu.w xr12, xr10, 10 2339 xvssrarni.hu.w xr13, xr11, 10 2340 xvssrlni.bu.h xr13, xr12, 0 2341 xvshuf4i.w xr10, xr13, 0x4E 2342 xvilvl.b xr17, xr10, xr13 2343 xvpermi.d xr18, xr17, 0x08 2344 xvpermi.d xr19, xr17, 0x0d 2345 vst vr18, t3, 0 2346 vstx vr19, t3, a1 2347 addi.d t3, t3, 16 2348 2349 xvhaddw.w.h xr6, xr6, xr6 2350 xvhaddw.w.h xr7, xr7, xr7 2351 xvadd.h xr6, xr7, xr6 2352 xvpickev.h xr7, xr6, xr6 2353 xvpermi.d xr8, xr7, 0x08 2354 vsub.h vr9, vr8, vr21 2355 vssrarni.bu.h vr9, vr9, 2 2356 vstelm.d vr9, t4, 0, 0 2357 addi.d t4, t4, 8 2358 bne t7, zero, .W32_420_LASX 2359 2360 alsl.d a2, a4, a2, 2 2361 alsl.d a3, a4, a3, 2 2362 alsl.d a0, a1, a0, 1 2363 srai.w t8, a4, 1 2364 add.d a6, a6, t8 2365 addi.w a5, a5, -2 2366 blt zero, a5, .LOOP_W32_420_LASX 2367 2368.END_W420_LASX: 2369endfunc 2370 2371#undef bpc_sh 2372#undef bpcw_sh 2373 2374.macro vhaddw.d.h in0 2375 vhaddw.w.h \in0, \in0, \in0 2376 vhaddw.d.w \in0, \in0, \in0 2377.endm 2378.macro vhaddw.q.w in0 2379 vhaddw.d.w \in0, \in0, \in0 2380 vhaddw.q.d \in0, \in0, \in0 2381.endm 2382.macro PUT_H_8W in0 2383 vshuf.b vr2, \in0, \in0, vr6 2384 vshuf.b vr3, \in0, \in0, vr7 2385 vshuf.b vr4, \in0, \in0, vr8 2386 vmulwev.h.bu.b vr12, vr2, vr10 2387 vmulwev.h.bu.b vr13, vr3, vr11 2388 vmulwev.h.bu.b vr14, vr3, vr10 2389 vmulwev.h.bu.b vr15, vr4, vr11 2390 vmaddwod.h.bu.b vr12, vr2, vr10 2391 vmaddwod.h.bu.b vr13, vr3, vr11 2392 vmaddwod.h.bu.b vr14, vr3, vr10 2393 vmaddwod.h.bu.b vr15, vr4, vr11 2394 vadd.h vr12, vr12, vr13 2395 vadd.h vr14, vr14, vr15 2396 vhaddw.w.h vr12, vr12, vr12 2397 vhaddw.w.h vr14, vr14, vr14 2398 vpickev.h \in0, vr14, vr12 2399 vadd.h \in0, \in0, vr9 2400.endm 2401 2402const subpel_h_shuf0 2403.byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20 2404endconst 2405const subpel_h_shuf1 2406.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 2407endconst 2408const subpel_h_shuf2 2409.byte 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 2410.byte 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 2411endconst 2412const subpel_h_shuf3 2413.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 2414.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 2415endconst 2416 2417.macro FILTER_8TAP_8W in0 2418 vshuf.b vr13, \in0, \in0, vr7 2419 vshuf.b vr14, \in0, \in0, vr11 2420 vshuf.b vr15, \in0, \in0, vr12 2421 vmulwev.h.bu.b vr16, vr13, vr8 2422 vmulwev.h.bu.b vr17, vr14, vr10 2423 vmulwev.h.bu.b vr18, vr14, vr8 2424 vmulwev.h.bu.b vr19, vr15, vr10 2425 vmaddwod.h.bu.b vr16, vr13, vr8 2426 vmaddwod.h.bu.b vr17, vr14, vr10 2427 vmaddwod.h.bu.b vr18, vr14, vr8 2428 vmaddwod.h.bu.b vr19, vr15, vr10 2429 vadd.h vr16, vr16, vr17 2430 vadd.h vr18, vr18, vr19 2431 vhaddw.w.h vr16, vr16, vr16 2432 vhaddw.w.h \in0, vr18, vr18 2433 vssrarni.h.w \in0, vr16, 2 2434.endm 2435 2436.macro PUT_8TAP_8BPC_LSX lable 2437 li.w t0, 4 2438 la.local t6, dav1d_mc_subpel_filters 2439 slli.d t2, a3, 1 //src_stride*2 2440 add.d t3, t2, a3 //src_stride*3 2441 slli.d t4, t2, 1 //src_stride*4 2442 2443 bnez a6, .l_\lable\()put_h //mx 2444 bnez a7, .l_\lable\()put_v //my 2445 2446 clz.w t1, a4 2447 li.w t5, 24 2448 sub.w t1, t1, t5 2449 la.local t5, .l_\lable\()put_hv0_jtable 2450 alsl.d t1, t1, t5, 3 2451 ld.d t6, t1, 0 2452 add.d t5, t5, t6 2453 jirl $r0, t5, 0 2454 2455 .align 3 2456.l_\lable\()put_hv0_jtable: 2457 .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable 2458 .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable 2459 .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable 2460 .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable 2461 .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable 2462 .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable 2463 .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable 2464 2465.l_\lable\()put_hv0_2w: 2466 vldrepl.h vr0, a2, 0 2467 add.d a2, a2, a3 2468 vldrepl.h vr1, a2, 0 2469 vstelm.h vr0, a0, 0, 0 2470 add.d a0, a0, a1 2471 vstelm.h vr1, a0, 0, 0 2472 add.d a2, a2, a3 2473 add.d a0, a0, a1 2474 addi.w a5, a5, -2 2475 bnez a5, .l_\lable\()put_hv0_2w 2476 b .l_\lable\()end_put_8tap 2477.l_\lable\()put_hv0_4w: 2478 fld.s f0, a2, 0 2479 fldx.s f1, a2, a3 2480 fst.s f0, a0, 0 2481 fstx.s f1, a0, a1 2482 alsl.d a2, a3, a2, 1 2483 alsl.d a0, a1, a0, 1 2484 addi.w a5, a5, -2 2485 bnez a5, .l_\lable\()put_hv0_4w 2486 b .l_\lable\()end_put_8tap 2487.l_\lable\()put_hv0_8w: 2488 fld.d f0, a2, 0 2489 fldx.d f1, a2, a3 2490 fst.d f0, a0, 0 2491 fstx.d f1, a0, a1 2492 alsl.d a2, a3, a2, 1 2493 alsl.d a0, a1, a0, 1 2494 addi.w a5, a5, -2 2495 bnez a5, .l_\lable\()put_hv0_8w 2496 b .l_\lable\()end_put_8tap 2497.l_\lable\()put_hv0_16w: 2498 vld vr0, a2, 0 2499 vldx vr1, a2, a3 2500 vst vr0, a0, 0 2501 vstx vr1, a0, a1 2502 alsl.d a2, a3, a2, 1 2503 alsl.d a0, a1, a0, 1 2504 addi.w a5, a5, -2 2505 bnez a5, .l_\lable\()put_hv0_16w 2506 b .l_\lable\()end_put_8tap 2507.l_\lable\()put_hv0_32w: 2508 vld vr0, a2, 0 2509 vld vr1, a2, 16 2510 add.d a2, a2, a3 2511 vld vr2, a2, 0 2512 vld vr3, a2, 16 2513 vst vr0, a0, 0 2514 vst vr1, a0, 16 2515 add.d a0, a0, a1 2516 vst vr2, a0, 0 2517 vst vr3, a0, 16 2518 add.d a2, a2, a3 2519 add.d a0, a0, a1 2520 addi.w a5, a5, -2 2521 bnez a5, .l_\lable\()put_hv0_32w 2522 b .l_\lable\()end_put_8tap 2523.l_\lable\()put_hv0_64w: 2524 vld vr0, a2, 0 2525 vld vr1, a2, 16 2526 vld vr2, a2, 32 2527 vld vr3, a2, 48 2528 add.d a2, a2, a3 2529 vld vr4, a2, 0 2530 vld vr5, a2, 16 2531 vld vr6, a2, 32 2532 vld vr7, a2, 48 2533 add.d a2, a2, a3 2534 vst vr0, a0, 0 2535 vst vr1, a0, 16 2536 vst vr2, a0, 32 2537 vst vr3, a0, 48 2538 add.d a0, a0, a1 2539 vst vr4, a0, 0 2540 vst vr5, a0, 16 2541 vst vr6, a0, 32 2542 vst vr7, a0, 48 2543 add.d a0, a0, a1 2544 addi.w a5, a5, -2 2545 bnez a5, .l_\lable\()put_hv0_64w 2546 b .l_\lable\()end_put_8tap 2547.l_\lable\()put_hv0_128w: 2548 vld vr0, a2, 0 2549 vld vr1, a2, 16 2550 vld vr2, a2, 32 2551 vld vr3, a2, 48 2552 vld vr4, a2, 64 2553 vld vr5, a2, 80 2554 vld vr6, a2, 96 2555 vld vr7, a2, 112 2556 add.d a2, a2, a3 2557 vld vr8, a2, 0 2558 vld vr9, a2, 16 2559 vld vr10, a2, 32 2560 vld vr11, a2, 48 2561 vld vr12, a2, 64 2562 vld vr13, a2, 80 2563 vld vr14, a2, 96 2564 vld vr15, a2, 112 2565 add.d a2, a2, a3 2566 vst vr0, a0, 0 2567 vst vr1, a0, 16 2568 vst vr2, a0, 32 2569 vst vr3, a0, 48 2570 vst vr4, a0, 64 2571 vst vr5, a0, 80 2572 vst vr6, a0, 96 2573 vst vr7, a0, 112 2574 add.d a0, a0, a1 2575 vst vr8, a0, 0 2576 vst vr9, a0, 16 2577 vst vr10, a0, 32 2578 vst vr11, a0, 48 2579 vst vr12, a0, 64 2580 vst vr13, a0, 80 2581 vst vr14, a0, 96 2582 vst vr15, a0, 112 2583 add.d a0, a0, a1 2584 addi.w a5, a5, -2 2585 bnez a5, .l_\lable\()put_hv0_128w 2586 b .l_\lable\()end_put_8tap 2587 2588.l_\lable\()put_h: 2589 bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) 2590 ld.d t5, sp, 0 //filter_type 2591 andi t1, t5, 3 2592 blt t0, a4, .l_\lable\()put_h_idx_fh 2593 andi t1, t5, 1 2594 addi.w t1, t1, 3 2595 2596.l_\lable\()put_h_idx_fh: 2597 addi.w t5, zero, 120 2598 mul.w t1, t1, t5 2599 addi.w t5, a6, -1 2600 slli.w t5, t5, 3 2601 add.w t1, t1, t5 2602 add.d t7, t6, t1 //fh's offset 2603 li.w t1, 34 2604 vreplgr2vr.h vr9, t1 2605 2606 clz.w t1, a4 2607 li.w t5, 24 2608 sub.w t1, t1, t5 2609 la.local t5, .l_\lable\()put_h_jtable 2610 alsl.d t1, t1, t5, 3 2611 ld.d t6, t1, 0 2612 add.d t5, t5, t6 2613 jirl $r0, t5, 0 2614 2615 .align 3 2616.l_\lable\()put_h_jtable: 2617 .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable 2618 .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable 2619 .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable 2620 .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable 2621 .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable 2622 .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable 2623 .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable 2624 2625.l_\lable\()put_h_2w: 2626 addi.d t7, t7, 2 2627 addi.d a2, a2, -1 2628 vldrepl.w vr8, t7, 0 2629 la.local t7, subpel_h_shuf0 2630 vld vr7, t7, 0 2631.l_\lable\()put_h_2w_loop: 2632 vld vr0, a2, 0 2633 vldx vr1, a2, a3 2634 add.d a2, a2, t2 2635 2636 vshuf.b vr0, vr1, vr0, vr7 2637 vdp2.h.bu.b vr1, vr0, vr8 2638 vhaddw.w.h vr0, vr1, vr1 2639 vpickev.h vr0, vr0, vr0 2640 vadd.h vr0, vr0, vr9 2641 vssrani.bu.h vr0, vr0, 6 2642 2643 vstelm.h vr0, a0, 0, 0 2644 add.d a0, a0, a1 2645 vstelm.h vr0, a0, 0, 1 2646 add.d a0, a0, a1 2647 addi.w a5, a5, -2 2648 bnez a5, .l_\lable\()put_h_2w_loop 2649 b .l_\lable\()end_put_8tap 2650 2651.l_\lable\()put_h_4w: 2652 addi.d t7, t7, 2 2653 addi.d a2, a2, -1 2654 vldrepl.w vr8, t7, 0 2655 la.local t7, subpel_h_shuf1 2656 vld vr7, t7, 0 2657.l_\lable\()put_h_4w_loop: 2658 vld vr0, a2, 0 2659 vldx vr1, a2, a3 2660 add.d a2, a2, t2 2661 2662 vshuf.b vr0, vr0, vr0, vr7 2663 vshuf.b vr1, vr1, vr1, vr7 2664 vmulwev.h.bu.b vr2, vr0, vr8 2665 vmulwev.h.bu.b vr3, vr1, vr8 2666 vmaddwod.h.bu.b vr2, vr0, vr8 2667 vmaddwod.h.bu.b vr3, vr1, vr8 2668 vhaddw.w.h vr0, vr2, vr2 2669 vhaddw.w.h vr1, vr3, vr3 2670 vpickev.h vr0, vr1, vr0 2671 vadd.h vr0, vr0, vr9 2672 vssrani.bu.h vr0, vr0, 6 2673 2674 vstelm.w vr0, a0, 0, 0 2675 add.d a0, a0, a1 2676 vstelm.w vr0, a0, 0, 1 2677 add.d a0, a0, a1 2678 addi.d a5, a5, -2 2679 bnez a5, .l_\lable\()put_h_4w_loop 2680 b .l_\lable\()end_put_8tap 2681 2682.l_\lable\()put_h_8w: 2683 fld.d f10, t7, 0 2684 vreplvei.w vr11, vr10, 1 2685 vreplvei.w vr10, vr10, 0 2686 la.local t7, subpel_h_shuf1 2687 vld vr6, t7, 0 2688 vaddi.bu vr7, vr6, 4 2689 vaddi.bu vr8, vr6, 8 2690 addi.d a2, a2, -3 2691.l_\lable\()put_h_8w_loop: 2692 vld vr0, a2, 0 2693 vldx vr1, a2, a3 2694 add.d a2, a2, t2 2695 PUT_H_8W vr0 2696 PUT_H_8W vr1 2697 vssrani.bu.h vr1, vr0, 6 2698 vstelm.d vr1, a0, 0, 0 2699 add.d a0, a0, a1 2700 vstelm.d vr1, a0, 0, 1 2701 add.d a0, a0, a1 2702 addi.w a5, a5, -2 2703 bnez a5, .l_\lable\()put_h_8w_loop 2704 b .l_\lable\()end_put_8tap 2705 2706.l_\lable\()put_h_16w: 2707.l_\lable\()put_h_32w: 2708.l_\lable\()put_h_64w: 2709.l_\lable\()put_h_128w: 2710 fld.d f10, t7, 0 2711 vreplvei.w vr11, vr10, 1 2712 vreplvei.w vr10, vr10, 0 2713 la.local t7, subpel_h_shuf1 2714 vld vr6, t7, 0 2715 vaddi.bu vr7, vr6, 4 2716 vaddi.bu vr8, vr6, 8 2717 addi.d a2, a2, -3 2718 addi.d t0, a2, 0 //src 2719 addi.w t5, a5, 0 //h 2720 addi.d t8, a0, 0 //dst 2721.l_\lable\()put_h_16w_loop: 2722 vld vr0, a2, 0 2723 vld vr1, a2, 8 2724 add.d a2, a2, a3 2725 PUT_H_8W vr0 2726 PUT_H_8W vr1 2727 vssrani.bu.h vr1, vr0, 6 2728 vst vr1, a0, 0 2729 add.d a0, a0, a1 2730 addi.d a5, a5, -1 2731 bnez a5, .l_\lable\()put_h_16w_loop 2732 addi.d a2, t0, 16 2733 addi.d t0, t0, 16 2734 addi.d a0, t8, 16 2735 addi.d t8, t8, 16 2736 addi.w a5, t5, 0 2737 addi.w a4, a4, -16 2738 bnez a4, .l_\lable\()put_h_16w_loop 2739 b .l_\lable\()end_put_8tap 2740 2741.l_\lable\()put_v: 2742 ld.d t1, sp, 0 //filter_type 2743 srli.w t1, t1, 2 2744 blt t0, a5, .l_\lable\()put_v_idx_fv 2745 andi t1, t1, 1 2746 addi.w t1, t1, 3 2747 2748.l_\lable\()put_v_idx_fv: 2749 addi.w t5, zero, 120 2750 mul.w t1, t1, t5 2751 addi.w t5, a7, -1 2752 slli.w t5, t5, 3 2753 add.w t1, t1, t5 2754 add.d t1, t6, t1 //fv's offset 2755 vldrepl.d vr8, t1, 0 2756 sub.d a2, a2, t3 2757 2758 vilvl.h vr8, vr8, vr8 2759 vreplvei.w vr9, vr8, 1 2760 vreplvei.w vr10, vr8, 2 2761 vreplvei.w vr11, vr8, 3 2762 vreplvei.w vr8, vr8, 0 2763 2764 clz.w t1, a4 2765 li.w t5, 24 2766 sub.w t1, t1, t5 2767 la.local t5, .l_\lable\()put_v_jtable 2768 alsl.d t1, t1, t5, 3 2769 ld.d t6, t1, 0 2770 add.d t5, t5, t6 2771 jirl $r0, t5, 0 2772 2773 .align 3 2774.l_\lable\()put_v_jtable: 2775 .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable 2776 .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable 2777 .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable 2778 .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable 2779 .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable 2780 .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable 2781 .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable 2782 2783.l_\lable\()put_v_2w: 2784 fld.s f0, a2, 0 2785 fldx.s f1, a2, a3 2786 fldx.s f2, a2, t2 2787 add.d a2, a2, t3 2788 fld.s f3, a2, 0 2789 fldx.s f4, a2, a3 2790 fldx.s f5, a2, t2 2791 fldx.s f6, a2, t3 2792 add.d a2, a2, t4 2793 2794 vilvl.h vr0, vr1, vr0 //0 1 2795 vilvl.h vr1, vr2, vr1 //1 2 2796 vilvl.b vr0, vr1, vr0 //01 12 2797 vilvl.h vr2, vr3, vr2 //2 3 2798 vilvl.h vr3, vr4, vr3 //3 4 2799 vilvl.b vr1, vr3, vr2 //23 34 2800 vilvl.h vr2, vr5, vr4 //4 5 2801 vilvl.h vr3, vr6, vr5 //5 6 2802 vilvl.b vr2, vr3, vr2 //45 56 2803.l_\lable\()put_v_2w_loop: 2804 fld.s f7, a2, 0 2805 vilvl.h vr3, vr7, vr6 //6 7 2806 fldx.s f6, a2, a3 2807 add.d a2, a2, t2 2808 vilvl.h vr4, vr6, vr7 //7 8 2809 vilvl.b vr3, vr4, vr3 //67 78 2810 2811 vmulwev.h.bu.b vr12, vr0, vr8 2812 vmulwev.h.bu.b vr13, vr1, vr9 2813 vmulwev.h.bu.b vr14, vr2, vr10 2814 vmulwev.h.bu.b vr15, vr3, vr11 2815 vmaddwod.h.bu.b vr12, vr0, vr8 2816 vmaddwod.h.bu.b vr13, vr1, vr9 2817 vmaddwod.h.bu.b vr14, vr2, vr10 2818 vmaddwod.h.bu.b vr15, vr3, vr11 2819 vaddi.hu vr0, vr1, 0 2820 vaddi.hu vr1, vr2, 0 2821 vaddi.hu vr2, vr3, 0 2822 vadd.h vr12, vr12, vr13 2823 vadd.h vr12, vr12, vr14 2824 vadd.h vr12, vr12, vr15 2825 2826 vssrarni.bu.h vr12, vr12, 6 2827 vstelm.h vr12, a0, 0, 0 2828 add.d a0, a0, a1 2829 vstelm.h vr12, a0, 0, 1 2830 add.d a0, a0, a1 2831 addi.w a5, a5, -2 2832 bnez a5, .l_\lable\()put_v_2w_loop 2833 b .l_\lable\()end_put_8tap 2834 2835.l_\lable\()put_v_4w: 2836 fld.s f0, a2, 0 2837 fldx.s f1, a2, a3 2838 fldx.s f2, a2, t2 2839 add.d a2, a2, t3 2840 fld.s f3, a2, 0 2841 fldx.s f4, a2, a3 2842 fldx.s f5, a2, t2 2843 fldx.s f6, a2, t3 2844 add.d a2, a2, t4 2845 2846 vilvl.w vr0, vr1, vr0 2847 vilvl.w vr1, vr2, vr1 2848 vilvl.b vr0, vr1, vr0 2849 vilvl.w vr1, vr3, vr2 2850 vilvl.w vr2, vr4, vr3 2851 vilvl.b vr1, vr2, vr1 2852 vilvl.w vr2, vr5, vr4 2853 vilvl.w vr3, vr6, vr5 2854 vilvl.b vr2, vr3, vr2 2855.l_\lable\()put_v_4w_loop: 2856 fld.s f7, a2, 0 2857 2858 vilvl.w vr3, vr7, vr6 2859 fldx.s f6, a2, a3 2860 add.d a2, a2, t2 2861 vilvl.w vr4, vr6, vr7 2862 vilvl.b vr3, vr4, vr3 2863 2864 vmulwev.h.bu.b vr12, vr0, vr8 2865 vmulwev.h.bu.b vr13, vr1, vr9 2866 vmulwev.h.bu.b vr14, vr2, vr10 2867 vmulwev.h.bu.b vr15, vr3, vr11 2868 vmaddwod.h.bu.b vr12, vr0, vr8 2869 vmaddwod.h.bu.b vr13, vr1, vr9 2870 vmaddwod.h.bu.b vr14, vr2, vr10 2871 vmaddwod.h.bu.b vr15, vr3, vr11 2872 vaddi.hu vr0, vr1, 0 2873 vaddi.hu vr1, vr2, 0 2874 vaddi.hu vr2, vr3, 0 2875 vadd.h vr12, vr12, vr13 2876 vadd.h vr12, vr12, vr14 2877 vadd.h vr12, vr12, vr15 2878 2879 vssrarni.bu.h vr12, vr12, 6 2880 vstelm.w vr12, a0, 0, 0 2881 add.d a0, a0, a1 2882 vstelm.w vr12, a0, 0, 1 2883 add.d a0, a0, a1 2884 addi.w a5, a5, -2 2885 bnez a5, .l_\lable\()put_v_4w_loop 2886 b .l_\lable\()end_put_8tap 2887 2888.l_\lable\()put_v_8w: 2889.l_\lable\()put_v_16w: 2890.l_\lable\()put_v_32w: 2891.l_\lable\()put_v_64w: 2892.l_\lable\()put_v_128w: 2893 addi.d t0, a2, 0 //src 2894 addi.d t5, a5, 0 //h 2895 addi.d t8, a0, 0 //dst 2896.l_\lable\()put_v_8w_loop0: 2897 fld.d f0, a2, 0 2898 fldx.d f1, a2, a3 2899 fldx.d f2, a2, t2 2900 add.d a2, a2, t3 2901 fld.d f3, a2, 0 2902 fldx.d f4, a2, a3 2903 fldx.d f5, a2, t2 2904 fldx.d f6, a2, t3 2905 add.d a2, a2, t4 2906 2907 vilvl.b vr0, vr1, vr0 //0 1 2908 vilvl.b vr1, vr2, vr1 //1 2 2909 vilvl.b vr2, vr3, vr2 //2 3 2910 vilvl.b vr3, vr4, vr3 //3 4 2911 vilvl.b vr4, vr5, vr4 //4 5 2912 vilvl.b vr5, vr6, vr5 //5 6 2913.l_\lable\()put_v_8w_loop: 2914 fld.d f7, a2, 0 2915 vilvl.b vr12, vr7, vr6 //6 7 2916 fldx.d f6, a2, a3 2917 add.d a2, a2, t2 2918 vilvl.b vr13, vr6, vr7 //7 8 2919 2920 vmulwev.h.bu.b vr14, vr0, vr8 2921 vmulwev.h.bu.b vr15, vr1, vr8 2922 vmulwev.h.bu.b vr16, vr2, vr9 2923 vmulwev.h.bu.b vr17, vr3, vr9 2924 vmulwev.h.bu.b vr18, vr4, vr10 2925 vmulwev.h.bu.b vr19, vr5, vr10 2926 vmulwev.h.bu.b vr20, vr12, vr11 2927 vmulwev.h.bu.b vr21, vr13, vr11 2928 vmaddwod.h.bu.b vr14, vr0, vr8 2929 vmaddwod.h.bu.b vr15, vr1, vr8 2930 vmaddwod.h.bu.b vr16, vr2, vr9 2931 vmaddwod.h.bu.b vr17, vr3, vr9 2932 vmaddwod.h.bu.b vr18, vr4, vr10 2933 vmaddwod.h.bu.b vr19, vr5, vr10 2934 vmaddwod.h.bu.b vr20, vr12, vr11 2935 vmaddwod.h.bu.b vr21, vr13, vr11 2936 2937 vaddi.hu vr0, vr2, 0 2938 vaddi.hu vr1, vr3, 0 2939 vaddi.hu vr2, vr4, 0 2940 vaddi.hu vr3, vr5, 0 2941 vaddi.hu vr4, vr12, 0 2942 vaddi.hu vr5, vr13, 0 2943 vadd.h vr14, vr14, vr16 2944 vadd.h vr14, vr14, vr18 2945 vadd.h vr14, vr14, vr20 2946 vadd.h vr15, vr15, vr17 2947 vadd.h vr15, vr15, vr19 2948 vadd.h vr15, vr15, vr21 2949 2950 vssrarni.bu.h vr15, vr14, 6 2951 vstelm.d vr15, a0, 0, 0 2952 add.d a0, a0, a1 2953 vstelm.d vr15, a0, 0, 1 2954 add.d a0, a0, a1 2955 addi.w a5, a5, -2 2956 bnez a5, .l_\lable\()put_v_8w_loop 2957 addi.d a2, t0, 8 2958 addi.d t0, t0, 8 2959 addi.d a0, t8, 8 2960 addi.d t8, t8, 8 2961 addi.d a5, t5, 0 2962 addi.w a4, a4, -8 2963 bnez a4, .l_\lable\()put_v_8w_loop0 2964 b .l_\lable\()end_put_8tap 2965 2966.l_\lable\()put_hv: 2967 ld.d t5, sp, 0 //filter_type 2968 andi t1, t5, 3 2969 blt t0, a4, .l_\lable\()put_hv_idx_fh 2970 andi t1, t5, 1 2971 addi.w t1, t1, 3 2972.l_\lable\()put_hv_idx_fh: 2973 addi.w t5, zero, 120 2974 mul.w t1, t1, t5 2975 addi.w t5, a6, -1 2976 slli.w t5, t5, 3 2977 add.w t1, t1, t5 2978 add.d t1, t6, t1 //fh's offset 2979 vldrepl.d vr8, t1, 0 2980 ld.d t1, sp, 0 //filter_type 2981 srli.w t1, t1, 2 2982 blt t0, a5, .l_\lable\()put_hv_idx_fv 2983 andi t1, t1, 1 2984 addi.w t1, t1, 3 2985.l_\lable\()put_hv_idx_fv: 2986 addi.w t5, zero, 120 2987 mul.w t1, t1, t5 2988 addi.w t5, a7, -1 2989 slli.w t5, t5, 3 2990 add.w t1, t1, t5 2991 add.d t1, t6, t1 //fv's offset 2992 vldrepl.d vr9, t1, 0 2993 vexth.h.b vr9, vr9 2994 2995 sub.d a2, a2, t3 2996 addi.d a2, a2, -3 2997 2998 clz.w t1, a4 2999 li.w t5, 24 3000 sub.w t1, t1, t5 3001 la.local t5, .l_\lable\()put_hv_jtable 3002 alsl.d t1, t1, t5, 3 3003 ld.d t6, t1, 0 3004 add.d t5, t5, t6 3005 jirl $r0, t5, 0 3006 3007 .align 3 3008.l_\lable\()put_hv_jtable: 3009 .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable 3010 .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable 3011 .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable 3012 .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable 3013 .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable 3014 .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable 3015 .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable 3016 3017.l_\lable\()put_hv_2w: 3018 addi.d a2, a2, 2 3019 vld vr0, a2, 0 3020 vldx vr1, a2, a3 3021 vldx vr2, a2, t2 3022 add.d a2, a2, t3 3023 vld vr3, a2, 0 3024 vldx vr4, a2, a3 3025 vldx vr5, a2, t2 3026 vldx vr6, a2, t3 3027 add.d a2, a2, t4 3028 3029 la.local t1, subpel_h_shuf0 3030 vld vr7, t1, 0 3031 vbsrl.v vr8, vr8, 2 3032 vreplvei.w vr8, vr8, 0 3033 3034 //fv 3035 vreplvei.w vr14, vr9, 1 3036 vreplvei.w vr15, vr9, 2 3037 vreplvei.w vr16, vr9, 3 3038 vreplvei.w vr9, vr9, 0 3039 3040 vshuf.b vr0, vr1, vr0, vr7 3041 vshuf.b vr1, vr3, vr2, vr7 3042 vshuf.b vr2, vr5, vr4, vr7 3043 vshuf.b vr3, vr6, vr6, vr7 3044 vmulwev.h.bu.b vr10, vr0, vr8 3045 vmulwev.h.bu.b vr11, vr1, vr8 3046 vmulwev.h.bu.b vr12, vr2, vr8 3047 vmulwev.h.bu.b vr13, vr3, vr8 3048 vmaddwod.h.bu.b vr10, vr0, vr8 3049 vmaddwod.h.bu.b vr11, vr1, vr8 3050 vmaddwod.h.bu.b vr12, vr2, vr8 3051 vmaddwod.h.bu.b vr13, vr3, vr8 3052 vhaddw.w.h vr0, vr10, vr10 3053 vhaddw.w.h vr1, vr11, vr11 3054 vssrarni.h.w vr1, vr0, 2 //h0 h1 h2 h3 3055 vhaddw.w.h vr2, vr12, vr12 3056 vhaddw.w.h vr3, vr13, vr13 3057 vssrarni.h.w vr3, vr2, 2 //h4 h5 h6 ~ 3058 vbsrl.v vr2, vr1, 4 3059 vextrins.w vr2, vr3, 0x30 //h1 h2 h3 h4 3060 vilvl.h vr4, vr2, vr1 //h0 h1 h1 h2 -- 3061 vilvh.h vr5, vr2, vr1 //h2 h3 h3 h4 -- 3062 vbsrl.v vr6, vr3, 4 3063 vilvl.h vr6, vr6, vr3 //h4 h5 h5 h6 -- 3064 vbsrl.v vr3, vr3, 8 //h6 ~ 3065.l_\lable\()put_hv_2w_loop: 3066 vld vr0, a2, 0 3067 vldx vr2, a2, a3 3068 add.d a2, a2, t2 3069 vshuf.b vr0, vr2, vr0, vr7 3070 vdp2.h.bu.b vr17, vr0, vr8 3071 vhaddw.w.h vr17, vr17, vr17 3072 vssrarni.h.w vr17, vr17, 2 //h7 h8 3073 vextrins.w vr3, vr17, 0x10 //h6 h7 3074 vilvl.h vr3, vr17, vr3 //h6 h7 h7 h8 -- 3075 3076 vmulwev.w.h vr18, vr4, vr9 3077 vmulwev.w.h vr19, vr5, vr14 3078 vmulwev.w.h vr20, vr6, vr15 3079 vmulwev.w.h vr21, vr3, vr16 3080 vmaddwod.w.h vr18, vr4, vr9 3081 vmaddwod.w.h vr19, vr5, vr14 3082 vmaddwod.w.h vr20, vr6, vr15 3083 vmaddwod.w.h vr21, vr3, vr16 3084 vaddi.hu vr4, vr5, 0 3085 vaddi.hu vr5, vr6, 0 3086 vaddi.hu vr6, vr3, 0 3087 vbsrl.v vr3, vr17, 4 //h8 ~ 3088 vadd.w vr18, vr18, vr19 3089 vadd.w vr18, vr18, vr20 3090 vadd.w vr18, vr18, vr21 3091 3092 vssrarni.hu.w vr0, vr18, 10 3093 vssrani.bu.h vr0, vr0, 0 3094 vstelm.h vr0, a0, 0, 0 3095 add.d a0, a0, a1 3096 vstelm.h vr0, a0, 0, 1 3097 add.d a0, a0, a1 3098 addi.d a5, a5, -2 3099 bnez a5, .l_\lable\()put_hv_2w_loop 3100 b .l_\lable\()end_put_8tap 3101 3102.l_\lable\()put_hv_4w: 3103 addi.d a2, a2, 2 //ignore leading 0 3104 vld vr0, a2, 0 3105 vldx vr1, a2, a3 3106 vldx vr2, a2, t2 3107 add.d a2, a2, t3 3108 vld vr3, a2, 0 3109 vldx vr4, a2, a3 3110 vldx vr5, a2, t2 3111 vldx vr6, a2, t3 3112 add.d a2, a2, t4 3113 3114 la.local t1, subpel_h_shuf1 3115 vld vr7, t1, 0 3116 vbsrl.v vr8, vr8, 2 3117 vreplvei.w vr8, vr8, 0 3118 3119 //fv 3120 vreplvei.w vr17, vr9, 0 3121 vreplvei.w vr18, vr9, 1 3122 vreplvei.w vr19, vr9, 2 3123 vreplvei.w vr20, vr9, 3 3124 3125 //DAV1D_FILTER_8TAP_RND 3126 vshuf.b vr0, vr0, vr0, vr7 3127 vshuf.b vr1, vr1, vr1, vr7 3128 vshuf.b vr2, vr2, vr2, vr7 3129 vshuf.b vr3, vr3, vr3, vr7 3130 vshuf.b vr4, vr4, vr4, vr7 3131 vshuf.b vr5, vr5, vr5, vr7 3132 vshuf.b vr6, vr6, vr6, vr7 3133 3134 vmulwev.h.bu.b vr10, vr0, vr8 3135 vmulwev.h.bu.b vr11, vr1, vr8 3136 vmulwev.h.bu.b vr12, vr2, vr8 3137 vmulwev.h.bu.b vr13, vr3, vr8 3138 vmulwev.h.bu.b vr14, vr4, vr8 3139 vmulwev.h.bu.b vr15, vr5, vr8 3140 vmulwev.h.bu.b vr16, vr6, vr8 3141 vmaddwod.h.bu.b vr10, vr0, vr8 3142 vmaddwod.h.bu.b vr11, vr1, vr8 3143 vmaddwod.h.bu.b vr12, vr2, vr8 3144 vmaddwod.h.bu.b vr13, vr3, vr8 3145 vmaddwod.h.bu.b vr14, vr4, vr8 3146 vmaddwod.h.bu.b vr15, vr5, vr8 3147 vmaddwod.h.bu.b vr16, vr6, vr8 3148 3149 vhaddw.w.h vr10, vr10, vr10 3150 vhaddw.w.h vr11, vr11, vr11 3151 vhaddw.w.h vr12, vr12, vr12 3152 vhaddw.w.h vr13, vr13, vr13 3153 vhaddw.w.h vr14, vr14, vr14 3154 vhaddw.w.h vr15, vr15, vr15 3155 vhaddw.w.h vr16, vr16, vr16 3156 3157 vssrarni.h.w vr10, vr10, 2 //h0 3158 vssrarni.h.w vr11, vr11, 2 //h1 3159 vssrarni.h.w vr12, vr12, 2 //h2 3160 vssrarni.h.w vr13, vr13, 2 //h3 3161 vssrarni.h.w vr14, vr14, 2 //h4 3162 vssrarni.h.w vr15, vr15, 2 //h5 3163 vssrarni.h.w vr16, vr16, 2 //h6 3164 3165 //h0 3166 vilvl.h vr0, vr11, vr10 //01 3167 vilvl.h vr1, vr13, vr12 //23 3168 vilvl.h vr2, vr15, vr14 //45 3169 //h1 3170 vilvl.h vr4, vr12, vr11 //12 3171 vilvl.h vr5, vr14, vr13 //34 3172 vilvl.h vr6, vr16, vr15 //56 3173 3174.l_\lable\()put_hv_4w_loop: 3175 vld vr9, a2, 0 3176 vldx vr10, a2, a3 3177 add.d a2, a2, t2 3178 3179 //DAV1D_FILTER_8TAP_CLIP 3180 vshuf.b vr9, vr9, vr9, vr7 3181 vshuf.b vr10, vr10, vr10, vr7 3182 vmulwev.h.bu.b vr11, vr9, vr8 3183 vmulwev.h.bu.b vr12, vr10, vr8 3184 vmaddwod.h.bu.b vr11, vr9, vr8 3185 vmaddwod.h.bu.b vr12, vr10, vr8 3186 vhaddw.w.h vr11, vr11, vr11 3187 vhaddw.w.h vr12, vr12, vr12 3188 vssrarni.h.w vr11, vr11, 2 //h7 3189 vssrarni.h.w vr12, vr12, 2 //h8 3190 vilvl.h vr3, vr11, vr16 //67 3191 vilvl.h vr13, vr12, vr11 //78 3192 3193 vmulwev.w.h vr9, vr0, vr17 3194 vmulwev.w.h vr10, vr1, vr18 3195 vmulwev.w.h vr14, vr2, vr19 3196 vmulwev.w.h vr15, vr3, vr20 3197 vmaddwod.w.h vr9, vr0, vr17 3198 vmaddwod.w.h vr10, vr1, vr18 3199 vmaddwod.w.h vr14, vr2, vr19 3200 vmaddwod.w.h vr15, vr3, vr20 3201 vadd.w vr16, vr9, vr10 3202 vadd.w vr16, vr16, vr14 3203 vadd.w vr16, vr16, vr15 3204 3205 vmulwev.w.h vr9, vr4, vr17 3206 vmulwev.w.h vr10, vr5, vr18 3207 vmulwev.w.h vr14, vr6, vr19 3208 vmulwev.w.h vr15, vr13, vr20 3209 vmaddwod.w.h vr9, vr4, vr17 3210 vmaddwod.w.h vr10, vr5, vr18 3211 vmaddwod.w.h vr14, vr6, vr19 3212 vmaddwod.w.h vr15, vr13, vr20 3213 vadd.w vr21, vr9, vr10 3214 vadd.w vr21, vr21, vr14 3215 vadd.w vr21, vr21, vr15 3216 3217 vssrarni.hu.w vr21, vr16, 10 3218 vssrani.bu.h vr21, vr21, 0 3219 //cache 3220 vaddi.hu vr0, vr1, 0 3221 vaddi.hu vr1, vr2, 0 3222 vaddi.hu vr2, vr3, 0 3223 vaddi.hu vr4, vr5, 0 3224 vaddi.hu vr5, vr6, 0 3225 vaddi.hu vr6, vr13, 0 3226 vaddi.hu vr16, vr12, 0 3227 3228 vstelm.w vr21, a0, 0, 0 3229 add.d a0, a0, a1 3230 vstelm.w vr21, a0, 0, 1 3231 add.d a0, a0, a1 3232 addi.w a5, a5, -2 3233 bnez a5, .l_\lable\()put_hv_4w_loop 3234 b .l_\lable\()end_put_8tap 3235 3236.l_\lable\()put_hv_8w: 3237.l_\lable\()put_hv_16w: 3238.l_\lable\()put_hv_32w: 3239.l_\lable\()put_hv_64w: 3240.l_\lable\()put_hv_128w: 3241 addi.d sp, sp, -8*8 3242 fst.d f24, sp, 0 3243 fst.d f25, sp, 8 3244 fst.d f26, sp, 16 3245 fst.d f27, sp, 24 3246 fst.d f28, sp, 32 3247 fst.d f29, sp, 40 3248 fst.d f30, sp, 48 3249 fst.d f31, sp, 56 3250 addi.d t0, a2, 0 //src 3251 addi.d t5, a5, 0 //h 3252 addi.d t8, a0, 0 //dst 3253 la.local t1, subpel_h_shuf1 3254 vld vr7, t1, 0 3255 vaddi.bu vr11, vr7, 4 3256 vaddi.bu vr12, vr7, 8 3257 vreplvei.w vr10, vr8, 1 3258 vreplvei.w vr8, vr8, 0 3259 vreplvei.w vr20, vr9, 1 3260 vreplvei.w vr21, vr9, 2 3261 vreplvei.w vr22, vr9, 3 3262 vreplvei.w vr9, vr9, 0 3263.l_\lable\()put_hv_8w_loop0: 3264 vld vr0, a2, 0 3265 vldx vr1, a2, a3 3266 vldx vr2, a2, t2 3267 add.d a2, a2, t3 3268 vld vr3, a2, 0 3269 vldx vr4, a2, a3 3270 vldx vr5, a2, t2 3271 vldx vr6, a2, t3 3272 add.d a2, a2, t4 3273 3274 FILTER_8TAP_8W vr0 //h0 3275 FILTER_8TAP_8W vr1 //h1 3276 FILTER_8TAP_8W vr2 //h2 3277 FILTER_8TAP_8W vr3 //h3 3278 FILTER_8TAP_8W vr4 //h4 3279 FILTER_8TAP_8W vr5 //h5 3280 FILTER_8TAP_8W vr6 //h6 3281 3282 //h0' low part 3283 vilvl.h vr23, vr1, vr0 //01 3284 vilvl.h vr24, vr3, vr2 //23 3285 vilvl.h vr25, vr5, vr4 //45 3286 //h0' high part 3287 vilvh.h vr26, vr1, vr0 //01 3288 vilvh.h vr27, vr3, vr2 //23 3289 vilvh.h vr28, vr5, vr4 //45 3290 3291 //h1' low part 3292 vilvl.h vr29, vr2, vr1 //12 3293 vilvl.h vr30, vr4, vr3 //34 3294 vilvl.h vr31, vr6, vr5 //56 3295 //h1' high part 3296 vilvh.h vr0, vr2, vr1 //12 3297 vilvh.h vr1, vr4, vr3 //34 3298 vilvh.h vr2, vr6, vr5 //56 3299 3300.l_\lable\()put_hv_8w_loop: 3301 vld vr3, a2, 0 3302 vldx vr4, a2, a3 3303 add.d a2, a2, t2 3304 3305 FILTER_8TAP_8W vr3 //h7 3306 FILTER_8TAP_8W vr4 //h8 3307 3308 //h0' low part 3309 vilvl.h vr16, vr3, vr6 //67 ~low 3310 vmulwev.w.h vr13, vr23, vr9 3311 vmulwev.w.h vr14, vr24, vr20 3312 vmulwev.w.h vr15, vr25, vr21 3313 vmulwev.w.h vr17, vr16, vr22 3314 vmaddwod.w.h vr13, vr23, vr9 3315 vmaddwod.w.h vr14, vr24, vr20 3316 vmaddwod.w.h vr15, vr25, vr21 3317 vmaddwod.w.h vr17, vr16, vr22 3318 vadd.w vr13, vr13, vr14 3319 vadd.w vr13, vr13, vr15 3320 vadd.w vr13, vr13, vr17 3321 //cache 3322 vaddi.hu vr23, vr24, 0 3323 vaddi.hu vr24, vr25, 0 3324 vaddi.hu vr25, vr16, 0 3325 3326 //h0' high part 3327 vilvh.h vr17, vr3, vr6 //67 ~high 3328 vmulwev.w.h vr14, vr26, vr9 3329 vmulwev.w.h vr15, vr27, vr20 3330 vmulwev.w.h vr16, vr28, vr21 3331 vmulwev.w.h vr18, vr17, vr22 3332 vmaddwod.w.h vr14, vr26, vr9 3333 vmaddwod.w.h vr15, vr27, vr20 3334 vmaddwod.w.h vr16, vr28, vr21 3335 vmaddwod.w.h vr18, vr17, vr22 3336 vadd.w vr14, vr14, vr15 3337 vadd.w vr14, vr14, vr16 3338 vadd.w vr14, vr14, vr18 3339 vssrarni.hu.w vr14, vr13, 10 3340 vssrarni.bu.h vr5, vr14, 0 3341 vstelm.d vr5, a0, 0, 0 3342 add.d a0, a0, a1 3343 //cache 3344 vaddi.hu vr26, vr27, 0 3345 vaddi.hu vr27, vr28, 0 3346 vaddi.hu vr28, vr17, 0 3347 vaddi.hu vr6, vr4, 0 3348 3349 vilvl.h vr5, vr4, vr3 //78 ~low 3350 vilvh.h vr4, vr4, vr3 //78 ~high 3351 3352 //h1' low part 3353 vmulwev.w.h vr13, vr29, vr9 3354 vmulwev.w.h vr14, vr30, vr20 3355 vmulwev.w.h vr15, vr31, vr21 3356 vmulwev.w.h vr16, vr5, vr22 3357 vmaddwod.w.h vr13, vr29, vr9 3358 vmaddwod.w.h vr14, vr30, vr20 3359 vmaddwod.w.h vr15, vr31, vr21 3360 vmaddwod.w.h vr16, vr5, vr22 3361 vadd.w vr13, vr13, vr14 3362 vadd.w vr13, vr13, vr15 3363 vadd.w vr13, vr13, vr16 3364 //cache 3365 vaddi.hu vr29, vr30, 0 3366 vaddi.hu vr30, vr31, 0 3367 vaddi.hu vr31, vr5, 0 3368 3369 //h1' high part 3370 vmulwev.w.h vr14, vr0, vr9 3371 vmulwev.w.h vr15, vr1, vr20 3372 vmulwev.w.h vr16, vr2, vr21 3373 vmulwev.w.h vr17, vr4, vr22 3374 vmaddwod.w.h vr14, vr0, vr9 3375 vmaddwod.w.h vr15, vr1, vr20 3376 vmaddwod.w.h vr16, vr2, vr21 3377 vmaddwod.w.h vr17, vr4, vr22 3378 vadd.w vr14, vr14, vr15 3379 vadd.w vr14, vr14, vr16 3380 vadd.w vr14, vr14, vr17 3381 vssrarni.hu.w vr14, vr13, 10 3382 vssrarni.bu.h vr5, vr14, 0 3383 vstelm.d vr5, a0, 0, 0 3384 add.d a0, a0, a1 3385 //cache 3386 vaddi.hu vr0, vr1, 0 3387 vaddi.hu vr1, vr2, 0 3388 vaddi.hu vr2, vr4, 0 3389 3390 addi.w a5, a5, -2 3391 bnez a5, .l_\lable\()put_hv_8w_loop 3392 addi.d a2, t0, 8 3393 addi.d t0, t0, 8 3394 addi.d a0, t8, 8 3395 addi.d t8, t8, 8 3396 addi.d a5, t5, 0 3397 addi.w a4, a4, -8 3398 bnez a4, .l_\lable\()put_hv_8w_loop0 3399 fld.d f24, sp, 0 3400 fld.d f25, sp, 8 3401 fld.d f26, sp, 16 3402 fld.d f27, sp, 24 3403 fld.d f28, sp, 32 3404 fld.d f29, sp, 40 3405 fld.d f30, sp, 48 3406 fld.d f31, sp, 56 3407 addi.d sp, sp, 8*8 3408.l_\lable\()end_put_8tap: 3409.endm 3410 3411function put_8tap_regular_8bpc_lsx 3412 addi.d sp, sp, -16 3413 st.d zero, sp, 0 3414 PUT_8TAP_8BPC_LSX 0 3415 addi.d sp, sp, 16 3416endfunc 3417 3418function put_8tap_smooth_regular_8bpc_lsx 3419 addi.d sp, sp, -16 3420 li.w t0, 1 3421 st.d t0, sp, 0 3422 PUT_8TAP_8BPC_LSX 1 3423 addi.d sp, sp, 16 3424endfunc 3425 3426function put_8tap_sharp_regular_8bpc_lsx 3427 addi.d sp, sp, -16 3428 li.w t0, 2 3429 st.d t0, sp, 0 3430 PUT_8TAP_8BPC_LSX 2 3431 addi.d sp, sp, 16 3432endfunc 3433 3434function put_8tap_regular_smooth_8bpc_lsx 3435 addi.d sp, sp, -16 3436 li.w t0, 4 3437 st.d t0, sp, 0 3438 PUT_8TAP_8BPC_LSX 4 3439 addi.d sp, sp, 16 3440endfunc 3441 3442function put_8tap_smooth_8bpc_lsx 3443 addi.d sp, sp, -16 3444 li.w t0, 5 3445 st.d t0, sp, 0 3446 PUT_8TAP_8BPC_LSX 5 3447 addi.d sp, sp, 16 3448endfunc 3449 3450function put_8tap_sharp_smooth_8bpc_lsx 3451 addi.d sp, sp, -16 3452 li.w t0, 6 3453 st.d t0, sp, 0 3454 PUT_8TAP_8BPC_LSX 6 3455 addi.d sp, sp, 16 3456endfunc 3457 3458function put_8tap_regular_sharp_8bpc_lsx 3459 addi.d sp, sp, -16 3460 li.w t0, 8 3461 st.d t0, sp, 0 3462 PUT_8TAP_8BPC_LSX 8 3463 addi.d sp, sp, 16 3464endfunc 3465 3466function put_8tap_smooth_sharp_8bpc_lsx 3467 addi.d sp, sp, -16 3468 li.w t0, 9 3469 st.d t0, sp, 0 3470 PUT_8TAP_8BPC_LSX 9 3471 addi.d sp, sp, 16 3472endfunc 3473 3474function put_8tap_sharp_8bpc_lsx 3475 addi.d sp, sp, -16 3476 li.w t0, 10 3477 st.d t0, sp, 0 3478 PUT_8TAP_8BPC_LSX 10 3479 addi.d sp, sp, 16 3480endfunc 3481 3482const shufb1 3483.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 3484endconst 3485 3486.macro PREP_H_8W in0 3487 vshuf.b vr2, \in0, \in0, vr6 3488 vshuf.b vr3, \in0, \in0, vr7 3489 vshuf.b vr4, \in0, \in0, vr8 3490 vmulwev.h.bu.b vr12, vr2, vr22 3491 vmulwev.h.bu.b vr13, vr3, vr23 3492 vmulwev.h.bu.b vr14, vr3, vr22 3493 vmulwev.h.bu.b vr15, vr4, vr23 3494 vmaddwod.h.bu.b vr12, vr2, vr22 3495 vmaddwod.h.bu.b vr13, vr3, vr23 3496 vmaddwod.h.bu.b vr14, vr3, vr22 3497 vmaddwod.h.bu.b vr15, vr4, vr23 3498 vadd.h vr12, vr12, vr13 3499 vadd.h vr14, vr14, vr15 3500 vhaddw.w.h vr12, vr12, vr12 3501 vhaddw.w.h \in0, vr14, vr14 3502 vssrarni.h.w \in0, vr12, 2 3503.endm 3504 3505.macro PREP_HV_8W_LASX in0 3506 xvshuf.b xr4, \in0, \in0, xr19 3507 xvshuf.b xr5, \in0, \in0, xr20 3508 xvshuf.b xr6, \in0, \in0, xr21 3509 xvmulwev.h.bu.b xr7, xr4, xr22 3510 xvmulwev.h.bu.b xr9, xr5, xr23 3511 xvmulwev.h.bu.b xr10, xr5, xr22 3512 xvmulwev.h.bu.b xr11, xr6, xr23 3513 xvmaddwod.h.bu.b xr7, xr4, xr22 3514 xvmaddwod.h.bu.b xr9, xr5, xr23 3515 xvmaddwod.h.bu.b xr10, xr5, xr22 3516 xvmaddwod.h.bu.b xr11, xr6, xr23 3517 xvadd.h xr7, xr7, xr9 3518 xvadd.h xr9, xr10, xr11 3519 xvhaddw.w.h xr7, xr7, xr7 3520 xvhaddw.w.h \in0, xr9, xr9 3521 xvssrarni.h.w \in0, xr7, 2 3522.endm 3523 3524.macro PREP_8TAP_8BPC_LASX lable 3525 li.w t0, 4 3526 la.local t6, dav1d_mc_subpel_filters 3527 slli.d t2, a2, 1 //src_stride*2 3528 add.d t3, t2, a2 //src_stride*3 3529 slli.d t4, t2, 1 3530 3531 bnez a5, .l_\lable\()h_lasx //mx 3532 bnez a6, .l_\lable\()v_lasx 3533 3534 clz.w t1, a3 3535 li.w t5, 24 3536 sub.w t1, t1, t5 3537 la.local t5, .l_\lable\()prep_hv0_jtable_lasx 3538 alsl.d t1, t1, t5, 1 3539 ld.h t8, t1, 0 3540 add.d t5, t5, t8 3541 jirl $r0, t5, 0 3542 3543 .align 3 3544.l_\lable\()prep_hv0_jtable_lasx: 3545 .hword .l_\lable\()hv0_128w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3546 .hword .l_\lable\()hv0_64w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3547 .hword .l_\lable\()hv0_32w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3548 .hword .l_\lable\()hv0_16w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3549 .hword .l_\lable\()hv0_8w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3550 .hword .l_\lable\()hv0_4w_lasx - .l_\lable\()prep_hv0_jtable_lasx 3551 3552.l_\lable\()hv0_4w_lasx: 3553 fld.s f0, a1, 0 3554 fldx.s f1, a1, a2 3555 fldx.s f2, a1, t2 3556 fldx.s f3, a1, t3 3557 add.d a1, a1, t4 3558 xvpackev.w xr0, xr1, xr0 3559 xvpackev.w xr1, xr3, xr2 3560 xvpermi.q xr0, xr1, 0x02 3561 xvsllwil.hu.bu xr0, xr0, 4 3562 xvst xr0, a0, 0 3563 addi.d a0, a0, 32 3564 addi.d a4, a4, -4 3565 bnez a4, .l_\lable\()hv0_4w_lasx 3566 b .l_\lable\()end_pre_8tap_lasx 3567.l_\lable\()hv0_8w_lasx: 3568 fld.d f0, a1, 0 3569 fldx.d f1, a1, a2 3570 fldx.d f2, a1, t2 3571 fldx.d f3, a1, t3 3572 add.d a1, a1, t4 3573 xvpermi.q xr0, xr1, 0x02 3574 xvpermi.q xr2, xr3, 0x02 3575 xvsllwil.hu.bu xr0, xr0, 4 3576 xvsllwil.hu.bu xr2, xr2, 4 3577 xvst xr0, a0, 0 3578 xvst xr2, a0, 32 3579 addi.d a0, a0, 64 3580 addi.d a4, a4, -4 3581 bnez a4, .l_\lable\()hv0_8w_lasx 3582 b .l_\lable\()end_pre_8tap_lasx 3583.l_\lable\()hv0_16w_lasx: 3584 vld vr0, a1, 0 3585 vldx vr1, a1, a2 3586 vldx vr2, a1, t2 3587 vldx vr3, a1, t3 3588 add.d a1, a1, t4 3589 vext2xv.hu.bu xr0, xr0 3590 vext2xv.hu.bu xr1, xr1 3591 vext2xv.hu.bu xr2, xr2 3592 vext2xv.hu.bu xr3, xr3 3593 xvslli.h xr0, xr0, 4 3594 xvslli.h xr1, xr1, 4 3595 xvslli.h xr2, xr2, 4 3596 xvslli.h xr3, xr3, 4 3597 xvst xr0, a0, 0 3598 xvst xr1, a0, 32 3599 xvst xr2, a0, 64 3600 xvst xr3, a0, 96 3601 addi.d a0, a0, 128 3602 addi.d a4, a4, -4 3603 bnez a4, .l_\lable\()hv0_16w_lasx 3604 b .l_\lable\()end_pre_8tap_lasx 3605.l_\lable\()hv0_32w_lasx: 3606 xvld xr0, a1, 0 3607 xvldx xr1, a1, a2 3608 xvldx xr2, a1, t2 3609 xvldx xr3, a1, t3 3610 add.d a1, a1, t4 3611 xvpermi.d xr4, xr0, 0xD8 3612 xvpermi.d xr5, xr1, 0xD8 3613 xvpermi.d xr6, xr2, 0xD8 3614 xvpermi.d xr7, xr3, 0xD8 3615 xvpermi.d xr10, xr0, 0x32 3616 xvpermi.d xr11, xr1, 0x32 3617 xvpermi.d xr12, xr2, 0x32 3618 xvpermi.d xr13, xr3, 0x32 3619 xvsllwil.hu.bu xr0, xr4, 4 3620 xvsllwil.hu.bu xr1, xr5, 4 3621 xvsllwil.hu.bu xr2, xr6, 4 3622 xvsllwil.hu.bu xr3, xr7, 4 3623 xvsllwil.hu.bu xr4, xr10, 4 3624 xvsllwil.hu.bu xr5, xr11, 4 3625 xvsllwil.hu.bu xr6, xr12, 4 3626 xvsllwil.hu.bu xr7, xr13, 4 3627 xvst xr0, a0, 0 3628 xvst xr4, a0, 32 3629 xvst xr1, a0, 64 3630 xvst xr5, a0, 96 3631 xvst xr2, a0, 128 3632 xvst xr6, a0, 160 3633 xvst xr3, a0, 192 3634 xvst xr7, a0, 224 3635 addi.d a0, a0, 256 3636 addi.d a4, a4, -4 3637 bnez a4, .l_\lable\()hv0_32w_lasx 3638 b .l_\lable\()end_pre_8tap_lasx 3639.l_\lable\()hv0_64w_lasx: 3640.l_\lable\()hv0_128w_lasx: 3641 addi.d t0, a1, 0 3642 addi.d t5, a4, 0 3643 srli.w t7, a3, 5 3644 slli.w t7, t7, 6 3645 addi.d t8, a0, 0 3646.l_\lable\()hv0_32_loop_lasx: 3647 xvld xr0, a1, 0 3648 xvldx xr1, a1, a2 3649 xvldx xr2, a1, t2 3650 xvldx xr3, a1, t3 3651 add.d a1, a1, t4 3652 xvpermi.d xr4, xr0, 0xD8 3653 xvpermi.d xr5, xr1, 0xD8 3654 xvpermi.d xr6, xr2, 0xD8 3655 xvpermi.d xr7, xr3, 0xD8 3656 xvpermi.d xr10, xr0, 0x32 3657 xvpermi.d xr11, xr1, 0x32 3658 xvpermi.d xr12, xr2, 0x32 3659 xvpermi.d xr13, xr3, 0x32 3660 xvsllwil.hu.bu xr0, xr4, 4 3661 xvsllwil.hu.bu xr1, xr5, 4 3662 xvsllwil.hu.bu xr2, xr6, 4 3663 xvsllwil.hu.bu xr3, xr7, 4 3664 xvsllwil.hu.bu xr4, xr10, 4 3665 xvsllwil.hu.bu xr5, xr11, 4 3666 xvsllwil.hu.bu xr6, xr12, 4 3667 xvsllwil.hu.bu xr7, xr13, 4 3668 xvst xr0, a0, 0 3669 xvst xr4, a0, 32 3670 add.d t1, a0, t7 3671 xvst xr1, t1, 0 3672 xvst xr5, t1, 32 3673 add.d t1, t1, t7 3674 xvst xr2, t1, 0 3675 xvst xr6, t1, 32 3676 add.d t1, t1, t7 3677 xvst xr3, t1, 0 3678 xvst xr7, t1, 32 3679 add.d a0, t1, t7 3680 addi.d a4, a4, -4 3681 bnez a4, .l_\lable\()hv0_32_loop_lasx 3682 addi.d a1, t0, 32 3683 addi.d t0, t0, 32 3684 addi.d a0, t8, 64 3685 addi.d t8, t8, 64 3686 addi.d a4, t5, 0 3687 addi.d a3, a3, -32 3688 bnez a3, .l_\lable\()hv0_32_loop_lasx 3689 b .l_\lable\()end_pre_8tap_lasx 3690 3691.l_\lable\()h_lasx: 3692 bnez a6, .l_\lable\()hv_lasx //if(fh) && if (fv) 3693 3694 andi t1, a7, 3 3695 blt t0, a3, .l_\lable\()h_idx_fh_lasx 3696 andi t1, a7, 1 3697 addi.w t1, t1, 3 3698.l_\lable\()h_idx_fh_lasx: 3699 addi.w t5, zero, 120 3700 mul.w t1, t1, t5 3701 addi.w t5, a5, -1 3702 slli.w t5, t5, 3 3703 add.w t1, t1, t5 3704 add.d t1, t6, t1 //fh's offset 3705 xvldrepl.d xr22, t1, 0 3706 3707 addi.d a1, a1, -3 3708 clz.w t1, a3 3709 li.w t5, 24 3710 sub.w t1, t1, t5 3711 la.local t5, .l_\lable\()prep_h_jtable_lasx 3712 alsl.d t1, t1, t5, 1 3713 ld.h t8, t1, 0 3714 add.d t5, t5, t8 3715 jirl $r0, t5, 0 3716 3717 .align 3 3718.l_\lable\()prep_h_jtable_lasx: 3719 .hword .l_\lable\()h_128w_lasx - .l_\lable\()prep_h_jtable_lasx 3720 .hword .l_\lable\()h_64w_lasx - .l_\lable\()prep_h_jtable_lasx 3721 .hword .l_\lable\()h_32w_lasx - .l_\lable\()prep_h_jtable_lasx 3722 .hword .l_\lable\()h_16w_lasx - .l_\lable\()prep_h_jtable_lasx 3723 .hword .l_\lable\()h_8w_lasx - .l_\lable\()prep_h_jtable_lasx 3724 .hword .l_\lable\()h_4w_lasx - .l_\lable\()prep_h_jtable_lasx 3725 3726.l_\lable\()h_4w_lasx: 3727 addi.d a1, a1, 2 3728 la.local t7, subpel_h_shuf1 3729 vld vr7, t7, 0 3730 xvreplve0.q xr7, xr7 3731 xvbsrl.v xr22, xr22, 2 3732 xvreplve0.w xr22, xr22 3733.l_\lable\()h_4w_loop_lasx: 3734 vld vr0, a1, 0 3735 vldx vr1, a1, a2 3736 vldx vr2, a1, t2 3737 vldx vr3, a1, t3 3738 add.d a1, a1, t4 3739 xvpermi.q xr1, xr0, 0x20 3740 xvpermi.q xr3, xr2, 0x20 3741 xvshuf.b xr1, xr1, xr1, xr7 3742 xvshuf.b xr3, xr3, xr3, xr7 3743 xvmulwev.h.bu.b xr0, xr1, xr22 3744 xvmulwev.h.bu.b xr2, xr3, xr22 3745 xvmaddwod.h.bu.b xr0, xr1, xr22 3746 xvmaddwod.h.bu.b xr2, xr3, xr22 3747 xvhaddw.w.h xr0, xr0, xr0 3748 xvhaddw.w.h xr2, xr2, xr2 3749 xvssrarni.h.w xr2, xr0, 2 3750 xvpermi.d xr2, xr2, 0xd8 3751 xvst xr2, a0, 0 3752 addi.d a0, a0, 32 3753 addi.w a4, a4, -4 3754 bnez a4, .l_\lable\()h_4w_loop_lasx 3755 b .l_\lable\()end_pre_8tap_lasx 3756 3757.l_\lable\()h_8w_lasx: 3758 la.local t7, subpel_h_shuf1 3759 vld vr6, t7, 0 3760 vbsrl.v vr23, vr22, 4 //fh 3761 xvreplve0.w xr23, xr23 3762 xvreplve0.w xr22, xr22 3763 xvreplve0.q xr19, xr6 3764 xvaddi.bu xr20, xr19, 4 3765 xvaddi.bu xr21, xr19, 8 3766.l_\lable\()h_8w_loop_lasx: 3767 xvld xr0, a1, 0 3768 xvldx xr1, a1, a2 3769 add.d a1, a1, t2 3770 xvpermi.q xr0, xr1, 0x02 3771 PREP_HV_8W_LASX xr0 3772 xvst xr0, a0, 0 3773 addi.d a0, a0, 32 3774 addi.d a4, a4, -2 3775 bnez a4, .l_\lable\()h_8w_loop_lasx 3776 b .l_\lable\()end_pre_8tap_lasx 3777 3778.l_\lable\()h_16w_lasx: 3779 la.local t7, subpel_h_shuf1 3780 vld vr6, t7, 0 3781 vbsrl.v vr23, vr22, 4 //fh 3782 xvreplve0.w xr23, xr23 3783 xvreplve0.w xr22, xr22 3784 xvreplve0.q xr19, xr6 3785 xvaddi.bu xr20, xr19, 4 3786 xvaddi.bu xr21, xr19, 8 3787.l_\lable\()h_16w_loop_lasx: 3788 xvld xr0, a1, 0 3789 xvld xr1, a1, 8 3790 add.d a1, a1, a2 3791 xvpermi.q xr0, xr1, 0x02 3792 PREP_HV_8W_LASX xr0 3793 xvst xr0, a0, 0 3794 xvld xr0, a1, 0 3795 xvld xr1, a1, 8 3796 add.d a1, a1, a2 3797 xvpermi.q xr0, xr1, 0x02 3798 PREP_HV_8W_LASX xr0 3799 xvst xr0, a0, 32 3800 addi.d a0, a0, 64 3801 addi.w a4, a4, -2 3802 bnez a4, .l_\lable\()h_16w_loop_lasx 3803 b .l_\lable\()end_pre_8tap_lasx 3804 3805.l_\lable\()h_32w_lasx: 3806.l_\lable\()h_64w_lasx: 3807.l_\lable\()h_128w_lasx: 3808 la.local t7, subpel_h_shuf1 3809 vld vr6, t7, 0 3810 vbsrl.v vr23, vr22, 4 //fh 3811 xvreplve0.w xr23, xr23 3812 xvreplve0.w xr22, xr22 3813 xvreplve0.q xr19, xr6 3814 xvaddi.bu xr20, xr19, 4 3815 xvaddi.bu xr21, xr19, 8 3816 addi.d t5, a1, 0 //src 3817 addi.d t6, a3, 0 //w 3818 slli.w t7, a3, 1 //store offset 3819 addi.d t8, a0, 0 //dst 3820.l_\lable\()h_16_loop_lasx: 3821 xvld xr0, a1, 0 3822 xvld xr1, a1, 8 3823 xvpermi.q xr0, xr1, 0x02 3824 PREP_HV_8W_LASX xr0 3825 xvst xr0, a0, 0 3826 xvld xr0, a1, 16 3827 xvld xr1, a1, 24 3828 xvpermi.q xr0, xr1, 0x02 3829 PREP_HV_8W_LASX xr0 3830 xvst xr0, a0, 32 3831 addi.d a0, a0, 64 3832 addi.d a1, a1, 32 3833 addi.d a3, a3, -32 3834 bnez a3, .l_\lable\()h_16_loop_lasx 3835 add.d a1, t5, a2 3836 add.d t5, t5, a2 3837 add.d a0, t8, t7 3838 add.d t8, t8, t7 3839 addi.d a3, t6, 0 3840 addi.d a4, a4, -1 3841 bnez a4, .l_\lable\()h_16_loop_lasx 3842 b .l_\lable\()end_pre_8tap_lasx 3843 3844.l_\lable\()hv_lasx: 3845 andi t1, a7, 3 3846 blt t0, a3, .l_\lable\()hv_idx_fh_lasx 3847 andi t1, a7, 1 3848 addi.w t1, t1, 3 3849.l_\lable\()hv_idx_fh_lasx: 3850 addi.w t5, zero, 120 3851 mul.w t1, t1, t5 3852 addi.w t5, a5, -1 3853 slli.w t5, t5, 3 3854 add.w t1, t1, t5 3855 add.d t1, t6, t1 //fh's offset 3856 xvldrepl.d xr22, t1, 0 3857 srli.w a7, a7, 2 3858 blt t0, a4, .l_\lable\()hv_idx_fv_lasx 3859 andi a7, a7, 1 3860 addi.w a7, a7, 3 3861.l_\lable\()hv_idx_fv_lasx: 3862 addi.w t5, zero, 120 3863 mul.w a7, a7, t5 3864 addi.w t5, a6, -1 3865 slli.w t5, t5, 3 3866 add.w a7, a7, t5 3867 add.d a7, t6, a7 //fv's offset 3868 xvldrepl.d xr8, a7, 0 3869 xvsllwil.h.b xr8, xr8, 0 3870 sub.d a1, a1, t3 3871 addi.d a1, a1, -1 //ignore leading 0s 3872 beq a3, t0, .l_\lable\()hv_4w_lasx 3873 addi.d a1, a1, -2 3874 b .l_\lable\()hv_8w_lasx 3875.l_\lable\()hv_4w_lasx: 3876 xvld xr0, a1, 0 3877 xvldx xr1, a1, a2 3878 xvldx xr2, a1, t2 3879 xvldx xr3, a1, t3 3880 add.d a1, a1, t4 3881 xvld xr4, a1, 0 3882 xvldx xr5, a1, a2 3883 xvldx xr6, a1, t2 3884 la.local t1, subpel_h_shuf2 3885 xvld xr7, t1, 0 3886 vbsrl.v vr22, vr22, 2 3887 xvreplve0.w xr22, xr22 3888 xvreplve0.q xr8, xr8 3889 xvrepl128vei.w xr12, xr8, 0 3890 xvrepl128vei.w xr13, xr8, 1 3891 xvrepl128vei.w xr14, xr8, 2 3892 xvrepl128vei.w xr15, xr8, 3 3893 xvilvl.d xr0, xr1, xr0 3894 xvilvl.d xr2, xr3, xr2 3895 xvilvl.d xr4, xr5, xr4 3896 xvreplve0.q xr0, xr0 3897 xvreplve0.q xr2, xr2 3898 xvreplve0.q xr4, xr4 3899 xvreplve0.q xr6, xr6 3900 xvshuf.b xr0, xr0, xr0, xr7 3901 xvshuf.b xr2, xr2, xr2, xr7 3902 xvshuf.b xr4, xr4, xr4, xr7 3903 xvshuf.b xr6, xr6, xr6, xr7 3904 xvmulwev.h.bu.b xr1, xr0, xr22 3905 xvmulwev.h.bu.b xr3, xr2, xr22 3906 xvmulwev.h.bu.b xr5, xr4, xr22 3907 xvmulwev.h.bu.b xr9, xr6, xr22 3908 xvmaddwod.h.bu.b xr1, xr0, xr22 3909 xvmaddwod.h.bu.b xr3, xr2, xr22 3910 xvmaddwod.h.bu.b xr5, xr4, xr22 3911 xvmaddwod.h.bu.b xr9, xr6, xr22 3912 xvhaddw.w.h xr1, xr1, xr1 // a0 b0 a1 b1 c0 d0 c1 d1 3913 xvhaddw.w.h xr3, xr3, xr3 // a2 b2 a3 b3 c2 d2 c3 d3 3914 xvhaddw.w.h xr5, xr5, xr5 // a4 b4 a5 b5 c4 d4 c5 d5 3915 xvhaddw.w.h xr9, xr9, xr9 // a6 b6 - - c6 d6 - - 3916 xvssrarni.h.w xr3, xr1, 2 // a0 b0 a1 b1 a2 b2 a3 b3 c0 d0 c1 d1 c2 d2 c3 d3 3917 xvssrarni.h.w xr9, xr5, 2 // a4 b4 a5 b5 a6 b6 - - c4 d4 c5 d5 c6 d6 - - 3918 xvbsrl.v xr4, xr3, 4 3919 xvextrins.w xr4, xr9, 0x30 // a1 b1 a2 b2 a3 b3 a4 b4 c1 d1 c2 d2 c3 d3 c4 d4 3920 xvilvl.h xr5, xr4, xr3 // a0 a1 b0 b1 a1 a2 b1 b2 c0 c1 d0 d1 c1 c2 d1 d2 3921 xvilvh.h xr6, xr4, xr3 // a2 a3 b2 b3 a3 a4 b3 b4 c2 c3 d2 d3 c3 c4 d3 d4 3922 xvbsrl.v xr10, xr9, 4 // a5 b5 a6 b6 - - - - c5 d5 c6 d6 - - - - 3923 xvilvl.h xr11, xr10, xr9 // a4 a5 b4 b5 a5 a6 b5 b6 c4 c5 d4 d5 c5 c6 d5 d6 3924.l_\lable\()hv_w4_loop_lasx: 3925 xvmulwev.w.h xr16, xr5, xr12 //a0 a1 (h0) 3926 xvmulwev.w.h xr17, xr6, xr12 //a2 a3 (h1) 3927 xvmulwev.w.h xr18, xr6, xr13 //a2 a3 (h0) 3928 xvmulwev.w.h xr19, xr11, xr13 //a4 a5 (h1) 3929 xvmulwev.w.h xr20, xr11, xr14 //a4 a5 (h0) 3930 xvmaddwod.w.h xr16, xr5, xr12 // 3931 xvmaddwod.w.h xr17, xr6, xr12 // 3932 xvmaddwod.w.h xr18, xr6, xr13 // 3933 xvmaddwod.w.h xr19, xr11, xr13 // 3934 xvmaddwod.w.h xr20, xr11, xr14 // 3935 xvaddi.wu xr5, xr11, 0 3936 xvadd.w xr16, xr16, xr18 //a0 a1 + a2 a3 3937 xvldx xr18, a1, t3 //a7 b7 c7 d7 3938 add.d a1, a1, t4 3939 xvadd.w xr17, xr17, xr19 //a2 a3 + a4 a5 3940 xvld xr19, a1, 0 //a8 b8 c8 d8 3941 xvadd.w xr16, xr16, xr20 //a0 a1 + a2 a3 + a4 a5 3942 xvldx xr20, a1, a2 //a9 b9 c9 d9 3943 xvilvl.d xr18, xr19, xr18 3944 xvreplve0.q xr18, xr18 3945 xvldx xr19, a1, t2 //aa ba ca da 3946 xvilvl.d xr20, xr19, xr20 3947 xvreplve0.q xr20, xr20 3948 xvshuf.b xr18, xr18, xr18, xr7 3949 xvshuf.b xr20, xr20, xr20, xr7 3950 xvmulwev.h.bu.b xr21, xr18, xr22 3951 xvmulwev.h.bu.b xr23, xr20, xr22 3952 xvmaddwod.h.bu.b xr21, xr18, xr22 3953 xvmaddwod.h.bu.b xr23, xr20, xr22 3954 xvhaddw.w.h xr21, xr21, xr21 //a7 b7 a8 b8 c7 d7 c8 d8 3955 xvhaddw.w.h xr23, xr23, xr23 //a9 b9 aa ba c9 d9 ca da 3956 xvssrarni.h.w xr23, xr21, 2 //a7 b7 a8 b8 a9 b9 aa ba c7 d7 c8 d8 c9 d9 ca da 3957 xvbsll.v xr0, xr23, 4 3958 xvextrins.w xr0, xr9, 0x02 //a6 b6 a7 b7 a8 b8 a9 b9 c6 d6 c7 d7 c8 d8 c9 d9 3959 xvilvl.h xr6, xr23, xr0 //a6 a7 b6 b7 a7 a8 b7 b8 c6 c7 d6 d7 c7 c8 d7 d8 3960 xvilvh.h xr11, xr23, xr0 //a8 a9 b8 b9 a9 aa b9 ba c8 c9 d8 d9 c9 ca d9 da 3961 xvbsrl.v xr9, xr23, 4 3962 xvmulwev.w.h xr1 , xr6, xr14 //a6 a7 (h0) 3963 xvmulwev.w.h xr2 , xr6, xr15 //a6 a7 (h1) 3964 xvmulwev.w.h xr3 , xr11, xr15 //a8 a9 (h1) 3965 xvmaddwod.w.h xr1 , xr6, xr14 3966 xvmaddwod.w.h xr2 , xr6, xr15 3967 xvmaddwod.w.h xr3 , xr11, xr15 3968 xvadd.w xr17, xr17, xr1 //a2 a3 + a4 a5 + a6 a7 3969 xvadd.w xr16, xr16, xr2 //a0 a1 + a2 a3 + a4 a5 + a6 a7 3970 xvadd.w xr17, xr17, xr3 //a2 a3 + a4 a5 + a6 a7 + a8 a9 3971 xvssrarni.h.w xr17, xr16, 6 //a01 b01 a12 b12 a23 b23 a34 b34 c01 d01 c12 d12 c23 d23 c34 d34 3972 xvpermi.d xr17, xr17, 0xd8 //a01 b01 a12 b12 c01 d01 c12 d12 a23 b23 a34 b34 c23 d23 c34 d34 3973 xvshuf4i.w xr17, xr17, 0xd8 3974 xvst xr17, a0, 0 3975 addi.d a0, a0, 32 3976 addi.d a4, a4, -4 3977 bnez a4, .l_\lable\()hv_w4_loop_lasx 3978 b .l_\lable\()end_pre_8tap_lasx 3979 3980.l_\lable\()hv_8w_lasx: 3981 addi.d sp, sp, -4*8 3982 fst.d f24, sp, 0 3983 fst.d f25, sp, 8 3984 fst.d f26, sp, 16 3985 fst.d f27, sp, 24 3986 la.local t1, subpel_h_shuf1 3987 vld vr19, t1, 0 3988 addi.d t0, a1, 0 3989 addi.d t5, a4, 0 3990 slli.w t7, a3, 1 // store offset 3991 addi.d t8, a0, 0 3992 xvreplve0.q xr19, xr19 3993 xvaddi.bu xr20, xr19, 4 3994 xvaddi.bu xr21, xr19, 8 3995 vbsrl.v vr23, vr22, 4 3996 xvreplve0.w xr22, xr22 //f0f1f2f3 3997 xvreplve0.w xr23, xr23 //f4f5f6f7 3998 xvreplve0.q xr8, xr8 3999 xvrepl128vei.w xr24, xr8, 0 4000 xvrepl128vei.w xr25, xr8, 1 4001 xvrepl128vei.w xr26, xr8, 2 4002 xvrepl128vei.w xr27, xr8, 3 4003.l_\lable\()hv_8w_loop0_lasx: 4004 xvld xr0, a1, 0 4005 xvldx xr1, a1, a2 4006 xvldx xr2, a1, t2 4007 add.d a1, a1, t3 4008 xvld xr3, a1, 0 4009 xvldx xr4, a1, a2 4010 xvldx xr5, a1, t2 4011 xvldx xr6, a1, t3 4012 add.d a1, a1, t4 4013 xvpermi.q xr0, xr3, 0x02 //0 3 4014 xvpermi.q xr1, xr4, 0x02 //1 4 4015 xvpermi.q xr2, xr5, 0x02 //2 5 4016 xvpermi.q xr3, xr6, 0x02 //3 6 4017 PREP_HV_8W_LASX xr0 //a0b0c0d0 e0f0g0h0 a3b3c3d3 e3f3g3h3 4018 PREP_HV_8W_LASX xr1 //a1b1c1d1 e1f1g1h1 a4b4c4d4 e4f4g4h4 4019 PREP_HV_8W_LASX xr2 //a2b2c2d2 e2f2g2h2 a5b5c5d5 e5f5g5h5 4020 PREP_HV_8W_LASX xr3 //a3b3c3d3 e3f3g3h3 a6b6c6d6 e6f6g6h6 4021 xvpermi.d xr0, xr0, 0xd8 4022 xvpermi.d xr1, xr1, 0xd8 4023 xvpermi.d xr2, xr2, 0xd8 4024 xvpermi.d xr18, xr3, 0xd8 4025 xvilvl.h xr12, xr1, xr0 //a0a1b0b1c0c1d0d1 e0e1f0f1g0g1h0h1 4026 xvilvh.h xr13, xr1, xr0 //a3a4b3b4c3c4d3d4 e3e4f3f4g3g4h3h4 4027 xvilvl.h xr14, xr2, xr1 //a1a2b1b2c1c2d1d2 e1e2f1f2g1g2h1h2 4028 xvilvh.h xr15, xr2, xr1 //a4a5b4b5c4c5d4d5 e4e5f4f5g4g5h4h5 4029 xvilvl.h xr16, xr18, xr2 //a2a3b2b3c2c3d2d3 e2e3f2f3g2g3h2h3 4030 xvilvh.h xr17, xr18, xr2 //a5a6b5b6c5c6d5d6 e5e6f5f6g5g6h5h6 4031.l_\lable\()hv_8w_loop_lasx: 4032 xvld xr0, a1, 0 4033 xvldx xr1, a1, a2 4034 add.d a1, a1, t2 4035 xvpermi.q xr0, xr1, 0x02 //7 8 4036 PREP_HV_8W_LASX xr0 //a7b7c7d7e7f7g7h7 a8b8c8d8e8f8g8h8 4037 xvpermi.q xr3, xr0, 0x03 //a6b6c6d6e6f6g6h6 a7b7c7d7e7f7g7h7 4038 xvpermi.d xr3, xr3, 0xd8 //a6b6c6d6a7b7c7d7 e6f6g6h6e7f7g7h7 4039 xvpermi.d xr1, xr0, 0xd8 //a7b7c7d7a8b8c8d8 e7f7g7h7e8f8g8h8 4040 xvilvl.h xr18, xr1, xr3 //a6a7b6b7c6c7d6d7 e6e7f6f7g6g7h6h7 4041 xvilvh.h xr2, xr1, xr3 //a7a8b7b8c7c8d7d8 e7e8f7f8g7g8h7h8 4042 xvaddi.hu xr3, xr0, 0 4043 xvmulwev.w.h xr4, xr12, xr24 //01 4044 xvmulwev.w.h xr5, xr14, xr24 //12 4045 xvmulwev.w.h xr6, xr16, xr25 //23 4046 xvmulwev.w.h xr7, xr13, xr25 //34 4047 xvmulwev.w.h xr8, xr15, xr26 //45 4048 xvmulwev.w.h xr9, xr17, xr26 //56 4049 xvmulwev.w.h xr10, xr18, xr27 //67 4050 xvmulwev.w.h xr11, xr2, xr27 //78 4051 xvmaddwod.w.h xr4, xr12, xr24 //01 4052 xvmaddwod.w.h xr5, xr14, xr24 //12 4053 xvmaddwod.w.h xr6, xr16, xr25 //23 4054 xvmaddwod.w.h xr7, xr13, xr25 //34 4055 xvmaddwod.w.h xr8, xr15, xr26 //45 4056 xvmaddwod.w.h xr9, xr17, xr26 //56 4057 xvmaddwod.w.h xr10, xr18, xr27 //67 4058 xvmaddwod.w.h xr11, xr2, xr27 //78 4059 xvadd.w xr4, xr4, xr6 4060 xvadd.w xr5, xr5, xr7 4061 xvadd.w xr4, xr4, xr8 4062 xvadd.w xr5, xr5, xr9 4063 xvadd.w xr4, xr4, xr10 4064 xvadd.w xr5, xr5, xr11 4065 xvaddi.hu xr12, xr16, 0 //01 <-- 23 4066 xvaddi.hu xr14, xr13, 0 //12 <-- 34 4067 xvaddi.hu xr16, xr15, 0 //23 <-- 45 4068 xvaddi.hu xr13, xr17, 0 //34 <-- 56 4069 xvaddi.hu xr15, xr18, 0 //45 <-- 67 4070 xvaddi.hu xr17, xr2, 0 //56 <-- 78 4071 xvssrarni.h.w xr5, xr4, 6 4072 xvpermi.d xr5, xr5, 0xd8 4073 vst vr5, a0, 0 4074 xvpermi.q xr5, xr5, 0x11 4075 vstx vr5, a0, t7 4076 alsl.d a0, t7, a0, 1 4077 addi.d a4, a4, -2 4078 bnez a4, .l_\lable\()hv_8w_loop_lasx 4079 addi.d a1, t0, 8 4080 addi.d t0, t0, 8 4081 addi.d a0, t8, 16 4082 addi.d t8, t8, 16 4083 addi.d a4, t5, 0 4084 addi.d a3, a3, -8 4085 bnez a3, .l_\lable\()hv_8w_loop0_lasx 4086 fld.d f24, sp, 0 4087 fld.d f25, sp, 8 4088 fld.d f26, sp, 16 4089 fld.d f27, sp, 24 4090 addi.d sp, sp, 4*8 4091 b .l_\lable\()end_pre_8tap_lasx 4092 4093.l_\lable\()v_lasx: 4094 srli.w a7, a7, 2 4095 blt t0, a4, .l_\lable\()v_idx_fv_lasx 4096 andi a7, a7, 1 4097 addi.w a7, a7, 3 4098.l_\lable\()v_idx_fv_lasx: 4099 addi.w t5, zero, 120 4100 mul.w a7, a7, t5 4101 addi.w t5, a6, -1 4102 slli.w t5, t5, 3 4103 add.w a7, a7, t5 4104 add.d a7, t6, a7 //fv's offset 4105 xvldrepl.d xr8, a7, 0 4106 xvrepl128vei.h xr12, xr8, 0 4107 xvrepl128vei.h xr13, xr8, 1 4108 xvrepl128vei.h xr14, xr8, 2 4109 xvrepl128vei.h xr15, xr8, 3 4110 sub.d a1, a1, t3 4111 beq a3, t0, .l_\lable\()v_4w_lasx 4112 addi.w t0, t0, 4 4113 beq a3, t0, .l_\lable\()v_8w_lasx 4114 blt t0, a3, .l_\lable\()v_16w_lasx 4115.l_\lable\()v_4w_lasx: 4116 la.local t6, subpel_h_shuf3 4117 xvld xr11, t6, 0 4118 fld.s f0, a1, 0 //a0b0c0d0 4119 fldx.s f1, a1, a2 //a1b1c1d1 4120 fldx.s f2, a1, t2 //a2b2c2d2 4121 add.d a1, a1, t3 4122 fld.s f3, a1, 0 //a3b3c3d3 4123 fldx.s f4, a1, a2 //a4b4c4d4 4124 fldx.s f5, a1, t2 //a5b5c5d5 4125 fldx.s f6, a1, t3 //a6b6c6d6 4126 vilvl.w vr0, vr1, vr0 //01 4127 vilvl.w vr1, vr3, vr2 //23 4128 vilvl.d vr0, vr1, vr0 //0123 4129 vilvl.w vr2, vr5, vr4 //45 4130 vilvl.d vr1, vr2, vr1 //2345 4131 xvpermi.q xr0, xr1, 0x02 //0123 2345 4132 xvbsrl.v xr1, xr0, 4 //123- 345- 4133 xvpermi.q xr4, xr6, 0x02 4134 xvextrins.w xr1, xr4, 0x30 //1234 3456 4135 xvilvl.b xr2, xr1, xr0 //0112 2334 //a0a1b0b1c0c1d0d1 a1a2b1b2c1c2d1d2 a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 4136 xvilvh.b xr3, xr1, xr0 //2334 4556 //a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 4137.l_\lable\()v_4w_loop_lasx: 4138 add.d a1, a1, t4 4139 fld.s f0, a1, 0 //a7b7c7d7 4140 fldx.s f1, a1, a2 //a8b8c8d8 4141 fldx.s f4, a1, t2 //a9b9c9d9 4142 fldx.s f5, a1, t3 //aabacada 4143 vilvl.w vr7, vr0, vr6 //67 4144 vilvl.w vr10, vr4, vr1 //89 4145 vextrins.w vr7, vr1, 0x20//678- 4146 vextrins.w vr10, vr5, 0x20//89a- 4147 xvpermi.q xr7, xr10, 0x02//678- 89a- 4148 xvshuf.b xr4, xr7, xr7, xr11 //67 78 89 9a //a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 a8a9b8b9c8c9d8d9 a9aab9bac9cad9da 4149 xvpermi.q xr7, xr3, 0x11 //4556 4150 xvpermi.q xr7, xr4, 0x02 //45 56 67 78 //a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 4151 xvmulwev.h.bu.b xr16, xr2, xr12 4152 xvmulwev.h.bu.b xr17, xr3, xr13 4153 xvmulwev.h.bu.b xr18, xr7, xr14 4154 xvmulwev.h.bu.b xr19, xr4, xr15 4155 xvmaddwod.h.bu.b xr16, xr2, xr12 4156 xvmaddwod.h.bu.b xr17, xr3, xr13 4157 xvmaddwod.h.bu.b xr18, xr7, xr14 4158 xvmaddwod.h.bu.b xr19, xr4, xr15 4159 xvadd.h xr16, xr16, xr17 4160 xvadd.h xr16, xr16, xr18 4161 xvadd.h xr16, xr16, xr19 4162 xvsrari.h xr16, xr16, 2 4163 xvaddi.bu xr2, xr7, 0 4164 xvaddi.bu xr3, xr4, 0 4165 xvaddi.bu xr6, xr5, 0 4166 xvst xr16, a0, 0 4167 addi.d a0, a0, 32 4168 addi.w a4, a4, -4 4169 bnez a4, .l_\lable\()v_4w_loop_lasx 4170 b .l_\lable\()end_pre_8tap_lasx 4171 4172.l_\lable\()v_8w_lasx: 4173 fld.d f0, a1, 0 4174 fldx.d f1, a1, a2 4175 fldx.d f2, a1, t2 4176 add.d a1, a1, t3 4177 fld.d f3, a1, 0 4178 fldx.d f4, a1, a2 4179 fldx.d f5, a1, t2 4180 fldx.d f6, a1, t3 4181 xvpermi.q xr0, xr1, 0x02 4182 xvpermi.q xr1, xr2, 0x02 4183 xvilvl.b xr0, xr1, xr0 //01 12 4184 xvpermi.q xr2, xr3, 0x02 4185 xvpermi.q xr3, xr4, 0x02 4186 xvilvl.b xr2, xr3, xr2 //23 34 4187 xvpermi.q xr4, xr5, 0x02 4188 xvpermi.q xr5, xr6, 0x02 4189 xvilvl.b xr4, xr5, xr4 //45 56 4190.l_\lable\()v_8w_loop_lasx: 4191 add.d a1, a1, t4 4192 fld.d f7, a1, 0 //7 4193 fldx.d f10, a1, a2 //8 4194 fldx.d f11, a1, t2 //9 4195 fldx.d f18, a1, t3 //a 4196 xvpermi.q xr6, xr7, 0x02 4197 xvpermi.q xr7, xr10, 0x02 4198 xvilvl.b xr6, xr7, xr6 //67 78 4199 xvpermi.q xr10, xr11, 0x02 4200 xvpermi.q xr11, xr18, 0x02 4201 xvilvl.b xr10, xr11, xr10 //89 9a 4202 xvmulwev.h.bu.b xr1, xr0, xr12 4203 xvmulwev.h.bu.b xr3, xr2, xr13 4204 xvmulwev.h.bu.b xr5, xr4, xr14 4205 xvmulwev.h.bu.b xr7, xr6, xr15 4206 xvmulwev.h.bu.b xr9, xr2, xr12 4207 xvmulwev.h.bu.b xr11, xr4, xr13 4208 xvmulwev.h.bu.b xr16, xr6, xr14 4209 xvmulwev.h.bu.b xr17, xr10, xr15 4210 xvmaddwod.h.bu.b xr1, xr0, xr12 4211 xvmaddwod.h.bu.b xr3, xr2, xr13 4212 xvmaddwod.h.bu.b xr5, xr4, xr14 4213 xvmaddwod.h.bu.b xr7, xr6, xr15 4214 xvmaddwod.h.bu.b xr9, xr2, xr12 4215 xvmaddwod.h.bu.b xr11, xr4, xr13 4216 xvmaddwod.h.bu.b xr16, xr6, xr14 4217 xvmaddwod.h.bu.b xr17, xr10, xr15 4218 xvadd.h xr1, xr1, xr3 4219 xvadd.h xr1, xr1, xr5 4220 xvadd.h xr1, xr1, xr7 4221 xvadd.h xr9, xr9, xr11 4222 xvadd.h xr9, xr9, xr16 4223 xvadd.h xr9, xr9, xr17 4224 xvaddi.bu xr0, xr4, 0 4225 xvaddi.bu xr2, xr6, 0 4226 xvaddi.bu xr4, xr10, 0 4227 xvaddi.bu xr6, xr18, 0 4228 xvsrari.h xr1, xr1, 2 4229 xvsrari.h xr9, xr9, 2 4230 xvst xr1, a0, 0 4231 xvst xr9, a0, 32 4232 addi.d a0, a0, 64 4233 addi.w a4, a4, -4 4234 bnez a4, .l_\lable\()v_8w_loop_lasx 4235 b .l_\lable\()end_pre_8tap_lasx 4236 4237.l_\lable\()v_16w_lasx: 4238 addi.d t0, a0, 0 //dst 4239 addi.d t5, a1, 0 //src 4240 slli.w t7, a3, 1 //w 4241 addi.d t8, a4, 0 //h 4242.l_\lable\()v_16w_loop0_lasx: 4243 vld vr0, a1, 0 4244 vldx vr1, a1, a2 4245 vldx vr2, a1, t2 4246 add.d a1, a1, t3 4247 vld vr3, a1, 0 4248 vldx vr4, a1, a2 4249 vldx vr5, a1, t2 4250 vldx vr6, a1, t3 4251 add.d a1, a1, t4 4252 xvpermi.d xr0, xr0, 0xd8 4253 xvpermi.d xr1, xr1, 0xd8 4254 xvpermi.d xr2, xr2, 0xd8 4255 xvpermi.d xr3, xr3, 0xd8 4256 xvpermi.d xr4, xr4, 0xd8 4257 xvpermi.d xr5, xr5, 0xd8 4258 xvpermi.d xr6, xr6, 0xd8 4259 xvilvl.b xr0, xr1, xr0 //01 4260 xvilvl.b xr1, xr2, xr1 //12 4261 xvilvl.b xr2, xr3, xr2 //23 4262 xvilvl.b xr3, xr4, xr3 //34 4263 xvilvl.b xr4, xr5, xr4 //45 4264 xvilvl.b xr5, xr6, xr5 //56 4265.l_\lable\()v_16w_loop_lasx: 4266 vld vr7, a1, 0 //7 4267 vldx vr10, a1, a2 //8 4268 add.d a1, a1, t2 4269 xvpermi.d xr7, xr7, 0xd8 4270 xvpermi.d xr10, xr10, 0xd8 4271 xvilvl.b xr6, xr7, xr6 //67 4272 xvilvl.b xr7, xr10, xr7 //78 4273 xvmulwev.h.bu.b xr9, xr0, xr12 4274 xvmulwev.h.bu.b xr11, xr2, xr13 4275 xvmulwev.h.bu.b xr16, xr4, xr14 4276 xvmulwev.h.bu.b xr17, xr6, xr15 4277 xvmulwev.h.bu.b xr18, xr1, xr12 4278 xvmulwev.h.bu.b xr19, xr3, xr13 4279 xvmulwev.h.bu.b xr20, xr5, xr14 4280 xvmulwev.h.bu.b xr21, xr7, xr15 4281 xvmaddwod.h.bu.b xr9, xr0, xr12 4282 xvmaddwod.h.bu.b xr11, xr2, xr13 4283 xvmaddwod.h.bu.b xr16, xr4, xr14 4284 xvmaddwod.h.bu.b xr17, xr6, xr15 4285 xvmaddwod.h.bu.b xr18, xr1, xr12 4286 xvmaddwod.h.bu.b xr19, xr3, xr13 4287 xvmaddwod.h.bu.b xr20, xr5, xr14 4288 xvmaddwod.h.bu.b xr21, xr7, xr15 4289 xvadd.h xr9, xr9, xr11 4290 xvadd.h xr9, xr9, xr16 4291 xvadd.h xr9, xr9, xr17 4292 xvadd.h xr11, xr18, xr19 4293 xvadd.h xr11, xr11, xr20 4294 xvadd.h xr11, xr11, xr21 4295 xvsrari.h xr9, xr9, 2 4296 xvsrari.h xr11, xr11, 2 4297 xvaddi.bu xr0, xr2, 0 4298 xvaddi.bu xr1, xr3, 0 4299 xvaddi.bu xr2, xr4, 0 4300 xvaddi.bu xr3, xr5, 0 4301 xvaddi.bu xr4, xr6, 0 4302 xvaddi.bu xr5, xr7, 0 4303 xvaddi.bu xr6, xr10, 0 4304 xvst xr9, a0, 0 4305 xvstx xr11, a0, t7 4306 alsl.d a0, t7, a0, 1 4307 addi.d a4, a4, -2 4308 bnez a4, .l_\lable\()v_16w_loop_lasx 4309 addi.d a3, a3, -16 4310 addi.d a0, t0, 32 4311 addi.d t0, t0, 32 4312 addi.d a1, t5, 16 4313 addi.d t5, t5, 16 4314 addi.d a4, t8, 0 4315 bnez a3, .l_\lable\()v_16w_loop0_lasx 4316.l_\lable\()end_pre_8tap_lasx: 4317.endm 4318 4319function prep_8tap_regular_8bpc_lasx 4320 addi.w a7, zero, 0 4321 PREP_8TAP_8BPC_LASX 0 4322endfunc 4323 4324function prep_8tap_smooth_regular_8bpc_lasx 4325 addi.w a7, zero, 1 4326 PREP_8TAP_8BPC_LASX 1 4327endfunc 4328 4329function prep_8tap_sharp_regular_8bpc_lasx 4330 addi.w a7, zero, 2 4331 PREP_8TAP_8BPC_LASX 2 4332endfunc 4333 4334function prep_8tap_regular_smooth_8bpc_lasx 4335 addi.w a7, zero, 4 4336 PREP_8TAP_8BPC_LASX 4 4337endfunc 4338 4339function prep_8tap_smooth_8bpc_lasx 4340 addi.w a7, zero, 5 4341 PREP_8TAP_8BPC_LASX 5 4342endfunc 4343 4344function prep_8tap_sharp_smooth_8bpc_lasx 4345 addi.w a7, zero, 6 4346 PREP_8TAP_8BPC_LASX 6 4347endfunc 4348 4349function prep_8tap_regular_sharp_8bpc_lasx 4350 addi.w a7, zero, 8 4351 PREP_8TAP_8BPC_LASX 8 4352endfunc 4353 4354function prep_8tap_smooth_sharp_8bpc_lasx 4355 addi.w a7, zero, 9 4356 PREP_8TAP_8BPC_LASX 9 4357endfunc 4358 4359function prep_8tap_sharp_8bpc_lasx 4360 addi.w a7, zero, 10 4361 PREP_8TAP_8BPC_LASX 10 4362endfunc 4363 4364.macro PREP_8TAP_8BPC_LSX lable 4365 li.w t0, 4 4366 la.local t6, dav1d_mc_subpel_filters 4367 la.local t7, shufb1 4368 vld vr23, t7, 0 4369 slli.d t2, a2, 1 //src_stride*2 4370 add.d t3, t2, a2 //src_stride*3 4371 slli.d t4, t2, 1 4372 4373 bnez a5, .l_\lable\()h_lsx //mx 4374 bnez a6, .l_\lable\()v_lsx 4375 4376 clz.w t1, a3 4377 li.w t5, 24 4378 sub.w t1, t1, t5 4379 la.local t5, .l_\lable\()prep_hv0_jtable_lsx 4380 alsl.d t1, t1, t5, 1 4381 ld.h t8, t1, 0 4382 add.d t5, t5, t8 4383 jirl $r0, t5, 0 4384 .align 3 4385.l_\lable\()prep_hv0_jtable_lsx: 4386 .hword .l_\lable\()hv0_128w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4387 .hword .l_\lable\()hv0_64w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4388 .hword .l_\lable\()hv0_32w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4389 .hword .l_\lable\()hv0_16w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4390 .hword .l_\lable\()hv0_8w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4391 .hword .l_\lable\()hv0_4w_lsx - .l_\lable\()prep_hv0_jtable_lsx 4392 4393.l_\lable\()hv0_4w_lsx: 4394 fld.s f0, a1, 0 4395 fldx.s f1, a1, a2 4396 add.d a1, a1, t2 4397 vilvl.w vr0, vr1, vr0 4398 vsllwil.hu.bu vr0, vr0, 4 4399 vst vr0, a0, 0 4400 addi.d a0, a0, 16 4401 addi.d a4, a4, -2 4402 bnez a4, .l_\lable\()hv0_4w_lsx 4403 b .l_\lable\()end_pre_8tap_lsx 4404.l_\lable\()hv0_8w_lsx: 4405 fld.d f0, a1, 0 4406 fldx.d f1, a1, a2 4407 add.d a1, a1, t2 4408 vsllwil.hu.bu vr0, vr0, 4 4409 vsllwil.hu.bu vr1, vr1, 4 4410 vst vr0, a0, 0 4411 vst vr1, a0, 16 4412 addi.d a0, a0, 32 4413 addi.d a4, a4, -2 4414 bnez a4, .l_\lable\()hv0_8w_lsx 4415 b .l_\lable\()end_pre_8tap_lsx 4416.l_\lable\()hv0_16w_lsx: 4417 vld vr0, a1, 0 4418 vldx vr1, a1, a2 4419 add.d a1, a1, t2 4420 vsllwil.hu.bu vr2, vr0, 4 4421 vsllwil.hu.bu vr4, vr1, 4 4422 vexth.hu.bu vr3, vr0 4423 vexth.hu.bu vr5, vr1 4424 vslli.h vr3, vr3, 4 4425 vslli.h vr5, vr5, 4 4426 vst vr2, a0, 0 4427 vst vr3, a0, 16 4428 vst vr4, a0, 32 4429 vst vr5, a0, 48 4430 addi.d a0, a0, 64 4431 addi.d a4, a4, -2 4432 bnez a4, .l_\lable\()hv0_16w_lsx 4433 b .l_\lable\()end_pre_8tap_lsx 4434.l_\lable\()hv0_32w_lsx: 4435.l_\lable\()hv0_64w_lsx: 4436.l_\lable\()hv0_128w_lsx: 4437 addi.d t0, a1, 0 4438 addi.d t5, a4, 0 4439 srli.w t7, a3, 4 4440 slli.w t7, t7, 5 4441 addi.d t8, a0, 0 4442.l_\lable\()hv0_16_loop_lsx: 4443 vld vr0, a1, 0 4444 vldx vr1, a1, a2 4445 add.d a1, a1, t2 4446 vsllwil.hu.bu vr2, vr0, 4 4447 vsllwil.hu.bu vr3, vr1, 4 4448 vexth.hu.bu vr0, vr0 4449 vexth.hu.bu vr1, vr1 4450 vslli.h vr0, vr0, 4 4451 vslli.h vr1, vr1, 4 4452 vst vr2, a0, 0 4453 vst vr0, a0, 16 4454 add.d a0, a0, t7 4455 vst vr3, a0, 0 4456 vst vr1, a0, 16 4457 add.d a0, a0, t7 4458 addi.d a4, a4, -2 4459 bnez a4, .l_\lable\()hv0_16_loop_lsx 4460 addi.d a1, t0, 16 4461 addi.d t0, t0, 16 4462 addi.d a0, t8, 32 4463 addi.d t8, t8, 32 4464 addi.d a4, t5, 0 4465 addi.d a3, a3, -16 4466 bnez a3, .l_\lable\()hv0_16_loop_lsx 4467 b .l_\lable\()end_pre_8tap_lsx 4468.l_\lable\()h_lsx: 4469 bnez a6, .l_\lable\()hv_lsx //if(fh) && if (fv) 4470 4471 andi t1, a7, 3 4472 blt t0, a3, .l_\lable\()h_idx_fh_lsx 4473 andi t1, a7, 1 4474 addi.w t1, t1, 3 4475.l_\lable\()h_idx_fh_lsx: 4476 addi.w t5, zero, 120 4477 mul.w t1, t1, t5 4478 addi.w t5, a5, -1 4479 slli.w t5, t5, 3 4480 add.w t1, t1, t5 4481 add.d t1, t6, t1 //fh's offset 4482 vldrepl.d vr23, t1, 0 4483 4484 addi.d a1, a1, -3 4485 clz.w t1, a3 4486 li.w t5, 24 4487 sub.w t1, t1, t5 4488 la.local t5, .l_\lable\()prep_h_jtable_lsx 4489 alsl.d t1, t1, t5, 1 4490 ld.h t8, t1, 0 4491 add.d t5, t5, t8 4492 jirl $r0, t5, 0 4493 4494 .align 3 4495.l_\lable\()prep_h_jtable_lsx: 4496 .hword .l_\lable\()h_128w_lsx - .l_\lable\()prep_h_jtable_lsx 4497 .hword .l_\lable\()h_64w_lsx - .l_\lable\()prep_h_jtable_lsx 4498 .hword .l_\lable\()h_32w_lsx - .l_\lable\()prep_h_jtable_lsx 4499 .hword .l_\lable\()h_16w_lsx - .l_\lable\()prep_h_jtable_lsx 4500 .hword .l_\lable\()h_8w_lsx - .l_\lable\()prep_h_jtable_lsx 4501 .hword .l_\lable\()h_4w_lsx - .l_\lable\()prep_h_jtable_lsx 4502 4503.l_\lable\()h_4w_lsx: 4504 addi.d a1, a1, 2 4505 la.local t7, subpel_h_shuf1 4506 vld vr7, t7, 0 4507 vbsrl.v vr23, vr23, 2 4508 vreplvei.w vr23, vr23, 0 4509.l_\lable\()h_4w_loop_lsx: 4510 vld vr0, a1, 0 4511 vldx vr1, a1, a2 4512 add.d a1, a1, t2 4513 vshuf.b vr0, vr0, vr0, vr7 4514 vshuf.b vr1, vr1, vr1, vr7 4515 vmulwev.h.bu.b vr2, vr0, vr23 4516 vmulwev.h.bu.b vr3, vr1, vr23 4517 vmaddwod.h.bu.b vr2, vr0, vr23 4518 vmaddwod.h.bu.b vr3, vr1, vr23 4519 vhaddw.w.h vr0, vr2, vr2 4520 vhaddw.w.h vr1, vr3, vr3 4521 vssrarni.h.w vr1, vr0, 2 4522 vst vr1, a0, 0 4523 addi.d a0, a0, 16 4524 addi.w a4, a4, -2 4525 bnez a4, .l_\lable\()h_4w_loop_lsx 4526 b .l_\lable\()end_pre_8tap_lsx 4527 4528.l_\lable\()h_8w_lsx: 4529 vreplvei.w vr22, vr23, 0 //fh 4530 vreplvei.w vr23, vr23, 1 4531 la.local t7, subpel_h_shuf1 4532 vld vr6, t7, 0 4533 vaddi.bu vr7, vr6, 4 4534 vaddi.bu vr8, vr6, 8 4535.l_\lable\()h_8w_loop_lsx: 4536 vld vr0, a1, 0 4537 vldx vr1, a1, a2 4538 add.d a1, a1, t2 4539 PREP_H_8W vr0 4540 PREP_H_8W vr1 4541 vst vr0, a0, 0 4542 vst vr1, a0, 16 4543 addi.d a0, a0, 32 4544 addi.d a4, a4, -2 4545 bnez a4, .l_\lable\()h_8w_loop_lsx 4546 b .l_\lable\()end_pre_8tap_lsx 4547 4548.l_\lable\()h_16w_lsx: 4549.l_\lable\()h_32w_lsx: 4550.l_\lable\()h_64w_lsx: 4551.l_\lable\()h_128w_lsx: 4552 vreplvei.w vr22, vr23, 0 //fh 4553 vreplvei.w vr23, vr23, 1 4554 la.local t7, subpel_h_shuf1 4555 vld vr6, t7, 0 4556 vaddi.bu vr7, vr6, 4 4557 vaddi.bu vr8, vr6, 8 4558 srli.w t7, a3, 4 4559 slli.w t6, t7, 5 4560.l_\lable\()h_16w_loop0_lsx: 4561 addi.d t0, a1, 0 //src 4562 addi.d t5, a4, 0 //h 4563 addi.d t8, a0, 0 //dst 4564.l_\lable\()h_16w_loop_lsx: 4565 vld vr0, a1, 0 4566 vld vr1, a1, 8 4567 add.d a1, a1, a2 4568 PREP_H_8W vr0 4569 PREP_H_8W vr1 4570 vst vr0, a0, 0 4571 vst vr1, a0, 16 4572 add.d a0, a0, t6 4573 addi.d t5, t5, -1 4574 bnez t5, .l_\lable\()h_16w_loop_lsx 4575 addi.d a1, t0, 16 4576 addi.d a0, t8, 32 4577 addi.w t7, t7, -1 4578 bnez t7, .l_\lable\()h_16w_loop0_lsx 4579 b .l_\lable\()end_pre_8tap_lsx 4580 4581.l_\lable\()hv_lsx: 4582 andi t1, a7, 3 4583 blt t0, a3, .l_\lable\()hv_idx_fh_lsx 4584 andi t1, a7, 1 4585 addi.w t1, t1, 3 4586.l_\lable\()hv_idx_fh_lsx: 4587 addi.w t5, zero, 120 4588 mul.w t1, t1, t5 4589 addi.w t5, a5, -1 4590 slli.w t5, t5, 3 4591 add.w t1, t1, t5 4592 add.d t1, t6, t1 //fh's offset 4593 vldrepl.d vr8, t1, 0 4594 srli.w a7, a7, 2 4595 blt t0, a4, .l_\lable\()hv_idx_fv_lsx 4596 andi a7, a7, 1 4597 addi.w a7, a7, 3 4598.l_\lable\()hv_idx_fv_lsx: 4599 addi.w t5, zero, 120 4600 mul.w a7, a7, t5 4601 addi.w t5, a6, -1 4602 slli.w t5, t5, 3 4603 add.w a7, a7, t5 4604 add.d a7, t6, a7 //fv's offset 4605 vldrepl.d vr9, a7, 0 4606 vsllwil.h.b vr9, vr9, 0 4607 4608 sub.d a1, a1, t3 4609 addi.d a1, a1, -3 4610 beq a3, t0, .l_\lable\()hv_4w_lsx 4611 b .l_\lable\()hv_8w_lsx 4612.l_\lable\()hv_4w_lsx: 4613 addi.d a1, a1, 2 //ignore leading 0s 4614 vld vr0, a1, 0 4615 vldx vr1, a1, a2 4616 vldx vr2, a1, t2 4617 add.d a1, a1, t3 4618 vld vr3, a1, 0 4619 vldx vr4, a1, a2 4620 vldx vr5, a1, t2 4621 vldx vr6, a1, t3 4622 add.d a1, a1, t4 4623 4624 la.local t1, subpel_h_shuf1 4625 vld vr7, t1, 0 4626 vbsrl.v vr8, vr8, 2 4627 vreplvei.w vr8, vr8, 0 4628 4629 //fv 4630 vreplvei.w vr17, vr9, 0 4631 vreplvei.w vr18, vr9, 1 4632 vreplvei.w vr19, vr9, 2 4633 vreplvei.w vr20, vr9, 3 4634 4635 //DAV1D_FILTER_8TAP_RND 4636 vshuf.b vr0, vr0, vr0, vr7 4637 vshuf.b vr1, vr1, vr1, vr7 4638 vshuf.b vr2, vr2, vr2, vr7 4639 vshuf.b vr3, vr3, vr3, vr7 4640 vshuf.b vr4, vr4, vr4, vr7 4641 vshuf.b vr5, vr5, vr5, vr7 4642 vshuf.b vr6, vr6, vr6, vr7 4643 4644 vmulwev.h.bu.b vr10, vr0, vr8 4645 vmulwev.h.bu.b vr11, vr1, vr8 4646 vmulwev.h.bu.b vr12, vr2, vr8 4647 vmulwev.h.bu.b vr13, vr3, vr8 4648 vmulwev.h.bu.b vr14, vr4, vr8 4649 vmulwev.h.bu.b vr15, vr5, vr8 4650 vmulwev.h.bu.b vr16, vr6, vr8 4651 vmaddwod.h.bu.b vr10, vr0, vr8 4652 vmaddwod.h.bu.b vr11, vr1, vr8 4653 vmaddwod.h.bu.b vr12, vr2, vr8 4654 vmaddwod.h.bu.b vr13, vr3, vr8 4655 vmaddwod.h.bu.b vr14, vr4, vr8 4656 vmaddwod.h.bu.b vr15, vr5, vr8 4657 vmaddwod.h.bu.b vr16, vr6, vr8 4658 4659 vhaddw.w.h vr10, vr10, vr10 4660 vhaddw.w.h vr11, vr11, vr11 4661 vhaddw.w.h vr12, vr12, vr12 4662 vhaddw.w.h vr13, vr13, vr13 4663 vhaddw.w.h vr14, vr14, vr14 4664 vhaddw.w.h vr15, vr15, vr15 4665 vhaddw.w.h vr16, vr16, vr16 4666 4667 vssrarni.h.w vr10, vr10, 2 //h0 4668 vssrarni.h.w vr11, vr11, 2 //h1 4669 vssrarni.h.w vr12, vr12, 2 //h2 4670 vssrarni.h.w vr13, vr13, 2 //h3 4671 vssrarni.h.w vr14, vr14, 2 //h4 4672 vssrarni.h.w vr15, vr15, 2 //h5 4673 vssrarni.h.w vr16, vr16, 2 //h6 4674 4675 //h0 4676 vilvl.h vr0, vr11, vr10 //01 4677 vilvl.h vr1, vr13, vr12 //23 4678 vilvl.h vr2, vr15, vr14 //45 4679 //h1 4680 vilvl.h vr4, vr12, vr11 //12 4681 vilvl.h vr5, vr14, vr13 //34 4682 vilvl.h vr6, vr16, vr15 //56 4683 4684.l_\lable\()hv_w4_loop_lsx: 4685 vld vr9, a1, 0 4686 vldx vr10, a1, a2 4687 add.d a1, a1, t2 4688 4689 //DAV1D_FILTER_8TAP_CLIP 4690 vshuf.b vr9, vr9, vr9, vr7 4691 vshuf.b vr10, vr10, vr10, vr7 4692 vmulwev.h.bu.b vr11, vr9, vr8 4693 vmulwev.h.bu.b vr12, vr10, vr8 4694 vmaddwod.h.bu.b vr11, vr9, vr8 4695 vmaddwod.h.bu.b vr12, vr10, vr8 4696 vhaddw.w.h vr11, vr11, vr11 4697 vhaddw.w.h vr12, vr12, vr12 4698 vssrarni.h.w vr11, vr11, 2 //7h 4699 vssrarni.h.w vr12, vr12, 2 //h8 4700 vilvl.h vr3, vr11, vr16 //67 4701 vilvl.h vr13, vr12, vr11 //78 4702 4703 vmulwev.w.h vr9, vr0, vr17 4704 vmulwev.w.h vr10, vr1, vr18 4705 vmulwev.w.h vr14, vr2, vr19 4706 vmulwev.w.h vr15, vr3, vr20 4707 vmaddwod.w.h vr9, vr0, vr17 4708 vmaddwod.w.h vr10, vr1, vr18 4709 vmaddwod.w.h vr14, vr2, vr19 4710 vmaddwod.w.h vr15, vr3, vr20 4711 vadd.w vr16, vr9, vr10 4712 vadd.w vr16, vr16, vr14 4713 vadd.w vr16, vr16, vr15 4714 4715 vmulwev.w.h vr9, vr4, vr17 4716 vmulwev.w.h vr10, vr5, vr18 4717 vmulwev.w.h vr14, vr6, vr19 4718 vmulwev.w.h vr15, vr13, vr20 4719 vmaddwod.w.h vr9, vr4, vr17 4720 vmaddwod.w.h vr10, vr5, vr18 4721 vmaddwod.w.h vr14, vr6, vr19 4722 vmaddwod.w.h vr15, vr13, vr20 4723 vadd.w vr21, vr9, vr10 4724 vadd.w vr21, vr21, vr14 4725 vadd.w vr21, vr21, vr15 4726 4727 vssrarni.h.w vr21, vr16, 6 4728 //cache 4729 vaddi.hu vr0, vr1, 0 4730 vaddi.hu vr1, vr2, 0 4731 vaddi.hu vr2, vr3, 0 4732 vaddi.hu vr4, vr5, 0 4733 vaddi.hu vr5, vr6, 0 4734 vaddi.hu vr6, vr13, 0 4735 vaddi.hu vr16, vr12, 0 4736 4737 vst vr21, a0, 0 4738 addi.d a0, a0, 16 4739 addi.d a4, a4, -2 4740 bnez a4, .l_\lable\()hv_w4_loop_lsx 4741 b .l_\lable\()end_pre_8tap_lsx 4742 4743.l_\lable\()hv_8w_lsx: 4744.l_\lable\()hv_16w_lsx: 4745.l_\lable\()hv_32w_lsx: 4746.l_\lable\()hv_64w_lsx: 4747.l_\lable\()hv_128w_lsx: 4748 addi.d sp, sp, -8*8 4749 fst.d f24, sp, 0 4750 fst.d f25, sp, 8 4751 fst.d f26, sp, 16 4752 fst.d f27, sp, 24 4753 fst.d f28, sp, 32 4754 fst.d f29, sp, 40 4755 fst.d f30, sp, 48 4756 fst.d f31, sp, 56 4757 addi.d t0, a1, 0 //src 4758 addi.d t5, a4, 0 //h 4759 addi.d t8, a0, 0 //dst 4760 slli.w t6, a3, 1 4761 la.local t1, subpel_h_shuf1 4762 vld vr7, t1, 0 4763 vaddi.bu vr11, vr7, 4 4764 vaddi.bu vr12, vr7, 8 4765 vreplvei.w vr10, vr8, 1 4766 vreplvei.w vr8, vr8, 0 4767 vreplvei.w vr20, vr9, 1 4768 vreplvei.w vr21, vr9, 2 4769 vreplvei.w vr22, vr9, 3 4770 vreplvei.w vr9, vr9, 0 4771.l_\lable\()prep_hv_8w_loop0_lsx: 4772 vld vr0, a1, 0 4773 vldx vr1, a1, a2 4774 vldx vr2, a1, t2 4775 add.d a1, a1, t3 4776 vld vr3, a1, 0 4777 vldx vr4, a1, a2 4778 vldx vr5, a1, t2 4779 vldx vr6, a1, t3 4780 add.d a1, a1, t4 4781 4782 FILTER_8TAP_8W vr0 //h0 4783 FILTER_8TAP_8W vr1 //h1 4784 FILTER_8TAP_8W vr2 //h2 4785 FILTER_8TAP_8W vr3 //h3 4786 FILTER_8TAP_8W vr4 //h4 4787 FILTER_8TAP_8W vr5 //h5 4788 FILTER_8TAP_8W vr6 //h6 4789 4790 //h0' low part 4791 vilvl.h vr23, vr1, vr0 //01 4792 vilvl.h vr24, vr3, vr2 //23 4793 vilvl.h vr25, vr5, vr4 //45 4794 //h0' high part 4795 vilvh.h vr26, vr1, vr0 //01 4796 vilvh.h vr27, vr3, vr2 //23 4797 vilvh.h vr28, vr5, vr4 //45 4798 4799 //h1' low part 4800 vilvl.h vr29, vr2, vr1 //12 4801 vilvl.h vr30, vr4, vr3 //34 4802 vilvl.h vr31, vr6, vr5 //56 4803 //h1' high part 4804 vilvh.h vr0, vr2, vr1 //12 4805 vilvh.h vr1, vr4, vr3 //34 4806 vilvh.h vr2, vr6, vr5 //56 4807 4808.l_\lable\()prep_hv_8w_loop_lsx: 4809 vld vr3, a1, 0 4810 vldx vr4, a1, a2 4811 add.d a1, a1, t2 4812 4813 FILTER_8TAP_8W vr3 //h7 4814 FILTER_8TAP_8W vr4 //h8 4815 4816 //h0' low part 4817 vilvl.h vr16, vr3, vr6 //67 ~low 4818 vmulwev.w.h vr13, vr23, vr9 4819 vmulwev.w.h vr14, vr24, vr20 4820 vmulwev.w.h vr15, vr25, vr21 4821 vmulwev.w.h vr17, vr16, vr22 4822 vmaddwod.w.h vr13, vr23, vr9 4823 vmaddwod.w.h vr14, vr24, vr20 4824 vmaddwod.w.h vr15, vr25, vr21 4825 vmaddwod.w.h vr17, vr16, vr22 4826 vadd.w vr13, vr13, vr14 4827 vadd.w vr13, vr13, vr15 4828 vadd.w vr13, vr13, vr17 4829 //cache 4830 vaddi.hu vr23, vr24, 0 4831 vaddi.hu vr24, vr25, 0 4832 vaddi.hu vr25, vr16, 0 4833 4834 //h0' high part 4835 vilvh.h vr17, vr3, vr6 //67 ~high 4836 vmulwev.w.h vr14, vr26, vr9 4837 vmulwev.w.h vr15, vr27, vr20 4838 vmulwev.w.h vr16, vr28, vr21 4839 vmulwev.w.h vr18, vr17, vr22 4840 vmaddwod.w.h vr14, vr26, vr9 4841 vmaddwod.w.h vr15, vr27, vr20 4842 vmaddwod.w.h vr16, vr28, vr21 4843 vmaddwod.w.h vr18, vr17, vr22 4844 vadd.w vr14, vr14, vr15 4845 vadd.w vr14, vr14, vr16 4846 vadd.w vr14, vr14, vr18 4847 vssrarni.h.w vr14, vr13, 6 4848 vst vr14, a0, 0 4849 add.d a0, a0, t6 4850 //cache 4851 vaddi.hu vr26, vr27, 0 4852 vaddi.hu vr27, vr28, 0 4853 vaddi.hu vr28, vr17, 0 4854 vaddi.hu vr6, vr4, 0 4855 4856 vilvl.h vr5, vr4, vr3 //78 ~low 4857 vilvh.h vr4, vr4, vr3 //78 ~high 4858 4859 //h1' low part 4860 vmulwev.w.h vr13, vr29, vr9 4861 vmulwev.w.h vr14, vr30, vr20 4862 vmulwev.w.h vr15, vr31, vr21 4863 vmulwev.w.h vr16, vr5, vr22 4864 vmaddwod.w.h vr13, vr29, vr9 4865 vmaddwod.w.h vr14, vr30, vr20 4866 vmaddwod.w.h vr15, vr31, vr21 4867 vmaddwod.w.h vr16, vr5, vr22 4868 vadd.w vr13, vr13, vr14 4869 vadd.w vr13, vr13, vr15 4870 vadd.w vr13, vr13, vr16 4871 //cache 4872 vaddi.hu vr29, vr30, 0 4873 vaddi.hu vr30, vr31, 0 4874 vaddi.hu vr31, vr5, 0 4875 4876 //h1' high part 4877 vmulwev.w.h vr14, vr0, vr9 4878 vmulwev.w.h vr15, vr1, vr20 4879 vmulwev.w.h vr16, vr2, vr21 4880 vmulwev.w.h vr17, vr4, vr22 4881 vmaddwod.w.h vr14, vr0, vr9 4882 vmaddwod.w.h vr15, vr1, vr20 4883 vmaddwod.w.h vr16, vr2, vr21 4884 vmaddwod.w.h vr17, vr4, vr22 4885 vadd.w vr14, vr14, vr15 4886 vadd.w vr14, vr14, vr16 4887 vadd.w vr14, vr14, vr17 4888 vssrarni.h.w vr14, vr13, 6 4889 vst vr14, a0, 0 4890 add.d a0, a0, t6 4891 //cache 4892 vaddi.hu vr0, vr1, 0 4893 vaddi.hu vr1, vr2, 0 4894 vaddi.hu vr2, vr4, 0 4895 addi.w a4, a4, -2 4896 bnez a4, .l_\lable\()prep_hv_8w_loop_lsx 4897 addi.d a1, t0, 8 4898 addi.d t0, t0, 8 4899 addi.d a0, t8, 16 4900 addi.d t8, t8, 16 4901 addi.d a4, t5, 0 4902 addi.w a3, a3, -8 4903 bnez a3, .l_\lable\()prep_hv_8w_loop0_lsx 4904 fld.d f24, sp, 0 4905 fld.d f25, sp, 8 4906 fld.d f26, sp, 16 4907 fld.d f27, sp, 24 4908 fld.d f28, sp, 32 4909 fld.d f29, sp, 40 4910 fld.d f30, sp, 48 4911 fld.d f31, sp, 56 4912 addi.d sp, sp, 8*8 4913 b .l_\lable\()end_pre_8tap_lsx 4914 4915.l_\lable\()v_lsx: 4916 srli.w a7, a7, 2 4917 blt t0, a4, .l_\lable\()v_idx_fv_lsx 4918 andi a7, a7, 1 4919 addi.w a7, a7, 3 4920.l_\lable\()v_idx_fv_lsx: 4921 addi.w t5, zero, 120 4922 mul.w a7, a7, t5 4923 addi.w t5, a6, -1 4924 slli.w t5, t5, 3 4925 add.w a7, a7, t5 4926 add.d a7, t6, a7 //fv's offset 4927 vldrepl.d vr8, a7, 0 4928 4929 vilvl.h vr8, vr8, vr8 4930 vreplvei.w vr9, vr8, 1 4931 vreplvei.w vr10, vr8, 2 4932 vreplvei.w vr11, vr8, 3 4933 vreplvei.w vr8, vr8, 0 4934 4935 sub.d a1, a1, t3 4936 beq a3, t0, .l_\lable\()v_4w_lsx 4937 blt t0, a3, .l_\lable\()v_8w_lsx 4938.l_\lable\()v_4w_lsx: 4939 fld.s f0, a1, 0 4940 fldx.s f1, a1, a2 4941 fldx.s f2, a1, t2 4942 add.d a1, a1, t3 4943 fld.s f3, a1, 0 4944 fldx.s f4, a1, a2 4945 fldx.s f5, a1, t2 4946 fldx.s f6, a1, t3 4947 add.d a1, a1, t4 4948 4949 vilvl.w vr0, vr1, vr0 4950 vilvl.w vr1, vr2, vr1 4951 vilvl.b vr0, vr1, vr0 //0 1 1 2 4952 vilvl.w vr1, vr3, vr2 4953 vilvl.w vr2, vr4, vr3 4954 vilvl.b vr1, vr2, vr1 //2 3 3 4 4955 vilvl.w vr2, vr5, vr4 4956 vilvl.w vr3, vr6, vr5 4957 vilvl.b vr2, vr3, vr2 //4 5 5 6 4958.l_\lable\()v_4w_loop_lsx: 4959 fld.s f7, a1, 0 4960 4961 vilvl.w vr3, vr7, vr6 4962 fldx.s f6, a1, a2 4963 add.d a1, a1, t2 4964 vilvl.w vr4, vr6, vr7 4965 vilvl.b vr3, vr4, vr3 //6 7 7 8 4966 4967 vmulwev.h.bu.b vr12, vr0, vr8 4968 vmulwev.h.bu.b vr13, vr1, vr9 4969 vmulwev.h.bu.b vr14, vr2, vr10 4970 vmulwev.h.bu.b vr15, vr3, vr11 4971 vmaddwod.h.bu.b vr12, vr0, vr8 4972 vmaddwod.h.bu.b vr13, vr1, vr9 4973 vmaddwod.h.bu.b vr14, vr2, vr10 4974 vmaddwod.h.bu.b vr15, vr3, vr11 4975 vaddi.hu vr0, vr1, 0 4976 vaddi.hu vr1, vr2, 0 4977 vaddi.hu vr2, vr3, 0 4978 vadd.h vr12, vr12, vr13 4979 vadd.h vr12, vr12, vr14 4980 vadd.h vr12, vr12, vr15 4981 4982 vsrari.h vr12, vr12, 2 4983 vst vr12, a0, 0 4984 addi.d a0, a0, 16 4985 addi.w a4, a4, -2 4986 bnez a4, .l_\lable\()v_4w_loop_lsx 4987 b .l_\lable\()end_pre_8tap_lsx 4988 4989.l_\lable\()v_8w_lsx: 4990 addi.d t0, a1, 0 4991 addi.d t5, a4, 0 4992 addi.d t8, a0, 0 4993 slli.w t6, a3, 1 4994.l_\lable\()v_8w_loop0_lsx: 4995 fld.d f0, a1, 0 4996 fldx.d f1, a1, a2 4997 fldx.d f2, a1, t2 4998 add.d a1, a1, t3 4999 fld.d f3, a1, 0 5000 fldx.d f4, a1, a2 5001 fldx.d f5, a1, t2 5002 fldx.d f6, a1, t3 5003 add.d a1, a1, t4 5004 5005 vilvl.b vr0, vr1, vr0 //0 1 5006 vilvl.b vr1, vr2, vr1 //1 2 5007 vilvl.b vr2, vr3, vr2 //2 3 5008 vilvl.b vr3, vr4, vr3 //3 4 5009 vilvl.b vr4, vr5, vr4 //4 5 5010 vilvl.b vr5, vr6, vr5 //5 6 5011.l_\lable\()v_8w_loop_lsx: 5012 fld.d f7, a1, 0 5013 vilvl.b vr12, vr7, vr6 //6 7 5014 fldx.d f6, a1, a2 5015 add.d a1, a1, t2 5016 vilvl.b vr13, vr6, vr7 //7 8 5017 5018 vmulwev.h.bu.b vr14, vr0, vr8 5019 vmulwev.h.bu.b vr15, vr1, vr8 5020 vmulwev.h.bu.b vr16, vr2, vr9 5021 vmulwev.h.bu.b vr17, vr3, vr9 5022 vmulwev.h.bu.b vr18, vr4, vr10 5023 vmulwev.h.bu.b vr19, vr5, vr10 5024 vmulwev.h.bu.b vr20, vr12, vr11 5025 vmulwev.h.bu.b vr21, vr13, vr11 5026 vmaddwod.h.bu.b vr14, vr0, vr8 5027 vmaddwod.h.bu.b vr15, vr1, vr8 5028 vmaddwod.h.bu.b vr16, vr2, vr9 5029 vmaddwod.h.bu.b vr17, vr3, vr9 5030 vmaddwod.h.bu.b vr18, vr4, vr10 5031 vmaddwod.h.bu.b vr19, vr5, vr10 5032 vmaddwod.h.bu.b vr20, vr12, vr11 5033 vmaddwod.h.bu.b vr21, vr13, vr11 5034 5035 vaddi.hu vr0, vr2, 0 5036 vaddi.hu vr1, vr3, 0 5037 vaddi.hu vr2, vr4, 0 5038 vaddi.hu vr3, vr5, 0 5039 vaddi.hu vr4, vr12, 0 5040 vaddi.hu vr5, vr13, 0 5041 vadd.h vr14, vr14, vr16 5042 vadd.h vr14, vr14, vr18 5043 vadd.h vr14, vr14, vr20 5044 vadd.h vr15, vr15, vr17 5045 vadd.h vr15, vr15, vr19 5046 vadd.h vr15, vr15, vr21 5047 5048 vsrari.h vr14, vr14, 2 5049 vsrari.h vr15, vr15, 2 5050 vst vr14, a0, 0 5051 add.d a0, a0, t6 5052 vst vr15, a0, 0 5053 add.d a0, a0, t6 5054 addi.w a4, a4, -2 5055 bnez a4, .l_\lable\()v_8w_loop_lsx 5056 addi.d a1, t0, 8 5057 addi.d t0, t0, 8 5058 addi.d a0, t8, 16 5059 addi.d t8, t8, 16 5060 addi.d a4, t5, 0 5061 addi.d a3, a3, -8 5062 bnez a3, .l_\lable\()v_8w_loop0_lsx 5063.l_\lable\()end_pre_8tap_lsx: 5064.endm 5065 5066function prep_8tap_regular_8bpc_lsx 5067 addi.w a7, zero, 0 5068 PREP_8TAP_8BPC_LSX 0 5069endfunc 5070 5071function prep_8tap_smooth_regular_8bpc_lsx 5072 addi.w a7, zero, 1 5073 PREP_8TAP_8BPC_LSX 1 5074endfunc 5075 5076function prep_8tap_sharp_regular_8bpc_lsx 5077 addi.w a7, zero, 2 5078 PREP_8TAP_8BPC_LSX 2 5079endfunc 5080 5081function prep_8tap_regular_smooth_8bpc_lsx 5082 addi.w a7, zero, 4 5083 PREP_8TAP_8BPC_LSX 4 5084endfunc 5085 5086function prep_8tap_smooth_8bpc_lsx 5087 addi.w a7, zero, 5 5088 PREP_8TAP_8BPC_LSX 5 5089endfunc 5090 5091function prep_8tap_sharp_smooth_8bpc_lsx 5092 addi.w a7, zero, 6 5093 PREP_8TAP_8BPC_LSX 6 5094endfunc 5095 5096function prep_8tap_regular_sharp_8bpc_lsx 5097 addi.w a7, zero, 8 5098 PREP_8TAP_8BPC_LSX 8 5099endfunc 5100 5101function prep_8tap_smooth_sharp_8bpc_lsx 5102 addi.w a7, zero, 9 5103 PREP_8TAP_8BPC_LSX 9 5104endfunc 5105 5106function prep_8tap_sharp_8bpc_lsx 5107 addi.w a7, zero, 10 5108 PREP_8TAP_8BPC_LSX 10 5109endfunc 5110 5111/* 5112 * static void blend_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 5113 const int w, int h, const uint8_t *mask) 5114 */ 5115function blend_8bpc_lsx 5116 addi.d t8, zero, 64 5117 vreplgr2vr.b vr23, t8 5118 5119 clz.w t0, a3 5120 li.w t1, 26 5121 sub.w t0, t0, t1 5122 la.local t1, .BLEND_LSX_JRTABLE 5123 alsl.d t0, t0, t1, 1 5124 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE 5125 add.d t1, t1, t2 // Get absolute address 5126 jirl $r0, t1, 0 5127 5128 .align 3 5129.BLEND_LSX_JRTABLE: 5130 .hword .BLEND_W32_LSX - .BLEND_LSX_JRTABLE 5131 .hword .BLEND_W16_LSX - .BLEND_LSX_JRTABLE 5132 .hword .BLEND_W8_LSX - .BLEND_LSX_JRTABLE 5133 .hword .BLEND_W4_LSX - .BLEND_LSX_JRTABLE 5134 5135.BLEND_W4_LSX: 5136 vld vr0, a0, 0 5137 vld vr1, a2, 0 5138 vld vr2, a5, 0 5139 5140 vsllwil.hu.bu vr1, vr1, 0 5141 vsllwil.hu.bu vr4, vr2, 0 5142 vmul.h vr1, vr1, vr4 //b*m 5143 vsub.b vr3, vr23, vr2 5144 vsllwil.hu.bu vr0, vr0, 0 5145 vsllwil.hu.bu vr3, vr3, 0 5146 vmadd.h vr1, vr0, vr3 5147 vssrarni.bu.h vr1, vr1, 6 5148 5149 vstelm.w vr1, a0, 0, 0 5150 addi.w a4, a4, -1 5151 add.d a0, a0, a1 5152 addi.d a2, a2, 4 5153 addi.d a5, a5, 4 5154 5155 blt zero, a4, .BLEND_W4_LSX 5156 b .BLEND_END_LSX 5157.BLEND_W8_LSX: 5158 vld vr0, a0, 0 5159 vld vr1, a2, 0 5160 vld vr2, a5, 0 5161 5162 vsllwil.hu.bu vr1, vr1, 0 5163 vsllwil.hu.bu vr4, vr2, 0 5164 vmul.h vr1, vr1, vr4 //b*m 5165 vsub.b vr3, vr23, vr2 5166 vsllwil.hu.bu vr0, vr0, 0 5167 vsllwil.hu.bu vr3, vr3, 0 5168 vmadd.h vr1, vr0, vr3 5169 vssrarni.bu.h vr1, vr1, 6 5170 5171 vstelm.d vr1, a0, 0, 0 5172 addi.w a4, a4, -1 5173 add.d a0, a0, a1 5174 addi.d a2, a2, 8 5175 addi.d a5, a5, 8 5176 5177 blt zero, a4, .BLEND_W8_LSX 5178 b .BLEND_END_LSX 5179.BLEND_W16_LSX: 5180 vld vr0, a0, 0 5181 vld vr1, a2, 0 5182 vld vr2, a5, 0 5183 5184 vexth.hu.bu vr5, vr1 5185 vsllwil.hu.bu vr1, vr1, 0 5186 vexth.hu.bu vr6, vr2 5187 vsllwil.hu.bu vr4, vr2, 0 5188 vmul.h vr1, vr1, vr4 //b*m 5189 vmul.h vr5, vr5, vr6 //b*m 5190 vsub.b vr3, vr23, vr2 5191 vexth.hu.bu vr7, vr0 5192 vexth.hu.bu vr8, vr3 5193 vmadd.h vr5, vr7, vr8 5194 vsllwil.hu.bu vr0, vr0, 0 5195 vsllwil.hu.bu vr3, vr3, 0 5196 vmadd.h vr1, vr0, vr3 5197 vssrarni.bu.h vr5, vr1, 6 5198 5199 vst vr5, a0, 0 5200 addi.w a4, a4, -1 5201 add.d a0, a0, a1 5202 addi.d a2, a2, 16 5203 addi.d a5, a5, 16 5204 5205 blt zero, a4, .BLEND_W16_LSX 5206 b .BLEND_END_LSX 5207.BLEND_W32_LSX: 5208 vld vr0, a0, 0 5209 vld vr1, a2, 0 5210 vld vr2, a5, 0 5211 5212 vexth.hu.bu vr5, vr1 5213 vsllwil.hu.bu vr1, vr1, 0 5214 vexth.hu.bu vr6, vr2 5215 vsllwil.hu.bu vr4, vr2, 0 5216 vmul.h vr1, vr1, vr4 //b*m 5217 vmul.h vr5, vr5, vr6 //b*m 5218 vsub.b vr3, vr23, vr2 5219 vexth.hu.bu vr7, vr0 5220 vexth.hu.bu vr8, vr3 5221 vmadd.h vr5, vr7, vr8 5222 vsllwil.hu.bu vr0, vr0, 0 5223 vsllwil.hu.bu vr3, vr3, 0 5224 vmadd.h vr1, vr0, vr3 5225 vssrarni.bu.h vr5, vr1, 6 5226 5227 vst vr5, a0, 0 5228 5229 /* sencond */ 5230 vld vr0, a0, 16 5231 vld vr1, a2, 16 5232 vld vr2, a5, 16 5233 5234 vexth.hu.bu vr5, vr1 5235 vsllwil.hu.bu vr1, vr1, 0 5236 vexth.hu.bu vr6, vr2 5237 vsllwil.hu.bu vr4, vr2, 0 5238 vmul.h vr1, vr1, vr4 //b*m 5239 vmul.h vr5, vr5, vr6 //b*m 5240 vsub.b vr3, vr23, vr2 5241 vexth.hu.bu vr7, vr0 5242 vexth.hu.bu vr8, vr3 5243 vmadd.h vr5, vr7, vr8 5244 vsllwil.hu.bu vr0, vr0, 0 5245 vsllwil.hu.bu vr3, vr3, 0 5246 vmadd.h vr1, vr0, vr3 5247 vssrarni.bu.h vr5, vr1, 6 5248 5249 vst vr5, a0, 16 5250 addi.w a4, a4, -1 5251 add.d a0, a0, a1 5252 addi.d a2, a2, 32 5253 addi.d a5, a5, 32 5254 5255 blt zero, a4, .BLEND_W32_LSX 5256.BLEND_END_LSX: 5257 5258endfunc 5259 5260const obmc_masks_la 5261/* Unused */ 5262.byte 0, 0, 0, 0 5263/* 2 */ 5264.byte 45, 19, 64, 0 5265/* 4 */ 5266.byte 39, 25, 50, 14, 59, 5, 64, 0 5267/* 8 */ 5268.byte 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 5269/* 16 */ 5270.byte 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 5271.byte 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 5272/* 32 */ 5273.byte 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 5274.byte 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 5275.byte 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 5276endconst 5277 5278/* 5279 * static void blend_v_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 5280 const int w, int h) 5281 */ 5282function blend_v_8bpc_lsx 5283 la.local t8, obmc_masks_la 5284 5285 clz.w t0, a3 5286 li.w t1, 26 5287 sub.w t0, t0, t1 5288 la.local t1, .BLEND_V_LSX_JRTABLE 5289 alsl.d t0, t0, t1, 1 5290 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE 5291 add.d t1, t1, t2 // Get absolute address 5292 jirl $r0, t1, 0 5293 5294 .align 3 5295.BLEND_V_LSX_JRTABLE: 5296 .hword .BLEND_V_W32_LSX - .BLEND_V_LSX_JRTABLE 5297 .hword .BLEND_V_W16_LSX - .BLEND_V_LSX_JRTABLE 5298 .hword .BLEND_V_W8_LSX - .BLEND_V_LSX_JRTABLE 5299 .hword .BLEND_V_W4_LSX - .BLEND_V_LSX_JRTABLE 5300 .hword .BLEND_V_W2_LSX - .BLEND_V_LSX_JRTABLE 5301 .hword .BLEND_V_W2_LSX_1 - .BLEND_V_LSX_JRTABLE //Instructions must be 4-byte aligned 5302 5303.BLEND_V_W2_LSX: 5304 ld.bu t6, t8, 4 5305 ld.bu t7, t8, 5 5306 5307.BLEND_V_W2_LSX_1: 5308 ld.bu t0, a0, 0 5309 ld.bu t1, a2, 0 5310 mul.d t0, t0, t6 5311 mul.d t1, t1, t7 5312 addi.d t0, t0, 32 5313 add.d t0, t0, t1 5314 srli.d t0, t0, 6 5315 st.b t0, a0, 0 5316 5317 addi.w a4, a4, -1 5318 add.d a0, a0, a1 5319 addi.d a2, a2, 2 5320 addi.d a5, a5, 2 5321 5322 blt zero, a4, .BLEND_V_W2_LSX_1 5323 b .BLEND_V_END_LSX 5324 5325.BLEND_V_W4_LSX: 5326 vld vr20, t8, 8 5327 5328.BLEND_V_W4_LSX_1: 5329 vld vr0, a0, 0 5330 vld vr1, a2, 0 5331 5332 vilvl.b vr0, vr1, vr0 5333 vdp2.h.bu vr1, vr0, vr20 5334 vssrarni.bu.h vr1, vr1, 6 5335 5336 vstelm.h vr1, a0, 0, 0 5337 vstelm.b vr1, a0, 2, 2 5338 addi.w a4, a4, -1 5339 add.d a0, a0, a1 5340 addi.d a2, a2, 4 5341 5342 blt zero, a4, .BLEND_V_W4_LSX_1 5343 b .BLEND_V_END_LSX 5344 5345.BLEND_V_W8_LSX: 5346 vld vr20, t8, 16 5347 5348.BLEND_V_W8_LSX_1: 5349 vld vr0, a0, 0 5350 vld vr1, a2, 0 5351 5352 vilvl.b vr0, vr1, vr0 5353 vdp2.h.bu vr1, vr0, vr20 5354 vssrarni.bu.h vr1, vr1, 6 5355 5356 vstelm.w vr1, a0, 0, 0 5357 vstelm.h vr1, a0, 4, 2 5358 addi.w a4, a4, -1 5359 add.d a0, a0, a1 5360 addi.d a2, a2, 8 5361 5362 blt zero, a4, .BLEND_V_W8_LSX_1 5363 b .BLEND_V_END_LSX 5364 5365.BLEND_V_W16_LSX: 5366 vld vr20, t8, 32 5367 vld vr21, t8, 48 5368 5369.BLEND_V_W16_LSX_1: 5370 vld vr0, a0, 0 5371 vld vr1, a2, 0 5372 5373 vilvl.b vr2, vr1, vr0 5374 vilvh.b vr3, vr1, vr0 5375 vmulwev.h.bu vr4, vr2, vr20 5376 vmulwev.h.bu vr5, vr3, vr21 5377 vmaddwod.h.bu vr4, vr2, vr20 5378 vmaddwod.h.bu vr5, vr3, vr21 5379 vssrarni.bu.h vr5, vr4, 6 5380 5381 vstelm.d vr5, a0, 0, 0 5382 vstelm.w vr5, a0, 8, 2 5383 addi.w a4, a4, -1 5384 add.d a0, a0, a1 5385 addi.d a2, a2, 16 5386 5387 blt zero, a4, .BLEND_V_W16_LSX_1 5388 b .BLEND_V_END_LSX 5389 5390.BLEND_V_W32_LSX: 5391 vld vr20, t8, 64 5392 vld vr21, t8, 80 5393 vld vr22, t8, 96 5394 5395.BLEND_V_W32_LSX_1: 5396 vld vr0, a0, 0 5397 vld vr1, a0, 16 5398 vld vr2, a2, 0 5399 vld vr3, a2, 16 5400 5401 vilvl.b vr4, vr2, vr0 5402 vmulwev.h.bu vr7, vr4, vr20 5403 vilvh.b vr5, vr2, vr0 5404 vmulwev.h.bu vr8, vr5, vr21 5405 vilvl.b vr6, vr3, vr1 5406 vmulwev.h.bu vr9, vr6, vr22 5407 vmaddwod.h.bu vr7, vr4, vr20 5408 vmaddwod.h.bu vr8, vr5, vr21 5409 vmaddwod.h.bu vr9, vr6, vr22 5410 vssrarni.bu.h vr8, vr7, 6 5411 vssrarni.bu.h vr9, vr9, 6 5412 5413 vst vr8, a0, 0 5414 vstelm.d vr9, a0, 16, 0 5415 addi.w a4, a4, -1 5416 add.d a0, a0, a1 5417 addi.d a2, a2, 32 5418 5419 blt zero, a4, .BLEND_V_W32_LSX_1 5420 5421.BLEND_V_END_LSX: 5422 5423endfunc 5424 5425/* 5426 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 5427 const int w, int h) 5428 */ 5429function blend_h_8bpc_lsx 5430 la.local t8, obmc_masks_la 5431 alsl.d t8, a4, t8, 1 5432 srli.d t0, a4, 1 5433 srli.d t1, a4, 2 5434 add.d a4, t0, t1 // h = (h * 3) >> 2; 5435 slli.d a4, a4, 1 5436 add.d a4, a4, t8 5437 5438 clz.w t0, a3 5439 li.w t1, 24 5440 sub.w t0, t0, t1 5441 la.local t1, .BLEND_H_LSX_JRTABLE 5442 alsl.d t0, t0, t1, 1 5443 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE 5444 add.d t1, t1, t2 // Get absolute address 5445 jirl $r0, t1, 0 5446 5447 .align 3 5448.BLEND_H_LSX_JRTABLE: 5449 .hword .BLEND_H_W128_LSX - .BLEND_H_LSX_JRTABLE 5450 .hword .BLEND_H_W64_LSX - .BLEND_H_LSX_JRTABLE 5451 .hword .BLEND_H_W32_LSX - .BLEND_H_LSX_JRTABLE 5452 .hword .BLEND_H_W16_LSX - .BLEND_H_LSX_JRTABLE 5453 .hword .BLEND_H_W8_LSX - .BLEND_H_LSX_JRTABLE 5454 .hword .BLEND_H_W4_LSX - .BLEND_H_LSX_JRTABLE 5455 .hword .BLEND_H_W2_LSX - .BLEND_H_LSX_JRTABLE 5456 .hword .BLEND_H_END_LSX - .BLEND_H_LSX_JRTABLE //Instructions must be 4-byte aligned 5457 5458.BLEND_H_W2_LSX: 5459 vldrepl.h vr20, t8, 0 5460 vld vr0, a0, 0 5461 vld vr1, a2, 0 5462 5463 vilvl.b vr0, vr1, vr0 5464 vdp2.h.bu vr1, vr0, vr20 5465 vssrarni.bu.h vr1, vr1, 6 5466 5467 vstelm.h vr1, a0, 0, 0 5468 addi.d t8, t8, 2 5469 add.d a0, a0, a1 5470 addi.d a2, a2, 2 5471 5472 blt t8, a4, .BLEND_H_W2_LSX 5473 b .BLEND_H_END_LSX 5474 5475.BLEND_H_W4_LSX: 5476 vldrepl.h vr20, t8, 0 5477 vld vr0, a0, 0 5478 vld vr1, a2, 0 5479 5480 vilvl.b vr0, vr1, vr0 5481 vdp2.h.bu vr1, vr0, vr20 5482 vssrarni.bu.h vr1, vr1, 6 5483 5484 vstelm.w vr1, a0, 0, 0 5485 addi.d t8, t8, 2 5486 add.d a0, a0, a1 5487 addi.d a2, a2, 4 5488 5489 blt t8, a4, .BLEND_H_W4_LSX 5490 b .BLEND_H_END_LSX 5491 5492.BLEND_H_W8_LSX: 5493 vldrepl.h vr20, t8, 0 5494 vld vr0, a0, 0 5495 vld vr1, a2, 0 5496 5497 vilvl.b vr0, vr1, vr0 5498 vdp2.h.bu vr1, vr0, vr20 5499 vssrarni.bu.h vr1, vr1, 6 5500 5501 vstelm.d vr1, a0, 0, 0 5502 addi.d t8, t8, 2 5503 add.d a0, a0, a1 5504 addi.d a2, a2, 8 5505 5506 blt t8, a4, .BLEND_H_W8_LSX 5507 b .BLEND_H_END_LSX 5508 5509.BLEND_H_W16_LSX: 5510 vldrepl.h vr20, t8, 0 5511 vld vr0, a0, 0 5512 vld vr1, a2, 0 5513 5514 vilvl.b vr2, vr1, vr0 5515 vilvh.b vr3, vr1, vr0 5516 vmulwev.h.bu vr4, vr2, vr20 5517 vmulwev.h.bu vr5, vr3, vr20 5518 vmaddwod.h.bu vr4, vr2, vr20 5519 vmaddwod.h.bu vr5, vr3, vr20 5520 vssrarni.bu.h vr5, vr4, 6 5521 5522 vst vr5, a0, 0 5523 addi.d t8, t8, 2 5524 add.d a0, a0, a1 5525 addi.d a2, a2, 16 5526 5527 blt t8, a4, .BLEND_H_W16_LSX 5528 b .BLEND_H_END_LSX 5529 5530.BLEND_H_W32_LSX: 5531 vldrepl.h vr20, t8, 0 5532 5533 vld vr0, a0, 0 5534 vld vr1, a0, 16 5535 vld vr2, a2, 0 5536 vld vr3, a2, 16 5537 5538 vilvl.b vr4, vr2, vr0 5539 vilvh.b vr5, vr2, vr0 5540 vilvl.b vr6, vr3, vr1 5541 vilvh.b vr3, vr3, vr1 5542 vmulwev.h.bu vr7, vr4, vr20 5543 vmulwev.h.bu vr8, vr5, vr20 5544 vmulwev.h.bu vr9, vr6, vr20 5545 vmulwev.h.bu vr0, vr3, vr20 5546 vmaddwod.h.bu vr7, vr4, vr20 5547 vmaddwod.h.bu vr8, vr5, vr20 5548 vmaddwod.h.bu vr9, vr6, vr20 5549 vmaddwod.h.bu vr0, vr3, vr20 5550 vssrarni.bu.h vr8, vr7, 6 5551 vssrarni.bu.h vr0, vr9, 6 5552 5553 vst vr8, a0, 0 5554 vst vr0, a0, 16 5555 addi.d t8, t8, 2 5556 add.d a0, a0, a1 5557 addi.d a2, a2, 32 5558 5559 blt t8, a4, .BLEND_H_W32_LSX 5560 b .BLEND_H_END_LSX 5561 5562.BLEND_H_W64_LSX: 5563 vldrepl.h vr20, t8, 0 5564 5565 vld vr0, a0, 0 5566 vld vr1, a0, 16 5567 vld vr2, a0, 32 5568 vld vr3, a0, 48 5569 vld vr4, a2, 0 5570 vld vr5, a2, 16 5571 vld vr6, a2, 32 5572 vld vr7, a2, 48 5573 5574 vilvl.b vr8, vr4, vr0 5575 vilvh.b vr9, vr4, vr0 5576 vilvl.b vr10, vr5, vr1 5577 vilvh.b vr11, vr5, vr1 5578 vilvl.b vr12, vr6, vr2 5579 vilvh.b vr13, vr6, vr2 5580 vilvl.b vr14, vr7, vr3 5581 vilvh.b vr15, vr7, vr3 5582 vmulwev.h.bu vr0, vr8, vr20 5583 vmulwev.h.bu vr1, vr9, vr20 5584 vmulwev.h.bu vr2, vr10, vr20 5585 vmulwev.h.bu vr3, vr11, vr20 5586 vmulwev.h.bu vr4, vr12, vr20 5587 vmulwev.h.bu vr5, vr13, vr20 5588 vmulwev.h.bu vr6, vr14, vr20 5589 vmulwev.h.bu vr7, vr15, vr20 5590 5591 vmaddwod.h.bu vr0, vr8, vr20 5592 vmaddwod.h.bu vr1, vr9, vr20 5593 vmaddwod.h.bu vr2, vr10, vr20 5594 vmaddwod.h.bu vr3, vr11, vr20 5595 vmaddwod.h.bu vr4, vr12, vr20 5596 vmaddwod.h.bu vr5, vr13, vr20 5597 vmaddwod.h.bu vr6, vr14, vr20 5598 vmaddwod.h.bu vr7, vr15, vr20 5599 5600 vssrarni.bu.h vr1, vr0, 6 5601 vssrarni.bu.h vr3, vr2, 6 5602 vssrarni.bu.h vr5, vr4, 6 5603 vssrarni.bu.h vr7, vr6, 6 5604 5605 vst vr1, a0, 0 5606 vst vr3, a0, 16 5607 vst vr5, a0, 32 5608 vst vr7, a0, 48 5609 addi.d t8, t8, 2 5610 add.d a0, a0, a1 5611 addi.d a2, a2, 64 5612 5613 blt t8, a4, .BLEND_H_W64_LSX 5614 b .BLEND_H_END_LSX 5615 5616.BLEND_H_W128_LSX: 5617 vldrepl.h vr20, t8, 0 5618 5619 vld vr0, a0, 0 5620 vld vr1, a0, 16 5621 vld vr2, a0, 32 5622 vld vr3, a0, 48 5623 vld vr4, a2, 0 5624 vld vr5, a2, 16 5625 vld vr6, a2, 32 5626 vld vr7, a2, 48 5627 5628 vilvl.b vr8, vr4, vr0 5629 vilvh.b vr9, vr4, vr0 5630 vilvl.b vr10, vr5, vr1 5631 vilvh.b vr11, vr5, vr1 5632 vilvl.b vr12, vr6, vr2 5633 vilvh.b vr13, vr6, vr2 5634 vilvl.b vr14, vr7, vr3 5635 vilvh.b vr15, vr7, vr3 5636 vmulwev.h.bu vr0, vr8, vr20 5637 vmulwev.h.bu vr1, vr9, vr20 5638 vmulwev.h.bu vr2, vr10, vr20 5639 vmulwev.h.bu vr3, vr11, vr20 5640 vmulwev.h.bu vr4, vr12, vr20 5641 vmulwev.h.bu vr5, vr13, vr20 5642 vmulwev.h.bu vr6, vr14, vr20 5643 vmulwev.h.bu vr7, vr15, vr20 5644 5645 vmaddwod.h.bu vr0, vr8, vr20 5646 vmaddwod.h.bu vr1, vr9, vr20 5647 vmaddwod.h.bu vr2, vr10, vr20 5648 vmaddwod.h.bu vr3, vr11, vr20 5649 vmaddwod.h.bu vr4, vr12, vr20 5650 vmaddwod.h.bu vr5, vr13, vr20 5651 vmaddwod.h.bu vr6, vr14, vr20 5652 vmaddwod.h.bu vr7, vr15, vr20 5653 5654 vssrarni.bu.h vr1, vr0, 6 5655 vssrarni.bu.h vr3, vr2, 6 5656 vssrarni.bu.h vr5, vr4, 6 5657 vssrarni.bu.h vr7, vr6, 6 5658 5659 vst vr1, a0, 0 5660 vst vr3, a0, 16 5661 vst vr5, a0, 32 5662 vst vr7, a0, 48 5663 5664 /* second */ 5665 vld vr0, a0, 64 5666 vld vr1, a0, 80 5667 vld vr2, a0, 96 5668 vld vr3, a0, 112 5669 vld vr4, a2, 64 5670 vld vr5, a2, 80 5671 vld vr6, a2, 96 5672 vld vr7, a2, 112 5673 5674 vilvl.b vr8, vr4, vr0 5675 vilvh.b vr9, vr4, vr0 5676 vilvl.b vr10, vr5, vr1 5677 vilvh.b vr11, vr5, vr1 5678 vilvl.b vr12, vr6, vr2 5679 vilvh.b vr13, vr6, vr2 5680 vilvl.b vr14, vr7, vr3 5681 vilvh.b vr15, vr7, vr3 5682 vmulwev.h.bu vr0, vr8, vr20 5683 vmulwev.h.bu vr1, vr9, vr20 5684 vmulwev.h.bu vr2, vr10, vr20 5685 vmulwev.h.bu vr3, vr11, vr20 5686 vmulwev.h.bu vr4, vr12, vr20 5687 vmulwev.h.bu vr5, vr13, vr20 5688 vmulwev.h.bu vr6, vr14, vr20 5689 vmulwev.h.bu vr7, vr15, vr20 5690 5691 vmaddwod.h.bu vr0, vr8, vr20 5692 vmaddwod.h.bu vr1, vr9, vr20 5693 vmaddwod.h.bu vr2, vr10, vr20 5694 vmaddwod.h.bu vr3, vr11, vr20 5695 vmaddwod.h.bu vr4, vr12, vr20 5696 vmaddwod.h.bu vr5, vr13, vr20 5697 vmaddwod.h.bu vr6, vr14, vr20 5698 vmaddwod.h.bu vr7, vr15, vr20 5699 5700 vssrarni.bu.h vr1, vr0, 6 5701 vssrarni.bu.h vr3, vr2, 6 5702 vssrarni.bu.h vr5, vr4, 6 5703 vssrarni.bu.h vr7, vr6, 6 5704 5705 vst vr1, a0, 64 5706 vst vr3, a0, 80 5707 vst vr5, a0, 96 5708 vst vr7, a0, 112 5709 5710 addi.d t8, t8, 2 5711 add.d a0, a0, a1 5712 addi.d a2, a2, 128 5713 5714 blt t8, a4, .BLEND_H_W128_LSX 5715 b .BLEND_H_END_LSX 5716 5717.BLEND_H_END_LSX: 5718 5719endfunc 5720 5721/* 5722 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 5723 const int w, int h) 5724 */ 5725function blend_h_8bpc_lasx 5726 la.local t8, obmc_masks_la 5727 alsl.d t8, a4, t8, 1 5728 srli.d t0, a4, 1 5729 srli.d t1, a4, 2 5730 add.d a4, t0, t1 // h = (h * 3) >> 2; 5731 slli.d a4, a4, 1 5732 add.d a4, a4, t8 5733 5734 clz.w t0, a3 5735 li.w t1, 24 5736 sub.w t0, t0, t1 5737 la.local t1, .BLEND_H_LASX_JRTABLE 5738 alsl.d t0, t0, t1, 1 5739 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE 5740 add.d t1, t1, t2 // Get absolute address 5741 jirl $r0, t1, 0 5742 5743 .align 3 5744.BLEND_H_LASX_JRTABLE: 5745 .hword .BLEND_H_W128_LASX - .BLEND_H_LASX_JRTABLE 5746 .hword .BLEND_H_W64_LASX - .BLEND_H_LASX_JRTABLE 5747 .hword .BLEND_H_W32_LASX - .BLEND_H_LASX_JRTABLE 5748 .hword .BLEND_H_W16_LASX - .BLEND_H_LASX_JRTABLE 5749 .hword .BLEND_H_W8_LASX - .BLEND_H_LASX_JRTABLE 5750 .hword .BLEND_H_W4_LASX - .BLEND_H_LASX_JRTABLE 5751 .hword .BLEND_H_W2_LASX - .BLEND_H_LASX_JRTABLE 5752 .hword .BLEND_H_END_LASX - .BLEND_H_LASX_JRTABLE //Instructions must be 4-byte aligned 5753 5754.BLEND_H_W2_LASX: 5755 vldrepl.h vr20, t8, 0 5756 vld vr0, a0, 0 5757 vld vr1, a2, 0 5758 5759 vilvl.b vr0, vr1, vr0 5760 vdp2.h.bu vr1, vr0, vr20 5761 vssrarni.bu.h vr1, vr1, 6 5762 5763 vstelm.h vr1, a0, 0, 0 5764 addi.d t8, t8, 2 5765 add.d a0, a0, a1 5766 addi.d a2, a2, 2 5767 5768 blt t8, a4, .BLEND_H_W2_LASX 5769 b .BLEND_H_END_LASX 5770 5771.BLEND_H_W4_LASX: 5772 vldrepl.h vr20, t8, 0 5773 vld vr0, a0, 0 5774 vld vr1, a2, 0 5775 5776 vilvl.b vr0, vr1, vr0 5777 vdp2.h.bu vr1, vr0, vr20 5778 vssrarni.bu.h vr1, vr1, 6 5779 5780 vstelm.w vr1, a0, 0, 0 5781 addi.d t8, t8, 2 5782 add.d a0, a0, a1 5783 addi.d a2, a2, 4 5784 5785 blt t8, a4, .BLEND_H_W4_LASX 5786 b .BLEND_H_END_LASX 5787 5788.BLEND_H_W8_LASX: 5789 vldrepl.h vr20, t8, 0 5790 vld vr0, a0, 0 5791 vld vr1, a2, 0 5792 5793 vilvl.b vr0, vr1, vr0 5794 vdp2.h.bu vr1, vr0, vr20 5795 vssrarni.bu.h vr1, vr1, 6 5796 5797 vstelm.d vr1, a0, 0, 0 5798 addi.d t8, t8, 2 5799 add.d a0, a0, a1 5800 addi.d a2, a2, 8 5801 5802 blt t8, a4, .BLEND_H_W8_LASX 5803 b .BLEND_H_END_LASX 5804 5805.BLEND_H_W16_LASX: 5806 vldrepl.h vr20, t8, 0 5807 vld vr0, a0, 0 5808 vld vr1, a2, 0 5809 5810 vilvl.b vr2, vr1, vr0 5811 vilvh.b vr3, vr1, vr0 5812 vmulwev.h.bu vr4, vr2, vr20 5813 vmulwev.h.bu vr5, vr3, vr20 5814 vmaddwod.h.bu vr4, vr2, vr20 5815 vmaddwod.h.bu vr5, vr3, vr20 5816 vssrarni.bu.h vr5, vr4, 6 5817 5818 vst vr5, a0, 0 5819 addi.d t8, t8, 2 5820 add.d a0, a0, a1 5821 addi.d a2, a2, 16 5822 5823 blt t8, a4, .BLEND_H_W16_LSX 5824 b .BLEND_H_END_LSX 5825 5826.BLEND_H_W32_LASX: 5827 xvldrepl.h xr20, t8, 0 5828 5829 xvld xr0, a0, 0 5830 xvld xr1, a2, 0 5831 5832 xvilvl.b xr2, xr1, xr0 5833 xvilvh.b xr3, xr1, xr0 5834 5835 xvmulwev.h.bu xr4, xr2, xr20 5836 xvmulwev.h.bu xr5, xr3, xr20 5837 xvmaddwod.h.bu xr4, xr2, xr20 5838 xvmaddwod.h.bu xr5, xr3, xr20 5839 xvssrarni.bu.h xr5, xr4, 6 5840 5841 xvst xr5, a0, 0 5842 addi.d t8, t8, 2 5843 add.d a0, a0, a1 5844 addi.d a2, a2, 32 5845 5846 blt t8, a4, .BLEND_H_W32_LASX 5847 b .BLEND_H_END_LASX 5848 5849.BLEND_H_W64_LASX: 5850 xvldrepl.h xr20, t8, 0 5851 5852 xvld xr0, a0, 0 5853 xvld xr1, a0, 32 5854 xvld xr2, a2, 0 5855 xvld xr3, a2, 32 5856 5857 xvilvl.b xr4, xr2, xr0 5858 xvilvh.b xr5, xr2, xr0 5859 xvilvl.b xr6, xr3, xr1 5860 xvilvh.b xr7, xr3, xr1 5861 5862 xvmulwev.h.bu xr0, xr4, xr20 5863 xvmulwev.h.bu xr1, xr5, xr20 5864 xvmulwev.h.bu xr2, xr6, xr20 5865 xvmulwev.h.bu xr3, xr7, xr20 5866 xvmaddwod.h.bu xr0, xr4, xr20 5867 xvmaddwod.h.bu xr1, xr5, xr20 5868 xvmaddwod.h.bu xr2, xr6, xr20 5869 xvmaddwod.h.bu xr3, xr7, xr20 5870 xvssrarni.bu.h xr1, xr0, 6 5871 xvssrarni.bu.h xr3, xr2, 6 5872 5873 xvst xr1, a0, 0 5874 xvst xr3, a0, 32 5875 addi.d t8, t8, 2 5876 add.d a0, a0, a1 5877 addi.d a2, a2, 64 5878 5879 blt t8, a4, .BLEND_H_W64_LASX 5880 b .BLEND_H_END_LASX 5881 5882.BLEND_H_W128_LASX: 5883 xvldrepl.h xr20, t8, 0 5884 5885 xvld xr0, a0, 0 5886 xvld xr1, a0, 32 5887 xvld xr2, a0, 64 5888 xvld xr3, a0, 96 5889 xvld xr4, a2, 0 5890 xvld xr5, a2, 32 5891 xvld xr6, a2, 64 5892 xvld xr7, a2, 96 5893 5894 xvilvl.b xr8, xr4, xr0 5895 xvilvh.b xr9, xr4, xr0 5896 xvilvl.b xr10, xr5, xr1 5897 xvilvh.b xr11, xr5, xr1 5898 xvilvl.b xr12, xr6, xr2 5899 xvilvh.b xr13, xr6, xr2 5900 xvilvl.b xr14, xr7, xr3 5901 xvilvh.b xr15, xr7, xr3 5902 5903 xvmulwev.h.bu xr0, xr8, xr20 5904 xvmulwev.h.bu xr1, xr9, xr20 5905 xvmulwev.h.bu xr2, xr10, xr20 5906 xvmulwev.h.bu xr3, xr11, xr20 5907 xvmulwev.h.bu xr4, xr12, xr20 5908 xvmulwev.h.bu xr5, xr13, xr20 5909 xvmulwev.h.bu xr6, xr14, xr20 5910 xvmulwev.h.bu xr7, xr15, xr20 5911 xvmaddwod.h.bu xr0, xr8, xr20 5912 xvmaddwod.h.bu xr1, xr9, xr20 5913 xvmaddwod.h.bu xr2, xr10, xr20 5914 xvmaddwod.h.bu xr3, xr11, xr20 5915 xvmaddwod.h.bu xr4, xr12, xr20 5916 xvmaddwod.h.bu xr5, xr13, xr20 5917 xvmaddwod.h.bu xr6, xr14, xr20 5918 xvmaddwod.h.bu xr7, xr15, xr20 5919 xvssrarni.bu.h xr1, xr0, 6 5920 xvssrarni.bu.h xr3, xr2, 6 5921 xvssrarni.bu.h xr5, xr4, 6 5922 xvssrarni.bu.h xr7, xr6, 6 5923 5924 xvst xr1, a0, 0 5925 xvst xr3, a0, 32 5926 xvst xr5, a0, 64 5927 xvst xr7, a0, 96 5928 addi.d t8, t8, 2 5929 add.d a0, a0, a1 5930 addi.d a2, a2, 128 5931 5932 blt t8, a4, .BLEND_H_W128_LASX 5933 b .BLEND_H_END_LASX 5934 5935.BLEND_H_END_LASX: 5936 5937endfunc 5938 5939/* 5940 * a1=16 | a2=8 | a3=4 5941 * temp reg: a4 5942 */ 5943.macro PIXEL_COPY_LSX _dst, _src, _size 5944 blt \_size, a1, 8f 594516: 5946 vld vr0, \_src, 0 5947 vst vr0, \_dst, 0 5948 addi.d \_size, \_size, -16 5949 addi.d \_dst, \_dst, 16 5950 addi.d \_src, \_src, 16 5951 blt a1, \_size, 16b 59528: 5953 blt \_size, a2, 14f 5954 ld.d a4, \_src, 0 5955 st.d a4, \_dst, 0 5956 addi.d \_size, \_size, -8 5957 addi.d \_dst, \_dst, 8 5958 addi.d \_src, \_src, 8 595914: 5960 blt \_size, a3, 11f 5961 ld.w a4, \_src, 0 5962 st.w a4, \_dst, 0 5963 addi.d \_size, \_size, -4 5964 addi.d \_dst, \_dst, 4 5965 addi.d \_src, \_src, 4 596611: 5967 beqz \_size, 110f 5968111: 5969 ld.b a4, \_src, 0 5970 st.b a4, \_dst, 0 5971 addi.d \_size, \_size, -1 5972 addi.d \_dst, \_dst, 1 5973 addi.d \_src, \_src, 1 5974 bnez \_size, 111b 5975110: 5976.endm 5977 5978/* 5979 * a1=16 | a2=8 | a3=4 5980 */ 5981.macro PIXEL_SET_LSX _dst, _vsrc, _size 5982 blt \_size, a1, 8f 598316: 5984 vst \_vsrc, \_dst, 0 5985 addi.d \_size, \_size, -16 5986 addi.d \_dst, \_dst, 16 5987 blt a1, \_size, 16b 59888: 5989 blt \_size, a2, 14f 5990 vstelm.d \_vsrc, \_dst, 0, 0 5991 addi.d \_size, \_size, -8 5992 addi.d \_dst, \_dst, 8 599314: 5994 blt \_size, a3, 11f 5995 vstelm.w \_vsrc, \_dst, 0, 0 5996 addi.d \_size, \_size, -4 5997 addi.d \_dst, \_dst, 4 599811: 5999 beqz \_size, 110f 6000111: 6001 vstelm.b \_vsrc, \_dst, 0, 0 6002 addi.d \_size, \_size, -1 6003 addi.d \_dst, \_dst, 1 6004 bnez \_size, 111b 6005110: 6006.endm 6007 6008/* 6009 * temp reg: a4 a5 t2 t3 vr0 6010 */ 6011.macro DEGE_LOOP need_left, need_right 60120: 6013 addi.d t2, t6, 0 // dst 6014 addi.d t3, t7, 0 // src 6015.if \need_left 6016 vldrepl.b vr0, t3, 0 6017 addi.d a5, t0, 0 6018 PIXEL_SET_LSX t2, vr0, a5 6019.endif 6020 6021 addi.d a5, t4, 0 6022 PIXEL_COPY_LSX t2, t3, a5 6023 6024.if \need_right 6025 vldrepl.b vr0, t3, -1 6026 addi.d a5, t1, 0 6027 PIXEL_SET_LSX t2, vr0, a5 6028.endif 6029 6030 addi.d t5, t5, -1 6031 add.d t7, t7, t8 6032 add.d t6, t6, a7 6033 bnez t5, 0b 6034.endm 6035 6036/* 6037 * static void emu_edge_c(const intptr_t bw, const intptr_t bh, 6038 * const intptr_t iw, const intptr_t ih, 6039 * const intptr_t x, const intptr_t y, 6040 * pixel *dst, const ptrdiff_t dst_stride, 6041 * const pixel *ref, const ptrdiff_t ref_stride) 6042 */ 6043function emu_edge_8bpc_lsx 6044 vxor.v vr23, vr23, vr23 // zero 6045 addi.d t0, a3, -1 // ih - 1 6046 addi.d t1, a2, -1 // iw - 1 6047 vreplgr2vr.w vr22, t0 6048 vinsgr2vr.w vr22, t1, 1 6049 vreplgr2vr.w vr0, a5 6050 vinsgr2vr.w vr0, a4, 1 // [0] - h | [1] - w 6051 6052 vclip.w vr2, vr0, vr23, vr22 6053 vpickve2gr.w t0, vr2, 0 6054 ld.d t2, sp, 0 6055 ld.d t8, sp, 8 // ref_stride 6056 mul.w t0, t0, t8 6057 vpickve2gr.w t1, vr2, 1 6058 add.d t2, t2, t1 6059 add.d t7, t0, t2 // ref 6060 6061 addi.d t0, a0, -1 // bw - 1 6062 addi.d t1, a1, -1 // bh - 1 6063 vreplgr2vr.w vr21, t0 6064 vreplgr2vr.w vr22, t1 6065 vilvl.d vr21, vr22, vr21 6066 sub.d t2, zero, a4 // -x 6067 add.d t3, a0, a4 6068 sub.d t3, t3, a2 // x + bw - iw 6069 sub.d t4, zero, a5 // -y 6070 add.d t5, a1, a5 6071 sub.d t5, t5, a3 // y + bh - ih 6072 vreplgr2vr.w vr0, t2 6073 vinsgr2vr.w vr0, t3, 1 6074 vinsgr2vr.w vr0, t4, 2 6075 vinsgr2vr.w vr0, t5, 3 6076 vclip.w vr2, vr0, vr23, vr21 6077 vpickve2gr.w t0, vr2, 0 // left_ext 6078 vpickve2gr.w t1, vr2, 1 // right_ext 6079 vpickve2gr.w t2, vr2, 2 // top_ext 6080 vpickve2gr.w t3, vr2, 3 // bottom_ext 6081 6082 mul.w t6, t2, a7 6083 add.d t4, t0, t1 6084 add.d t5, t2, t3 6085 sub.d t4, a0, t4 // center_w 6086 sub.d t5, a1, t5 // center_h 6087 6088 addi.d a1, zero, 16 6089 addi.d a2, zero, 8 6090 addi.d a3, zero, 4 6091 add.d t6, t6, a6 // blk 6092 6093 beqz t0, 2f 6094 // need_left 6095 beqz t1, 3f 6096 // need_left + need_right 6097 DEGE_LOOP 1, 1 6098 b 5f 6099 61002: 6101 // !need_left 6102 beqz t1, 4f 6103 // !need_left + need_right 6104 DEGE_LOOP 0, 1 6105 b 5f 6106 61073: 6108 // need_left + !need_right 6109 DEGE_LOOP 1, 0 6110 b 5f 6111 61124: 6113 // !need_left + !need_right 6114 DEGE_LOOP 0, 0 6115 61165: 6117 vpickve2gr.w t2, vr2, 2 // top_ext 6118 vpickve2gr.w t3, vr2, 3 // bottom_ext 6119 sub.d t7, a7, a0 // dst_stride - bw 6120 mul.w t8, t2, a7 6121 6122 beqz t3, 2f 6123 // need_bottom 6124 sub.d t0, t6, a7 // &dst[-PXSTRIDE(dst_stride)] 61251: 6126 addi.d t1, t0, 0 6127 addi.d a5, a0, 0 6128 PIXEL_COPY_LSX t6, t1, a5 6129 add.d t6, t6, t7 6130 addi.d t3, t3, -1 6131 bnez t3, 1b 61322: 6133 beqz t2, 3f 6134 // need_top 6135 add.d t8, t8, a6 // blk 61361: 6137 addi.d t1, t8, 0 6138 addi.d a5, a0, 0 6139 PIXEL_COPY_LSX a6, t1, a5 6140 add.d a6, a6, t7 6141 addi.d t2, t2, -1 6142 bnez t2, 1b 61433: 6144 6145endfunc 6146