1/* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29 30.macro ipred_dc_gen topleft, width, height 31 add.d t0, \width, \height //dc 32 srai.d t0, t0, 1 33 addi.d t3, \topleft,1 34 35 or t1, zero, zero //data index 36 srai.d t2, \width, 4 //loop param 37 beqz t2, 2f 38 391: // width/16 40 vldx vr0, t3, t1 41 vhaddw.hu.bu vr0, vr0, vr0 42 vhaddw.wu.hu vr0, vr0, vr0 43 vhaddw.du.wu vr0, vr0, vr0 44 vhaddw.qu.du vr0, vr0, vr0 45 46 vpickve2gr.du t4, vr0, 0 47 add.d t0, t0, t4 48 49 addi.d t1, t1, 16 50 addi.d t2, t2, -1 51 bnez t2, 1b 52 b 4f 53 542: // &8 55 andi t2, \width, 8 56 beqz t2, 3f 57 58 vxor.v vr0, vr0, vr0 59 fldx.d f0, t3, t1 60 61 vhaddw.hu.bu vr0, vr0, vr0 62 vhaddw.wu.hu vr0, vr0, vr0 63 vhaddw.du.wu vr0, vr0, vr0 64 65 vpickve2gr.du t4, vr0, 0 66 add.d t0, t0, t4 67 addi.d t1, t1, 8 68 b 4f 69 703: // &4 71 andi t2, \width, 4 72 beqz t2, 4f 73 74 vxor.v vr0, vr0, vr0 75 fldx.s f0, t3, t1 76 77 vhaddw.hu.bu vr0, vr0, vr0 78 vhaddw.wu.hu vr0, vr0, vr0 79 80 vpickve2gr.wu t4, vr0, 0 81 add.d t0, t0, t4 82 addi.d t1, t1, 4 83 844: 85 addi.d t3, \topleft,0 86 srai.d t2, \height, 4 //loop param 87 beqz t2, 8f 88 897: // height/16 90 addi.d t3, t3, -16 91 vld vr0, t3, 0 92 93 vhaddw.hu.bu vr0, vr0, vr0 94 vhaddw.wu.hu vr0, vr0, vr0 95 vhaddw.du.wu vr0, vr0, vr0 96 vhaddw.qu.du vr0, vr0, vr0 97 98 vpickve2gr.du t4, vr0, 0 99 add.d t0, t0, t4 100 101 addi.d t2, t2, -1 102 bnez t2, 7b 103 b 10f 104 1058: // &8 106 andi t2, \height, 8 107 beqz t2, 9f 108 109 addi.d t3, t3, -8 110 vxor.v vr0, vr0, vr0 111 fld.d f0, t3, 0 112 113 vhaddw.hu.bu vr0, vr0, vr0 114 vhaddw.wu.hu vr0, vr0, vr0 115 vhaddw.du.wu vr0, vr0, vr0 116 117 vpickve2gr.du t4, vr0, 0 118 add.d t0, t0, t4 119 b 10f 120 1219: // &4 122 andi t2, \height, 4 123 beqz t2, 10f 124 125 addi.d t3, t3, -4 126 vxor.v vr0, vr0, vr0 127 fld.s f0, t3, 0 128 129 vhaddw.hu.bu vr0, vr0, vr0 130 vhaddw.wu.hu vr0, vr0, vr0 131 132 vpickve2gr.wu t4, vr0, 0 133 add.d t0, t0, t4 134 13510: 136 add.d t1, \width, \height 137 ctz.w t1, t1 138 sra.w t0, t0, t1 139 140 // w != h 141 beq \width, \height, 16f 142 add.d t2, \height, \height 143 add.d t3, \width, \width 144 slt t2, t2, \width 145 slt t3, t3, \height 146 or t2, t2, t3 147 li.w t3, 0x3334 148 maskeqz t1, t3, t2 149 li.w t3, 0x5556 150 masknez t2, t3, t2 151 or t1, t1, t2 152 mul.w t0, t0, t1 153 srai.w t0, t0, 16 154 15516: 156.endm 157 158.macro ipred_splat_dc dst, stride, width, height, dc 159 li.w t1, 4 160 blt t1, \width, 2f 161 162 li.w t1, 0x01010101 163 mulw.d.wu t1, \dc, t1 164 beqz \height, 7f 165 or t2, \dst, \dst 1661: // width <= 4 167 st.w t1, t2, 0 168 add.d t2, t2, \stride 169 addi.d \height, \height, -1 170 bnez \height, 1b 171 b 7f 172 1732: //width > 4 174 li.d t1, 0x0101010101010101 175 mul.d t1, \dc, t1 176 vreplgr2vr.d vr0, t1 177 or t4, \dst, \dst 178 beqz \height, 7f 179 1803: 181 andi t5, \width, 64 182 beqz t5, 4f 183 vst vr0, t4, 0 184 vst vr0, t4, 16 185 vst vr0, t4, 32 186 vst vr0, t4, 48 187 b 6f 188 1894: 190 andi t5, \width, 32 191 beqz t5, 41f 192 vst vr0, t4, 0 193 vst vr0, t4, 16 194 b 6f 195 19641: 197 andi t5, \width, 16 198 beqz t5, 5f 199 vst vr0, t4, 0 200 b 6f 201 2025: 203 fst.d f0, t4, 0 204 2056: 206 add.d t4, t4, \stride 207 addi.d \height, \height, -1 208 bnez \height, 3b 209 2107: 211.endm 212 213.macro ipred_dc_gen_top topleft, width 214 srai.d t0, \width, 1 215 addi.d t1, \topleft,1 216 217 srai.d t2, \width, 4 218 beqz t2, 2f 2191: 220 vld vr0, t1, 0 221 vhaddw.hu.bu vr0, vr0, vr0 222 vhaddw.wu.hu vr0, vr0, vr0 223 vhaddw.du.wu vr0, vr0, vr0 224 vhaddw.qu.du vr0, vr0, vr0 225 226 vpickve2gr.du t3, vr0, 0 227 add.d t0, t0, t3 228 229 addi.d t1, t1, 16 230 addi.d t2, t2, -1 231 bnez t2, 1b 232 b 4f 233 2342: // &8 235 andi t2, \width, 8 236 beqz t2, 3f 237 238 vxor.v vr0, vr0, vr0 239 fld.d f0, t1, 0 240 241 vhaddw.hu.bu vr0, vr0, vr0 242 vhaddw.wu.hu vr0, vr0, vr0 243 vhaddw.du.wu vr0, vr0, vr0 244 245 vpickve2gr.du t2, vr0, 0 246 add.d t0, t0, t2 247 248 addi.d t1, t1, 8 249 b 4f 250 2513: // &4 252 andi t2, \width, 4 253 beqz t2, 4f 254 255 vxor.v vr0, vr0, vr0 256 fld.s f0, t1, 0 257 258 vhaddw.hu.bu vr0, vr0, vr0 259 vhaddw.wu.hu vr0, vr0, vr0 260 261 vpickve2gr.du t2, vr0, 0 262 add.d t0, t0, t2 263 addi.d t1, t1, 4 264 2654: 266 ctz.w t1, \width 267 sra.w t0, t0, t1 268.endm 269 270.macro ipred_dc_gen_left topleft, height 271 srai.d t0, \height, 1 272 srai.d t2, \height, 4 //loop param 273 beqz t2, 8f 274 2757: // height/16 276 addi.d \topleft,\topleft,-16 277 vld vr0, \topleft,0 278 279 vhaddw.hu.bu vr0, vr0, vr0 280 vhaddw.wu.hu vr0, vr0, vr0 281 vhaddw.du.wu vr0, vr0, vr0 282 vhaddw.qu.du vr0, vr0, vr0 283 284 vpickve2gr.du t4, vr0, 0 285 add.d t0, t0, t4 286 287 addi.d t2, t2, -1 288 bnez t2, 7b 289 b 10f 290 2918: // &8 292 andi t2, \height, 8 293 beqz t2, 9f 294 295 addi.d \topleft,\topleft,-8 296 vxor.v vr0, vr0, vr0 297 fld.d f0, \topleft,0 298 299 vhaddw.hu.bu vr0, vr0, vr0 300 vhaddw.wu.hu vr0, vr0, vr0 301 vhaddw.du.wu vr0, vr0, vr0 302 303 vpickve2gr.du t4, vr0, 0 304 add.d t0, t0, t4 305 b 10f 306 3079: // &4 308 andi t2, \height, 4 309 beqz t2, 10f 310 311 addi.d \topleft,\topleft,-4 312 vxor.v vr0, vr0, vr0 313 fld.s f0, \topleft,0 314 315 vhaddw.hu.bu vr0, vr0, vr0 316 vhaddw.wu.hu vr0, vr0, vr0 317 318 vpickve2gr.wu t4, vr0, 0 319 add.d t0, t0, t4 320 32110: 322 ctz.w t1, \height 323 sra.w t0, t0, t1 324 325.endm 326 327// void ipred_dc_lsx(pixel *dst, const ptrdiff_t stride, 328// const pixel *const topleft, 329// const int width, const int height, const int a, 330// const int max_width, const int max_height 331// HIGHBD_DECL_SUFFIX) 332function ipred_dc_8bpc_lsx 333 ipred_dc_gen a2, a3, a4 334 ipred_splat_dc a0, a1, a3, a4, t0 335 336endfunc 337 338// void ipred_dc_128_lsx(pixel *dst, const ptrdiff_t stride, 339// const pixel *const topleft, 340// const int width, const int height, const int a, 341// const int max_width, const int max_height 342// HIGHBD_DECL_SUFFIX) 343function ipred_dc_128_8bpc_lsx 344 li.w t0, 128 345 ipred_splat_dc a0, a1, a3, a4, t0 346 347endfunc 348 349// void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride, 350// const pixel *const topleft, 351// const int width, const int height, const int a, 352// const int max_width, const int max_height 353// HIGHBD_DECL_SUFFIX) 354function ipred_dc_top_8bpc_lsx 355 ipred_dc_gen_top a2, a3 356 ipred_splat_dc a0, a1, a3, a4, t0 357 358endfunc 359 360// void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride, 361// const pixel *const topleft, 362// const int width, const int height, const int a, 363// const int max_width, const int max_height 364// HIGHBD_DECL_SUFFIX) 365function ipred_dc_left_8bpc_lsx 366 ipred_dc_gen_left a2, a4 367 ipred_splat_dc a0, a1, a3, a4, t0 368 369endfunc 370 371.macro pixel_set_8bpc dst_ptr, src_ptr, width 372 vldrepl.b vr0, \src_ptr, 0 3731: 374 andi a5, \width, 64 375 beqz a5, 2f 376 377 vst vr0, \dst_ptr, 0 378 vst vr0, \dst_ptr, 16 379 vst vr0, \dst_ptr, 32 380 vst vr0, \dst_ptr, 48 381 b 6f 3822: 383 andi a5, \width, 32 384 beqz a5, 3f 385 386 vst vr0, \dst_ptr, 0 387 vst vr0, \dst_ptr, 16 388 b 6f 3893: 390 andi a5, \width, 16 391 beqz a5, 4f 392 393 vst vr0, \dst_ptr, 0 394 b 6f 3954: 396 andi a5, \width, 8 397 beqz a5, 5f 398 399 fst.d f0, \dst_ptr, 0 400 b 6f 4015: 402 andi a5, \width, 4 403 beqz a5, 6f 404 405 fst.s f0, \dst_ptr, 0 4066: 407.endm 408 409// void ipred_h_c(pixel *dst, const ptrdiff_t stride, 410// const pixel *const topleft, 411// const int width, const int height, const int a, 412// const int max_width, const int max_height 413// HIGHBD_DECL_SUFFIX) 414function ipred_h_8bpc_lsx 415 beqz a4, .IPRED_H_END 416.IPRED_H_LOOP: 417 addi.d a2, a2, -1 418 419 pixel_set_8bpc a0, a2, a3 420 421 add.d a0, a0, a1 422 addi.d a4, a4, -1 423 bnez a4, .IPRED_H_LOOP 424 425.IPRED_H_END: 426endfunc 427 428.macro pixel_copy_8bpc dst_ptr, src_ptr, width 4291: 430 andi a5, \width, 64 431 beqz a5, 2f 432 433 vld vr0, \src_ptr, 0 434 vld vr1, \src_ptr, 16 435 vld vr2, \src_ptr, 32 436 vld vr3, \src_ptr, 48 437 438 vst vr0, \dst_ptr, 0 439 vst vr1, \dst_ptr, 16 440 vst vr2, \dst_ptr, 32 441 vst vr3, \dst_ptr, 48 442 443 b 6f 4442: 445 andi a5, \width, 32 446 beqz a5, 3f 447 448 vld vr0, \src_ptr, 0 449 vld vr1, \src_ptr, 16 450 451 vst vr0, \dst_ptr, 0 452 vst vr1, \dst_ptr, 16 453 454 b 6f 4553: 456 andi a5, \width, 16 457 beqz a5, 4f 458 459 vld vr0, \src_ptr, 0 460 vst vr0, \dst_ptr, 0 461 462 b 6f 4634: 464 andi a5, \width, 8 465 beqz a5, 5f 466 467 fld.d f0, \src_ptr, 0 468 fst.d f0, \dst_ptr, 0 469 470 b 6f 4715: 472 andi a5, \width, 4 473 beqz a5, 6f 474 475 fld.s f0, \src_ptr, 0 476 fst.s f0, \dst_ptr, 0 4776: 478.endm 479 480// void ipred_v_lsx(pixel *dst, const ptrdiff_t stride, 481// const pixel *const topleft, 482// const int width, const int height, const int a, 483// const int max_width, const int max_height 484// HIGHBD_DECL_SUFFIX) 485function ipred_v_8bpc_lsx 486 beqz a4, .IPRED_V_END 487 addi.d a2, a2, 1 488.IPRED_V_LOOP: 489 pixel_copy_8bpc a0, a2, a3 490 491 add.d a0, a0, a1 492 addi.d a4, a4, -1 493 bnez a4, .IPRED_V_LOOP 494 495.IPRED_V_END: 496endfunc 497 498// void ipred_paeth_lsx(pixel *dst, const ptrdiff_t stride, 499// const pixel *const tl_ptr, 500// const int width, const int height, const int a, 501// const int max_width, const int max_height 502// HIGHBD_DECL_SUFFIX) 503function ipred_paeth_8bpc_lsx 504 vldrepl.b vr0, a2, 0 //topleft 505 vsllwil.hu.bu vr0, vr0, 0 506 or a6, a2, a2 507 addi.d a7, a2, 1 508 509.IPRED_PAETH_H_LOOP: 510 addi.d a6, a6, -1 511 vldrepl.b vr1, a6, 0 //left 512 vsllwil.hu.bu vr1, vr1, 0 513 514.IPRED_PAETH_W_LOOP64: 515 andi a5, a3, 64 516 beqz a5, .IPRED_PAETH_W_LOOP32 517 518 vld vr2, a7, 0 //top 519 vpermi.w vr9, vr2, 0x0e 520 vsllwil.hu.bu vr2, vr2, 0 521 vsllwil.hu.bu vr9, vr9, 0 522 523 vabsd.hu vr5, vr0, vr1 //tdiff 524 vabsd.hu vr4, vr0, vr2 //ldiff 525 vabsd.hu vr10, vr0, vr9 526 527 vadd.h vr3, vr0, vr0 528 vadd.h vr6, vr1, vr2 529 vadd.h vr11, vr1, vr9 530 vabsd.hu vr6, vr3, vr6 //tldiff 531 vabsd.hu vr11, vr3, vr11 //tldiff 532 533 vsle.hu vr3, vr5, vr6 534 vbitsel.v vr7, vr0, vr2, vr3 535 vsle.hu vr3, vr4, vr5 536 vsle.hu vr8, vr4, vr6 537 vand.v vr3, vr3, vr8 538 vbitsel.v vr3, vr7, vr1, vr3 539 vsrlni.b.h vr3, vr3, 0 540 541 vsle.hu vr12, vr5, vr11 542 vbitsel.v vr7, vr0, vr9, vr12 543 vsle.hu vr12, vr10, vr5 544 vsle.hu vr8, vr10, vr11 545 vand.v vr12, vr12, vr8 546 vbitsel.v vr12, vr7, vr1, vr12 547 vsrlni.b.h vr12, vr12, 0 548 549 vpermi.w vr12, vr3, 0x44 550 551 vst vr12, a0, 0 552 553 vld vr2, a7, 16 //top 554 vpermi.w vr9, vr2, 0x0e 555 vsllwil.hu.bu vr2, vr2, 0 556 vsllwil.hu.bu vr9, vr9, 0 557 558 vabsd.hu vr5, vr0, vr1 //tdiff 559 vabsd.hu vr4, vr0, vr2 //ldiff 560 vabsd.hu vr10, vr0, vr9 561 562 vadd.h vr3, vr0, vr0 563 vadd.h vr6, vr1, vr2 564 vadd.h vr11, vr1, vr9 565 vabsd.hu vr6, vr3, vr6 //tldiff 566 vabsd.hu vr11, vr3, vr11 //tldiff 567 568 vsle.hu vr3, vr5, vr6 569 vbitsel.v vr7, vr0, vr2, vr3 570 vsle.hu vr3, vr4, vr5 571 vsle.hu vr8, vr4, vr6 572 vand.v vr3, vr3, vr8 573 vbitsel.v vr3, vr7, vr1, vr3 574 vsrlni.b.h vr3, vr3, 0 575 576 vsle.hu vr12, vr5, vr11 577 vbitsel.v vr7, vr0, vr9, vr12 578 vsle.hu vr12, vr10, vr5 579 vsle.hu vr8, vr10, vr11 580 vand.v vr12, vr12, vr8 581 vbitsel.v vr12, vr7, vr1, vr12 582 vsrlni.b.h vr12, vr12, 0 583 584 vpermi.w vr12, vr3, 0x44 585 586 vst vr12, a0, 16 587 588 vld vr2, a7, 32 //top 589 vpermi.w vr9, vr2, 0x0e 590 vsllwil.hu.bu vr2, vr2, 0 591 vsllwil.hu.bu vr9, vr9, 0 592 593 vabsd.hu vr5, vr0, vr1 //tdiff 594 vabsd.hu vr4, vr0, vr2 //ldiff 595 vabsd.hu vr10, vr0, vr9 596 597 vadd.h vr3, vr0, vr0 598 vadd.h vr6, vr1, vr2 599 vadd.h vr11, vr1, vr9 600 vabsd.hu vr6, vr3, vr6 //tldiff 601 vabsd.hu vr11, vr3, vr11 //tldiff 602 603 vsle.hu vr3, vr5, vr6 604 vbitsel.v vr7, vr0, vr2, vr3 605 vsle.hu vr3, vr4, vr5 606 vsle.hu vr8, vr4, vr6 607 vand.v vr3, vr3, vr8 608 vbitsel.v vr3, vr7, vr1, vr3 609 vsrlni.b.h vr3, vr3, 0 610 611 vsle.hu vr12, vr5, vr11 612 vbitsel.v vr7, vr0, vr9, vr12 613 vsle.hu vr12, vr10, vr5 614 vsle.hu vr8, vr10, vr11 615 vand.v vr12, vr12, vr8 616 vbitsel.v vr12, vr7, vr1, vr12 617 vsrlni.b.h vr12, vr12, 0 618 619 vpermi.w vr12, vr3, 0x44 620 621 vst vr12, a0, 32 622 623 vld vr2, a7, 48 //top 624 vpermi.w vr9, vr2, 0x0e 625 vsllwil.hu.bu vr2, vr2, 0 626 vsllwil.hu.bu vr9, vr9, 0 627 628 vabsd.hu vr5, vr0, vr1 //tdiff 629 vabsd.hu vr4, vr0, vr2 //ldiff 630 vabsd.hu vr10, vr0, vr9 631 632 vadd.h vr3, vr0, vr0 633 vadd.h vr6, vr1, vr2 634 vadd.h vr11, vr1, vr9 635 vabsd.hu vr6, vr3, vr6 //tldiff 636 vabsd.hu vr11, vr3, vr11 //tldiff 637 638 vsle.hu vr3, vr5, vr6 639 vbitsel.v vr7, vr0, vr2, vr3 640 vsle.hu vr3, vr4, vr5 641 vsle.hu vr8, vr4, vr6 642 vand.v vr3, vr3, vr8 643 vbitsel.v vr3, vr7, vr1, vr3 644 vsrlni.b.h vr3, vr3, 0 645 646 vsle.hu vr12, vr5, vr11 647 vbitsel.v vr7, vr0, vr9, vr12 648 vsle.hu vr12, vr10, vr5 649 vsle.hu vr8, vr10, vr11 650 vand.v vr12, vr12, vr8 651 vbitsel.v vr12, vr7, vr1, vr12 652 vsrlni.b.h vr12, vr12, 0 653 654 vpermi.w vr12, vr3, 0x44 655 656 vst vr12, a0, 48 657 658 b .IPRED_PAETH_W_LOOPEND 659 660.IPRED_PAETH_W_LOOP32: 661 andi a5, a3, 32 662 beqz a5, .IPRED_PAETH_W_LOOP16 663 664 vld vr2, a7, 0 //top 665 vpermi.w vr9, vr2, 0x0e 666 vsllwil.hu.bu vr2, vr2, 0 667 vsllwil.hu.bu vr9, vr9, 0 668 669 vabsd.hu vr5, vr0, vr1 //tdiff 670 vabsd.hu vr4, vr0, vr2 //ldiff 671 vabsd.hu vr10, vr0, vr9 672 673 vadd.h vr3, vr0, vr0 674 vadd.h vr6, vr1, vr2 675 vadd.h vr11, vr1, vr9 676 vabsd.hu vr6, vr3, vr6 //tldiff 677 vabsd.hu vr11, vr3, vr11 //tldiff 678 679 vsle.hu vr3, vr5, vr6 680 vbitsel.v vr7, vr0, vr2, vr3 681 vsle.hu vr3, vr4, vr5 682 vsle.hu vr8, vr4, vr6 683 vand.v vr3, vr3, vr8 684 vbitsel.v vr3, vr7, vr1, vr3 685 vsrlni.b.h vr3, vr3, 0 686 687 vsle.hu vr12, vr5, vr11 688 vbitsel.v vr7, vr0, vr9, vr12 689 vsle.hu vr12, vr10, vr5 690 vsle.hu vr8, vr10, vr11 691 vand.v vr12, vr12, vr8 692 vbitsel.v vr12, vr7, vr1, vr12 693 vsrlni.b.h vr12, vr12, 0 694 695 vpermi.w vr12, vr3, 0x44 696 697 vst vr12, a0, 0 698 699 vld vr2, a7, 16 //top 700 vpermi.w vr9, vr2, 0x0e 701 vsllwil.hu.bu vr2, vr2, 0 702 vsllwil.hu.bu vr9, vr9, 0 703 704 vabsd.hu vr5, vr0, vr1 //tdiff 705 vabsd.hu vr4, vr0, vr2 //ldiff 706 vabsd.hu vr10, vr0, vr9 707 708 vadd.h vr3, vr0, vr0 709 vadd.h vr6, vr1, vr2 710 vadd.h vr11, vr1, vr9 711 vabsd.hu vr6, vr3, vr6 //tldiff 712 vabsd.hu vr11, vr3, vr11 //tldiff 713 714 vsle.hu vr3, vr5, vr6 715 vbitsel.v vr7, vr0, vr2, vr3 716 vsle.hu vr3, vr4, vr5 717 vsle.hu vr8, vr4, vr6 718 vand.v vr3, vr3, vr8 719 vbitsel.v vr3, vr7, vr1, vr3 720 vsrlni.b.h vr3, vr3, 0 721 722 vsle.hu vr12, vr5, vr11 723 vbitsel.v vr7, vr0, vr9, vr12 724 vsle.hu vr12, vr10, vr5 725 vsle.hu vr8, vr10, vr11 726 vand.v vr12, vr12, vr8 727 vbitsel.v vr12, vr7, vr1, vr12 728 vsrlni.b.h vr12, vr12, 0 729 730 vpermi.w vr12, vr3, 0x44 731 732 vst vr12, a0, 16 733 734 b .IPRED_PAETH_W_LOOPEND 735 736.IPRED_PAETH_W_LOOP16: 737 andi a5, a3, 16 738 beqz a5, .IPRED_PAETH_W_LOOP8 739 740 vld vr2, a7, 0 //top 741 vpermi.w vr9, vr2, 0x0e 742 vsllwil.hu.bu vr2, vr2, 0 743 vsllwil.hu.bu vr9, vr9, 0 744 745 vabsd.hu vr5, vr0, vr1 //tdiff 746 vabsd.hu vr4, vr0, vr2 //ldiff 747 vabsd.hu vr10, vr0, vr9 748 749 vadd.h vr3, vr0, vr0 750 vadd.h vr6, vr1, vr2 751 vadd.h vr11, vr1, vr9 752 vabsd.hu vr6, vr3, vr6 //tldiff 753 vabsd.hu vr11, vr3, vr11 //tldiff 754 755 vsle.hu vr3, vr5, vr6 756 vbitsel.v vr7, vr0, vr2, vr3 757 vsle.hu vr3, vr4, vr5 758 vsle.hu vr8, vr4, vr6 759 vand.v vr3, vr3, vr8 760 vbitsel.v vr3, vr7, vr1, vr3 761 vsrlni.b.h vr3, vr3, 0 762 763 vsle.hu vr12, vr5, vr11 764 vbitsel.v vr7, vr0, vr9, vr12 765 vsle.hu vr12, vr10, vr5 766 vsle.hu vr8, vr10, vr11 767 vand.v vr12, vr12, vr8 768 vbitsel.v vr12, vr7, vr1, vr12 769 vsrlni.b.h vr12, vr12, 0 770 771 vpermi.w vr12, vr3, 0x44 772 773 vst vr12, a0, 0 774 775 b .IPRED_PAETH_W_LOOPEND 776 777.IPRED_PAETH_W_LOOP8: 778 andi a5, a3, 8 779 beqz a5, .IPRED_PAETH_W_LOOP4 780 781 fld.d f2, a7, 0 //top 782 vsllwil.hu.bu vr2, vr2, 0 783 784 vabsd.hu vr5, vr0, vr1 //tdiff 785 vabsd.hu vr4, vr0, vr2 //ldiff 786 787 vadd.h vr3, vr0, vr0 788 vadd.h vr6, vr1, vr2 789 vabsd.hu vr6, vr3, vr6 //tldiff 790 791 vsle.hu vr3, vr5, vr6 792 vbitsel.v vr7, vr0, vr2, vr3 793 vsle.hu vr3, vr4, vr5 794 vsle.hu vr8, vr4, vr6 795 vand.v vr3, vr3, vr8 796 vbitsel.v vr3, vr7, vr1, vr3 797 vsrlni.b.h vr3, vr3, 0 798 fst.d f3, a0, 0 799 800 b .IPRED_PAETH_W_LOOPEND 801 802.IPRED_PAETH_W_LOOP4: 803 andi a5, a3, 4 804 beqz a5, .IPRED_PAETH_W_LOOPEND 805 806 fld.s f2, a7, 0 //top 807 vsllwil.hu.bu vr2, vr2, 0 808 809 vabsd.hu vr5, vr0, vr1 //tdiff 810 vabsd.hu vr4, vr0, vr2 //ldiff 811 812 vadd.h vr3, vr0, vr0 813 vadd.h vr6, vr1, vr2 814 vabsd.hu vr6, vr3, vr6 //tldiff 815 816 vsle.hu vr3, vr5, vr6 817 vbitsel.v vr7, vr0, vr2, vr3 818 vsle.hu vr3, vr4, vr5 819 vsle.hu vr8, vr4, vr6 820 vand.v vr3, vr3, vr8 821 vbitsel.v vr3, vr7, vr1, vr3 822 vsrlni.b.h vr3, vr3, 0 823 fst.s f3, a0, 0 824 825 b .IPRED_PAETH_W_LOOPEND 826 827.IPRED_PAETH_W_LOOPEND: 828 add.d a0, a0, a1 829 addi.d a4, a4, -1 830 bnez a4, .IPRED_PAETH_H_LOOP 831endfunc 832 833const dav1d_sm_weights 834 .byte 0, 0 835 // bs = 2 836 .byte 255, 128 837 // bs = 4 838 .byte 255, 149, 85, 64 839 // bs = 8 840 .byte 255, 197, 146, 105, 73, 50, 37, 32 841 // bs = 16 842 .byte 255, 225, 196, 170, 145, 123, 102, 84 843 .byte 68, 54, 43, 33, 26, 20, 17, 16 844 // bs = 32 845 .byte 255, 240, 225, 210, 196, 182, 169, 157 846 .byte 145, 133, 122, 111, 101, 92, 83, 74 847 .byte 66, 59, 52, 45, 39, 34, 29, 25 848 .byte 21, 17, 14, 12, 10, 9, 8, 8 849 // bs = 64 850 .byte 255, 248, 240, 233, 225, 218, 210, 203 851 .byte 196, 189, 182, 176, 169, 163, 156, 150 852 .byte 144, 138, 133, 127, 121, 116, 111, 106 853 .byte 101, 96, 91, 86, 82, 77, 73, 69 854 .byte 65, 61, 57, 54, 50, 47, 44, 41 855 .byte 38, 35, 32, 29, 27, 25, 22, 20 856 .byte 18, 16, 15, 13, 12, 10, 9, 8 857 .byte 7, 6, 6, 5, 5, 4, 4, 4 858endconst 859 860// void ipred_smooth_lsx(pixel *dst, const ptrdiff_t stride, 861// const pixel *const topleft, 862// const int width, const int height, const int a, 863// const int max_width, const int max_height 864// HIGHBD_DECL_SUFFIX) 865function ipred_smooth_8bpc_lsx 866 la.local a5, dav1d_sm_weights 867 add.d a6, a5, a3 //hor 868 add.d a5, a5, a4 //ver 869 870 add.d a7, a2, a3 871 sub.d t0, a2, a4 872 873 vldrepl.b vr0, a7, 0 //right 874 vldrepl.b vr1, t0, 0 //bottom 875 876 vsllwil.hu.bu vr0, vr0, 0 877 vsllwil.wu.hu vr0, vr0, 0 878 vsllwil.hu.bu vr1, vr1, 0 879 vsllwil.wu.hu vr1, vr1, 0 880 881 li.w t0, 256 882 vreplgr2vr.w vr6, t0 883 884 addi.d t0, a2, 1 //ptr topleft[x] 885 addi.d t3, a2, -1 //ptr topleft[y] 886 887.IPRED_SMOOTH_H_LOOP: 888 vldrepl.b vr2, a5, 0 //ver[y] 889 vldrepl.b vr3, t3, 0 //topleft[y] 890 891 vsllwil.hu.bu vr2, vr2, 0 892 vsllwil.wu.hu vr2, vr2, 0 893 vsllwil.hu.bu vr3, vr3, 0 894 vsllwil.wu.hu vr3, vr3, 0 895 896 vsub.w vr7, vr6, vr2 //256-ver[y] 897 898 or t1, zero, zero //xx 899 srai.d t2, a3, 2 //loop max 900 901.IPRED_SMOOTH_W_LOOP: 902 fldx.s f4, t0, t1 //topleft[x] 903 fldx.s f5, a6, t1 //hor[x] 904 905 vsllwil.hu.bu vr4, vr4, 0 906 vsllwil.wu.hu vr4, vr4, 0 907 vsllwil.hu.bu vr5, vr5, 0 908 vsllwil.wu.hu vr5, vr5, 0 909 910 vsub.w vr8, vr6, vr5 //256-hor[x] 911 912 vmul.w vr9, vr8, vr0 913 vmadd.w vr9, vr5, vr3 914 vmadd.w vr9, vr7, vr1 915 vmadd.w vr9, vr2, vr4 //pred 916 917 vadd.w vr9, vr9, vr6 918 vsrlni.h.w vr9, vr9, 9 919 vsrlni.b.h vr9, vr9, 0 920 921 fstx.s f9, a0, t1 922 923 addi.d t1, t1, 4 924 addi.d t2, t2, -1 925 bnez t2, .IPRED_SMOOTH_W_LOOP 926 927.IPRED_SMOOTH_W_LOOP_END: 928 addi.d t3, t3, -1 929 addi.d a5, a5, 1 930 add.d a0, a0, a1 931 addi.d a4, a4, -1 932 bnez a4, .IPRED_SMOOTH_H_LOOP 933 934endfunc 935 936// void ipred_smooth_v_lsx(pixel *dst, const ptrdiff_t stride, 937// const pixel *const topleft, 938// const int width, const int height, const int a, 939// const int max_width, const int max_height 940// HIGHBD_DECL_SUFFIX) 941function ipred_smooth_v_8bpc_lsx 942 la.local a5, dav1d_sm_weights 943 add.d a5, a5, a4 //ver 944 945 sub.d t0, a2, a4 946 vldrepl.b vr0, t0, 0 //bottom 947 vsllwil.hu.bu vr0, vr0, 0 948 949 li.w t0, 256 950 vreplgr2vr.h vr2, t0 951 li.w t0, 128 952 vreplgr2vr.h vr3, t0 953 954 addi.d t0, a2, 1 //ptr topleft[x] 955 956.IPRED_SMOOTH_V_H_LOOP: 957 vldrepl.b vr1, a5, 0 //ver[y] 958 vsllwil.hu.bu vr1, vr1, 0 959 vsub.h vr5, vr2, vr1 //256-ver[y] 960 961 or t1, zero, zero //xx 962 srai.d t2, a3, 3 //loop max 963 beqz t2, .IPRED_SMOOTH_V_W_LOOP4 964 965.IPRED_SMOOTH_V_W_LOOP8: 966 fldx.d f4, t0, t1 //topleft[x] 967 vsllwil.hu.bu vr4, vr4, 0 968 969 vmul.h vr6, vr5, vr0 970 vmadd.h vr6, vr1, vr4 //pred 971 vadd.h vr6, vr6, vr3 972 vsrlni.b.h vr6, vr6, 8 973 974 fstx.d f6, a0, t1 975 976 addi.d t1, t1, 8 977 addi.d t2, t2, -1 978 bnez t2, .IPRED_SMOOTH_V_W_LOOP8 979 b .IPRED_SMOOTH_V_W_LOOP_END 980 981.IPRED_SMOOTH_V_W_LOOP4: 982 fldx.s f4, t0, t1 //topleft[x] 983 vsllwil.hu.bu vr4, vr4, 0 984 985 vmul.h vr6, vr5, vr0 986 vmadd.h vr6, vr1, vr4 //pred 987 vadd.h vr6, vr6, vr3 988 vsrai.h vr6, vr6, 8 989 vsrlni.b.h vr6, vr6, 0 990 991 fstx.s f6, a0, t1 992 993 addi.d t1, t1, 4 994 995.IPRED_SMOOTH_V_W_LOOP_END: 996 addi.d a5, a5, 1 997 add.d a0, a0, a1 998 addi.d a4, a4, -1 999 bnez a4, .IPRED_SMOOTH_V_H_LOOP 1000 1001endfunc 1002 1003// void ipred_smooth_h_lsx(pixel *dst, const ptrdiff_t stride, 1004// const pixel *const topleft, 1005// const int width, const int height, const int a, 1006// const int max_width, const int max_height 1007// HIGHBD_DECL_SUFFIX) 1008function ipred_smooth_h_8bpc_lsx 1009 la.local a5, dav1d_sm_weights 1010 add.d a6, a5, a3 //hor 1011 1012 add.d a7, a2, a3 1013 vldrepl.b vr0, a7, 0 //right 1014 vsllwil.hu.bu vr0, vr0, 0 1015 1016 li.w t0, 256 1017 vreplgr2vr.h vr1, t0 1018 li.w t0, 128 1019 vreplgr2vr.h vr2, t0 1020 1021 addi.d t3, a2, -1 //ptr topleft[y] 1022 1023.IPRED_SMOOTH_H_H_LOOP: 1024 vldrepl.b vr3, t3, 0 //topleft[y] 1025 vsllwil.hu.bu vr3, vr3, 0 1026 1027 or t1, zero, zero //xx 1028 srai.d t2, a3, 3 //loop max 1029 beqz t2, .IPRED_SMOOTH_H_W_LOOP4 1030 1031.IPRED_SMOOTH_H_W_LOOP8: 1032 fldx.d f5, a6, t1 //hor[x] 1033 vsllwil.hu.bu vr5, vr5, 0 1034 vsub.h vr4, vr1, vr5 //256-hor[x] 1035 1036 vmul.h vr6, vr4, vr0 1037 vmadd.h vr6, vr5, vr3 //pred 1038 vadd.h vr6, vr6, vr2 1039 vsrlni.b.h vr6, vr6, 8 1040 1041 fstx.d f6, a0, t1 1042 1043 addi.d t1, t1, 8 1044 addi.d t2, t2, -1 1045 bnez t2, .IPRED_SMOOTH_H_W_LOOP8 1046 b .IPRED_SMOOTH_W_H_LOOP_END 1047 1048.IPRED_SMOOTH_H_W_LOOP4: 1049 fldx.s f5, a6, t1 //hor[x] 1050 vsllwil.hu.bu vr5, vr5, 0 1051 vsub.h vr4, vr1, vr5 //256-hor[x] 1052 1053 vmul.h vr6, vr4, vr0 1054 vmadd.h vr6, vr5, vr3 //pred 1055 vadd.h vr6, vr6, vr2 1056 vsrai.h vr6, vr6, 8 1057 vsrlni.b.h vr6, vr6, 0 1058 1059 fstx.s f6, a0, t1 1060 1061 addi.d t1, t1, 4 1062 1063.IPRED_SMOOTH_W_H_LOOP_END: 1064 addi.d t3, t3, -1 1065 add.d a0, a0, a1 1066 addi.d a4, a4, -1 1067 bnez a4, .IPRED_SMOOTH_H_H_LOOP 1068 1069endfunc 1070 1071// void pal_pred_lsx(pixel *dst, const ptrdiff_t stride, 1072// const pixel *const pal, const uint8_t *idx, 1073// const int w, const int h) 1074function pal_pred_8bpc_lsx 1075 srai.d a7, a5, 2 1076 1077.PAL_PRED_WLOOP4: 1078 andi a6, a4, 4 1079 beqz a6, .PAL_PRED_WLOOP8 1080 fld.d f0, a3, 0 1081 vsrli.b vr1, vr0, 4 1082 vandi.b vr2, vr0, 7 1083 vilvl.b vr0, vr1, vr2 1084 fld.d f1, a2, 0 1085 vshuf.b vr2, vr1, vr1, vr0 1086 1087 vstelm.w vr2, a0, 0, 0 1088 add.d a0, a0, a1 1089 vstelm.w vr2, a0, 0, 1 1090 add.d a0, a0, a1 1091 vstelm.w vr2, a0, 0, 2 1092 add.d a0, a0, a1 1093 vstelm.w vr2, a0, 0, 3 1094 add.d a0, a0, a1 1095 1096 addi.d a3, a3, 8 1097 addi.d a7, a7, -1 1098 bnez a7, .PAL_PRED_WLOOP4 1099 b .PAL_PRED_END 1100 1101.PAL_PRED_WLOOP8: 1102 andi a6, a4, 8 1103 beqz a6, .PAL_PRED_WLOOP16 1104 1105 vld vr0, a3, 0 1106 vsrli.b vr1, vr0, 4 1107 vandi.b vr2, vr0, 7 1108 vilvl.b vr0, vr1, vr2 1109 vilvh.b vr3, vr1, vr2 1110 fld.d f1, a2, 0 1111 vshuf.b vr0, vr1, vr1, vr0 1112 vshuf.b vr3, vr1, vr1, vr3 1113 1114 vstelm.d vr0, a0, 0, 0 1115 add.d a0, a0, a1 1116 vstelm.d vr0, a0, 0, 1 1117 add.d a0, a0, a1 1118 1119 vstelm.d vr3, a0, 0, 0 1120 add.d a0, a0, a1 1121 vstelm.d vr3, a0, 0, 1 1122 add.d a0, a0, a1 1123 1124 addi.d a3, a3, 16 1125 addi.d a7, a7, -1 1126 bnez a7, .PAL_PRED_WLOOP8 1127 b .PAL_PRED_END 1128 1129.PAL_PRED_WLOOP16: 1130 andi a6, a4, 16 1131 beqz a6, .PAL_PRED_WLOOP32 1132 1133 vld vr0, a3, 0 1134 vld vr1, a3, 16 1135 fld.d f6, a2, 0 1136 vsrli.b vr2, vr0, 4 1137 vandi.b vr3, vr0, 7 1138 vsrli.b vr4, vr1, 4 1139 vandi.b vr5, vr1, 7 1140 vilvl.b vr0, vr2, vr3 1141 vilvh.b vr1, vr2, vr3 1142 vilvl.b vr2, vr4, vr5 1143 vilvh.b vr3, vr4, vr5 1144 vshuf.b vr0, vr6, vr6, vr0 1145 vshuf.b vr1, vr6, vr6, vr1 1146 vshuf.b vr2, vr6, vr6, vr2 1147 vshuf.b vr3, vr6, vr6, vr3 1148 1149 vst vr0, a0, 0 1150 add.d a0, a0, a1 1151 vst vr1, a0, 0 1152 add.d a0, a0, a1 1153 vst vr2, a0, 0 1154 add.d a0, a0, a1 1155 vst vr3, a0, 0 1156 add.d a0, a0, a1 1157 1158 addi.d a3, a3, 32 1159 addi.d a7, a7, -1 1160 bnez a7, .PAL_PRED_WLOOP16 1161 b .PAL_PRED_END 1162 1163.PAL_PRED_WLOOP32: 1164 andi a6, a4, 32 1165 beqz a6, .PAL_PRED_WLOOP64 1166 1167 vld vr0, a3, 0 1168 vld vr1, a3, 16 1169 vld vr2, a3, 32 1170 vld vr3, a3, 48 1171 fld.d f4, a2, 0 1172 vsrli.b vr5, vr0, 4 1173 vandi.b vr6, vr0, 7 1174 vsrli.b vr7, vr1, 4 1175 vandi.b vr8, vr1, 7 1176 vsrli.b vr9, vr2, 4 1177 vandi.b vr10, vr2, 7 1178 vsrli.b vr11, vr3, 4 1179 vandi.b vr12, vr3, 7 1180 vilvl.b vr0, vr5, vr6 1181 vilvh.b vr1, vr5, vr6 1182 vilvl.b vr2, vr7, vr8 1183 vilvh.b vr3, vr7, vr8 1184 vilvl.b vr5, vr9, vr10 1185 vilvh.b vr6, vr9, vr10 1186 vilvl.b vr7, vr11, vr12 1187 vilvh.b vr8, vr11, vr12 1188 vshuf.b vr0, vr4, vr4, vr0 1189 vshuf.b vr1, vr4, vr4, vr1 1190 vshuf.b vr2, vr4, vr4, vr2 1191 vshuf.b vr3, vr4, vr4, vr3 1192 vshuf.b vr5, vr4, vr4, vr5 1193 vshuf.b vr6, vr4, vr4, vr6 1194 vshuf.b vr7, vr4, vr4, vr7 1195 vshuf.b vr8, vr4, vr4, vr8 1196 1197 vst vr0, a0, 0 1198 vst vr1, a0, 16 1199 add.d a0, a0, a1 1200 vst vr2, a0, 0 1201 vst vr3, a0, 16 1202 add.d a0, a0, a1 1203 vst vr5, a0, 0 1204 vst vr6, a0, 16 1205 add.d a0, a0, a1 1206 vst vr7, a0, 0 1207 vst vr8, a0, 16 1208 add.d a0, a0, a1 1209 1210 addi.d a3, a3, 64 1211 addi.d a7, a7, -1 1212 bnez a7, .PAL_PRED_WLOOP32 1213 b .PAL_PRED_END 1214 1215.PAL_PRED_WLOOP64: 1216 vld vr0, a3, 0 1217 vld vr1, a3, 16 1218 fld.d f2, a2, 0 1219 vsrli.b vr3, vr0, 4 1220 vandi.b vr4, vr0, 7 1221 vsrli.b vr5, vr1, 4 1222 vandi.b vr6, vr1, 7 1223 vilvl.b vr0, vr3, vr4 1224 vilvh.b vr1, vr3, vr4 1225 vilvl.b vr3, vr5, vr6 1226 vilvh.b vr4, vr5, vr6 1227 vshuf.b vr0, vr2, vr2, vr0 1228 vshuf.b vr1, vr2, vr2, vr1 1229 vshuf.b vr3, vr2, vr2, vr3 1230 vshuf.b vr4, vr2, vr2, vr4 1231 1232 vst vr0, a0, 0 1233 vst vr1, a0, 16 1234 vst vr3, a0, 32 1235 vst vr4, a0, 48 1236 1237 add.d a0, a0, a1 1238 addi.d a3, a3, 32 1239 addi.d a5, a5, -1 1240 bnez a5, .PAL_PRED_WLOOP64 1241 1242.PAL_PRED_END: 1243endfunc 1244 1245.macro apply_sign_vrh v, s, vrzero, vrt0 ,out 1246 vslt.h \vrt0, \s, \vrzero 1247 vandn.v \s, \vrt0, \v 1248 vsigncov.h \v, \vrt0, \v 1249 vor.v \out, \s, \v 1250.endm 1251 1252.macro iclip_pixel_vrh in0, in1, in2, tmp0, tmp1, out 1253 vmin.h \tmp0, \in2, \in0 1254 vslt.h \in0, \in0, \in1 1255 vand.v \tmp1, \in0, \in1 1256 vandn.v \tmp0, \in0, \tmp0 1257 vor.v \out, \tmp1, \tmp0 1258.endm 1259 1260.macro ipred_cfl_pred dst, stride, w, h, dc, ac, alpha 1261 vreplgr2vr.h vr2, \alpha 1262 vreplgr2vr.h vr7, \dc 1263 li.w t1, 32 1264 vreplgr2vr.h vr3, t1 1265 vxor.v vr4, vr4, vr4 1266 li.w t1, 255 1267 vreplgr2vr.h vr6, t1 1268 add.d t4, \w, \w 1269 12701: 1271 or t1, zero, zero 1272 or t2, zero, zero 1273 srai.d t3, \w, 3 1274 beqz t3, 3f 1275 12762: 1277 vldx vr0, \ac, t1 1278 vmul.h vr1, vr2, vr0 1279 vadda.h vr0, vr1, vr3 1280 vsrai.h vr0, vr0, 6 1281 apply_sign_vrh vr0, vr1, vr4, vr5, vr0 1282 vadd.h vr1, vr0, vr7 1283 iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0 1284 vsrlni.b.h vr0, vr0, 0 1285 fstx.d f0, \dst, t2 1286 1287 addi.d t1, t1, 16 1288 addi.d t2, t2, 8 1289 addi.d t3, t3, -1 1290 bnez t3, 2b 1291 b 4f 1292 12933: 1294 fld.d f0, \ac, 0 1295 vmul.h vr1, vr2, vr0 1296 vadda.h vr0, vr1, vr3 1297 vsrai.h vr0, vr0, 6 1298 apply_sign_vrh vr0, vr1, vr4, vr5, vr0 1299 vadd.h vr1, vr0, vr7 1300 iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0 1301 vsrlni.b.h vr0, vr0, 0 1302 fst.s f0, \dst, 0 1303 13044: 1305 add.d \ac, \ac, t4 1306 add.d \dst, \dst, \stride 1307 addi.d \h, \h, -1 1308 bnez \h, 1b 1309.endm 1310 1311function ipred_cfl_8bpc_lsx 1312 ipred_dc_gen a2, a3, a4 1313 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 1314endfunc 1315 1316function ipred_cfl_top_8bpc_lsx 1317 ipred_dc_gen_top a2, a3 1318 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 1319endfunc 1320 1321function ipred_cfl_left_8bpc_lsx 1322 ipred_dc_gen_left a2, a4 1323 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 1324endfunc 1325 1326function ipred_cfl_128_8bpc_lsx 1327 li.w t0, 128 1328 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 1329endfunc 1330 1331const dav1d_filter_intra_taps_lsx 1332 //arr0 8*7 1333.byte -6, -5, -3, -3, -4, -3, -3, -3 1334.byte 10, 2, 1, 1, 6, 2, 2, 1 1335.byte 0, 10, 1, 1, 0, 6, 2, 2 1336.byte 0, 0, 10, 2, 0, 0, 6, 2 1337.byte 0, 0, 0, 10, 0, 0, 0, 6 1338.byte 12, 9, 7, 5, 2, 2, 2, 3 1339.byte 0, 0, 0, 0, 12, 9, 7, 5 1340 //arr1 1341.byte -10, -6, -4, -2, -10, -6, -4, -2 1342.byte 16, 0, 0, 0, 16, 0, 0, 0 1343.byte 0, 16, 0, 0, 0, 16, 0, 0 1344.byte 0, 0, 16, 0, 0, 0, 16, 0 1345.byte 0, 0, 0, 16, 0, 0, 0, 16 1346.byte 10, 6, 4, 2, 0, 0, 0, 0 1347.byte 0, 0, 0, 0, 10, 6, 4, 2 1348 //arr2 1349.byte -8, -8, -8, -8, -4, -4, -4, -4 1350.byte 8, 0, 0, 0, 4, 0, 0, 0 1351.byte 0, 8, 0, 0, 0, 4, 0, 0 1352.byte 0, 0, 8, 0, 0, 0, 4, 0 1353.byte 0, 0, 0, 8, 0, 0, 0, 4 1354.byte 16, 16, 16, 16, 0, 0, 0, 0 1355.byte 0, 0, 0, 0, 16, 16, 16, 16 1356 //arr3 1357.byte -2, -1, -1, 0, -1, -1, -1, -1 1358.byte 8, 3, 2, 1, 4, 3, 2, 2 1359.byte 0, 8, 3, 2, 0, 4, 3, 2 1360.byte 0, 0, 8, 3, 0, 0, 4, 3 1361.byte 0, 0, 0, 8, 0, 0, 0, 4 1362.byte 10, 6, 4, 2, 3, 4, 4, 3 1363.byte 0, 0, 0, 0, 10, 6, 4, 3 1364 //arr4 1365.byte -12, -10, -9, -8, -10, -9, -8, -7 1366.byte 14, 0, 0, 0, 12, 1, 0, 0 1367.byte 0, 14, 0, 0, 0, 12, 0, 0 1368.byte 0, 0, 14, 0, 0, 0, 12, 1 1369.byte 0, 0, 0, 14, 0, 0, 0, 12 1370.byte 14, 12, 11, 10, 0, 0, 1, 1 1371.byte 0, 0, 0, 0, 14, 12, 11, 9 1372endconst 1373 1374.macro ipred_filter_load_p 1375 vldrepl.b vr0, t0, 0 1376 vldrepl.b vr1, a7, 0 1377 vldrepl.b vr2, a7, 1 1378 vldrepl.b vr3, a7, 2 1379 vldrepl.b vr4, a7, 3 1380 vldrepl.b vr5, t1, 0 1381 vldrepl.b vr6, t1, -1 1382 1383 vsllwil.hu.bu vr0, vr0, 0 1384 vsllwil.hu.bu vr1, vr1, 0 1385 vsllwil.hu.bu vr2, vr2, 0 1386 vsllwil.hu.bu vr3, vr3, 0 1387 vsllwil.hu.bu vr4, vr4, 0 1388 vsllwil.hu.bu vr5, vr5, 0 1389 vsllwil.hu.bu vr6, vr6, 0 1390.endm 1391 1392.macro ipred_filter_loadx_p 1393 vldrepl.b vr0, t0, 0 1394 vldrepl.b vr1, a7, 0 1395 vldrepl.b vr2, a7, 1 1396 vldrepl.b vr3, a7, 2 1397 vldrepl.b vr4, a7, 3 1398 vldrepl.b vr5, t1, 0 1399 ldx.bu t3, t1, a1 1400 vreplgr2vr.b vr6, t3 1401 1402 vsllwil.hu.bu vr0, vr0, 0 1403 vsllwil.hu.bu vr1, vr1, 0 1404 vsllwil.hu.bu vr2, vr2, 0 1405 vsllwil.hu.bu vr3, vr3, 0 1406 vsllwil.hu.bu vr4, vr4, 0 1407 vsllwil.hu.bu vr5, vr5, 0 1408 vsllwil.hu.bu vr6, vr6, 0 1409.endm 1410 1411.macro ipred_filter_load_fltptr 1412 fld.d f7, a6, 0 1413 fld.d f8, a6, 8 1414 fld.d f9, a6, 16 1415 fld.d f10, a6, 24 1416 fld.d f11, a6, 32 1417 fld.d f12, a6, 40 1418 fld.d f13, a6, 48 1419 1420 vsllwil.h.b vr7, vr7, 0 1421 vsllwil.h.b vr8, vr8, 0 1422 vsllwil.h.b vr9, vr9, 0 1423 vsllwil.h.b vr10, vr10, 0 1424 vsllwil.h.b vr11, vr11, 0 1425 vsllwil.h.b vr12, vr12, 0 1426 vsllwil.h.b vr13, vr13, 0 1427.endm 1428 1429.macro ipred_filter_calc_acc 1430 vmul.h vr7, vr7, vr0 1431 vmadd.h vr7, vr8, vr1 1432 vmadd.h vr7, vr9, vr2 1433 vmadd.h vr7, vr10, vr3 1434 vmadd.h vr7, vr11, vr4 1435 vmadd.h vr7, vr12, vr5 1436 vmadd.h vr7, vr13, vr6 1437 vaddi.hu vr7, vr7, 8 1438 vsrai.h vr7, vr7, 4 1439 iclip_pixel_vrh vr7, vr14, vr15, vr9, vr10, vr8 1440 vsrlni.b.h vr8, vr8, 0 1441.endm 1442 1443// void ipred_filter_lsx(pixel *dst, const ptrdiff_t stride, 1444// const pixel *const topleft_in, 1445// const int width, const int height, int filt_idx, 1446// const int max_width, const int max_height 1447// HIGHBD_DECL_SUFFIX) 1448function ipred_filter_8bpc_lsx 1449 andi a5, a5, 511 1450 la.local a6, dav1d_filter_intra_taps_lsx 1451 li.w a7, 56 1452 mul.w a7, a7, a5 1453 add.d a6, a6, a7 //*filter 1454 addi.d a7, a2, 1 //*top 1455 or a5, zero, zero //y 1456 vxor.v vr14, vr14, vr14 1457 li.w t0, 255 1458 vreplgr2vr.h vr15, t0 1459 1460.FILTER_LOOP_H: 1461 sub.d t0, a2, a5 //*topleft 1462 addi.d t1, t0, -1 //left 1463 1464 ctz.w t2, a3 1465 addi.d t3, t2, -2 1466 beqz t3, .FILTER_LOOP_W4 1467 addi.d t3, t2, -3 1468 beqz t3, .FILTER_LOOP_W8 1469 addi.d t3, t2, -4 1470 beqz t3, .FILTER_LOOP_W16 1471 addi.d t3, t2, -5 1472 beqz t3, .FILTER_LOOP_W32 1473 1474.FILTER_LOOP_W4: 1475 ipred_filter_load_p 1476 1477 or t3, a0, a0 //*ptr 1478 1479 ipred_filter_load_fltptr 1480 ipred_filter_calc_acc 1481 1482 fst.s f8, t3, 0 1483 add.d t3, t3, a1 1484 vstelm.w vr8, t3, 0, 1 1485 add.d t3, t3, a1 1486 1487 b .FILTER_LOOP_W_END 1488 1489.FILTER_LOOP_W8: 1490 ipred_filter_load_p 1491 1492 or t3, a0, a0 1493 1494 ipred_filter_load_fltptr 1495 ipred_filter_calc_acc 1496 1497 fst.s f8, t3, 0 1498 add.d t3, t3, a1 1499 vstelm.w vr8, t3, 0, 1 1500 add.d t3, t3, a1 1501 1502 addi.d t1, a0, 3 1503 addi.d a7, a7, 4 1504 addi.d t0, a7, -1 1505 1506 ipred_filter_loadx_p 1507 1508 addi.d t3, a0, 4 1509 1510 ipred_filter_load_fltptr 1511 ipred_filter_calc_acc 1512 1513 fst.s f8, t3, 0 1514 add.d t3, t3, a1 1515 vstelm.w vr8, t3, 0, 1 1516 add.d t3, t3, a1 1517 1518 b .FILTER_LOOP_W_END 1519 1520.FILTER_LOOP_W16: 1521 ipred_filter_load_p 1522 1523 or t3, a0, a0 1524 1525 ipred_filter_load_fltptr 1526 ipred_filter_calc_acc 1527 1528 fst.s f8, t3, 0 1529 add.d t3, t3, a1 1530 vstelm.w vr8, t3, 0, 1 1531 add.d t3, t3, a1 1532 1533 addi.d t1, a0, 3 1534 addi.d a7, a7, 4 1535 addi.d t0, a7, -1 1536 1537 ipred_filter_loadx_p 1538 1539 addi.d t3, a0, 4 1540 1541 ipred_filter_load_fltptr 1542 ipred_filter_calc_acc 1543 1544 fst.s f8, t3, 0 1545 add.d t3, t3, a1 1546 vstelm.w vr8, t3, 0, 1 1547 add.d t3, t3, a1 1548 1549 addi.d t1, a0, 7 1550 addi.d a7, a7, 4 1551 addi.d t0, a7, -1 1552 1553 ipred_filter_loadx_p 1554 1555 addi.d t3, a0, 8 1556 1557 ipred_filter_load_fltptr 1558 ipred_filter_calc_acc 1559 1560 fst.s f8, t3, 0 1561 add.d t3, t3, a1 1562 vstelm.w vr8, t3, 0, 1 1563 add.d t3, t3, a1 1564 1565 addi.d t1, a0, 11 1566 addi.d a7, a7, 4 1567 addi.d t0, a7, -1 1568 1569 ipred_filter_loadx_p 1570 1571 addi.d t3, a0, 12 1572 1573 ipred_filter_load_fltptr 1574 ipred_filter_calc_acc 1575 1576 fst.s f8, t3, 0 1577 add.d t3, t3, a1 1578 vstelm.w vr8, t3, 0, 1 1579 add.d t3, t3, a1 1580 1581 b .FILTER_LOOP_W_END 1582 1583.FILTER_LOOP_W32: 1584 ipred_filter_load_p 1585 1586 or t3, a0, a0 1587 1588 ipred_filter_load_fltptr 1589 ipred_filter_calc_acc 1590 1591 fst.s f8, t3, 0 1592 add.d t3, t3, a1 1593 vstelm.w vr8, t3, 0, 1 1594 add.d t3, t3, a1 1595 1596 addi.d t1, a0, 3 1597 addi.d a7, a7, 4 1598 addi.d t0, a7, -1 1599 1600 ipred_filter_loadx_p 1601 1602 addi.d t3, a0, 4 1603 1604 ipred_filter_load_fltptr 1605 ipred_filter_calc_acc 1606 1607 fst.s f8, t3, 0 1608 add.d t3, t3, a1 1609 vstelm.w vr8, t3, 0, 1 1610 add.d t3, t3, a1 1611 1612 addi.d t1, a0, 7 1613 addi.d a7, a7, 4 1614 addi.d t0, a7, -1 1615 1616 ipred_filter_loadx_p 1617 1618 addi.d t3, a0, 8 1619 1620 ipred_filter_load_fltptr 1621 ipred_filter_calc_acc 1622 1623 fst.s f8, t3, 0 1624 add.d t3, t3, a1 1625 vstelm.w vr8, t3, 0, 1 1626 add.d t3, t3, a1 1627 1628 addi.d t1, a0, 11 1629 addi.d a7, a7, 4 1630 addi.d t0, a7, -1 1631 1632 ipred_filter_loadx_p 1633 1634 addi.d t3, a0, 12 1635 1636 ipred_filter_load_fltptr 1637 ipred_filter_calc_acc 1638 1639 fst.s f8, t3, 0 1640 add.d t3, t3, a1 1641 vstelm.w vr8, t3, 0, 1 1642 add.d t3, t3, a1 1643 1644 addi.d t1, a0, 15 1645 addi.d a7, a7, 4 1646 addi.d t0, a7, -1 1647 1648 ipred_filter_loadx_p 1649 1650 addi.d t3, a0, 16 1651 1652 ipred_filter_load_fltptr 1653 ipred_filter_calc_acc 1654 1655 fst.s f8, t3, 0 1656 add.d t3, t3, a1 1657 vstelm.w vr8, t3, 0, 1 1658 add.d t3, t3, a1 1659 1660 addi.d t1, a0, 19 1661 addi.d a7, a7, 4 1662 addi.d t0, a7, -1 1663 1664 ipred_filter_loadx_p 1665 1666 addi.d t3, a0, 20 1667 1668 ipred_filter_load_fltptr 1669 ipred_filter_calc_acc 1670 1671 fst.s f8, t3, 0 1672 add.d t3, t3, a1 1673 vstelm.w vr8, t3, 0, 1 1674 add.d t3, t3, a1 1675 1676 addi.d t1, a0, 23 1677 addi.d a7, a7, 4 1678 addi.d t0, a7, -1 1679 1680 ipred_filter_loadx_p 1681 1682 addi.d t3, a0, 24 1683 1684 ipred_filter_load_fltptr 1685 ipred_filter_calc_acc 1686 1687 fst.s f8, t3, 0 1688 add.d t3, t3, a1 1689 vstelm.w vr8, t3, 0, 1 1690 add.d t3, t3, a1 1691 1692 addi.d t1, a0, 27 1693 addi.d a7, a7, 4 1694 addi.d t0, a7, -1 1695 1696 ipred_filter_loadx_p 1697 1698 addi.d t3, a0, 28 1699 1700 ipred_filter_load_fltptr 1701 ipred_filter_calc_acc 1702 1703 fst.s f8, t3, 0 1704 add.d t3, t3, a1 1705 vstelm.w vr8, t3, 0, 1 1706 add.d t3, t3, a1 1707 1708.FILTER_LOOP_W_END: 1709 add.d a7, a0, a1 1710 add.d t2, a1, a1 1711 add.d a0, a0, t2 1712 addi.d a5, a5, 2 1713 blt a5, a4, .FILTER_LOOP_H 1714endfunc 1715 1716const dav1d_dr_intra_derivative 1717 // Values that are 0 will never be used 1718 .short 0 // Angles: 1719 .short 1023, 0 // 3, 93, 183 1720 .short 547 // 6, 96, 186 1721 .short 372, 0, 0 // 9, 99, 189 1722 .short 273 // 14, 104, 194 1723 .short 215, 0 // 17, 107, 197 1724 .short 178 // 20, 110, 200 1725 .short 151, 0 // 23, 113, 203 (113 & 203 are base angles) 1726 .short 132 // 26, 116, 206 1727 .short 116, 0 // 29, 119, 209 1728 .short 102, 0 // 32, 122, 212 1729 .short 90 // 36, 126, 216 1730 .short 80, 0 // 39, 129, 219 1731 .short 71 // 42, 132, 222 1732 .short 64, 0 // 45, 135, 225 (45 & 135 are base angles) 1733 .short 57 // 48, 138, 228 1734 .short 51, 0 // 51, 141, 231 1735 .short 45, 0 // 54, 144, 234 1736 .short 40 // 58, 148, 238 1737 .short 35, 0 // 61, 151, 241 1738 .short 31 // 64, 154, 244 1739 .short 27, 0 // 67, 157, 247 (67 & 157 are base angles) 1740 .short 23 // 70, 160, 250 1741 .short 19, 0 // 73, 163, 253 1742 .short 15, 0 // 76, 166, 256 1743 .short 11, 0 // 81, 171, 261 1744 .short 7 // 84, 174, 264 1745 .short 3 // 87, 177, 267 1746endconst 1747 1748const z1_upsample_edge_kernel 1749 .short -1, 9, 9, -1, -1, 9, 9, -1 1750endconst 1751 1752const ipred_filter_edge_kernel1 1753 .short 0, 4, 8, 4, 0, 4, 8, 4 1754 .short 0, 5, 6, 5, 0, 5, 6, 5 1755 .short 2, 4, 4, 4, 2, 4, 4, 4 1756endconst 1757 1758const ipred_filter_edge_kernel2 1759 .short 0, 0, 0, 0, 0, 0, 0, 0 1760 .short 0, 0, 0, 0, 0, 0, 0, 0 1761 .short 2, 2, 2, 2, 2, 2, 2, 2 1762endconst 1763 1764.macro z1_upsample_edge_calc_loop 1765 vsllwil.hu.bu vr10, vr7, 0 1766 vsllwil.hu.bu vr11, vr11, 0 1767 vsllwil.hu.bu vr12, vr12, 0 1768 vsllwil.hu.bu vr13, vr13, 0 1769 1770 vmul.h vr10, vr10, vr0 1771 vmul.h vr11, vr11, vr0 1772 vmul.h vr12, vr12, vr0 1773 vmul.h vr13, vr13, vr0 1774 1775 vhaddw.w.h vr10, vr10, vr10 1776 vhaddw.w.h vr11, vr11, vr11 1777 vhaddw.w.h vr12, vr12, vr12 1778 vhaddw.w.h vr13, vr13, vr13 1779 vhaddw.d.w vr10, vr10, vr10 1780 vhaddw.d.w vr11, vr11, vr11 1781 vhaddw.d.w vr12, vr12, vr12 1782 vhaddw.d.w vr13, vr13, vr13 1783 1784 vpackev.h vr10, vr11, vr10 1785 vpackev.h vr11, vr13, vr12 1786 vpackev.w vr12, vr11, vr10 //s:01234567 1787 vsrari.h vr12, vr12, 4 1788 iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12 1789 vsrlni.b.h vr12, vr12, 0 //out: 13579... 1790 vbsrl.v vr11, vr7, 1 //out:02468... 1791 vilvl.b vr13, vr12, vr11 1792.endm 1793 1794.macro z1_upsample_edge_data_init1 1795 vbsrl.v vr11, vr7, 1 1796 vbsrl.v vr12, vr7, 2 1797 vbsrl.v vr13, vr7, 3 1798 z1_upsample_edge_calc_loop 1799.endm 1800 1801.macro z1_upsample_edge_data_init2 1802 vbsrl.v vr11, vr7, 1 1803 vbsrl.v vr12, vr7, 2 1804 vextrins.b vr12, vr12, 0x76 1805 vbsrl.v vr13, vr7, 3 1806 vextrins.b vr13, vr13, 0x65 1807 vextrins.b vr13, vr13, 0x75 1808 z1_upsample_edge_calc_loop 1809.endm 1810 1811.macro z1_upsample_edge_calc_other 1812 vsllwil.hu.bu vr10, vr7, 0 1813 vmul.h vr10, vr10, vr0 1814 vhaddw.w.h vr10, vr10, vr10 1815 vhaddw.d.w vr10, vr10, vr10 1816 vreplvei.h vr12, vr10, 0 //s0-s7 1817 vsrari.h vr12, vr12, 4 1818 1819 iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12 1820 vsrlni.b.h vr12, vr12, 0 1821 vilvl.b vr13, vr12, vr7 1822.endm 1823 1824.macro z1_filter_edge_calc_loop1 1825 vmul.h vr10, vr10, vr1 1826 vmul.h vr11, vr11, vr1 1827 vmul.h vr12, vr12, vr1 1828 vmul.h vr13, vr13, vr1 1829 1830 vhaddw.w.h vr10, vr10, vr10 1831 vhaddw.w.h vr11, vr11, vr11 1832 vhaddw.w.h vr12, vr12, vr12 1833 vhaddw.w.h vr13, vr13, vr13 1834 vhaddw.d.w vr10, vr10, vr10 1835 vhaddw.d.w vr11, vr11, vr11 1836 vhaddw.d.w vr12, vr12, vr12 1837 vhaddw.d.w vr13, vr13, vr13 1838 1839 vpackev.h vr10, vr11, vr10 1840 vpackev.h vr11, vr13, vr12 1841 vpackev.w vr10, vr11, vr10 //s:01234567 1842.endm 1843 1844.macro z1_filter_edge_calc_loop2 1845 vsllwil.hu.bu vr13, vr13, 0 1846 vmadd.h vr10, vr13, vr6 1847 vsrari.h vr12, vr10, 4 1848 vsrlni.b.h vr12, vr12, 0 //out: 0-7 1849.endm 1850 1851.macro z1_filter_edge_calc_other 1852 vsllwil.hu.bu vr10, vr10, 0 1853 vmul.h vr11, vr10, vr1 1854 vhaddw.w.h vr11, vr11, vr11 1855 vhaddw.d.w vr11, vr11, vr11 1856 vreplvei.h vr12, vr11, 4 1857 vextrins.h vr12, vr11, 0x00 1858 1859 vreplvei.h vr13, vr10, 1 1860 vmadd.h vr12, vr13, vr6 1861 vsrari.h vr12, vr12, 4 1862 vsrlni.b.h vr12, vr12, 0 //out: 0-7 1863.endm 1864 1865.macro z1_filter_edge_data_init1 1866 vbsll.v vr10, vr7, 1 1867 vextrins.b vr10, vr10, 0x01 1868 vbsrl.v vr12, vr7, 1 1869 vbsrl.v vr13, vr7, 2 1870 vsllwil.hu.bu vr10, vr10, 0 1871 vsllwil.hu.bu vr11, vr7, 0 1872 vsllwil.hu.bu vr12, vr12, 0 1873 vsllwil.hu.bu vr13, vr13, 0 1874 z1_filter_edge_calc_loop1 1875.endm 1876 1877.macro z1_filter_edge_data_init2 1878 vbsrl.v vr11, vr7, 1 1879 vbsrl.v vr12, vr7, 2 1880 vbsrl.v vr13, vr7, 3 1881 vsllwil.hu.bu vr10, vr7, 0 1882 vsllwil.hu.bu vr11, vr11, 0 1883 vsllwil.hu.bu vr12, vr12, 0 1884 vsllwil.hu.bu vr13, vr13, 0 1885 z1_filter_edge_calc_loop1 1886.endm 1887 1888.macro z1_filter_edge_data_init3 1889 vbsrl.v vr11, vr7, 1 1890 vbsrl.v vr12, vr7, 2 1891 vbsrl.v vr13, vr7, 3 1892 vextrins.b vr13, vr13, 0x76 1893 vsllwil.hu.bu vr10, vr7, 0 1894 vsllwil.hu.bu vr11, vr11, 0 1895 vsllwil.hu.bu vr12, vr12, 0 1896 vsllwil.hu.bu vr13, vr13, 0 1897 z1_filter_edge_calc_loop1 1898.endm 1899 1900.macro z1_filter_edge_data_init4 1901 vbsll.v vr10, vr7, 1 1902 vextrins.b vr10, vr10, 0x01 1903 vbsrl.v vr12, vr7, 1 1904 vbsrl.v vr13, vr7, 2 1905 vextrins.b vr13, vr13, 0x76 1906 vsllwil.hu.bu vr10, vr10, 0 1907 vsllwil.hu.bu vr11, vr7, 0 1908 vsllwil.hu.bu vr12, vr12, 0 1909 vsllwil.hu.bu vr13, vr13, 0 1910 z1_filter_edge_calc_loop1 1911.endm 1912 1913.macro pixel_set_8bpc_allw dst_ptr, src_ptr, width, tmp0, tmp1 1914 vldrepl.b vr10, \src_ptr, 0 1915 or \tmp1, zero, zero 1916 srai.d \tmp0, \width, 4 1917 beqz \tmp0, 2f 19181: 1919 vstx vr10, \dst_ptr, \tmp1 1920 addi.d \tmp1, \tmp1, 16 1921 addi.d \tmp0, \tmp0, -1 1922 bnez \tmp0, 1b 19232: 1924 andi \tmp0, \width, 8 1925 beqz \tmp0, 3f 1926 fstx.d f10, \dst_ptr, \tmp1 1927 addi.d \tmp1, \tmp1, 8 19283: 1929 andi \tmp0, \width, 4 1930 beqz \tmp0, 4f 1931 fstx.s f10, \dst_ptr, \tmp1 1932 addi.d \tmp1, \tmp1, 4 19334: 1934 andi \tmp0, \width, 2 1935 beqz \tmp0, 5f 1936 ldx.bu \tmp0, \src_ptr, zero 1937 stx.b \tmp0, \dst_ptr, \tmp1 1938 addi.d \tmp1, \tmp1, 1 1939 stx.b \tmp0, \dst_ptr, \tmp1 1940 addi.d \tmp1, \tmp1, 1 19415: 1942 andi \tmp0, \width, 1 1943 beqz \tmp0, 6f 1944 ldx.bu \tmp0, \src_ptr, zero 1945 stx.b \tmp0, \dst_ptr, \tmp1 19466: 1947.endm 1948 1949// void ipred_z1_lsx(pixel *dst, const ptrdiff_t stride, 1950// const pixel *const topleft_in, 1951// const int width, const int height, int angle, 1952// const int max_width, const int max_height 1953// HIGHBD_DECL_SUFFIX) 1954function ipred_z1_8bpc_lsx 1955 addi.d a2, a2, 1 //&topleft_in[1] 1956 addi.d sp, sp, -128 1957 or t2, sp, sp //top_out 1958 srai.d a6, a5, 9 1959 andi a6, a6, 1 //is_sum 1960 srai.d a7, a5, 10 //enable_intra_edge_filter 1961 andi a5, a5, 511 1962 1963 la.local t0, dav1d_dr_intra_derivative 1964 andi t1, a5, 0xFFE 1965 ldx.hu t1, t0, t1 //dx 1966 1967 beqz a7, .IPRED_Z1_NOTUA 1968 add.d t3, a3, a4 1969 li.w t4, 90 1970 sub.w t4, t4, a5 1971 // ipred_get_upsample t5:upsample_above 1972 li.w t6, 16 1973 sra.d t6, t6, a6 1974 bge t6, t3, .Z1_GETUS1 1975 addi.d t5, zero, 0 1976 b .Z1_GETUS2 1977.Z1_GETUS1: 1978 addi.d t5, zero, 1 1979.Z1_GETUS2: 1980 li.w t6, 40 1981 blt t4, t6, .Z1_GETUS3 1982 addi.d t6, zero, 0 1983 b .Z1_GETUS4 1984.Z1_GETUS3: 1985 addi.d t6, zero, 1 1986.Z1_GETUS4: 1987 and t5, t5, t6 1988 1989 beqz t5, .IPRED_Z1_NOTUA 1990 1991 la.local t0, z1_upsample_edge_kernel 1992 vld vr0, t0, 0 //kernel 1993 vxor.v vr15, vr15, vr15 1994 li.w t0, 255 1995 vreplgr2vr.h vr16, t0 1996 1997.Z1_UEDGE_W4: 1998 andi t6, a3, 4 1999 beqz t6, .Z1_UEDGE_W8 2000.Z1_UEDGE_W4_H4: 2001 andi t6, a4, 4 2002 beqz t6, .Z1_UEDGE_W4_H8 2003 2004 //0-6 2005 vld vr7, a2, -1 2006 vbsrl.v vr11, vr7, 1 2007 vbsrl.v vr12, vr7, 2 2008 vextrins.b vr12, vr12, 0x76 2009 vbsrl.v vr13, vr7, 3 2010 z1_upsample_edge_calc_loop 2011 2012 fst.d f13, t2, 0 2013 vstelm.w vr13, t2, 8, 2 2014 vstelm.h vr13, t2, 12, 6 2015 2016 ld.bu t7, a2, 7 2017 st.b t7, t2, 14 2018 2019 b .Z1_UEDGE_END 2020 2021.Z1_UEDGE_W4_H8: 2022 andi t6, a4, 8 2023 beqz t6, .Z1_UEDGE_W4_H16 2024 2025 //0-7 2026 vld vr7, a2, -1 2027 z1_upsample_edge_data_init2 2028 vst vr13, t2, 0 2029 2030 //8-10 2031 vldrepl.b vr7, a2, 7 2032 z1_upsample_edge_calc_other 2033 2034 vstelm.w vr13, t2, 16, 0 2035 vstelm.h vr13, t2, 20, 2 2036 2037 ld.bu t7, a2, 7 2038 st.b t7, t2, 22 2039 2040 b .Z1_UEDGE_END 2041 2042.Z1_UEDGE_W4_H16: 2043 andi t6, a4, 16 2044 beqz t6, .Z1_UEDGE_W4_H32 2045 2046 //0-7 2047 vld vr7, a2, -1 2048 z1_upsample_edge_data_init2 2049 vst vr13, t2, 0 2050 2051 //8-15 2052 vldrepl.b vr7, a2, 7 2053 z1_upsample_edge_calc_other 2054 vst vr13, t2, 16 2055 2056 //16-18 2057 vstelm.w vr13, t2, 32, 0 2058 vstelm.h vr13, t2, 36, 2 2059 2060 ld.bu t7, a2, 7 2061 st.b t7, t2, 38 2062 2063 b .Z1_UEDGE_END 2064 2065.Z1_UEDGE_W4_H32: 2066 andi t6, a4, 32 2067 beqz t6, .Z1_UEDGE_W4_H64 2068 2069 //0-7 2070 vld vr7, a2, -1 2071 z1_upsample_edge_data_init2 2072 vst vr13, t2, 0 2073 2074 //8-15 2075 vldrepl.b vr7, a2, 7 2076 z1_upsample_edge_calc_other 2077 vst vr13, t2, 16 2078 2079 vst vr13, t2, 32 //16-23 2080 vst vr13, t2, 48 //24-31 2081 2082 //32-34 2083 vstelm.w vr13, t2, 64, 0 2084 vstelm.h vr13, t2, 68, 2 2085 2086 ld.bu t7, a2, 7 2087 st.b t7, t2, 70 2088 2089 b .Z1_UEDGE_END 2090 2091.Z1_UEDGE_W4_H64: 2092 //0-7 2093 vld vr7, a2, -1 2094 z1_upsample_edge_data_init2 2095 vst vr13, t2, 0 2096 2097 //8-15 2098 vldrepl.b vr7, a2, 7 2099 z1_upsample_edge_calc_other 2100 vst vr13, t2, 16 2101 2102 vst vr13, t2, 32 //16-23 2103 vst vr13, t2, 48 //24-31 2104 vst vr13, t2, 64 //32-39 2105 vst vr13, t2, 80 //40-47 2106 vst vr13, t2, 96 //48-55 2107 vst vr13, t2, 112 //56-63 2108 2109 //64-66 2110 vstelm.w vr13, t2, 128, 0 2111 vstelm.h vr13, t2, 132, 2 2112 2113 ld.bu t7, a2, 7 2114 st.b t7, t2, 134 2115 2116 b .Z1_UEDGE_END 2117 2118.Z1_UEDGE_W8: 2119 andi t6, a3, 8 2120 beqz t6, .Z1_UEDGE_W16 2121.Z1_UEDGE_W8_H4: 2122 andi t6, a4, 4 2123 beqz t6, .Z1_UEDGE_W8_H8 2124 2125 //0-7 2126 vld vr7, a2, -1 2127 z1_upsample_edge_data_init1 2128 vst vr13, t2, 0 2129 2130 //8-15 2131 vld vr7, a2, 7 2132 vbsrl.v vr11, vr7, 1 2133 vbsrl.v vr12, vr7, 2 2134 vextrins.b vr12, vr12, 0x32 2135 vbsrl.v vr13, vr7, 3 2136 vextrins.b vr13, vr13, 0x21 2137 vextrins.b vr13, vr13, 0x31 2138 z1_upsample_edge_calc_loop 2139 vstelm.w vr13, t2, 16, 0 2140 vstelm.h vr13, t2, 20, 2 2141 2142 ld.bu t7, a2, 11 2143 st.b t7, t2, 22 2144 b .Z1_UEDGE_END 2145 2146.Z1_UEDGE_W8_H8: 2147 andi t6, a4, 8 2148 beqz t6, .Z1_UEDGE_W8_H16 2149 2150 //0-7 2151 vld vr7, a2, -1 2152 z1_upsample_edge_data_init1 2153 vst vr13, t2, 0 2154 2155 //8-14 2156 vld vr7, a2, 7 2157 vbsrl.v vr11, vr7, 1 2158 vbsrl.v vr12, vr7, 2 2159 vextrins.b vr12, vr12, 0x76 2160 vbsrl.v vr13, vr7, 3 2161 z1_upsample_edge_calc_loop 2162 fst.d f13, t2, 16 2163 vstelm.w vr13, t2, 24, 2 2164 vstelm.h vr13, t2, 28, 6 2165 2166 ld.bu t7, a2, 15 2167 st.b t7, t2, 30 2168 b .Z1_UEDGE_END 2169 2170.Z1_UEDGE_W8_H16: 2171 andi t6, a4, 16 2172 beqz t6, .Z1_UEDGE_W8_H32 2173 2174 //0-7 2175 vld vr7, a2, -1 2176 z1_upsample_edge_data_init1 2177 vst vr13, t2, 0 2178 2179 //8-15 2180 vld vr7, a2, 7 2181 z1_upsample_edge_data_init2 2182 vst vr13, t2, 16 2183 2184 //16-22 2185 vldrepl.b vr7, a2, 15 2186 z1_upsample_edge_calc_other 2187 fst.d f13, t2, 32 2188 vstelm.w vr13, t2, 40, 2 2189 vstelm.h vr13, t2, 44, 6 2190 2191 ld.bu t7, a2, 15 2192 st.b t7, t2, 46 2193 b .Z1_UEDGE_END 2194 2195.Z1_UEDGE_W8_H32: 2196 andi t6, a4, 32 2197 beqz t6, .Z1_UEDGE_W8_H64 2198 2199 //0-7 2200 vld vr7, a2, -1 2201 z1_upsample_edge_data_init1 2202 vst vr13, t2, 0 2203 2204 //8-15 2205 vld vr7, a2, 7 2206 z1_upsample_edge_data_init2 2207 vst vr13, t2, 16 2208 2209 //16-23 2210 vldrepl.b vr7, a2, 15 2211 z1_upsample_edge_calc_other 2212 vst vr13, t2, 32 2213 2214 vst vr13, t2, 48 //24-31 2215 2216 //32-38 2217 fst.d f13, t2, 64 2218 vstelm.w vr13, t2, 72, 2 2219 vstelm.h vr13, t2, 76, 6 2220 2221 ld.bu t7, a2, 15 2222 st.b t7, t2, 78 2223 b .Z1_UEDGE_END 2224 2225.Z1_UEDGE_W8_H64: 2226 //0-7 2227 vld vr7, a2, -1 2228 z1_upsample_edge_data_init1 2229 vst vr13, t2, 0 2230 2231 //8-15 2232 vld vr7, a2, 7 2233 z1_upsample_edge_data_init2 2234 vst vr13, t2, 16 2235 2236 //16-23 2237 vldrepl.b vr7, a2, 15 2238 z1_upsample_edge_calc_other 2239 vst vr13, t2, 32 2240 2241 vst vr13, t2, 48 //24-31 2242 vst vr13, t2, 64 //32-39 2243 vst vr13, t2, 80 //40-47 2244 vst vr13, t2, 96 //48-55 2245 vst vr13, t2, 112 //56-63 2246 2247 //64-70 2248 fst.d f13, t2, 128 2249 vstelm.w vr13, t2, 136, 2 2250 vstelm.h vr13, t2, 140, 6 2251 2252 ld.bu t7, a2, 15 2253 st.b t7, t2, 142 2254 b .Z1_UEDGE_END 2255 2256.Z1_UEDGE_W16: 2257 andi t6, a3, 16 2258 beqz t6, .Z1_UEDGE_W32 2259.Z1_UEDGE_W16_H4: 2260 andi t6, a4, 4 2261 beqz t6, .Z1_UEDGE_W16_H8 2262 2263 //0-7 2264 vld vr7, a2, -1 2265 z1_upsample_edge_data_init1 2266 vst vr13, t2, 0 2267 2268 //8-15 2269 vld vr7, a2, 7 2270 z1_upsample_edge_data_init1 2271 vst vr13, t2, 16 2272 2273 //16-18 2274 vld vr7, a2, 15 2275 z1_upsample_edge_data_init1 2276 vstelm.w vr13, t2, 32, 0 2277 vstelm.h vr13, t2, 36, 2 2278 2279 ld.bu t7, a2, 19 2280 st.b t7, t2, 38 2281 b .Z1_UEDGE_END 2282 2283.Z1_UEDGE_W16_H8: 2284 andi t6, a4, 8 2285 beqz t6, .Z1_UEDGE_W16_H16 2286 2287 //0-7 2288 vld vr7, a2, -1 2289 z1_upsample_edge_data_init1 2290 vst vr13, t2, 0 2291 2292 //8-15 2293 vld vr7, a2, 7 2294 z1_upsample_edge_data_init1 2295 vst vr13, t2, 16 2296 2297 //16-22 2298 vld vr7, a2, 15 2299 vbsrl.v vr11, vr7, 1 2300 vbsrl.v vr12, vr7, 2 2301 vextrins.b vr12, vr12, 0x76 2302 vbsrl.v vr13, vr7, 3 2303 z1_upsample_edge_calc_loop 2304 fst.d f13, t2, 32 2305 vstelm.w vr13, t2, 40, 2 2306 vstelm.h vr13, t2, 44, 6 2307 2308 ld.bu t7, a2, 23 2309 st.b t7, t2, 46 2310 b .Z1_UEDGE_END 2311 2312.Z1_UEDGE_W16_H16: 2313 andi t6, a4, 16 2314 beqz t6, .Z1_UEDGE_W16_H32 2315 2316 //0-7 2317 vld vr7, a2, -1 2318 z1_upsample_edge_data_init1 2319 vst vr13, t2, 0 2320 2321 //8-15 2322 vld vr7, a2, 7 2323 z1_upsample_edge_data_init1 2324 vst vr13, t2, 16 2325 2326 //16-23 2327 vld vr7, a2, 15 2328 z1_upsample_edge_data_init1 2329 vst vr13, t2, 32 2330 2331 //24-30 2332 vld vr7, a2, 23 2333 vbsrl.v vr11, vr7, 1 2334 vbsrl.v vr12, vr7, 2 2335 vextrins.b vr12, vr12, 0x76 2336 vbsrl.v vr13, vr7, 3 2337 z1_upsample_edge_calc_loop 2338 fst.d f13, t2, 48 2339 vstelm.w vr13, t2, 56, 2 2340 vstelm.h vr13, t2, 60, 6 2341 2342 ld.bu t7, a2, 31 2343 st.b t7, t2, 62 2344 b .Z1_UEDGE_END 2345 2346.Z1_UEDGE_W16_H32: 2347 andi t6, a4, 32 2348 beqz t6, .Z1_UEDGE_W16_H64 2349 2350 //0-7 2351 vld vr7, a2, -1 2352 z1_upsample_edge_data_init1 2353 vst vr13, t2, 0 2354 2355 //8-15 2356 vld vr7, a2, 7 2357 z1_upsample_edge_data_init1 2358 vst vr13, t2, 16 2359 2360 //16-23 2361 vld vr7, a2, 15 2362 z1_upsample_edge_data_init1 2363 vst vr13, t2, 32 2364 2365 //24-31 2366 vld vr7, a2, 23 2367 z1_upsample_edge_data_init2 2368 vst vr13, t2, 48 2369 2370 //32-39 2371 vldrepl.b vr7, a2, 31 2372 z1_upsample_edge_calc_other 2373 vst vr13, t2, 64 2374 2375 //40-46 2376 fst.d f13, t2, 80 2377 vstelm.w vr13, t2, 88, 2 2378 vstelm.h vr13, t2, 92, 6 2379 2380 ld.bu t7, a2, 31 2381 st.b t7, t2, 94 2382 b .Z1_UEDGE_END 2383 2384.Z1_UEDGE_W16_H64: 2385 //0-7 2386 vld vr7, a2, -1 2387 z1_upsample_edge_data_init1 2388 vst vr13, t2, 0 2389 2390 //8-15 2391 vld vr7, a2, 7 2392 z1_upsample_edge_data_init1 2393 vst vr13, t2, 16 2394 2395 //16-23 2396 vld vr7, a2, 15 2397 z1_upsample_edge_data_init1 2398 vst vr13, t2, 32 2399 2400 //24-31 2401 vld vr7, a2, 23 2402 z1_upsample_edge_data_init2 2403 vst vr13, t2, 48 2404 2405 //32-39 2406 vldrepl.b vr7, a2, 31 2407 z1_upsample_edge_calc_other 2408 vst vr13, t2, 64 2409 2410 vst vr13, t2, 80 //40-47 2411 vst vr13, t2, 96 //48-55 2412 vst vr13, t2, 112 //56-63 2413 vst vr13, t2, 128 //64-71 2414 2415 //72-78 2416 fst.d f13, t2, 144 2417 vstelm.w vr13, t2, 152, 2 2418 vstelm.h vr13, t2, 156, 6 2419 2420 ld.bu t7, a2, 31 2421 st.b t7, t2, 158 2422 b .Z1_UEDGE_END 2423 2424.Z1_UEDGE_W32: 2425 andi t6, a3, 32 2426 beqz t6, .Z1_UEDGE_W64 2427.Z1_UEDGE_W32_H8: 2428 andi t6, a4, 8 2429 beqz t6, .Z1_UEDGE_W32_H16 2430 2431 //0-7 2432 vld vr7, a2, -1 2433 z1_upsample_edge_data_init1 2434 vst vr13, t2, 0 2435 2436 //8-15 2437 vld vr7, a2, 7 2438 z1_upsample_edge_data_init1 2439 vst vr13, t2, 16 2440 2441 //16-23 2442 vld vr7, a2, 15 2443 z1_upsample_edge_data_init1 2444 vst vr13, t2, 32 2445 2446 //24-31 2447 vld vr7, a2, 23 2448 z1_upsample_edge_data_init1 2449 vst vr13, t2, 48 2450 2451 //32-38 2452 vld vr7, a2, 31 2453 vbsrl.v vr11, vr7, 1 2454 vbsrl.v vr12, vr7, 2 2455 vextrins.b vr12, vr12, 0x76 2456 vbsrl.v vr13, vr7, 3 2457 z1_upsample_edge_calc_loop 2458 fst.d f13, t2, 64 2459 vstelm.w vr13, t2, 72, 2 2460 vstelm.h vr13, t2, 76, 6 2461 2462 ld.bu t7, a2, 39 2463 st.b t7, t2, 78 2464 b .Z1_UEDGE_END 2465 2466.Z1_UEDGE_W32_H16: 2467 andi t6, a4, 16 2468 beqz t6, .Z1_UEDGE_W32_H32 2469 2470 //0-7 2471 vld vr7, a2, -1 2472 z1_upsample_edge_data_init1 2473 vst vr13, t2, 0 2474 2475 //8-15 2476 vld vr7, a2, 7 2477 z1_upsample_edge_data_init1 2478 vst vr13, t2, 16 2479 2480 //16-23 2481 vld vr7, a2, 15 2482 z1_upsample_edge_data_init1 2483 vst vr13, t2, 32 2484 2485 //24-31 2486 vld vr7, a2, 23 2487 z1_upsample_edge_data_init1 2488 vst vr13, t2, 48 2489 2490 //32-39 2491 vld vr7, a2, 31 2492 z1_upsample_edge_data_init1 2493 vst vr13, t2, 64 2494 2495 //40-46 2496 vld vr7, a2, 39 2497 vbsrl.v vr11, vr7, 1 2498 vbsrl.v vr12, vr7, 2 2499 vextrins.b vr12, vr12, 0x76 2500 vbsrl.v vr13, vr7, 3 2501 z1_upsample_edge_calc_loop 2502 fst.d f13, t2, 80 2503 vstelm.w vr13, t2, 88, 2 2504 vstelm.h vr13, t2, 92, 6 2505 2506 ld.bu t7, a2, 47 2507 st.b t7, t2, 94 2508 b .Z1_UEDGE_END 2509 2510.Z1_UEDGE_W32_H32: 2511 andi t6, a4, 32 2512 beqz t6, .Z1_UEDGE_W32_H64 2513 2514 //0-7 2515 vld vr7, a2, -1 2516 z1_upsample_edge_data_init1 2517 vst vr13, t2, 0 2518 2519 //8-15 2520 vld vr7, a2, 7 2521 z1_upsample_edge_data_init1 2522 vst vr13, t2, 16 2523 2524 //16-23 2525 vld vr7, a2, 15 2526 z1_upsample_edge_data_init1 2527 vst vr13, t2, 32 2528 2529 //24-31 2530 vld vr7, a2, 23 2531 z1_upsample_edge_data_init1 2532 vst vr13, t2, 48 2533 2534 //32-39 2535 vld vr7, a2, 31 2536 z1_upsample_edge_data_init1 2537 vst vr13, t2, 64 2538 2539 //40-47 2540 vld vr7, a2, 39 2541 z1_upsample_edge_data_init1 2542 vst vr13, t2, 80 2543 2544 //48-55 2545 vld vr7, a2, 47 2546 z1_upsample_edge_data_init1 2547 vst vr13, t2, 96 2548 2549 //56-62 2550 vld vr7, a2, 55 2551 vbsrl.v vr11, vr7, 1 2552 vbsrl.v vr12, vr7, 2 2553 vextrins.b vr12, vr12, 0x76 2554 vbsrl.v vr13, vr7, 3 2555 z1_upsample_edge_calc_loop 2556 fst.d f13, t2, 112 2557 vstelm.w vr13, t2, 120, 2 2558 vstelm.h vr13, t2, 124, 6 2559 2560 ld.bu t7, a2, 63 2561 st.b t7, t2, 126 2562 b .Z1_UEDGE_END 2563 2564.Z1_UEDGE_W32_H64: 2565 //0-7 2566 vld vr7, a2, -1 2567 z1_upsample_edge_data_init1 2568 vst vr13, t2, 0 2569 2570 //8-15 2571 vld vr7, a2, 7 2572 z1_upsample_edge_data_init1 2573 vst vr13, t2, 16 2574 2575 //16-23 2576 vld vr7, a2, 15 2577 z1_upsample_edge_data_init1 2578 vst vr13, t2, 32 2579 2580 //24-31 2581 vld vr7, a2, 23 2582 z1_upsample_edge_data_init1 2583 vst vr13, t2, 48 2584 2585 //32-39 2586 vld vr7, a2, 31 2587 z1_upsample_edge_data_init1 2588 vst vr13, t2, 64 2589 2590 //40-47 2591 vld vr7, a2, 39 2592 z1_upsample_edge_data_init1 2593 vst vr13, t2, 80 2594 2595 //48-55 2596 vld vr7, a2, 47 2597 z1_upsample_edge_data_init1 2598 vst vr13, t2, 96 2599 2600 //56-63 2601 vld vr7, a2, 55 2602 z1_upsample_edge_data_init2 2603 vst vr13, t2, 112 2604 2605 //64-71 2606 vldrepl.b vr7, a2, 63 2607 z1_upsample_edge_calc_other 2608 vst vr13, t2, 128 2609 2610 vst vr13, t2, 144 //72-79 2611 vst vr13, t2, 160 //80-87 2612 2613 //88-94 2614 fst.d f13, t2, 176 2615 vstelm.w vr13, t2, 184, 2 2616 vstelm.h vr13, t2, 188, 6 2617 2618 ld.bu t7, a2, 63 2619 st.b t7, t2, 190 2620 b .Z1_UEDGE_END 2621 2622.Z1_UEDGE_W64: 2623.Z1_UEDGE_W64_H16: 2624 andi t6, a4, 16 2625 beqz t6, .Z1_UEDGE_W64_H32 2626 2627 //0-7 2628 vld vr7, a2, -1 2629 z1_upsample_edge_data_init1 2630 vst vr13, t2, 0 2631 2632 //8-15 2633 vld vr7, a2, 7 2634 z1_upsample_edge_data_init1 2635 vst vr13, t2, 16 2636 2637 //16-23 2638 vld vr7, a2, 15 2639 z1_upsample_edge_data_init1 2640 vst vr13, t2, 32 2641 2642 //24-31 2643 vld vr7, a2, 23 2644 z1_upsample_edge_data_init1 2645 vst vr13, t2, 48 2646 2647 //32-39 2648 vld vr7, a2, 31 2649 z1_upsample_edge_data_init1 2650 vst vr13, t2, 64 2651 2652 //40-47 2653 vld vr7, a2, 39 2654 z1_upsample_edge_data_init1 2655 vst vr13, t2, 80 2656 2657 //48-55 2658 vld vr7, a2, 47 2659 z1_upsample_edge_data_init1 2660 vst vr13, t2, 96 2661 2662 //56-63 2663 vld vr7, a2, 55 2664 z1_upsample_edge_data_init1 2665 vst vr13, t2, 112 2666 2667 //64-71 2668 vld vr7, a2, 63 2669 z1_upsample_edge_data_init1 2670 vst vr13, t2, 128 2671 2672 //72-78 2673 vld vr7, a2, 71 2674 z1_upsample_edge_data_init2 2675 fst.d f13, t2, 144 2676 vstelm.w vr13, t2, 152, 2 2677 vstelm.h vr13, t2, 156, 6 2678 2679 ld.bu t7, a2, 79 2680 st.b t7, t2, 158 2681 b .Z1_UEDGE_END 2682 2683.Z1_UEDGE_W64_H32: 2684 andi t6, a4, 32 2685 beqz t6, .Z1_UEDGE_W64_H64 2686 2687 //0-7 2688 vld vr7, a2, -1 2689 z1_upsample_edge_data_init1 2690 vst vr13, t2, 0 2691 2692 //8-15 2693 vld vr7, a2, 7 2694 z1_upsample_edge_data_init1 2695 vst vr13, t2, 16 2696 2697 //16-23 2698 vld vr7, a2, 15 2699 z1_upsample_edge_data_init1 2700 vst vr13, t2, 32 2701 2702 //24-31 2703 vld vr7, a2, 23 2704 z1_upsample_edge_data_init1 2705 vst vr13, t2, 48 2706 2707 //32-39 2708 vld vr7, a2, 31 2709 z1_upsample_edge_data_init1 2710 vst vr13, t2, 64 2711 2712 //40-47 2713 vld vr7, a2, 39 2714 z1_upsample_edge_data_init1 2715 vst vr13, t2, 80 2716 2717 //48-55 2718 vld vr7, a2, 47 2719 z1_upsample_edge_data_init1 2720 vst vr13, t2, 96 2721 2722 //56-63 2723 vld vr7, a2, 55 2724 z1_upsample_edge_data_init1 2725 vst vr13, t2, 112 2726 2727 //64-71 2728 vld vr7, a2, 63 2729 z1_upsample_edge_data_init1 2730 vst vr13, t2, 128 2731 2732 //72-79 2733 vld vr7, a2, 71 2734 z1_upsample_edge_data_init1 2735 vst vr13, t2, 144 2736 2737 //80-87 2738 vld vr7, a2, 79 2739 z1_upsample_edge_data_init1 2740 vst vr13, t2, 160 2741 2742 //88-94 2743 vld vr7, a2, 87 2744 z1_upsample_edge_data_init2 2745 fst.d f13, t2, 176 2746 vstelm.w vr13, t2, 184, 2 2747 vstelm.h vr13, t2, 188, 6 2748 2749 ld.bu t7, a2, 95 2750 st.b t7, t2, 190 2751 b .Z1_UEDGE_END 2752 2753.Z1_UEDGE_W64_H64: 2754 //0-7 2755 vld vr7, a2, -1 2756 z1_upsample_edge_data_init1 2757 vst vr13, t2, 0 2758 2759 //8-15 2760 vld vr7, a2, 7 2761 z1_upsample_edge_data_init1 2762 vst vr13, t2, 16 2763 2764 //16-23 2765 vld vr7, a2, 15 2766 z1_upsample_edge_data_init1 2767 vst vr13, t2, 32 2768 2769 //24-31 2770 vld vr7, a2, 23 2771 z1_upsample_edge_data_init1 2772 vst vr13, t2, 48 2773 2774 //32-39 2775 vld vr7, a2, 31 2776 z1_upsample_edge_data_init1 2777 vst vr13, t2, 64 2778 2779 //40-47 2780 vld vr7, a2, 39 2781 z1_upsample_edge_data_init1 2782 vst vr13, t2, 80 2783 2784 //48-55 2785 vld vr7, a2, 47 2786 z1_upsample_edge_data_init1 2787 vst vr13, t2, 96 2788 2789 //56-63 2790 vld vr7, a2, 55 2791 z1_upsample_edge_data_init1 2792 vst vr13, t2, 112 2793 2794 //64-71 2795 vld vr7, a2, 63 2796 z1_upsample_edge_data_init1 2797 vst vr13, t2, 128 2798 2799 //72-79 2800 vld vr7, a2, 71 2801 z1_upsample_edge_data_init1 2802 vst vr13, t2, 144 2803 2804 //80-87 2805 vld vr7, a2, 79 2806 z1_upsample_edge_data_init1 2807 vst vr13, t2, 160 2808 2809 //88-95 2810 vld vr7, a2, 87 2811 z1_upsample_edge_data_init1 2812 vst vr13, t2, 176 2813 2814 //96-103 2815 vld vr7, a2, 95 2816 z1_upsample_edge_data_init1 2817 vst vr13, t2, 192 2818 2819 //104-111 2820 vld vr7, a2, 103 2821 z1_upsample_edge_data_init1 2822 vst vr13, t2, 208 2823 2824 //112-119 2825 vld vr7, a2, 111 2826 z1_upsample_edge_data_init1 2827 vst vr13, t2, 224 2828 2829 //120-126 2830 vld vr7, a2, 119 2831 z1_upsample_edge_data_init2 2832 fst.d f13, t2, 240 2833 vstelm.w vr13, t2, 248, 2 2834 vstelm.h vr13, t2, 252, 6 2835 2836 ld.bu t7, a2, 127 2837 st.b t7, t2, 254 2838 b .Z1_UEDGE_END 2839 2840.Z1_UEDGE_END: 2841 //upsample_edge end 2842 2843 or a7, t2, t2 //top 2844 add.d t0, a3, a4 2845 slli.d t0, t0, 1 2846 addi.d t0, t0, -2 //max_base_x 2847 slli.d t1, t1, 1 2848 b .IPRED_Z1_UA_END 2849 2850.IPRED_Z1_NOTUA: 2851 or t5, zero, zero //upsample_above=0 2852 beqz a7, .IPRED_Z1_NOTFS 2853 add.d a7, a3, a4 //w+h 2854 li.w t4, 90 2855 sub.d t4, t4, a5 2856 // ipred_get_filter_strength a6:filter_strength 2857 beqz a6, .Z1_GETFS20 2858.Z1_GETFS10: //wh<=8 2859 addi.d t6, a7, -8 2860 blt zero, t6, .Z1_GETFS11 2861 addi.d t6, t4, -64 2862 blt t6, zero, .Z1_GETFS101 2863 ori a6, zero, 2 2864 b .Z1_GETFS40 2865.Z1_GETFS101: 2866 addi.d t6, t4, -40 2867 blt t6, zero, .Z1_GETFS30 2868 ori a6, zero, 1 2869 b .Z1_GETFS40 2870.Z1_GETFS11: //wh<=16 2871 addi.d t6, a7, -16 2872 blt zero, t6, .Z1_GETFS12 2873 addi.d t6, t4, -48 2874 blt t6, zero, .Z1_GETFS111 2875 ori a6, zero, 2 2876 b .Z1_GETFS40 2877.Z1_GETFS111: 2878 addi.d t6, t4, -20 2879 blt t6, zero, .Z1_GETFS30 2880 ori a6, zero, 1 2881 b .Z1_GETFS40 2882.Z1_GETFS12: //wh<=24 2883 addi.d t6, a7, -24 2884 blt zero, t6, .Z1_GETFS13 2885 addi.d t6, t4, -4 2886 blt t6, zero, .Z1_GETFS30 2887 ori a6, zero, 3 2888 b .Z1_GETFS40 2889.Z1_GETFS13: 2890 ori a6, zero, 3 2891 b .Z1_GETFS40 2892 2893.Z1_GETFS20: //wh<=8 2894 addi.d t6, a7, -8 2895 blt zero, t6, .Z1_GETFS21 2896 addi.d t6, t4, -56 2897 blt t6, zero, .Z1_GETFS30 2898 ori a6, zero, 1 2899 b .Z1_GETFS40 2900.Z1_GETFS21: //wh<=16 2901 addi.d t6, a7, -16 2902 blt zero, t6, .Z1_GETFS22 2903 addi.d t6, t4, -40 2904 blt t6, zero, .Z1_GETFS30 2905 ori a6, zero, 1 2906 b .Z1_GETFS40 2907.Z1_GETFS22: //wh<=24 2908 addi.d t6, a7, -24 2909 blt zero, t6, .Z1_GETFS23 2910 addi.d t6, t4, -32 2911 blt t6, zero, .Z1_GETFS221 2912 ori a6, zero, 3 2913 b .Z1_GETFS40 2914.Z1_GETFS221: 2915 addi.d t6, t4, -16 2916 blt t6, zero, .Z1_GETFS222 2917 ori a6, zero, 2 2918 b .Z1_GETFS40 2919.Z1_GETFS222: 2920 addi.d t6, t4, -8 2921 blt t6, zero, .Z1_GETFS30 2922 ori a6, zero, 1 2923 b .Z1_GETFS40 2924.Z1_GETFS23: //wh<=32 2925 addi.d t6, a7, -32 2926 blt zero, t6, .Z1_GETFS24 2927 addi.d t6, t4, -32 2928 blt t6, zero, .Z1_GETFS231 2929 ori a6, zero, 3 2930 b .Z1_GETFS40 2931.Z1_GETFS231: 2932 addi.d t6, t4, -4 2933 blt t6, zero, .Z1_GETFS232 2934 ori a6, zero, 2 2935 b .Z1_GETFS40 2936.Z1_GETFS232: 2937 ori a6, zero, 1 2938 b .Z1_GETFS40 2939.Z1_GETFS24: 2940 ori a6, zero, 3 2941 b .Z1_GETFS40 2942.Z1_GETFS30: 2943 or a6, zero, zero 2944.Z1_GETFS40: 2945 2946 beqz a6, .IPRED_Z1_NOTFS 2947 2948.IPRED_Z1_IFFS: 2949 // filter_edge 2950 addi.d a6, a6, -1 2951 slli.d a6, a6, 4 2952 la.local t0, ipred_filter_edge_kernel1 2953 vldx vr1, t0, a6 //kernel[0-3] 2954 2955 la.local t0, ipred_filter_edge_kernel2 2956 vldx vr6, t0, a6 //kernel[4] 2957 2958.IPRED_Z1_FS_W4: 2959 andi t0, a3, 4 2960 beqz t0, .IPRED_Z1_FS_W8 2961.IPRED_Z1_FS_W4_H4: 2962 andi t0, a4, 4 2963 beqz t0, .IPRED_Z1_FS_W4_H8 2964 2965 //0-7 2966 vld vr7, a2, -1 2967 z1_filter_edge_data_init4 2968 vbsrl.v vr13, vr7, 3 2969 vextrins.b vr13, vr13, 0x65 2970 vextrins.b vr13, vr13, 0x75 2971 z1_filter_edge_calc_loop2 2972 fst.d f12, t2, 0 2973 b .IPRED_Z1_FS_END 2974 2975.IPRED_Z1_FS_W4_H8: 2976 andi t0, a4, 8 2977 beqz t0, .IPRED_Z1_FS_W4_H16 2978 2979 //0-7 2980 vld vr7, a2, -1 2981 z1_filter_edge_data_init4 2982 vbsrl.v vr13, vr7, 3 2983 vextrins.b vr13, vr13, 0x65 2984 vextrins.b vr13, vr13, 0x75 2985 z1_filter_edge_calc_loop2 2986 fst.d f12, t2, 0 2987 2988 //8-11 2989 vreplvei.b vr10, vr7, 8 2990 vextrins.b vr10, vr7, 0x07 2991 z1_filter_edge_calc_other 2992 fst.s f12, t2, 8 2993 2994 b .IPRED_Z1_FS_END 2995 2996.IPRED_Z1_FS_W4_H16: 2997 andi t0, a4, 16 2998 beqz t0, .IPRED_Z1_FS_W4_H32 2999 3000 //0-7 3001 vld vr7, a2, -1 3002 z1_filter_edge_data_init4 3003 vbsrl.v vr13, vr7, 3 3004 vextrins.b vr13, vr13, 0x65 3005 vextrins.b vr13, vr13, 0x75 3006 z1_filter_edge_calc_loop2 3007 fst.d f12, t2, 0 3008 3009 //8-15 3010 vreplvei.b vr10, vr7, 8 3011 vextrins.b vr10, vr7, 0x07 3012 z1_filter_edge_calc_other 3013 fst.d f12, t2, 8 3014 3015 //16-19 3016 vreplvei.b vr12, vr12, 1 3017 fst.s f12, t2, 16 3018 3019 b .IPRED_Z1_FS_END 3020 3021.IPRED_Z1_FS_W4_H32: 3022 andi t0, a4, 32 3023 beqz t0, .IPRED_Z1_FS_W4_H64 3024 3025 //0-7 3026 vld vr7, a2, -1 3027 z1_filter_edge_data_init4 3028 vbsrl.v vr13, vr7, 3 3029 vextrins.b vr13, vr13, 0x65 3030 vextrins.b vr13, vr13, 0x75 3031 z1_filter_edge_calc_loop2 3032 fst.d f12, t2, 0 3033 3034 //8-15 3035 vreplvei.b vr10, vr7, 8 3036 vextrins.b vr10, vr7, 0x07 3037 z1_filter_edge_calc_other 3038 fst.d f12, t2, 8 3039 3040 //16-23 3041 vreplvei.b vr12, vr12, 1 3042 fst.d f12, t2, 16 3043 3044 fst.d f12, t2, 24 //24-31 3045 fst.s f12, t2, 32 //32-35 3046 3047 b .IPRED_Z1_FS_END 3048 3049.IPRED_Z1_FS_W4_H64: 3050 //0-7 3051 vld vr7, a2, -1 3052 z1_filter_edge_data_init4 3053 vbsrl.v vr13, vr7, 3 3054 vextrins.b vr13, vr13, 0x65 3055 vextrins.b vr13, vr13, 0x75 3056 z1_filter_edge_calc_loop2 3057 fst.d f12, t2, 0 3058 3059 //8-15 3060 vreplvei.b vr10, vr7, 8 3061 vextrins.b vr10, vr7, 0x07 3062 z1_filter_edge_calc_other 3063 fst.d f12, t2, 8 3064 3065 //16-23 3066 vreplvei.b vr12, vr12, 1 3067 fst.d f12, t2, 16 3068 3069 fst.d f12, t2, 24 //24-31 3070 fst.d f12, t2, 32 //32-39 3071 fst.d f12, t2, 40 //40-47 3072 fst.d f12, t2, 48 //48-55 3073 fst.d f12, t2, 56 //56-63 3074 fst.s f12, t2, 64 //64-67 3075 3076 b .IPRED_Z1_FS_END 3077 3078.IPRED_Z1_FS_W8: 3079 andi t0, a3, 8 3080 beqz t0, .IPRED_Z1_FS_W16 3081.IPRED_Z1_FS_W8_H4: 3082 andi t0, a4, 4 3083 beqz t0, .IPRED_Z1_FS_W8_H8 3084 3085 //0-7 3086 vld vr7, a2, -1 3087 z1_filter_edge_data_init1 3088 vbsrl.v vr13, vr7, 3 3089 z1_filter_edge_calc_loop2 3090 fst.d f12, t2, 0 3091 3092 //8-11 3093 vld vr7, a2, 6 3094 vbsrl.v vr11, vr7, 1 3095 vbsrl.v vr12, vr7, 2 3096 vbsrl.v vr13, vr7, 3 3097 vextrins.b vr13, vr13, 0x32 3098 vsllwil.hu.bu vr10, vr7, 0 3099 vsllwil.hu.bu vr11, vr11, 0 3100 vsllwil.hu.bu vr12, vr12, 0 3101 vsllwil.hu.bu vr13, vr13, 0 3102 z1_filter_edge_calc_loop1 3103 3104 vbsrl.v vr13, vr7, 4 3105 vextrins.b vr13, vr13, 0x21 3106 vextrins.b vr13, vr13, 0x31 3107 z1_filter_edge_calc_loop2 3108 fst.s f12, t2, 8 3109 b .IPRED_Z1_FS_END 3110 3111.IPRED_Z1_FS_W8_H8: 3112 andi t0, a4, 8 3113 beqz t0, .IPRED_Z1_FS_W8_H16 3114 3115 //0-7 3116 vld vr7, a2, -1 3117 z1_filter_edge_data_init1 3118 vbsrl.v vr13, vr7, 3 3119 z1_filter_edge_calc_loop2 3120 fst.d f12, t2, 0 3121 3122 //8-15 3123 vld vr7, a2, 6 3124 z1_filter_edge_data_init3 3125 vbsrl.v vr13, vr7, 4 3126 vextrins.b vr13, vr13, 0x65 3127 vextrins.b vr13, vr13, 0x75 3128 z1_filter_edge_calc_loop2 3129 fst.d f12, t2, 8 3130 b .IPRED_Z1_FS_END 3131 3132.IPRED_Z1_FS_W8_H16: 3133 andi t0, a4, 16 3134 beqz t0, .IPRED_Z1_FS_W8_H32 3135 3136 //0-7 3137 vld vr7, a2, -1 3138 z1_filter_edge_data_init1 3139 vbsrl.v vr13, vr7, 3 3140 z1_filter_edge_calc_loop2 3141 fst.d f12, t2, 0 3142 3143 //8-15 3144 vld vr7, a2, 6 3145 z1_filter_edge_data_init3 3146 vbsrl.v vr13, vr7, 4 3147 vextrins.b vr13, vr13, 0x65 3148 vextrins.b vr13, vr13, 0x75 3149 z1_filter_edge_calc_loop2 3150 fst.d f12, t2, 8 3151 3152 //16-23 3153 vreplvei.b vr10, vr7, 9 3154 vextrins.b vr10, vr7, 0x08 3155 z1_filter_edge_calc_other 3156 fst.d f12, t2, 16 3157 3158 b .IPRED_Z1_FS_END 3159 3160.IPRED_Z1_FS_W8_H32: 3161 andi t0, a4, 32 3162 beqz t0, .IPRED_Z1_FS_W8_H64 3163 3164 //0-7 3165 vld vr7, a2, -1 3166 z1_filter_edge_data_init1 3167 vbsrl.v vr13, vr7, 3 3168 z1_filter_edge_calc_loop2 3169 fst.d f12, t2, 0 3170 3171 //8-15 3172 vld vr7, a2, 6 3173 z1_filter_edge_data_init3 3174 vbsrl.v vr13, vr7, 4 3175 vextrins.b vr13, vr13, 0x65 3176 vextrins.b vr13, vr13, 0x75 3177 z1_filter_edge_calc_loop2 3178 fst.d f12, t2, 8 3179 3180 //16-23 3181 vreplvei.b vr10, vr7, 9 3182 vextrins.b vr10, vr7, 0x08 3183 z1_filter_edge_calc_other 3184 fst.d f12, t2, 16 3185 3186 //24-31 3187 vreplvei.b vr12, vr12, 1 3188 fst.d f12, t2, 24 3189 3190 //32-39 3191 fst.d f12, t2, 32 3192 3193 b .IPRED_Z1_FS_END 3194 3195.IPRED_Z1_FS_W8_H64: 3196 //0-7 3197 vld vr7, a2, -1 3198 z1_filter_edge_data_init1 3199 vbsrl.v vr13, vr7, 3 3200 z1_filter_edge_calc_loop2 3201 fst.d f12, t2, 0 3202 3203 //8-15 3204 vld vr7, a2, 6 3205 z1_filter_edge_data_init3 3206 vbsrl.v vr13, vr7, 4 3207 vextrins.b vr13, vr13, 0x65 3208 vextrins.b vr13, vr13, 0x75 3209 z1_filter_edge_calc_loop2 3210 fst.d f12, t2, 8 3211 3212 //16-23 3213 vreplvei.b vr10, vr7, 9 3214 vextrins.b vr10, vr7, 0x08 3215 z1_filter_edge_calc_other 3216 fst.d f12, t2, 16 3217 3218 //24-31 3219 vreplvei.b vr12, vr12, 1 3220 fst.d f12, t2, 24 3221 3222 fst.d f12, t2, 32 //32-39 3223 fst.d f12, t2, 40 //40-47 3224 fst.d f12, t2, 48 //48-55 3225 fst.d f12, t2, 56 //56-63 3226 fst.d f12, t2, 64 //64-71 3227 3228 b .IPRED_Z1_FS_END 3229 3230.IPRED_Z1_FS_W16: 3231 andi t0, a3, 16 3232 beqz t0, .IPRED_Z1_FS_W32 3233.IPRED_Z1_FS_W16_H4: 3234 andi t0, a4, 4 3235 beqz t0, .IPRED_Z1_FS_W16_H8 3236 3237 //0-7 3238 vld vr7, a2, -1 3239 z1_filter_edge_data_init1 3240 vbsrl.v vr13, vr7, 3 3241 z1_filter_edge_calc_loop2 3242 fst.d f12, t2, 0 3243 3244 //8-15 3245 vld vr7, a2, 6 3246 z1_filter_edge_data_init2 3247 vbsrl.v vr13, vr7, 4 3248 z1_filter_edge_calc_loop2 3249 fst.d f12, t2, 8 3250 3251 //16-19 3252 vld vr7, a2, 14 3253 vbsrl.v vr11, vr7, 1 3254 vbsrl.v vr12, vr7, 2 3255 vbsrl.v vr13, vr7, 3 3256 vextrins.b vr13, vr13, 0x32 3257 vsllwil.hu.bu vr10, vr7, 0 3258 vsllwil.hu.bu vr11, vr11, 0 3259 vsllwil.hu.bu vr12, vr12, 0 3260 vsllwil.hu.bu vr13, vr13, 0 3261 z1_filter_edge_calc_loop1 3262 3263 vbsrl.v vr13, vr7, 4 3264 vextrins.b vr13, vr13, 0x21 3265 vextrins.b vr13, vr13, 0x31 3266 z1_filter_edge_calc_loop2 3267 fst.s f12, t2, 16 3268 b .IPRED_Z1_FS_END 3269 3270.IPRED_Z1_FS_W16_H8: 3271 andi t0, a4, 8 3272 beqz t0, .IPRED_Z1_FS_W16_H16 3273 3274 //0-7 3275 vld vr7, a2, -1 3276 z1_filter_edge_data_init1 3277 vbsrl.v vr13, vr7, 3 3278 z1_filter_edge_calc_loop2 3279 fst.d f12, t2, 0 3280 3281 //8-15 3282 vld vr7, a2, 6 3283 z1_filter_edge_data_init2 3284 vbsrl.v vr13, vr7, 4 3285 z1_filter_edge_calc_loop2 3286 fst.d f12, t2, 8 3287 3288 //16-23 3289 vld vr7, a2, 14 3290 z1_filter_edge_data_init3 3291 vbsrl.v vr13, vr7, 4 3292 vextrins.b vr13, vr13, 0x65 3293 vextrins.b vr13, vr13, 0x75 3294 z1_filter_edge_calc_loop2 3295 fst.d f12, t2, 16 3296 b .IPRED_Z1_FS_END 3297 3298.IPRED_Z1_FS_W16_H16: 3299 andi t0, a4, 16 3300 beqz t0, .IPRED_Z1_FS_W16_H32 3301 3302 //0-7 3303 vld vr7, a2, -1 3304 z1_filter_edge_data_init1 3305 vbsrl.v vr13, vr7, 3 3306 z1_filter_edge_calc_loop2 3307 fst.d f12, t2, 0 3308 3309 //8-15 3310 vld vr7, a2, 6 3311 z1_filter_edge_data_init2 3312 vbsrl.v vr13, vr7, 4 3313 z1_filter_edge_calc_loop2 3314 fst.d f12, t2, 8 3315 3316 //16-23 3317 vld vr7, a2, 14 3318 z1_filter_edge_data_init2 3319 vbsrl.v vr13, vr7, 4 3320 z1_filter_edge_calc_loop2 3321 fst.d f12, t2, 16 3322 3323 //24-31 3324 vld vr7, a2, 22 3325 z1_filter_edge_data_init3 3326 vbsrl.v vr13, vr7, 4 3327 vextrins.b vr13, vr13, 0x65 3328 vextrins.b vr13, vr13, 0x75 3329 z1_filter_edge_calc_loop2 3330 fst.d f12, t2, 24 3331 b .IPRED_Z1_FS_END 3332 3333.IPRED_Z1_FS_W16_H32: 3334 andi t0, a4, 32 3335 beqz t0, .IPRED_Z1_FS_W16_H64 3336 3337 //0-7 3338 vld vr7, a2, -1 3339 z1_filter_edge_data_init1 3340 vbsrl.v vr13, vr7, 3 3341 z1_filter_edge_calc_loop2 3342 fst.d f12, t2, 0 3343 3344 //8-15 3345 vld vr7, a2, 6 3346 z1_filter_edge_data_init2 3347 vbsrl.v vr13, vr7, 4 3348 z1_filter_edge_calc_loop2 3349 fst.d f12, t2, 8 3350 3351 //16-23 3352 vld vr7, a2, 14 3353 z1_filter_edge_data_init2 3354 vbsrl.v vr13, vr7, 4 3355 z1_filter_edge_calc_loop2 3356 fst.d f12, t2, 16 3357 3358 //24-31 3359 vld vr7, a2, 22 3360 z1_filter_edge_data_init3 3361 vbsrl.v vr13, vr7, 4 3362 vextrins.b vr13, vr13, 0x65 3363 vextrins.b vr13, vr13, 0x75 3364 z1_filter_edge_calc_loop2 3365 fst.d f12, t2, 24 3366 3367 //32-39 3368 vreplvei.b vr10, vr7, 9 3369 vextrins.b vr10, vr7, 0x08 3370 z1_filter_edge_calc_other 3371 fst.d f12, t2, 32 3372 3373 //40-47 3374 vreplvei.b vr12, vr12, 1 3375 fst.d f12, t2, 40 3376 3377 b .IPRED_Z1_FS_END 3378 3379.IPRED_Z1_FS_W16_H64: 3380 //0-7 3381 vld vr7, a2, -1 3382 z1_filter_edge_data_init1 3383 vbsrl.v vr13, vr7, 3 3384 z1_filter_edge_calc_loop2 3385 fst.d f12, t2, 0 3386 3387 //8-15 3388 vld vr7, a2, 6 3389 z1_filter_edge_data_init2 3390 vbsrl.v vr13, vr7, 4 3391 z1_filter_edge_calc_loop2 3392 fst.d f12, t2, 8 3393 3394 //16-23 3395 vld vr7, a2, 14 3396 z1_filter_edge_data_init2 3397 vbsrl.v vr13, vr7, 4 3398 z1_filter_edge_calc_loop2 3399 fst.d f12, t2, 16 3400 3401 //24-31 3402 vld vr7, a2, 22 3403 z1_filter_edge_data_init3 3404 vbsrl.v vr13, vr7, 4 3405 vextrins.b vr13, vr13, 0x65 3406 vextrins.b vr13, vr13, 0x75 3407 z1_filter_edge_calc_loop2 3408 fst.d f12, t2, 24 3409 3410 //32-39 3411 vreplvei.b vr10, vr7, 9 3412 vextrins.b vr10, vr7, 0x08 3413 z1_filter_edge_calc_other 3414 fst.d f12, t2, 32 3415 3416 //40-47 3417 vreplvei.b vr12, vr12, 1 3418 fst.d f12, t2, 40 3419 3420 fst.d f12, t2, 48 //48-55 3421 fst.d f12, t2, 56 //56-63 3422 fst.d f12, t2, 64 //64-71 3423 fst.d f12, t2, 72 //72-81 3424 3425 b .IPRED_Z1_FS_END 3426 3427.IPRED_Z1_FS_W32: 3428 andi t0, a3, 32 3429 beqz t0, .IPRED_Z1_FS_W64 3430.IPRED_Z1_FS_W32_H8: 3431 andi t0, a4, 8 3432 beqz t0, .IPRED_Z1_FS_W32_H16 3433 3434 //0-7 3435 vld vr7, a2, -1 3436 z1_filter_edge_data_init1 3437 vbsrl.v vr13, vr7, 3 3438 z1_filter_edge_calc_loop2 3439 fst.d f12, t2, 0 3440 3441 //8-15 3442 vld vr7, a2, 6 3443 z1_filter_edge_data_init2 3444 vbsrl.v vr13, vr7, 4 3445 z1_filter_edge_calc_loop2 3446 fst.d f12, t2, 8 3447 3448 //16-23 3449 vld vr7, a2, 14 3450 z1_filter_edge_data_init2 3451 vbsrl.v vr13, vr7, 4 3452 z1_filter_edge_calc_loop2 3453 fst.d f12, t2, 16 3454 3455 //24-31 3456 vld vr7, a2, 22 3457 z1_filter_edge_data_init2 3458 vbsrl.v vr13, vr7, 4 3459 z1_filter_edge_calc_loop2 3460 fst.d f12, t2, 24 3461 3462 //32-39 3463 vld vr7, a2, 30 3464 z1_filter_edge_data_init3 3465 vbsrl.v vr13, vr7, 4 3466 vextrins.b vr13, vr13, 0x65 3467 vextrins.b vr13, vr13, 0x75 3468 z1_filter_edge_calc_loop2 3469 fst.d f12, t2, 32 3470 3471 b .IPRED_Z1_FS_END 3472 3473.IPRED_Z1_FS_W32_H16: 3474 andi t0, a4, 16 3475 beqz t0, .IPRED_Z1_FS_W32_H32 3476 3477 //0-7 3478 vld vr7, a2, -1 3479 z1_filter_edge_data_init1 3480 vbsrl.v vr13, vr7, 3 3481 z1_filter_edge_calc_loop2 3482 fst.d f12, t2, 0 3483 3484 //8-15 3485 vld vr7, a2, 6 3486 z1_filter_edge_data_init2 3487 vbsrl.v vr13, vr7, 4 3488 z1_filter_edge_calc_loop2 3489 fst.d f12, t2, 8 3490 3491 //16-23 3492 vld vr7, a2, 14 3493 z1_filter_edge_data_init2 3494 3495 vbsrl.v vr13, vr7, 4 3496 z1_filter_edge_calc_loop2 3497 fst.d f12, t2, 16 3498 3499 //24-31 3500 vld vr7, a2, 22 3501 z1_filter_edge_data_init2 3502 vbsrl.v vr13, vr7, 4 3503 z1_filter_edge_calc_loop2 3504 fst.d f12, t2, 24 3505 3506 //32-39 3507 vld vr7, a2, 30 3508 z1_filter_edge_data_init2 3509 vbsrl.v vr13, vr7, 4 3510 z1_filter_edge_calc_loop2 3511 fst.d f12, t2, 32 3512 3513 //40-47 3514 vld vr7, a2, 38 3515 z1_filter_edge_data_init3 3516 vbsrl.v vr13, vr7, 4 3517 vextrins.b vr13, vr13, 0x65 3518 vextrins.b vr13, vr13, 0x75 3519 z1_filter_edge_calc_loop2 3520 fst.d f12, t2, 40 3521 3522 b .IPRED_Z1_FS_END 3523 3524.IPRED_Z1_FS_W32_H32: 3525 andi t0, a4, 32 3526 beqz t0, .IPRED_Z1_FS_W32_H64 3527 3528 //0-7 3529 vld vr7, a2, -1 3530 z1_filter_edge_data_init1 3531 vbsrl.v vr13, vr7, 3 3532 z1_filter_edge_calc_loop2 3533 fst.d f12, t2, 0 3534 3535 //8-15 3536 vld vr7, a2, 6 3537 z1_filter_edge_data_init2 3538 vbsrl.v vr13, vr7, 4 3539 z1_filter_edge_calc_loop2 3540 fst.d f12, t2, 8 3541 3542 //16-23 3543 vld vr7, a2, 14 3544 z1_filter_edge_data_init2 3545 vbsrl.v vr13, vr7, 4 3546 z1_filter_edge_calc_loop2 3547 fst.d f12, t2, 16 3548 3549 //24-31 3550 vld vr7, a2, 22 3551 z1_filter_edge_data_init2 3552 vbsrl.v vr13, vr7, 4 3553 z1_filter_edge_calc_loop2 3554 fst.d f12, t2, 24 3555 3556 //32-39 3557 vld vr7, a2, 30 3558 z1_filter_edge_data_init2 3559 vbsrl.v vr13, vr7, 4 3560 z1_filter_edge_calc_loop2 3561 fst.d f12, t2, 32 3562 3563 //40-47 3564 vld vr7, a2, 38 3565 z1_filter_edge_data_init2 3566 vbsrl.v vr13, vr7, 4 3567 z1_filter_edge_calc_loop2 3568 fst.d f12, t2, 40 3569 3570 //48-55 3571 vld vr7, a2, 46 3572 z1_filter_edge_data_init2 3573 vbsrl.v vr13, vr7, 4 3574 z1_filter_edge_calc_loop2 3575 fst.d f12, t2, 48 3576 3577 //56-63 3578 vld vr7, a2, 54 3579 z1_filter_edge_data_init3 3580 vbsrl.v vr13, vr7, 4 3581 vextrins.b vr13, vr13, 0x65 3582 vextrins.b vr13, vr13, 0x75 3583 z1_filter_edge_calc_loop2 3584 fst.d f12, t2, 56 3585 3586 b .IPRED_Z1_FS_END 3587 3588.IPRED_Z1_FS_W32_H64: 3589 //0-7 3590 vld vr7, a2, -1 3591 z1_filter_edge_data_init1 3592 vbsrl.v vr13, vr7, 3 3593 z1_filter_edge_calc_loop2 3594 fst.d f12, t2, 0 3595 3596 //8-15 3597 vld vr7, a2, 6 3598 z1_filter_edge_data_init2 3599 vbsrl.v vr13, vr7, 4 3600 z1_filter_edge_calc_loop2 3601 fst.d f12, t2, 8 3602 3603 //16-23 3604 vld vr7, a2, 14 3605 z1_filter_edge_data_init2 3606 vbsrl.v vr13, vr7, 4 3607 z1_filter_edge_calc_loop2 3608 fst.d f12, t2, 16 3609 3610 //24-31 3611 vld vr7, a2, 22 3612 z1_filter_edge_data_init2 3613 vbsrl.v vr13, vr7, 4 3614 z1_filter_edge_calc_loop2 3615 fst.d f12, t2, 24 3616 3617 //32-39 3618 vld vr7, a2, 30 3619 z1_filter_edge_data_init2 3620 vbsrl.v vr13, vr7, 4 3621 z1_filter_edge_calc_loop2 3622 fst.d f12, t2, 32 3623 3624 //40-47 3625 vld vr7, a2, 38 3626 z1_filter_edge_data_init2 3627 vbsrl.v vr13, vr7, 4 3628 z1_filter_edge_calc_loop2 3629 fst.d f12, t2, 40 3630 3631 //48-55 3632 vld vr7, a2, 46 3633 z1_filter_edge_data_init2 3634 vbsrl.v vr13, vr7, 4 3635 z1_filter_edge_calc_loop2 3636 fst.d f12, t2, 48 3637 3638 //56-63 3639 vld vr7, a2, 54 3640 z1_filter_edge_data_init3 3641 vbsrl.v vr13, vr7, 4 3642 vextrins.b vr13, vr13, 0x65 3643 vextrins.b vr13, vr13, 0x75 3644 z1_filter_edge_calc_loop2 3645 fst.d f12, t2, 56 3646 3647 //64-71 3648 vreplvei.b vr10, vr7, 9 3649 vextrins.b vr10, vr7, 0x08 3650 z1_filter_edge_calc_other 3651 fst.d f12, t2, 64 3652 3653 //72-89 3654 vreplvei.b vr12, vr12, 1 3655 fst.d f12, t2, 72 3656 3657 fst.d f12, t2, 80 //80-87 3658 fst.d f12, t2, 88 //88-95 3659 3660 b .IPRED_Z1_FS_END 3661 3662.IPRED_Z1_FS_W64: 3663.IPRED_Z1_FS_W64_H16: 3664 andi t0, a4, 16 3665 beqz t0, .IPRED_Z1_FS_W64_H32 3666 3667 //0-7 3668 vld vr7, a2, -1 3669 z1_filter_edge_data_init1 3670 vbsrl.v vr13, vr7, 3 3671 z1_filter_edge_calc_loop2 3672 fst.d f12, t2, 0 3673 3674 //8-15 3675 vld vr7, a2, 6 3676 z1_filter_edge_data_init2 3677 vbsrl.v vr13, vr7, 4 3678 z1_filter_edge_calc_loop2 3679 fst.d f12, t2, 8 3680 3681 //16-23 3682 vld vr7, a2, 14 3683 z1_filter_edge_data_init2 3684 vbsrl.v vr13, vr7, 4 3685 z1_filter_edge_calc_loop2 3686 fst.d f12, t2, 16 3687 3688 //24-31 3689 vld vr7, a2, 22 3690 z1_filter_edge_data_init2 3691 vbsrl.v vr13, vr7, 4 3692 z1_filter_edge_calc_loop2 3693 fst.d f12, t2, 24 3694 3695 //32-39 3696 vld vr7, a2, 30 3697 z1_filter_edge_data_init2 3698 vbsrl.v vr13, vr7, 4 3699 z1_filter_edge_calc_loop2 3700 fst.d f12, t2, 32 3701 3702 //40-47 3703 vld vr7, a2, 38 3704 z1_filter_edge_data_init2 3705 vbsrl.v vr13, vr7, 4 3706 z1_filter_edge_calc_loop2 3707 fst.d f12, t2, 40 3708 3709 //48-55 3710 vld vr7, a2, 46 3711 z1_filter_edge_data_init2 3712 vbsrl.v vr13, vr7, 4 3713 z1_filter_edge_calc_loop2 3714 fst.d f12, t2, 48 3715 3716 //56-63 3717 vld vr7, a2, 54 3718 z1_filter_edge_data_init2 3719 vbsrl.v vr13, vr7, 4 3720 z1_filter_edge_calc_loop2 3721 fst.d f12, t2, 56 3722 3723 //64-71 3724 vld vr7, a2, 62 3725 z1_filter_edge_data_init2 3726 vbsrl.v vr13, vr7, 4 3727 z1_filter_edge_calc_loop2 3728 fst.d f12, t2, 64 3729 3730 //72-79 3731 vld vr7, a2, 70 3732 z1_filter_edge_data_init3 3733 vbsrl.v vr13, vr7, 4 3734 vextrins.b vr13, vr13, 0x65 3735 vextrins.b vr13, vr13, 0x75 3736 z1_filter_edge_calc_loop2 3737 fst.d f12, t2, 72 3738 3739 b .IPRED_Z1_FS_END 3740 3741.IPRED_Z1_FS_W64_H32: 3742 andi t0, a4, 32 3743 beqz t0, .IPRED_Z1_FS_W64_H64 3744 3745 //0-7 3746 vld vr7, a2, -1 3747 z1_filter_edge_data_init1 3748 vbsrl.v vr13, vr7, 3 3749 z1_filter_edge_calc_loop2 3750 fst.d f12, t2, 0 3751 3752 //8-15 3753 vld vr7, a2, 6 3754 z1_filter_edge_data_init2 3755 vbsrl.v vr13, vr7, 4 3756 z1_filter_edge_calc_loop2 3757 fst.d f12, t2, 8 3758 3759 //16-23 3760 vld vr7, a2, 14 3761 z1_filter_edge_data_init2 3762 vbsrl.v vr13, vr7, 4 3763 z1_filter_edge_calc_loop2 3764 fst.d f12, t2, 16 3765 3766 //24-31 3767 vld vr7, a2, 22 3768 z1_filter_edge_data_init2 3769 vbsrl.v vr13, vr7, 4 3770 z1_filter_edge_calc_loop2 3771 fst.d f12, t2, 24 3772 3773 //32-39 3774 vld vr7, a2, 30 3775 z1_filter_edge_data_init2 3776 vbsrl.v vr13, vr7, 4 3777 z1_filter_edge_calc_loop2 3778 fst.d f12, t2, 32 3779 3780 //40-47 3781 vld vr7, a2, 38 3782 z1_filter_edge_data_init2 3783 vbsrl.v vr13, vr7, 4 3784 z1_filter_edge_calc_loop2 3785 fst.d f12, t2, 40 3786 3787 //48-55 3788 vld vr7, a2, 46 3789 z1_filter_edge_data_init2 3790 vbsrl.v vr13, vr7, 4 3791 z1_filter_edge_calc_loop2 3792 fst.d f12, t2, 48 3793 3794 //56-63 3795 vld vr7, a2, 54 3796 z1_filter_edge_data_init2 3797 vbsrl.v vr13, vr7, 4 3798 z1_filter_edge_calc_loop2 3799 fst.d f12, t2, 56 3800 3801 //64-71 3802 vld vr7, a2, 62 3803 z1_filter_edge_data_init2 3804 vbsrl.v vr13, vr7, 4 3805 z1_filter_edge_calc_loop2 3806 fst.d f12, t2, 64 3807 3808 //72-79 3809 vld vr7, a2, 70 3810 z1_filter_edge_data_init2 3811 vbsrl.v vr13, vr7, 4 3812 z1_filter_edge_calc_loop2 3813 fst.d f12, t2, 72 3814 3815 //80-87 3816 vld vr7, a2, 78 3817 z1_filter_edge_data_init2 3818 vbsrl.v vr13, vr7, 4 3819 z1_filter_edge_calc_loop2 3820 fst.d f12, t2, 80 3821 3822 //88-95 3823 vld vr7, a2, 86 3824 z1_filter_edge_data_init3 3825 vbsrl.v vr13, vr7, 4 3826 vextrins.b vr13, vr13, 0x65 3827 vextrins.b vr13, vr13, 0x75 3828 z1_filter_edge_calc_loop2 3829 fst.d f12, t2, 88 3830 3831 b .IPRED_Z1_FS_END 3832 3833.IPRED_Z1_FS_W64_H64: 3834 //0-7 3835 vld vr7, a2, -1 3836 z1_filter_edge_data_init1 3837 vbsrl.v vr13, vr7, 3 3838 z1_filter_edge_calc_loop2 3839 fst.d f12, t2, 0 3840 3841 //8-15 3842 vld vr7, a2, 6 3843 z1_filter_edge_data_init2 3844 vbsrl.v vr13, vr7, 4 3845 z1_filter_edge_calc_loop2 3846 fst.d f12, t2, 8 3847 3848 //16-23 3849 vld vr7, a2, 14 3850 z1_filter_edge_data_init2 3851 vbsrl.v vr13, vr7, 4 3852 z1_filter_edge_calc_loop2 3853 fst.d f12, t2, 16 3854 3855 //24-31 3856 vld vr7, a2, 22 3857 z1_filter_edge_data_init2 3858 vbsrl.v vr13, vr7, 4 3859 z1_filter_edge_calc_loop2 3860 fst.d f12, t2, 24 3861 3862 //32-39 3863 vld vr7, a2, 30 3864 z1_filter_edge_data_init2 3865 vbsrl.v vr13, vr7, 4 3866 z1_filter_edge_calc_loop2 3867 fst.d f12, t2, 32 3868 3869 //40-47 3870 vld vr7, a2, 38 3871 z1_filter_edge_data_init2 3872 vbsrl.v vr13, vr7, 4 3873 z1_filter_edge_calc_loop2 3874 fst.d f12, t2, 40 3875 3876 //48-55 3877 vld vr7, a2, 46 3878 z1_filter_edge_data_init2 3879 vbsrl.v vr13, vr7, 4 3880 z1_filter_edge_calc_loop2 3881 fst.d f12, t2, 48 3882 3883 //56-63 3884 vld vr7, a2, 54 3885 z1_filter_edge_data_init2 3886 vbsrl.v vr13, vr7, 4 3887 z1_filter_edge_calc_loop2 3888 fst.d f12, t2, 56 3889 3890 //64-71 3891 vld vr7, a2, 62 3892 z1_filter_edge_data_init2 3893 vbsrl.v vr13, vr7, 4 3894 z1_filter_edge_calc_loop2 3895 fst.d f12, t2, 64 3896 3897 //72-79 3898 vld vr7, a2, 70 3899 z1_filter_edge_data_init2 3900 vbsrl.v vr13, vr7, 4 3901 z1_filter_edge_calc_loop2 3902 fst.d f12, t2, 72 3903 3904 //80-87 3905 vld vr7, a2, 78 3906 z1_filter_edge_data_init2 3907 vbsrl.v vr13, vr7, 4 3908 z1_filter_edge_calc_loop2 3909 fst.d f12, t2, 80 3910 3911 //88-95 3912 vld vr7, a2, 86 3913 z1_filter_edge_data_init2 3914 vbsrl.v vr13, vr7, 4 3915 z1_filter_edge_calc_loop2 3916 fst.d f12, t2, 88 3917 3918 //96-103 3919 vld vr7, a2, 94 3920 z1_filter_edge_data_init2 3921 vbsrl.v vr13, vr7, 4 3922 z1_filter_edge_calc_loop2 3923 fst.d f12, t2, 96 3924 3925 //104-111 3926 vld vr7, a2, 102 3927 z1_filter_edge_data_init2 3928 vbsrl.v vr13, vr7, 4 3929 z1_filter_edge_calc_loop2 3930 fst.d f12, t2, 104 3931 3932 //112-119 3933 vld vr7, a2, 110 3934 z1_filter_edge_data_init2 3935 vbsrl.v vr13, vr7, 4 3936 z1_filter_edge_calc_loop2 3937 fst.d f12, t2, 112 3938 3939 //120-127 3940 vld vr7, a2, 118 3941 z1_filter_edge_data_init3 3942 vbsrl.v vr13, vr7, 4 3943 vextrins.b vr13, vr13, 0x65 3944 vextrins.b vr13, vr13, 0x75 3945 z1_filter_edge_calc_loop2 3946 fst.d f12, t2, 120 3947 3948.IPRED_Z1_FS_END: 3949 addi.d t0, a7, -1 //max_base_x 3950 or a7, t2, t2 //top 3951 b .IPRED_Z1_UA_END 3952 3953.IPRED_Z1_NOTFS: 3954 or a7, a2, a2 //top 3955 // imin_gr 3956 blt a3, a4, .Z1_IMIN1 3957 or t0, a4, a4 3958 b .Z1_IMIN2 3959.Z1_IMIN1: 3960 or t0, a3, a3 3961.Z1_IMIN2: 3962 3963 add.d t0, a3, t0 3964 addi.d t0, t0, -1 //max_base_x 3965 3966.IPRED_Z1_UA_END: 3967 //st dst, t1:dx a2 a6 t6 t7 3968 beqz t5, .Z1_UA0 3969 3970 li.w a5, 64 3971 vreplgr2vr.h vr0, a5 3972 vsrai.h vr7, vr0, 1 3973 or t2, zero, zero //y 3974 or t3, t1, t1 //xpos 3975.Z1_LOOPY: 3976 andi t4, t3, 0x3e //frac 3977 vreplgr2vr.h vr1, t4 3978 vsub.h vr2, vr0, vr1 3979 or a6, zero, zero //x 3980 or a2, zero, zero //base_num 3981 srai.d t6, t3, 6 //base 3982 3983 or t7, t6, t6 3984 bge t7, t0, .Z1_LOOPX 3985.Z1_BASENUM: 3986 addi.d a2, a2, 1 3987 addi.d t7, t7, 2 3988 blt t7, t0, .Z1_BASENUM 3989 3990.Z1_LOOPX: 3991 blt a2, a3, .Z1_LOOPX_BASEMAX 3992 3993 srai.d t8, a3, 3 //loop param 3994 beqz t8, .Z1_LOOPX_W4 3995.Z1_LOOPX_W8: 3996 add.d t5, a7, t6 3997 vld vr3, t5, 0 3998 vpickev.b vr5, vr3, vr3 //0 2 4 6... 3999 vpickod.b vr6, vr3, vr3 //1 3 5 7... 4000 vsllwil.hu.bu vr5, vr5, 0 4001 vsllwil.hu.bu vr6, vr6, 0 4002 4003 vmul.h vr3, vr5, vr2 4004 vmadd.h vr3, vr6, vr1 4005 vadd.h vr3, vr3, vr7 4006 vsrai.h vr3, vr3, 6 4007 vsrlni.b.h vr3, vr3, 0 4008 fstx.d f3, a0, a6 4009 4010 addi.d a6, a6, 8 4011 addi.d t6, t6, 16 4012 addi.d t8, t8, -1 4013 bnez t8, .Z1_LOOPX_W8 4014 b .Z1_LOOPY_END 4015.Z1_LOOPX_W4: 4016 vldx vr3, a7, t6 4017 vsllwil.hu.bu vr3, vr3, 0 4018 vpickev.h vr5, vr3, vr3 //0 2 4 6... 4019 vpickod.h vr6, vr3, vr3 //1 3 5 7... 4020 4021 vmul.h vr3, vr5, vr2 4022 vmadd.h vr3, vr6, vr1 4023 vadd.h vr3, vr3, vr7 4024 vsrai.h vr3, vr3, 6 4025 vsrlni.b.h vr3, vr3, 0 4026 fstx.s f3, a0, a6 4027 b .Z1_LOOPY_END 4028.Z1_LOOPX_BASEMAX: 4029 srai.d t8, a2, 3 //loop param 4030 beqz t8, .Z1_LOOPX_BASEMAX4 4031.Z1_LOOPX_BASEMAX8: 4032 add.d t5, a7, t6 4033 vld vr3, t5, 0 4034 vpickev.b vr5, vr3, vr3 //0 2 4 6... 4035 vpickod.b vr6, vr3, vr3 //1 3 5 7... 4036 vsllwil.hu.bu vr5, vr5, 0 4037 vsllwil.hu.bu vr6, vr6, 0 4038 4039 vmul.h vr3, vr5, vr2 4040 vmadd.h vr3, vr6, vr1 4041 vadd.h vr3, vr3, vr7 4042 vsrai.h vr3, vr3, 6 4043 vsrlni.b.h vr3, vr3, 0 4044 fstx.d f3, a0, a6 4045 4046 addi.d a6, a6, 8 4047 addi.d t6, t6, 16 4048 addi.d t8, t8, -1 4049 bnez t8, .Z1_LOOPX_BASEMAX8 4050.Z1_LOOPX_BASEMAX4: 4051 andi t8, a2, 4 4052 beqz t8, .Z1_LOOPX_BASEMAX2 4053 4054 vldx vr3, a7, t6 4055 vsllwil.hu.bu vr3, vr3, 0 4056 vpickev.h vr5, vr3, vr3 //0 2 4 6... 4057 vpickod.h vr6, vr3, vr3 //1 3 5 7... 4058 4059 vmul.h vr3, vr5, vr2 4060 vmadd.h vr3, vr6, vr1 4061 vadd.h vr3, vr3, vr7 4062 vsrai.h vr3, vr3, 6 4063 vsrlni.b.h vr3, vr3, 0 4064 fstx.s f3, a0, a6 4065 4066 addi.d a6, a6, 4 4067 addi.d t6, t6, 8 4068.Z1_LOOPX_BASEMAX2: 4069 andi t8, a2, 2 4070 beqz t8, .Z1_LOOPX_BASEMAX1 4071 4072 vldx vr3, a7, t6 4073 vsllwil.hu.bu vr3, vr3, 0 4074 vpickev.h vr5, vr3, vr3 //0 2 4 6... 4075 vpickod.h vr6, vr3, vr3 //1 3 5 7... 4076 4077 vmul.h vr3, vr5, vr2 4078 vmadd.h vr3, vr6, vr1 4079 vadd.h vr3, vr3, vr7 4080 vsrai.h vr3, vr3, 6 4081 vsrlni.b.h vr3, vr3, 0 4082 vpickve2gr.bu t7, vr3, 0 4083 vpickve2gr.bu t8, vr3, 1 4084 stx.b t7, a0, a6 4085 addi.d a6, a6, 1 4086 stx.b t8, a0, a6 4087 addi.d a6, a6, 1 4088 addi.d t6, t6, 4 4089.Z1_LOOPX_BASEMAX1: 4090 andi t8, a2, 1 4091 beqz t8, .Z1_LOOPX_BASEMAX_MSET 4092 4093 add.d a2, a7, t6 4094 sub.d t7, a5, t4 4095 ld.bu t8, a2, 0 4096 mul.w t7, t7, t8 4097 ld.bu t8, a2, 1 4098 mul.w t8, t8, t4 4099 add.d t7, t7, t8 4100 addi.d t7, t7, 32 4101 srai.d t7, t7, 6 4102 stx.b t7, a0, a6 4103 4104 addi.d a6, a6, 1 4105.Z1_LOOPX_BASEMAX_MSET: //memset 4106 add.d t6, a0, a6 //dst 4107 add.d t7, a7, t0 //src 4108 sub.d a2, a3, a6 //size 4109 pixel_set_8bpc_allw t6, t7, a2, t8, t4 4110.Z1_LOOPY_END: 4111 addi.d t2, t2, 1 4112 add.d a0, a0, a1 4113 add.d t3, t3, t1 4114 blt t2, a4, .Z1_LOOPY 4115 b .Z1_END 4116 4117.Z1_UA0: 4118 li.w a5, 64 4119 vreplgr2vr.h vr0, a5 4120 vsrai.h vr7, vr0, 1 4121 or t2, zero, zero //y 4122 or t3, t1, t1 //xpos 4123.Z1_UA0_LOOPY: 4124 andi t4, t3, 0x3e //frac 4125 vreplgr2vr.h vr1, t4 4126 vsub.h vr2, vr0, vr1 4127 or a6, zero, zero //x 4128 srai.d t6, t3, 6 //base 4129 4130 sub.d a2, t0, t6 //a2:base_num 4131 blt a2, zero, .Z1_UA0_BASENUM 4132 b .Z1_UA0_LOOPX 4133.Z1_UA0_BASENUM: 4134 or a2, zero, zero 4135 4136.Z1_UA0_LOOPX: 4137 blt a2, a3, .Z1_UA0_LOOPX_BASEMAX 4138 4139 srai.d t8, a3, 3 //loop param 4140 beqz t8, .Z1_UA0_LOOPX_W4 4141.Z1_UA0_LOOPX_W8: 4142 add.d t5, a7, t6 4143 vld vr5, t5, 0 4144 vld vr6, t5, 1 4145 vsllwil.hu.bu vr5, vr5, 0 4146 vsllwil.hu.bu vr6, vr6, 0 4147 4148 vmul.h vr3, vr5, vr2 4149 vmadd.h vr3, vr6, vr1 4150 vadd.h vr3, vr3, vr7 4151 vsrai.h vr3, vr3, 6 4152 vsrlni.b.h vr3, vr3, 0 4153 fstx.d f3, a0, a6 4154 4155 addi.d a6, a6, 8 4156 addi.d t6, t6, 8 4157 addi.d t8, t8, -1 4158 bnez t8, .Z1_UA0_LOOPX_W8 4159 b .Z1_UA0_LOOPY_END 4160.Z1_UA0_LOOPX_W4: 4161 vldx vr5, a7, t6 4162 vsllwil.hu.bu vr5, vr5, 0 4163 vbsrl.v vr6, vr5, 2 4164 4165 vmul.h vr3, vr5, vr2 4166 vmadd.h vr3, vr6, vr1 4167 vadd.h vr3, vr3, vr7 4168 vsrai.h vr3, vr3, 6 4169 vsrlni.b.h vr3, vr3, 0 4170 fstx.s f3, a0, a6 4171 b .Z1_UA0_LOOPY_END 4172.Z1_UA0_LOOPX_BASEMAX: 4173 srai.d t8, a2, 3 //loop param 4174 beqz t8, .Z1_UA0_LOOPX_BASEMAX4 4175.Z1_UA0_LOOPX_BASEMAX8: 4176 add.d t5, a7, t6 4177 vld vr5, t5, 0 4178 vld vr6, t5, 1 4179 vsllwil.hu.bu vr5, vr5, 0 4180 vsllwil.hu.bu vr6, vr6, 0 4181 4182 vmul.h vr3, vr5, vr2 4183 vmadd.h vr3, vr6, vr1 4184 vadd.h vr3, vr3, vr7 4185 vsrai.h vr3, vr3, 6 4186 vsrlni.b.h vr3, vr3, 0 4187 fstx.d f3, a0, a6 4188 4189 addi.d a6, a6, 8 4190 addi.d t6, t6, 8 4191 addi.d t8, t8, -1 4192 bnez t8, .Z1_UA0_LOOPX_BASEMAX8 4193.Z1_UA0_LOOPX_BASEMAX4: 4194 andi t8, a2, 4 4195 beqz t8, .Z1_UA0_LOOPX_BASEMAX2 4196 4197 vldx vr5, a7, t6 4198 vsllwil.hu.bu vr5, vr5, 0 4199 vbsrl.v vr6, vr5, 2 4200 4201 vmul.h vr3, vr5, vr2 4202 vmadd.h vr3, vr6, vr1 4203 vadd.h vr3, vr3, vr7 4204 vsrai.h vr3, vr3, 6 4205 vsrlni.b.h vr3, vr3, 0 4206 fstx.s f3, a0, a6 4207 4208 addi.d a6, a6, 4 4209 addi.d t6, t6, 4 4210.Z1_UA0_LOOPX_BASEMAX2: 4211 andi t8, a2, 2 4212 beqz t8, .Z1_UA0_LOOPX_BASEMAX1 4213 4214 vldx vr5, a7, t6 4215 vsllwil.hu.bu vr5, vr5, 0 4216 vbsrl.v vr6, vr5, 2 4217 4218 vmul.h vr3, vr5, vr2 4219 vmadd.h vr3, vr6, vr1 4220 vadd.h vr3, vr3, vr7 4221 vsrai.h vr3, vr3, 6 4222 vsrlni.b.h vr3, vr3, 0 4223 vpickve2gr.bu t7, vr3, 0 4224 vpickve2gr.bu t8, vr3, 1 4225 stx.b t7, a0, a6 4226 addi.d a6, a6, 1 4227 stx.b t8, a0, a6 4228 addi.d a6, a6, 1 4229 addi.d t6, t6, 2 4230.Z1_UA0_LOOPX_BASEMAX1: 4231 andi t8, a2, 1 4232 beqz t8, .Z1_UA0_LOOPX_BASEMAX_MSET 4233 4234 add.d a2, a7, t6 4235 sub.d t7, a5, t4 4236 ld.bu t8, a2, 0 4237 mul.w t7, t7, t8 4238 ld.bu t8, a2, 1 4239 mul.w t8, t8, t4 4240 add.d t7, t7, t8 4241 addi.d t7, t7, 32 4242 srai.d t7, t7, 6 4243 stx.b t7, a0, a6 4244 4245 addi.d a6, a6, 1 4246.Z1_UA0_LOOPX_BASEMAX_MSET: //memset 4247 add.d t6, a0, a6 //dst 4248 add.d t7, a7, t0 //src 4249 sub.d a2, a3, a6 //size 4250 pixel_set_8bpc_allw t6, t7, a2, t8, t4 4251.Z1_UA0_LOOPY_END: 4252 addi.d t2, t2, 1 4253 add.d a0, a0, a1 4254 add.d t3, t3, t1 4255 blt t2, a4, .Z1_UA0_LOOPY 4256 4257.Z1_END: 4258 addi.d sp, sp, 128 4259endfunc 4260 4261