1; Copyright © 2023, VideoLAN and dav1d authors 2; Copyright © 2023, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 30 31const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 32%if ARCH_X86_64 33 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 34 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 35 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 36%endif 37pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11 38 39pb_1_16: times 4 db 1, 16 40%if ARCH_X86_64 41pb_32: times 4 db 32 42%endif 43 44%macro JMP_TABLE 2-* 45 %xdefine %1_table (%%table - 2*4) 46 %xdefine %%base mangle(private_prefix %+ _%1) 47 %%table: 48 %rep %0 - 1 49 dd %%base %+ .w%2 - (%%table - 2*4) 50 %rotate 1 51 %endrep 52%endmacro 53 54JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64 55%if ARCH_X86_64 56JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64 57JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64 58%endif 59 60SECTION .text 61 62INIT_XMM ssse3 63cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h 64%define base r6-pal_idx_finish_ssse3_table 65 LEA r6, pal_idx_finish_ssse3_table 66 tzcnt bwd, bwm 67 movifnidn bhd, bhm 68 movifnidn wd, wm 69 movifnidn hd, hm 70 movsxd bwq, [r6+bwq*4] 71 movddup m3, [base+pb_1_16] 72 add bwq, r6 73 sub bhd, hd 74 jmp bwq 75.w4: 76 mova m0, [srcq] 77 add srcq, 16 78 pmaddubsw m0, m3 79 packuswb m0, m0 80 movq [dstq], m0 81 add dstq, 8 82 sub hd, 4 83 jg .w4 84 test bhd, bhd 85 jz .w4_end 86 pshuflw m0, m0, q3333 87.w4_padv: 88 movq [dstq], m0 89 add dstq, 8 90 sub bhd, 4 91 jg .w4_padv 92.w4_end: 93 RET 94.w8_padh: 95 pshufb m0, m2 96 pshufb m1, m2 97 jmp .w8_main 98.w8: 99 mova m2, [base+pal_idx_w8_padh] 100.w8_loop: 101 mova m0, [srcq+16*0] 102 mova m1, [srcq+16*1] 103 cmp wd, 8 104 jl .w8_padh 105.w8_main: 106 pmaddubsw m0, m3 107 pmaddubsw m1, m3 108 add srcq, 16*2 109 packuswb m0, m1 110 movu [dstq], m0 111 add dstq, 16 112 sub hd, 4 113 jg .w8_loop 114 test bhd, bhd 115 jz .w8_end 116 pshufd m0, m0, q3333 117.w8_padv: 118 movu [dstq], m0 119 add dstq, 16 120 sub bhd, 4 121 jg .w8_padv 122.w8_end: 123 RET 124.w16_padh: 125 pshufb m0, m4 126 pshufb m1, m4 127 jmp .w16_main 128.w16: 129 cmp wd, 16 130 je .w16_loop 131 call .setup_padh 132.w16_loop: 133 mova m0, [srcq+16*0] 134 mova m1, [srcq+16*1] 135 cmp wd, 16 136 jl .w16_padh 137.w16_main: 138 pmaddubsw m0, m3 139 pmaddubsw m1, m3 140 add srcq, 16*2 141 packuswb m0, m1 142 movu [dstq], m0 143 add dstq, 16 144 sub hd, 2 145 jg .w16_loop 146 test bhd, bhd 147 jz .w16_end 148 punpckhqdq m0, m0 149.w16_padv: 150 movu [dstq+16*0], m0 151 movu [dstq+16*1], m0 152 add dstq, 16*2 153 sub bhd, 4 154 jg .w16_padv 155.w16_end: 156 RET 157.w32_padh: 158 cmp wd, 16 159 jg .w32_padh2 160 pshufb m1, m0, m5 161 pshufb m0, m4 162 jmp .w32_main 163.w32_padh2: 164 pshufb m1, m4 165 jmp .w32_main 166.w32: 167 cmp wd, 32 168 je .w32_loop 169 call .setup_padh 170.w32_loop: 171 mova m0, [srcq+16*0] 172 mova m1, [srcq+16*1] 173 cmp wd, 32 174 jl .w32_padh 175.w32_main: 176 pmaddubsw m0, m3 177 pmaddubsw m1, m3 178 add srcq, 16*2 179 packuswb m0, m1 180 movu [dstq], m0 181 add dstq, 16 182 dec hd 183 jg .w32_loop 184 test bhd, bhd 185 jz .w32_end 186.w32_padv: 187 movu [dstq+16*0], m0 188 movu [dstq+16*1], m0 189 movu [dstq+16*2], m0 190 movu [dstq+16*3], m0 191 add dstq, 16*4 192 sub bhd, 4 193 jg .w32_padv 194.w32_end: 195 RET 196.w64_padh: 197 cmp wd, 16 198 jg .w64_padh2 199 pshufb m1, m0, m5 200 pshufb m0, m4 201 pmaddubsw m0, m3 202 pmaddubsw m1, m3 203 packuswb m0, m1 204 packuswb m1, m1 205 jmp .w64_main 206.w64_padh2: 207 pshufb m1, m4 208 pmaddubsw m0, m3 209 pmaddubsw m2, m1, m3 210 pshufb m1, m5 211 pmaddubsw m1, m3 212 packuswb m0, m2 213 packuswb m1, m1 214 jmp .w64_main 215.w64_padh3: 216 cmp wd, 48 217 jg .w64_padh4 218 pshufb m2, m1, m5 219 pshufb m1, m4 220 jmp .w64_main2 221.w64_padh4: 222 pshufb m2, m4 223 jmp .w64_main2 224.w64: 225 cmp wd, 64 226 je .w64_loop 227 call .setup_padh 228.w64_loop: 229 mova m0, [srcq+16*0] 230 mova m1, [srcq+16*1] 231 cmp wd, 32 232 jle .w64_padh 233 pmaddubsw m0, m3 234 pmaddubsw m1, m3 235 packuswb m0, m1 236 mova m1, [srcq+16*2] 237 mova m2, [srcq+16*3] 238 cmp wd, 64 239 jl .w64_padh3 240.w64_main2: 241 pmaddubsw m1, m3 242 pmaddubsw m2, m3 243 packuswb m1, m2 244.w64_main: 245 add srcq, 16*4 246 movu [dstq+16*0], m0 247 movu [dstq+16*1], m1 248 add dstq, 16*2 249 dec hd 250 jg .w64_loop 251 test bhd, bhd 252 jz .w64_end 253.w64_padv: 254 movu [dstq+16*0], m0 255 movu [dstq+16*1], m1 256 movu [dstq+16*2], m0 257 movu [dstq+16*3], m1 258 add dstq, 16*4 259 sub bhd, 2 260 jg .w64_padv 261.w64_end: 262 RET 263.setup_padh: 264 mova m4, [base+pb_0to63] 265 lea r6d, [wq-1] 266 and r6d, 15 267 movd m5, r6d 268 pxor m0, m0 269 pshufb m5, m0 270 pminub m4, m5 271 ret 272 273%if ARCH_X86_64 274 275INIT_YMM avx2 276cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h 277%define base r6-pal_idx_finish_avx2_table 278 lea r6, [pal_idx_finish_avx2_table] 279 tzcnt bwd, bwd 280 movifnidn wd, wm 281 movifnidn hd, hm 282 movsxd bwq, [r6+bwq*4] 283 vpbroadcastd m2, [base+pb_1_16] 284 dec wd 285 add bwq, r6 286 sub bhd, hd 287 jmp bwq 288.w4: 289 mova xm0, [srcq] 290 add srcq, 16 291 pmaddubsw xm0, xm2 292 packuswb xm0, xm0 293 movq [dstq], xm0 294 add dstq, 8 295 sub hd, 4 296 jg .w4 297 test bhd, bhd 298 jz .w4_end 299 pshuflw xm0, xm0, q3333 300.w4_padv: 301 movq [dstq], xm0 302 add dstq, 8 303 sub bhd, 4 304 jg .w4_padv 305.w4_end: 306 RET 307.w8_padh: 308 pshufb xm0, xm3 309 pshufb xm1, xm3 310 jmp .w8_main 311.w8: 312 mova xm3, [base+pal_idx_w8_padh] 313.w8_loop: 314 mova xm0, [srcq+16*0] 315 mova xm1, [srcq+16*1] 316 cmp wd, 7 317 jl .w8_padh 318.w8_main: 319 pmaddubsw xm0, xm2 320 pmaddubsw xm1, xm2 321 add srcq, 16*2 322 packuswb xm0, xm1 323 movu [dstq], xm0 324 add dstq, 16 325 sub hd, 4 326 jg .w8_loop 327 test bhd, bhd 328 jz .w8_end 329 pshufd xm0, xm0, q3333 330.w8_padv: 331 movu [dstq], xm0 332 add dstq, 16 333 sub bhd, 4 334 jg .w8_padv 335.w8_end: 336 RET 337.w16_padh: 338 pshufb m0, m3 339 pshufb m1, m3 340 jmp .w16_main 341.w16: 342 cmp wd, 15 343 je .w16_loop 344 vbroadcasti128 m0, [base+pb_0to63] 345 movd xm3, wd 346 vpbroadcastb m3, xm3 347 pminub m3, m0 348.w16_loop: 349 mova m0, [srcq+32*0] 350 mova m1, [srcq+32*1] 351 cmp wd, 15 352 jl .w16_padh 353.w16_main: 354 pmaddubsw m0, m2 355 pmaddubsw m1, m2 356 add srcq, 32*2 357 packuswb m0, m1 358 vpermq m1, m0, q3120 359 movu [dstq], m1 360 add dstq, 32 361 sub hd, 4 362 jg .w16_loop 363 test bhd, bhd 364 jz .w16_end 365 vpermq m0, m0, q3333 366.w16_padv: 367 movu [dstq], m0 368 add dstq, 32 369 sub bhd, 4 370 jg .w16_padv 371.w16_end: 372 RET 373.w32_padh: 374 cmp wd, 15 375 jg .w32_padh2 376 vinserti128 m0, xm0, 1 377 vinserti128 m1, xm1, 1 378.w32_padh2: 379 pshufb m0, m3 380 pshufb m1, m3 381 jmp .w32_main 382.w32: 383 cmp wd, 31 384 je .w32_loop 385 movd xm3, wd 386 vpbroadcastb m3, xm3 387 pminub m3, [base+pb_0to63] 388.w32_loop: 389 mova m0, [srcq+32*0] 390 mova m1, [srcq+32*1] 391 cmp wd, 31 392 jl .w32_padh 393.w32_main: 394 pmaddubsw m0, m2 395 pmaddubsw m1, m2 396 add srcq, 32*2 397 packuswb m0, m1 398 vpermq m1, m0, q3120 399 movu [dstq], m1 400 add dstq, 32 401 sub hd, 2 402 jg .w32_loop 403 test bhd, bhd 404 jz .w32_end 405 vpermq m0, m0, q3131 406.w32_padv: 407 movu [dstq+32*0], m0 408 movu [dstq+32*1], m0 409 add dstq, 32*2 410 sub bhd, 4 411 jg .w32_padv 412.w32_end: 413 RET 414.w64_padh: 415 cmp wd, 15 416 jg .w64_padh2 417 vinserti128 m1, m0, xm0, 1 418 pshufb m0, m1, m3 419 pshufb m1, m4 420 jmp .w64_main 421.w64_padh2: 422 cmp wd, 31 423 jg .w64_padh3 424 vperm2i128 m1, m0, m0, 0x11 425 pshufb m0, m3 426 pshufb m1, m4 427 jmp .w64_main 428.w64_padh3: 429 cmp wd, 47 430 jg .w64_padh4 431 vinserti128 m1, xm1, 1 432.w64_padh4: 433 pshufb m1, m3 434 jmp .w64_main 435.w64: 436 cmp wd, 63 437 je .w64_loop 438 mov r6d, wd 439 and r6d, 31 440 movd xm4, r6d 441 vpbroadcastb m4, xm4 442 pminub m3, m4, [pb_0to63] 443.w64_loop: 444 mova m0, [srcq+32*0] 445 mova m1, [srcq+32*1] 446 cmp wd, 63 447 jl .w64_padh 448.w64_main: 449 pmaddubsw m0, m2 450 pmaddubsw m1, m2 451 add srcq, 32*2 452 packuswb m0, m1 453 vpermq m0, m0, q3120 454 movu [dstq], m0 455 add dstq, 32 456 dec hd 457 jg .w64_loop 458 test bhd, bhd 459 jz .w64_end 460.w64_padv: 461 movu [dstq+32*0], m0 462 movu [dstq+32*1], m0 463 movu [dstq+32*2], m0 464 movu [dstq+32*3], m0 465 add dstq, 32*4 466 sub bhd, 4 467 jg .w64_padv 468.w64_end: 469 RET 470 471INIT_ZMM avx512icl 472cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h 473%define base r6-pal_idx_finish_avx512icl_table 474 lea r6, [pal_idx_finish_avx512icl_table] 475 tzcnt bwd, bwd 476 movifnidn wd, wm 477 movifnidn hd, hm 478 movsxd bwq, [r6+bwq*4] 479 vpbroadcastd m4, [base+pb_1_16] 480 dec wd 481 add bwq, r6 482 sub bhd, hd 483 jmp bwq 484.w4: 485 mova xmm0, [srcq] 486 add srcq, 16 487 pmaddubsw xmm0, xm4 488 packuswb xmm0, xmm0 489 movq [dstq], xmm0 490 add dstq, 8 491 sub hd, 4 492 jg .w4 493 test bhd, bhd 494 jz .w4_end 495 pshuflw xmm0, xmm0, q3333 496.w4_padv: 497 movq [dstq], xmm0 498 add dstq, 8 499 sub bhd, 4 500 jg .w4_padv 501.w4_end: 502 RET 503.w8_padh: 504 pshufb xmm0, xmm2 505 pshufb xmm1, xmm2 506 jmp .w8_main 507.w8: 508 mova xmm2, [base+pal_idx_w8_padh] 509.w8_loop: 510 mova xmm0, [srcq+16*0] 511 mova xmm1, [srcq+16*1] 512 cmp wd, 7 513 jl .w8_padh 514.w8_main: 515 pmaddubsw xmm0, xm4 516 pmaddubsw xmm1, xm4 517 add srcq, 16*2 518 packuswb xmm0, xmm1 519 movu [dstq], xmm0 520 add dstq, 16 521 sub hd, 4 522 jg .w8_loop 523 test bhd, bhd 524 jz .w8_end 525 pshufd xmm0, xmm0, q3333 526.w8_padv: 527 movu [dstq], xmm0 528 add dstq, 16 529 sub bhd, 4 530 jg .w8_padv 531.w8_end: 532 RET 533.w16_padh: 534 pshufb m0, m2 535 jmp .w16_main 536.w16: 537 cmp wd, 15 538 je .w16_loop 539 vbroadcasti32x4 m2, [base+pb_0to63] 540 vpbroadcastb m0, wd 541 pminub m2, m0 542.w16_loop: 543 mova m0, [srcq] 544 cmp wd, 15 545 jl .w16_padh 546.w16_main: 547 pmaddubsw m0, m4 548 add srcq, 64 549 vpmovwb ym0, m0 550 movu [dstq], ym0 551 add dstq, 32 552 sub hd, 4 553 jg .w16_loop 554 test bhd, bhd 555 jz .w16_end 556 vpermq ym0, ym0, q3333 557.w16_padv: 558 movu [dstq], ym0 559 add dstq, 32 560 sub bhd, 4 561 jg .w16_padv 562.w16_end: 563 RET 564.w32_padh: 565 vpermb m0, m2, m0 566 vpermb m1, m2, m1 567 jmp .w32_main 568.w32: 569 mova m2, [base+pb_0to63] 570 paddb m3, m2, m2 571 cmp wd, 31 572 je .w32_loop 573 vpbroadcastb m0, wd 574 mov r6d, 0xff00 575 kmovw k1, r6d 576 vpaddd m0{k1}, [pb_32] {1to16} 577 pminub m2, m0 578.w32_loop: 579 mova m0, [srcq+64*0] 580 mova m1, [srcq+64*1] 581 cmp wd, 31 582 jl .w32_padh 583.w32_main: 584 pmaddubsw m0, m4 585 pmaddubsw m1, m4 586 add srcq, 64*2 587 vpermt2b m0, m3, m1 588 movu [dstq], m0 589 add dstq, 64 590 sub hd, 4 591 jg .w32_loop 592 test bhd, bhd 593 jz .w32_end 594 vshufi32x4 m0, m0, q3333 595.w32_padv: 596 movu [dstq], m0 597 add dstq, 64 598 sub bhd, 4 599 jg .w32_padv 600.w32_end: 601 RET 602.w64_padh: 603 REPX {vpermb x, m5, x}, m0, m1, m2, m3 604 jmp .w64_main 605.w64: 606 mova m5, [base+pb_0to63] 607 paddb m6, m5, m5 608 cmp wd, 63 609 je .w64_loop 610 vpbroadcastb m0, wd 611 pminub m5, m0 612.w64_loop: 613 mova m0, [srcq+64*0] 614 mova m1, [srcq+64*1] 615 mova m2, [srcq+64*2] 616 mova m3, [srcq+64*3] 617 cmp wd, 63 618 jl .w64_padh 619.w64_main: 620 REPX {pmaddubsw x, m4}, m0, m1, m2, m3 621 add srcq, 64*4 622 vpermt2b m0, m6, m1 623 vpermt2b m2, m6, m3 624 movu [dstq+64*0], m0 625 movu [dstq+64*1], m2 626 add dstq, 64*2 627 sub hd, 4 628 jg .w64_loop 629 test bhd, bhd 630 jz .w64_end 631 vshufi32x4 m2, m2, q3232 632.w64_padv: 633 movu [dstq+64*0], m2 634 movu [dstq+64*1], m2 635 add dstq, 64*2 636 sub bhd, 4 637 jg .w64_padv 638.w64_end: 639 RET 640 641%endif ; ARCH_X86_64 642