1; Copyright © 2019, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 ; avoids cacheline splits 30 31min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 32pw_0xff00: times 8 dw 0xff00 33pw_32: times 8 dw 32 34 35%if ARCH_X86_64 36%define resp resq 37%define movp movq 38%define c_shuf q3333 39%macro DECODE_SYMBOL_ADAPT_INIT 0-1 40%endmacro 41%else 42%define resp resd 43%define movp movd 44%define c_shuf q1111 45%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok 46 mov t0, r0m 47 mov t1, r1m 48%if %1 == 0 49 mov t2, r2m 50%endif 51%if STACK_ALIGNMENT >= 16 52 sub esp, 40-%1*4 53%else 54 mov eax, esp 55 and esp, ~15 56 sub esp, 40-%1*4 57 mov [esp], eax 58%endif 59%endmacro 60%endif 61 62struc msac 63 .buf: resp 1 64 .end: resp 1 65 .dif: resp 1 66 .rng: resd 1 67 .cnt: resd 1 68 .update_cdf: resd 1 69endstruc 70 71%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y) 72 73SECTION .text 74 75%if WIN64 76DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8 77%define buf rsp+stack_offset+8 ; shadow space 78%elif UNIX64 79DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8 80%define buf rsp-40 ; red zone 81%else 82DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3 83%define buf esp+8 84%endif 85 86INIT_XMM sse2 87cglobal msac_decode_symbol_adapt4, 0, 6, 6 88 DECODE_SYMBOL_ADAPT_INIT 89 LEA rax, pw_0xff00 90 movd m2, [t0+msac.rng] 91 movq m1, [t1] 92 movp m3, [t0+msac.dif] 93 mov t3d, [t0+msac.update_cdf] 94 mov t4d, t2d 95 not t2 ; -(n_symbols + 1) 96 pshuflw m2, m2, q0000 97 movd [buf+12], m2 98 pand m2, [rax] 99 mova m0, m1 100 psrlw m1, 6 101 psllw m1, 7 102 pmulhuw m1, m2 103 movq m2, [rax+t2*2] 104 pshuflw m3, m3, c_shuf 105 paddw m1, m2 106 mova [buf+16], m1 107 psubusw m1, m3 108 pxor m2, m2 109 pcmpeqw m1, m2 ; c >= v 110 pmovmskb eax, m1 111 test t3d, t3d 112 jz .renorm ; !allow_update_cdf 113 114; update_cdf: 115 movzx t3d, word [t1+t4*2] ; count 116 pcmpeqw m2, m2 117 mov t2d, t3d 118 shr t3d, 4 119 cmp t4d, 3 120 sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4 121 cmp t2d, 32 122 adc t2d, 0 ; count + (count < 32) 123 movd m3, t3d 124 pavgw m2, m1 ; i >= val ? -1 : 32768 125 psubw m2, m0 ; for (i = 0; i < val; i++) 126 psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; 127 psraw m2, m3 ; for (; i < n_symbols; i++) 128 paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; 129 movq [t1], m0 130 mov [t1+t4*2], t2w 131 132.renorm: 133 tzcnt eax, eax 134 mov t4, [t0+msac.dif] 135 movzx t1d, word [buf+rax+16] ; v 136 movzx t2d, word [buf+rax+14] ; u 137 shr eax, 1 138.renorm2: 139%if ARCH_X86_64 == 0 140%if STACK_ALIGNMENT >= 16 141 add esp, 40 142%else 143 mov esp, [esp] 144%endif 145%endif 146 sub t2d, t1d ; rng 147 shl t1, gprsize*8-16 148 sub t4, t1 ; dif - v 149.renorm3: 150 mov t1d, [t0+msac.cnt] 151 movifnidn t7, t0 152.renorm4: 153 bsr ecx, t2d 154 xor ecx, 15 ; d 155.renorm5: 156 shl t2d, cl 157 shl t4, cl 158 mov [t7+msac.rng], t2d 159 sub t1d, ecx 160 jae .end ; no refill required 161 162; refill: 163%if ARCH_X86_64 == 0 164 push t5 165%endif 166 mov t2, [t7+msac.buf] 167 mov t5, [t7+msac.end] 168 lea rcx, [t2+gprsize] 169 sub rcx, t5 170 ja .refill_eob 171 mov t5, [t2] 172 lea ecx, [t1+16-gprsize*8] 173 not t5 174 bswap t5 175 shr t5, cl 176 neg ecx 177 shr ecx, 3 ; num_bytes_read 178 or t4, t5 179.refill_end: 180 add t2, rcx 181 lea t1d, [t1+rcx*8] ; cnt += num_bits_read 182 mov [t7+msac.buf], t2 183.refill_end2: 184%if ARCH_X86_64 == 0 185 pop t5 186%endif 187.end: 188 mov [t7+msac.cnt], t1d 189 mov [t7+msac.dif], t4 190 RET 191.pad_with_ones: 192 lea ecx, [t1-16] 193%if ARCH_X86_64 194 ror rcx, cl 195%else 196 shr ecx, cl 197%endif 198 or t4, rcx 199 jmp .refill_end2 200.refill_eob: ; avoid overreading the input buffer 201 cmp t2, t5 202 jae .pad_with_ones ; eob reached 203 ; We can safely do a register-sized load of the last bytes of the buffer 204 ; as this code is only reached if the msac buffer size is >= gprsize. 205 mov t5, [t5-gprsize] 206 shl ecx, 3 207 shr t5, cl 208 lea ecx, [t1+16-gprsize*8] 209 not t5 210 bswap t5 211 shr t5, cl 212 neg ecx 213 or t4, t5 214 mov t5d, [t7+msac.end] 215 shr ecx, 3 216 sub t5d, t2d ; num_bytes_left 217 cmp ecx, t5d 218 cmovae ecx, t5d ; num_bytes_read 219 jmp .refill_end 220 221cglobal msac_decode_symbol_adapt8, 0, 6, 6 222 DECODE_SYMBOL_ADAPT_INIT 223 LEA rax, pw_0xff00 224 movd m2, [t0+msac.rng] 225 mova m1, [t1] 226 movp m3, [t0+msac.dif] 227 mov t3d, [t0+msac.update_cdf] 228 mov t4d, t2d 229 not t2 230 pshuflw m2, m2, q0000 231 movd [buf+12], m2 232 punpcklqdq m2, m2 233 mova m0, m1 234 psrlw m1, 6 235 pand m2, [rax] 236 psllw m1, 7 237 pmulhuw m1, m2 238 movu m2, [rax+t2*2] 239 pshuflw m3, m3, c_shuf 240 paddw m1, m2 241 punpcklqdq m3, m3 242 mova [buf+16], m1 243 psubusw m1, m3 244 pxor m2, m2 245 pcmpeqw m1, m2 246 pmovmskb eax, m1 247 test t3d, t3d 248 jz m(msac_decode_symbol_adapt4, SUFFIX).renorm 249 movzx t3d, word [t1+t4*2] 250 pcmpeqw m2, m2 251 mov t2d, t3d 252 shr t3d, 4 253 cmp t4d, 3 ; may be called with n_symbols <= 2 254 sbb t3d, -5 255 cmp t2d, 32 256 adc t2d, 0 257 movd m3, t3d 258 pavgw m2, m1 259 psubw m2, m0 260 psubw m0, m1 261 psraw m2, m3 262 paddw m0, m2 263 mova [t1], m0 264 mov [t1+t4*2], t2w 265 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm 266 267cglobal msac_decode_symbol_adapt16, 0, 6, 6 268 DECODE_SYMBOL_ADAPT_INIT 269 LEA rax, pw_0xff00 270 movd m4, [t0+msac.rng] 271 mova m2, [t1] 272 mova m3, [t1+16] 273 movp m5, [t0+msac.dif] 274 mov t3d, [t0+msac.update_cdf] 275 mov t4d, t2d 276 not t2 277%if WIN64 278 sub rsp, 48 ; need 36 bytes, shadow space is only 32 279%endif 280 pshuflw m4, m4, q0000 281 movd [buf-4], m4 282 punpcklqdq m4, m4 283 mova m0, m2 284 psrlw m2, 6 285 mova m1, m3 286 psrlw m3, 6 287 pand m4, [rax] 288 psllw m2, 7 289 psllw m3, 7 290 pmulhuw m2, m4 291 pmulhuw m3, m4 292 movu m4, [rax+t2*2] 293 pshuflw m5, m5, c_shuf 294 paddw m2, m4 295 psubw m4, [rax-pw_0xff00+pw_32] 296 punpcklqdq m5, m5 297 paddw m3, m4 298 mova [buf], m2 299 psubusw m2, m5 300 mova [buf+16], m3 301 psubusw m3, m5 302 pxor m4, m4 303 pcmpeqw m2, m4 304 pcmpeqw m3, m4 305 packsswb m5, m2, m3 306 pmovmskb eax, m5 307 test t3d, t3d 308 jz .renorm 309 movzx t3d, word [t1+t4*2] 310 pcmpeqw m4, m4 311 mova m5, m4 312 lea t2d, [t3+80] ; only support n_symbols > 2 313 shr t2d, 4 314 cmp t3d, 32 315 adc t3d, 0 316 pavgw m4, m2 317 pavgw m5, m3 318 psubw m4, m0 319 psubw m0, m2 320 movd m2, t2d 321 psubw m5, m1 322 psubw m1, m3 323 psraw m4, m2 324 psraw m5, m2 325 paddw m0, m4 326 paddw m1, m5 327 mova [t1], m0 328 mova [t1+16], m1 329 mov [t1+t4*2], t3w 330.renorm: 331 tzcnt eax, eax 332 mov t4, [t0+msac.dif] 333 movzx t1d, word [buf+rax*2] 334 movzx t2d, word [buf+rax*2-2] 335%if WIN64 336 add rsp, 48 337%endif 338 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2 339 340cglobal msac_decode_bool_adapt, 0, 6, 0 341 movifnidn t1, r1mp 342 movifnidn t0, r0mp 343 movzx eax, word [t1] 344 movzx t3d, byte [t0+msac.rng+1] 345 mov t4, [t0+msac.dif] 346 mov t2d, [t0+msac.rng] 347%if ARCH_X86_64 348 mov t5d, eax 349%endif 350 and eax, ~63 351 imul eax, t3d 352%if UNIX64 353 mov t6, t4 354%endif 355 shr eax, 7 356 add eax, 4 ; v 357 mov t3d, eax 358 shl rax, gprsize*8-16 ; vw 359 sub t2d, t3d ; r - v 360 sub t4, rax ; dif - vw 361 setb al 362 cmovb t2d, t3d 363 mov t3d, [t0+msac.update_cdf] 364%if UNIX64 365 cmovb t4, t6 366%else 367 cmovb t4, [t0+msac.dif] 368%endif 369%if ARCH_X86_64 == 0 370 movzx eax, al 371%endif 372 test t3d, t3d 373 jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 374%if UNIX64 == 0 375 push t6 376%endif 377 movzx t6d, word [t1+2] 378%if ARCH_X86_64 == 0 379 push t5 380 movzx t5d, word [t1] 381%endif 382 movifnidn t7, t0 383 lea ecx, [t6+64] 384 cmp t6d, 32 385 adc t6d, 0 386 mov [t1+2], t6w 387 imul t6d, eax, -32769 388 shr ecx, 4 ; rate 389 add t6d, t5d ; if (bit) 390 sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; 391 sar t6d, cl ; else 392 sub t5d, t6d ; cdf[0] -= cdf[0] >> rate; 393 mov [t1], t5w 394%if WIN64 395 mov t1d, [t7+msac.cnt] 396 pop t6 397 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4 398%else 399%if ARCH_X86_64 == 0 400 pop t5 401 pop t6 402%endif 403 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 404%endif 405 406cglobal msac_decode_bool_equi, 0, 6, 0 407 movifnidn t0, r0mp 408 mov t1d, [t0+msac.rng] 409 mov t4, [t0+msac.dif] 410 mov t2d, t1d 411 mov t1b, 8 412 mov t3, t4 413 mov eax, t1d 414 shr t1d, 1 ; v 415 shl rax, gprsize*8-17 ; vw 416 sub t2d, t1d ; r - v 417 sub t4, rax ; dif - vw 418 cmovb t2d, t1d 419 mov t1d, [t0+msac.cnt] 420 cmovb t4, t3 421 movifnidn t7, t0 422 mov ecx, 0xbfff 423 setb al ; the upper 32 bits contains garbage but that's OK 424 sub ecx, t2d 425 ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) 426 ; i.e. (0 <= d <= 2) and v < (3 << 14) 427 shr ecx, 14 ; d 428%if ARCH_X86_64 == 0 429 movzx eax, al 430%endif 431 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5 432 433cglobal msac_decode_bool, 0, 6, 0 434 movifnidn t0, r0mp 435 movifnidn t1d, r1m 436 movzx eax, byte [t0+msac.rng+1] ; r >> 8 437 mov t4, [t0+msac.dif] 438 mov t2d, [t0+msac.rng] 439 and t1d, ~63 440 imul eax, t1d 441 mov t3, t4 442 shr eax, 7 443 add eax, 4 ; v 444 mov t1d, eax 445 shl rax, gprsize*8-16 ; vw 446 sub t2d, t1d ; r - v 447 sub t4, rax ; dif - vw 448 cmovb t2d, t1d 449 cmovb t4, t3 450 setb al 451%if ARCH_X86_64 == 0 452 movzx eax, al 453%endif 454 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 455 456%macro HI_TOK 1 ; update_cdf 457%if ARCH_X86_64 == 0 458 mov eax, -24 459%endif 460%%loop: 461%if %1 462 movzx t2d, word [t1+3*2] 463%endif 464 mova m1, m0 465 pshuflw m2, m2, q0000 466 psrlw m1, 6 467 movd [buf+12], m2 468 pand m2, m4 469 psllw m1, 7 470 pmulhuw m1, m2 471%if ARCH_X86_64 == 0 472 add eax, 5 473 mov [buf+8], eax 474%endif 475 pshuflw m3, m3, c_shuf 476 paddw m1, m5 477 movq [buf+16], m1 478 psubusw m1, m3 479 pxor m2, m2 480 pcmpeqw m1, m2 481 pmovmskb eax, m1 482%if %1 483 lea ecx, [t2+80] 484 pcmpeqw m2, m2 485 shr ecx, 4 486 cmp t2d, 32 487 adc t2d, 0 488 movd m3, ecx 489 pavgw m2, m1 490 psubw m2, m0 491 psubw m0, m1 492 psraw m2, m3 493 paddw m0, m2 494 movq [t1], m0 495 mov [t1+3*2], t2w 496%endif 497 tzcnt eax, eax 498 movzx ecx, word [buf+rax+16] 499 movzx t2d, word [buf+rax+14] 500%if ARCH_X86_64 501 add t6d, 5 502%endif 503 sub eax, 5 ; setup for merging the tok_br and tok branches 504 sub t2d, ecx 505 shl rcx, gprsize*8-16 506 sub t4, rcx 507 bsr ecx, t2d 508 xor ecx, 15 509 shl t2d, cl 510 shl t4, cl 511 movd m2, t2d 512 mov [t7+msac.rng], t2d 513 sub t5d, ecx 514 jae %%end 515%if UNIX64 == 0 516 push t8 517%endif 518 mov t2, [t7+msac.buf] 519 mov t8, [t7+msac.end] 520 lea rcx, [t2+gprsize] 521 sub rcx, t8 522 ja %%refill_eob 523 mov t8, [t2] 524 lea ecx, [t5+16-gprsize*8] 525 not t8 526 bswap t8 527 shr t8, cl 528 neg ecx 529 shr ecx, 3 530 or t4, t8 531%%refill_end: 532 add t2, rcx 533 lea t5d, [t5+rcx*8] 534 mov [t7+msac.buf], t2 535%%refill_end2: 536%if UNIX64 == 0 537 pop t8 538%endif 539%%end: 540 movp m3, t4 541%if ARCH_X86_64 542 add t6d, eax ; CF = tok_br < 3 || tok == 15 543 jnc %%loop 544 lea eax, [t6+30] 545%else 546 add eax, [buf+8] 547 jnc %%loop 548 add eax, 30 549%if STACK_ALIGNMENT >= 16 550 add esp, 36 551%else 552 mov esp, [esp] 553%endif 554%endif 555 mov [t7+msac.dif], t4 556 shr eax, 1 557 mov [t7+msac.cnt], t5d 558 RET 559%%pad_with_ones: 560 ; ensure that dif is padded with at least 15 bits of ones at the end 561 lea ecx, [t5-16] 562%if ARCH_X86_64 563 ror rcx, cl 564%else 565 shr ecx, cl 566%endif 567 or t4, rcx 568 jmp %%refill_end2 569%%refill_eob: 570 cmp t2, t8 571 jae %%pad_with_ones 572 mov t8, [t8-gprsize] 573 shl ecx, 3 574 shr t8, cl 575 lea ecx, [t5+16-gprsize*8] 576 not t8 577 bswap t8 578 shr t8, cl 579 neg ecx 580 or t4, t8 581 mov t8d, [t7+msac.end] 582 shr ecx, 3 583 sub t8d, t2d 584 cmp ecx, t8d 585 cmovae ecx, t8d 586 jmp %%refill_end 587%endmacro 588 589cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 590 DECODE_SYMBOL_ADAPT_INIT 1 591%if ARCH_X86_64 == 0 && PIC 592 LEA t2, min_prob+12*2 593 %define base t2-(min_prob+12*2) 594%else 595 %define base 0 596%endif 597 movq m0, [t1] 598 movd m2, [t0+msac.rng] 599 mov eax, [t0+msac.update_cdf] 600 movq m4, [base+pw_0xff00] 601 movp m3, [t0+msac.dif] 602 movq m5, [base+min_prob+12*2] 603 mov t4, [t0+msac.dif] 604 mov t5d, [t0+msac.cnt] 605%if ARCH_X86_64 606 mov t6d, -24 607%endif 608 movifnidn t7, t0 609 test eax, eax 610 jz .no_update_cdf 611 HI_TOK 1 612.no_update_cdf: 613 HI_TOK 0 614 615%if ARCH_X86_64 616INIT_YMM avx2 617cglobal msac_decode_symbol_adapt16, 3, 6, 6 618 lea rax, [pw_0xff00] 619 vpbroadcastw m2, [t0+msac.rng] 620 mova m0, [t1] 621 vpbroadcastw m3, [t0+msac.dif+6] 622 vbroadcasti128 m4, [rax] 623 mov t3d, [t0+msac.update_cdf] 624 mov t4d, t2d 625 not t2 626 mov r5, rsp 627%if WIN64 628 and rsp, ~31 629 sub rsp, 40 630%else 631 and r5, ~31 632 %define buf r5-32 633%endif 634 psrlw m1, m0, 6 635 movd [buf-4], xm2 636 pand m2, m4 637 psllw m1, 7 638 pmulhuw m1, m2 639 paddw m1, [rax+t2*2] 640 mova [buf], m1 641 pmaxuw m1, m3 642 pcmpeqw m1, m3 643 pmovmskb eax, m1 644 test t3d, t3d 645 jz .renorm 646 movzx t3d, word [t1+t4*2] 647 pcmpeqw m2, m2 648 lea t2d, [t3+80] 649 shr t2d, 4 650 cmp t3d, 32 651 adc t3d, 0 652 movd xm3, t2d 653 pavgw m2, m1 654 psubw m2, m0 655 psubw m0, m1 656 psraw m2, xm3 657 paddw m0, m2 658 mova [t1], m0 659 mov [t1+t4*2], t3w 660.renorm: 661 tzcnt eax, eax 662 mov t4, [t0+msac.dif] 663 movzx t1d, word [buf+rax-0] 664 movzx t2d, word [buf+rax-2] 665 shr eax, 1 666%if WIN64 667 mov rsp, r5 668%endif 669 vzeroupper 670 jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 671%endif 672