1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function 2; 2024-06-18: Igor Pavlov : Public domain 3; 4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() 5; function for check at link time. 6; That code is tightly coupled with LzmaDec_TryDummy() 7; and with another functions in LzmaDec.c file. 8; CLzmaDec structure, (probs) array layout, input and output of 9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). 10 11ifndef x64 12; x64=1 13; .err <x64_IS_REQUIRED> 14endif 15 16include 7zAsm.asm 17 18MY_ASM_START 19 20; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is defined, we use additional SEGMENT with 64-byte alignment. 21; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is not defined, we use default SEGMENT (where default 16-byte alignment of segment is expected). 22; The performance is almost identical in our tests. 23; But the performance can depend from position of lzmadec code inside instruction cache 24; or micro-op cache line (depending from low address bits in 32-byte/64-byte cache lines). 25; And 64-byte alignment provides a more consistent speed regardless 26; of the code's position in the executable. 27; But also it's possible that code without Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT can be 28; slightly faster than 64-bytes aligned code in some cases, if offset of lzmadec 29; code in 64-byte block after compilation provides better speed by some reason. 30; Note that Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT adds an extra section to the ELF file. 31; If you don't want to get that extra section, do not define Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT. 32 33ifndef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT 34if (IS_LINUX gt 0) 35 Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1 36else 37 Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1 38endif 39endif 40 41ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT 42_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' 43MY_ALIGN macro num:req 44 align num 45 ; align 16 46endm 47else 48MY_ALIGN macro num:req 49 ; We expect that ".text" is aligned for 16-bytes. 50 ; So we don't need large alignment inside out function. 51 align 16 52endm 53endif 54 55 56MY_ALIGN_16 macro 57 MY_ALIGN 16 58endm 59 60MY_ALIGN_32 macro 61 MY_ALIGN 32 62endm 63 64MY_ALIGN_64 macro 65 MY_ALIGN 64 66endm 67 68 69; _LZMA_SIZE_OPT equ 1 70 71; _LZMA_PROB32 equ 1 72 73ifdef _LZMA_PROB32 74 PSHIFT equ 2 75 PLOAD macro dest, mem 76 mov dest, dword ptr [mem] 77 endm 78 PSTORE macro src, mem 79 mov dword ptr [mem], src 80 endm 81else 82 PSHIFT equ 1 83 PLOAD macro dest, mem 84 movzx dest, word ptr [mem] 85 endm 86 PSTORE macro src, mem 87 mov word ptr [mem], @CatStr(src, _W) 88 endm 89endif 90 91PMULT equ (1 SHL PSHIFT) 92PMULT_HALF equ (1 SHL (PSHIFT - 1)) 93PMULT_2 equ (1 SHL (PSHIFT + 1)) 94 95kMatchSpecLen_Error_Data equ (1 SHL 9) 96 97; x0 range 98; x1 pbPos / (prob) TREE 99; x2 probBranch / prm (MATCHED) / pbPos / cnt 100; x3 sym 101;====== r4 === RSP 102; x5 cod 103; x6 t1 NORM_CALC / probs_state / dist 104; x7 t0 NORM_CALC / prob2 IF_BIT_1 105; x8 state 106; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg 107; x10 kBitModelTotal_reg 108; r11 probs 109; x12 offs (MATCHED) / dic / len_temp 110; x13 processedPos 111; x14 bit (MATCHED) / dicPos 112; r15 buf 113 114 115cod equ x5 116cod_L equ x5_L 117range equ x0 118state equ x8 119state_R equ r8 120buf equ r15 121processedPos equ x13 122kBitModelTotal_reg equ x10 123 124probBranch equ x2 125probBranch_R equ r2 126probBranch_W equ x2_W 127 128pbPos equ x1 129pbPos_R equ r1 130 131cnt equ x2 132cnt_R equ r2 133 134lpMask_reg equ x9 135dicPos equ r14 136 137sym equ x3 138sym_R equ r3 139sym_L equ x3_L 140 141probs equ r11 142dic equ r12 143 144t0 equ x7 145t0_W equ x7_W 146t0_R equ r7 147 148prob2 equ t0 149prob2_W equ t0_W 150 151t1 equ x6 152t1_R equ r6 153 154probs_state equ t1 155probs_state_R equ t1_R 156 157prm equ r2 158match equ x9 159match_R equ r9 160offs equ x12 161offs_R equ r12 162bit equ x14 163bit_R equ r14 164 165sym2 equ x9 166sym2_R equ r9 167 168len_temp equ x12 169 170dist equ sym 171dist2 equ x9 172 173 174 175kNumBitModelTotalBits equ 11 176kBitModelTotal equ (1 SHL kNumBitModelTotalBits) 177kNumMoveBits equ 5 178kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) 179kTopValue equ (1 SHL 24) 180 181NORM_2 macro 182 ; movzx t0, BYTE PTR [buf] 183 shl cod, 8 184 mov cod_L, BYTE PTR [buf] 185 shl range, 8 186 ; or cod, t0 187 inc buf 188endm 189 190 191NORM macro 192 cmp range, kTopValue 193 jae SHORT @F 194 NORM_2 195@@: 196endm 197 198 199; ---------- Branch MACROS ---------- 200 201UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req 202 mov prob2, kBitModelTotal_reg 203 sub prob2, probBranch 204 shr prob2, kNumMoveBits 205 add probBranch, prob2 206 PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT 207endm 208 209 210UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req 211 sub prob2, range 212 sub cod, range 213 mov range, prob2 214 mov prob2, probBranch 215 shr probBranch, kNumMoveBits 216 sub prob2, probBranch 217 PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT 218endm 219 220 221CMP_COD macro probsArray:req, probOffset:req, probDisp:req 222 PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT 223 NORM 224 mov prob2, range 225 shr range, kNumBitModelTotalBits 226 imul range, probBranch 227 cmp cod, range 228endm 229 230 231IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req 232 CMP_COD probsArray, probOffset, probDisp 233 jae toLabel 234endm 235 236 237IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req 238 IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel 239 UPDATE_0 probsArray, probOffset, probDisp 240endm 241 242 243IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req 244 CMP_COD probsArray, probOffset, probDisp 245 jb toLabel 246endm 247 248 249; ---------- CMOV MACROS ---------- 250 251NORM_CALC macro prob:req 252 NORM 253 mov t0, range 254 shr range, kNumBitModelTotalBits 255 imul range, prob 256 sub t0, range 257 mov t1, cod 258 sub cod, range 259endm 260 261 262PUP macro prob:req, probPtr:req 263 sub t0, prob 264 ; only sar works for both 16/32 bit prob modes 265 sar t0, kNumMoveBits 266 add t0, prob 267 PSTORE t0, probPtr 268endm 269 270 271PUP_SUB macro prob:req, probPtr:req, symSub:req 272 sbb sym, symSub 273 PUP prob, probPtr 274endm 275 276 277PUP_COD macro prob:req, probPtr:req, symSub:req 278 mov t0, kBitModelOffset 279 cmovb cod, t1 280 mov t1, sym 281 cmovb t0, kBitModelTotal_reg 282 PUP_SUB prob, probPtr, symSub 283endm 284 285 286BIT_0 macro prob:req, probNext:req 287 PLOAD prob, probs + 1 * PMULT 288 PLOAD probNext, probs + 1 * PMULT_2 289 290 NORM_CALC prob 291 292 cmovae range, t0 293 PLOAD t0, probs + 1 * PMULT_2 + PMULT 294 cmovae probNext, t0 295 mov t0, kBitModelOffset 296 cmovb cod, t1 297 cmovb t0, kBitModelTotal_reg 298 mov sym, 2 299 PUP_SUB prob, probs + 1 * PMULT, 0 - 1 300endm 301 302 303BIT_1 macro prob:req, probNext:req 304 PLOAD probNext, probs + sym_R * PMULT_2 305 add sym, sym 306 307 NORM_CALC prob 308 309 cmovae range, t0 310 PLOAD t0, probs + sym_R * PMULT + PMULT 311 cmovae probNext, t0 312 PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 313endm 314 315 316BIT_2 macro prob:req, symSub:req 317 add sym, sym 318 319 NORM_CALC prob 320 321 cmovae range, t0 322 PUP_COD prob, probs + t1_R * PMULT_HALF, symSub 323endm 324 325 326; ---------- MATCHED LITERAL ---------- 327 328LITM_0 macro 329 mov offs, 256 * PMULT 330 shl match, (PSHIFT + 1) 331 mov bit, offs 332 and bit, match 333 PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT 334 lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] 335 ; lea prm, [probs + 256 * PMULT + 1 * PMULT] 336 ; add prm, bit_R 337 xor offs, bit 338 add match, match 339 340 NORM_CALC x1 341 342 cmovae offs, bit 343 mov bit, match 344 cmovae range, t0 345 mov t0, kBitModelOffset 346 cmovb cod, t1 347 cmovb t0, kBitModelTotal_reg 348 mov sym, 0 349 PUP_SUB x1, prm, -2-1 350endm 351 352 353LITM macro 354 and bit, offs 355 lea prm, [probs + offs_R * 1] 356 add prm, bit_R 357 PLOAD x1, prm + sym_R * PMULT 358 xor offs, bit 359 add sym, sym 360 add match, match 361 362 NORM_CALC x1 363 364 cmovae offs, bit 365 mov bit, match 366 cmovae range, t0 367 PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 368endm 369 370 371LITM_2 macro 372 and bit, offs 373 lea prm, [probs + offs_R * 1] 374 add prm, bit_R 375 PLOAD x1, prm + sym_R * PMULT 376 add sym, sym 377 378 NORM_CALC x1 379 380 cmovae range, t0 381 PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 382endm 383 384 385; ---------- REVERSE BITS ---------- 386 387REV_0 macro prob:req, probNext:req 388 ; PLOAD prob, probs + 1 * PMULT 389 ; lea sym2_R, [probs + 2 * PMULT] 390 ; PLOAD probNext, probs + 2 * PMULT 391 PLOAD probNext, sym2_R 392 393 NORM_CALC prob 394 395 cmovae range, t0 396 PLOAD t0, probs + 3 * PMULT 397 cmovae probNext, t0 398 cmovb cod, t1 399 mov t0, kBitModelOffset 400 cmovb t0, kBitModelTotal_reg 401 lea t1_R, [probs + 3 * PMULT] 402 cmovae sym2_R, t1_R 403 PUP prob, probs + 1 * PMULT 404endm 405 406 407REV_1 macro prob:req, probNext:req, step:req 408 add sym2_R, step * PMULT 409 PLOAD probNext, sym2_R 410 411 NORM_CALC prob 412 413 cmovae range, t0 414 PLOAD t0, sym2_R + step * PMULT 415 cmovae probNext, t0 416 cmovb cod, t1 417 mov t0, kBitModelOffset 418 cmovb t0, kBitModelTotal_reg 419 lea t1_R, [sym2_R + step * PMULT] 420 cmovae sym2_R, t1_R 421 PUP prob, t1_R - step * PMULT_2 422endm 423 424 425REV_2 macro prob:req, step:req 426 sub sym2_R, probs 427 shr sym2, PSHIFT 428 or sym, sym2 429 430 NORM_CALC prob 431 432 cmovae range, t0 433 lea t0, [sym - step] 434 cmovb sym, t0 435 cmovb cod, t1 436 mov t0, kBitModelOffset 437 cmovb t0, kBitModelTotal_reg 438 PUP prob, probs + sym2_R * PMULT 439endm 440 441 442REV_1_VAR macro prob:req 443 PLOAD prob, sym_R 444 mov probs, sym_R 445 add sym_R, sym2_R 446 447 NORM_CALC prob 448 449 cmovae range, t0 450 lea t0_R, [sym_R + 1 * sym2_R] 451 cmovae sym_R, t0_R 452 mov t0, kBitModelOffset 453 cmovb cod, t1 454 ; mov t1, kBitModelTotal 455 ; cmovb t0, t1 456 cmovb t0, kBitModelTotal_reg 457 add sym2, sym2 458 PUP prob, probs 459endm 460 461 462 463 464LIT_PROBS macro lpMaskParam:req 465 ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); 466 mov t0, processedPos 467 shl t0, 8 468 add sym, t0 469 and sym, lpMaskParam 470 add probs_state_R, pbPos_R 471 mov x1, LOC lc2 472 lea sym, dword ptr[sym_R + 2 * sym_R] 473 add probs, Literal * PMULT 474 shl sym, x1_L 475 add probs, sym_R 476 UPDATE_0 probs_state_R, 0, IsMatch 477 inc processedPos 478endm 479 480 481 482kNumPosBitsMax equ 4 483kNumPosStatesMax equ (1 SHL kNumPosBitsMax) 484 485kLenNumLowBits equ 3 486kLenNumLowSymbols equ (1 SHL kLenNumLowBits) 487kLenNumHighBits equ 8 488kLenNumHighSymbols equ (1 SHL kLenNumHighBits) 489kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) 490 491LenLow equ 0 492LenChoice equ LenLow 493LenChoice2 equ (LenLow + kLenNumLowSymbols) 494LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) 495 496kNumStates equ 12 497kNumStates2 equ 16 498kNumLitStates equ 7 499 500kStartPosModelIndex equ 4 501kEndPosModelIndex equ 14 502kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) 503 504kNumPosSlotBits equ 6 505kNumLenToPosStates equ 4 506 507kNumAlignBits equ 4 508kAlignTableSize equ (1 SHL kNumAlignBits) 509 510kMatchMinLen equ 2 511kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) 512 513kStartOffset equ 1664 514SpecPos equ (-kStartOffset) 515IsRep0Long equ (SpecPos + kNumFullDistances) 516RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) 517LenCoder equ (RepLenCoder + kNumLenProbs) 518IsMatch equ (LenCoder + kNumLenProbs) 519kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) 520IsRep equ (kAlign + kAlignTableSize) 521IsRepG0 equ (IsRep + kNumStates) 522IsRepG1 equ (IsRepG0 + kNumStates) 523IsRepG2 equ (IsRepG1 + kNumStates) 524PosSlot equ (IsRepG2 + kNumStates) 525Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) 526NUM_BASE_PROBS equ (Literal + kStartOffset) 527 528if kAlign ne 0 529 .err <Stop_Compiling_Bad_LZMA_kAlign> 530endif 531 532if NUM_BASE_PROBS ne 1984 533 .err <Stop_Compiling_Bad_LZMA_PROBS> 534endif 535 536 537PTR_FIELD equ dq ? 538 539CLzmaDec_Asm struct 540 lc db ? 541 lp db ? 542 pb db ? 543 _pad_ db ? 544 dicSize dd ? 545 546 probs_Spec PTR_FIELD 547 probs_1664 PTR_FIELD 548 dic_Spec PTR_FIELD 549 dicBufSize PTR_FIELD 550 dicPos_Spec PTR_FIELD 551 buf_Spec PTR_FIELD 552 553 range_Spec dd ? 554 code_Spec dd ? 555 processedPos_Spec dd ? 556 checkDicSize dd ? 557 rep0 dd ? 558 rep1 dd ? 559 rep2 dd ? 560 rep3 dd ? 561 state_Spec dd ? 562 remainLen dd ? 563CLzmaDec_Asm ends 564 565 566CLzmaDec_Asm_Loc struct 567 OLD_RSP PTR_FIELD 568 lzmaPtr PTR_FIELD 569 _pad0_ PTR_FIELD 570 _pad1_ PTR_FIELD 571 _pad2_ PTR_FIELD 572 dicBufSize PTR_FIELD 573 probs_Spec PTR_FIELD 574 dic_Spec PTR_FIELD 575 576 limit PTR_FIELD 577 bufLimit PTR_FIELD 578 lc2 dd ? 579 lpMask dd ? 580 pbMask dd ? 581 checkDicSize dd ? 582 583 _pad_ dd ? 584 remainLen dd ? 585 dicPos_Spec PTR_FIELD 586 rep0 dd ? 587 rep1 dd ? 588 rep2 dd ? 589 rep3 dd ? 590CLzmaDec_Asm_Loc ends 591 592 593GLOB_2 equ [sym_R].CLzmaDec_Asm. 594GLOB equ [r1].CLzmaDec_Asm. 595LOC_0 equ [r0].CLzmaDec_Asm_Loc. 596LOC equ [RSP].CLzmaDec_Asm_Loc. 597 598 599COPY_VAR macro name 600 mov t0, GLOB_2 name 601 mov LOC_0 name, t0 602endm 603 604 605RESTORE_VAR macro name 606 mov t0, LOC name 607 mov GLOB name, t0 608endm 609 610 611 612IsMatchBranch_Pre macro reg 613 ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; 614 mov pbPos, LOC pbMask 615 and pbPos, processedPos 616 shl pbPos, (kLenNumLowBits + 1 + PSHIFT) 617 lea probs_state_R, [probs + 1 * state_R] 618endm 619 620 621IsMatchBranch macro reg 622 IsMatchBranch_Pre 623 IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label 624endm 625 626 627CheckLimits macro reg 628 cmp buf, LOC bufLimit 629 jae fin_OK 630 cmp dicPos, LOC limit 631 jae fin_OK 632endm 633 634 635 636; RSP is (16x + 8) bytes aligned in WIN64-x64 637; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) 638 639PARAM_lzma equ REG_ABI_PARAM_0 640PARAM_limit equ REG_ABI_PARAM_1 641PARAM_bufLimit equ REG_ABI_PARAM_2 642 643ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT 644; MY_ALIGN_64 645else 646 MY_ALIGN_16 647endif 648MY_PROC LzmaDec_DecodeReal_3, 3 649MY_PUSH_PRESERVED_ABI_REGS 650 651 lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] 652 and r0, -128 653 mov r5, RSP 654 mov RSP, r0 655 mov LOC_0 Old_RSP, r5 656 mov LOC_0 lzmaPtr, PARAM_lzma 657 658 mov LOC_0 remainLen, 0 ; remainLen must be ZERO 659 660 mov LOC_0 bufLimit, PARAM_bufLimit 661 mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 662 mov dic, GLOB_2 dic_Spec 663 add PARAM_limit, dic 664 mov LOC_0 limit, PARAM_limit 665 666 COPY_VAR(rep0) 667 COPY_VAR(rep1) 668 COPY_VAR(rep2) 669 COPY_VAR(rep3) 670 671 mov dicPos, GLOB_2 dicPos_Spec 672 add dicPos, dic 673 mov LOC_0 dicPos_Spec, dicPos 674 mov LOC_0 dic_Spec, dic 675 676 mov x1_L, GLOB_2 pb 677 mov t0, 1 678 shl t0, x1_L 679 dec t0 680 mov LOC_0 pbMask, t0 681 682 ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; 683 ; unsigned lc = p->prop.lc; 684 ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); 685 686 mov x1_L, GLOB_2 lc 687 mov x2, 100h 688 mov t0, x2 689 shr x2, x1_L 690 ; inc x1 691 add x1_L, PSHIFT 692 mov LOC_0 lc2, x1 693 mov x1_L, GLOB_2 lp 694 shl t0, x1_L 695 sub t0, x2 696 mov LOC_0 lpMask, t0 697 mov lpMask_reg, t0 698 699 ; mov probs, GLOB_2 probs_Spec 700 ; add probs, kStartOffset SHL PSHIFT 701 mov probs, GLOB_2 probs_1664 702 mov LOC_0 probs_Spec, probs 703 704 mov t0_R, GLOB_2 dicBufSize 705 mov LOC_0 dicBufSize, t0_R 706 707 mov x1, GLOB_2 checkDicSize 708 mov LOC_0 checkDicSize, x1 709 710 mov processedPos, GLOB_2 processedPos_Spec 711 712 mov state, GLOB_2 state_Spec 713 shl state, PSHIFT 714 715 mov buf, GLOB_2 buf_Spec 716 mov range, GLOB_2 range_Spec 717 mov cod, GLOB_2 code_Spec 718 mov kBitModelTotal_reg, kBitModelTotal 719 xor sym, sym 720 721 ; if (processedPos != 0 || checkDicSize != 0) 722 or x1, processedPos 723 jz @f 724 725 add t0_R, dic 726 cmp dicPos, dic 727 cmovnz t0_R, dicPos 728 movzx sym, byte ptr[t0_R - 1] 729 730@@: 731 IsMatchBranch_Pre 732 cmp state, 4 * PMULT 733 jb lit_end 734 cmp state, kNumLitStates * PMULT 735 jb lit_matched_end 736 jmp lz_end 737 738 739 740 741; ---------- LITERAL ---------- 742MY_ALIGN_64 743lit_start: 744 xor state, state 745lit_start_2: 746 LIT_PROBS lpMask_reg 747 748 ifdef _LZMA_SIZE_OPT 749 750 PLOAD x1, probs + 1 * PMULT 751 mov sym, 1 752MY_ALIGN_16 753lit_loop: 754 BIT_1 x1, x2 755 mov x1, x2 756 cmp sym, 127 757 jbe lit_loop 758 759 else 760 761 BIT_0 x1, x2 762 BIT_1 x2, x1 763 BIT_1 x1, x2 764 BIT_1 x2, x1 765 BIT_1 x1, x2 766 BIT_1 x2, x1 767 BIT_1 x1, x2 768 769 endif 770 771 BIT_2 x2, 256 - 1 772 773 ; mov dic, LOC dic_Spec 774 mov probs, LOC probs_Spec 775 IsMatchBranch_Pre 776 mov byte ptr[dicPos], sym_L 777 inc dicPos 778 779 CheckLimits 780lit_end: 781 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start 782 783 ; jmp IsMatch_label 784 785; ---------- MATCHES ---------- 786; MY_ALIGN_32 787IsMatch_label: 788 UPDATE_1 probs_state_R, pbPos_R, IsMatch 789 IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label 790 791 add probs, LenCoder * PMULT 792 add state, kNumStates * PMULT 793 794; ---------- LEN DECODE ---------- 795len_decode: 796 mov len_temp, 8 - 1 - kMatchMinLen 797 IF_BIT_0_NOUP probs, 0, 0, len_mid_0 798 UPDATE_1 probs, 0, 0 799 add probs, (1 SHL (kLenNumLowBits + PSHIFT)) 800 mov len_temp, -1 - kMatchMinLen 801 IF_BIT_0_NOUP probs, 0, 0, len_mid_0 802 UPDATE_1 probs, 0, 0 803 add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) 804 mov sym, 1 805 PLOAD x1, probs + 1 * PMULT 806 807MY_ALIGN_32 808len8_loop: 809 BIT_1 x1, x2 810 mov x1, x2 811 cmp sym, 64 812 jb len8_loop 813 814 mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen 815 jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs 816 817MY_ALIGN_32 818len_mid_0: 819 UPDATE_0 probs, 0, 0 820 add probs, pbPos_R 821 BIT_0 x2, x1 822len_mid_2: 823 BIT_1 x1, x2 824 BIT_2 x2, len_temp 825 mov probs, LOC probs_Spec 826 cmp state, kNumStates * PMULT 827 jb copy_match 828 829 830; ---------- DECODE DISTANCE ---------- 831 ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); 832 833 mov t0, 3 + kMatchMinLen 834 cmp sym, 3 + kMatchMinLen 835 cmovb t0, sym 836 add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) 837 shl t0, (kNumPosSlotBits + PSHIFT) 838 add probs, t0_R 839 840 ; sym = Len 841 ; mov LOC remainLen, sym 842 mov len_temp, sym 843 844 ifdef _LZMA_SIZE_OPT 845 846 PLOAD x1, probs + 1 * PMULT 847 mov sym, 1 848MY_ALIGN_16 849slot_loop: 850 BIT_1 x1, x2 851 mov x1, x2 852 cmp sym, 32 853 jb slot_loop 854 855 else 856 857 BIT_0 x1, x2 858 BIT_1 x2, x1 859 BIT_1 x1, x2 860 BIT_1 x2, x1 861 BIT_1 x1, x2 862 863 endif 864 865 mov x1, sym 866 BIT_2 x2, 64-1 867 868 and sym, 3 869 mov probs, LOC probs_Spec 870 cmp x1, 32 + kEndPosModelIndex / 2 871 jb short_dist 872 873 ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); 874 sub x1, (32 + 1 + kNumAlignBits) 875 ; distance = (2 | (distance & 1)); 876 or sym, 2 877 PLOAD x2, probs + 1 * PMULT 878 shl sym, kNumAlignBits + 1 879 lea sym2_R, [probs + 2 * PMULT] 880 881 jmp direct_norm 882 ; lea t1, [sym_R + (1 SHL kNumAlignBits)] 883 ; cmp range, kTopValue 884 ; jb direct_norm 885 886; ---------- DIRECT DISTANCE ---------- 887MY_ALIGN_32 888direct_loop: 889 shr range, 1 890 mov t0, cod 891 sub cod, range 892 cmovs cod, t0 893 cmovns sym, t1 894 895 comment ~ 896 sub cod, range 897 mov x2, cod 898 sar x2, 31 899 lea sym, dword ptr [r2 + sym_R * 2 + 1] 900 and x2, range 901 add cod, x2 902 ~ 903 dec x1 904 je direct_end 905 906 add sym, sym 907direct_norm: 908 lea t1, [sym_R + (1 SHL kNumAlignBits)] 909 cmp range, kTopValue 910 jae near ptr direct_loop 911 ; we align for 32 here with "near ptr" command above 912 NORM_2 913 jmp direct_loop 914 915MY_ALIGN_32 916direct_end: 917 ; prob = + kAlign; 918 ; distance <<= kNumAlignBits; 919 REV_0 x2, x1 920 REV_1 x1, x2, 2 921 REV_1 x2, x1, 4 922 REV_2 x1, 8 923 924decode_dist_end: 925 926 ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) 927 928 mov t1, LOC rep0 929 mov x1, LOC rep1 930 mov x2, LOC rep2 931 932 mov t0, LOC checkDicSize 933 test t0, t0 934 cmove t0, processedPos 935 cmp sym, t0 936 jae end_of_payload 937 ; jmp end_of_payload ; for debug 938 939 ; rep3 = rep2; 940 ; rep2 = rep1; 941 ; rep1 = rep0; 942 ; rep0 = distance + 1; 943 944 inc sym 945 mov LOC rep0, sym 946 ; mov sym, LOC remainLen 947 mov sym, len_temp 948 mov LOC rep1, t1 949 mov LOC rep2, x1 950 mov LOC rep3, x2 951 952 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; 953 cmp state, (kNumStates + kNumLitStates) * PMULT 954 mov state, kNumLitStates * PMULT 955 mov t0, (kNumLitStates + 3) * PMULT 956 cmovae state, t0 957 958 959; ---------- COPY MATCH ---------- 960copy_match: 961 962 ; len += kMatchMinLen; 963 ; add sym, kMatchMinLen 964 965 ; if ((rem = limit - dicPos) == 0) 966 ; { 967 ; p->dicPos = dicPos; 968 ; return SZ_ERROR_DATA; 969 ; } 970 mov cnt_R, LOC limit 971 sub cnt_R, dicPos 972 jz fin_dicPos_LIMIT 973 974 ; curLen = ((rem < len) ? (unsigned)rem : len); 975 cmp cnt_R, sym_R 976 ; cmovae cnt_R, sym_R ; 64-bit 977 cmovae cnt, sym ; 32-bit 978 979 mov dic, LOC dic_Spec 980 mov x1, LOC rep0 981 982 mov t0_R, dicPos 983 add dicPos, cnt_R 984 ; processedPos += curLen; 985 add processedPos, cnt 986 ; len -= curLen; 987 sub sym, cnt 988 mov LOC remainLen, sym 989 990 sub t0_R, dic 991 992 ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); 993 sub t0_R, r1 994 jae @f 995 996 mov r1, LOC dicBufSize 997 add t0_R, r1 998 sub r1, t0_R 999 cmp cnt_R, r1 1000 ja copy_match_cross 1001@@: 1002 ; if (curLen <= dicBufSize - pos) 1003 1004; ---------- COPY MATCH FAST ---------- 1005 ; Byte *dest = dic + dicPos; 1006 ; mov r1, dic 1007 ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; 1008 ; sub t0_R, dicPos 1009 ; dicPos += curLen; 1010 1011 ; const Byte *lim = dest + curLen; 1012 add t0_R, dic 1013 movzx sym, byte ptr[t0_R] 1014 add t0_R, cnt_R 1015 neg cnt_R 1016 ; lea r1, [dicPos - 1] 1017copy_common: 1018 dec dicPos 1019 ; cmp LOC rep0, 1 1020 ; je rep0Label 1021 1022 ; t0_R - src_lim 1023 ; r1 - dest_lim - 1 1024 ; cnt_R - (-cnt) 1025 1026 IsMatchBranch_Pre 1027 inc cnt_R 1028 jz copy_end 1029MY_ALIGN_16 1030@@: 1031 mov byte ptr[cnt_R * 1 + dicPos], sym_L 1032 movzx sym, byte ptr[cnt_R * 1 + t0_R] 1033 inc cnt_R 1034 jnz @b 1035 1036copy_end: 1037lz_end_match: 1038 mov byte ptr[dicPos], sym_L 1039 inc dicPos 1040 1041 ; IsMatchBranch_Pre 1042 CheckLimits 1043lz_end: 1044 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label 1045 1046 1047 1048; ---------- LITERAL MATCHED ---------- 1049 1050 LIT_PROBS LOC lpMask 1051 1052 ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; 1053 mov x1, LOC rep0 1054 ; mov dic, LOC dic_Spec 1055 mov LOC dicPos_Spec, dicPos 1056 1057 ; state -= (state < 10) ? 3 : 6; 1058 lea t0, [state_R - 6 * PMULT] 1059 sub state, 3 * PMULT 1060 cmp state, 7 * PMULT 1061 cmovae state, t0 1062 1063 sub dicPos, dic 1064 sub dicPos, r1 1065 jae @f 1066 add dicPos, LOC dicBufSize 1067@@: 1068 comment ~ 1069 xor t0, t0 1070 sub dicPos, r1 1071 cmovb t0_R, LOC dicBufSize 1072 ~ 1073 1074 movzx match, byte ptr[dic + dicPos * 1] 1075 1076 ifdef _LZMA_SIZE_OPT 1077 1078 mov offs, 256 * PMULT 1079 shl match, (PSHIFT + 1) 1080 mov bit, match 1081 mov sym, 1 1082MY_ALIGN_16 1083litm_loop: 1084 LITM 1085 cmp sym, 256 1086 jb litm_loop 1087 sub sym, 256 1088 1089 else 1090 1091 LITM_0 1092 LITM 1093 LITM 1094 LITM 1095 LITM 1096 LITM 1097 LITM 1098 LITM_2 1099 1100 endif 1101 1102 mov probs, LOC probs_Spec 1103 IsMatchBranch_Pre 1104 ; mov dic, LOC dic_Spec 1105 mov dicPos, LOC dicPos_Spec 1106 mov byte ptr[dicPos], sym_L 1107 inc dicPos 1108 1109 CheckLimits 1110lit_matched_end: 1111 IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label 1112 ; IsMatchBranch 1113 mov lpMask_reg, LOC lpMask 1114 sub state, 3 * PMULT 1115 jmp lit_start_2 1116 1117 1118 1119; ---------- REP 0 LITERAL ---------- 1120MY_ALIGN_32 1121IsRep0Short_label: 1122 UPDATE_0 probs_state_R, pbPos_R, IsRep0Long 1123 1124 ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; 1125 mov dic, LOC dic_Spec 1126 mov t0_R, dicPos 1127 mov probBranch, LOC rep0 1128 sub t0_R, dic 1129 1130 sub probs, RepLenCoder * PMULT 1131 1132 ; state = state < kNumLitStates ? 9 : 11; 1133 or state, 1 * PMULT 1134 1135 ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT 1136 ; so we don't need the following (dicPos == limit) check here: 1137 ; cmp dicPos, LOC limit 1138 ; jae fin_dicPos_LIMIT_REP_SHORT 1139 1140 inc processedPos 1141 1142 IsMatchBranch_Pre 1143 1144; xor sym, sym 1145; sub t0_R, probBranch_R 1146; cmovb sym_R, LOC dicBufSize 1147; add t0_R, sym_R 1148 sub t0_R, probBranch_R 1149 jae @f 1150 add t0_R, LOC dicBufSize 1151@@: 1152 movzx sym, byte ptr[dic + t0_R * 1] 1153 jmp lz_end_match 1154 1155 1156MY_ALIGN_32 1157IsRep_label: 1158 UPDATE_1 probs_state_R, 0, IsRep 1159 1160 ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. 1161 ; So we don't check it here. 1162 1163 ; mov t0, processedPos 1164 ; or t0, LOC checkDicSize 1165 ; jz fin_ERROR_2 1166 1167 ; state = state < kNumLitStates ? 8 : 11; 1168 cmp state, kNumLitStates * PMULT 1169 mov state, 8 * PMULT 1170 mov probBranch, 11 * PMULT 1171 cmovae state, probBranch 1172 1173 ; prob = probs + RepLenCoder; 1174 add probs, RepLenCoder * PMULT 1175 1176 IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label 1177 IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label 1178 UPDATE_1 probs_state_R, pbPos_R, IsRep0Long 1179 jmp len_decode 1180 1181MY_ALIGN_32 1182IsRepG0_label: 1183 UPDATE_1 probs_state_R, 0, IsRepG0 1184 mov dist2, LOC rep0 1185 mov dist, LOC rep1 1186 mov LOC rep1, dist2 1187 1188 IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label 1189 mov LOC rep0, dist 1190 jmp len_decode 1191 1192; MY_ALIGN_32 1193IsRepG1_label: 1194 UPDATE_1 probs_state_R, 0, IsRepG1 1195 mov dist2, LOC rep2 1196 mov LOC rep2, dist 1197 1198 IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label 1199 mov LOC rep0, dist2 1200 jmp len_decode 1201 1202; MY_ALIGN_32 1203IsRepG2_label: 1204 UPDATE_1 probs_state_R, 0, IsRepG2 1205 mov dist, LOC rep3 1206 mov LOC rep3, dist2 1207 mov LOC rep0, dist 1208 jmp len_decode 1209 1210 1211 1212; ---------- SPEC SHORT DISTANCE ---------- 1213 1214MY_ALIGN_32 1215short_dist: 1216 sub x1, 32 + 1 1217 jbe decode_dist_end 1218 or sym, 2 1219 shl sym, x1_L 1220 lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] 1221 mov sym2, PMULT ; step 1222MY_ALIGN_32 1223spec_loop: 1224 REV_1_VAR x2 1225 dec x1 1226 jnz spec_loop 1227 1228 mov probs, LOC probs_Spec 1229 sub sym, sym2 1230 sub sym, SpecPos * PMULT 1231 sub sym_R, probs 1232 shr sym, PSHIFT 1233 1234 jmp decode_dist_end 1235 1236 1237; ---------- COPY MATCH CROSS ---------- 1238copy_match_cross: 1239 ; t0_R - src pos 1240 ; r1 - len to dicBufSize 1241 ; cnt_R - total copy len 1242 1243 mov t1_R, t0_R ; srcPos 1244 mov t0_R, dic 1245 mov r1, LOC dicBufSize ; 1246 neg cnt_R 1247@@: 1248 movzx sym, byte ptr[t1_R * 1 + t0_R] 1249 inc t1_R 1250 mov byte ptr[cnt_R * 1 + dicPos], sym_L 1251 inc cnt_R 1252 cmp t1_R, r1 1253 jne @b 1254 1255 movzx sym, byte ptr[t0_R] 1256 sub t0_R, cnt_R 1257 jmp copy_common 1258 1259 1260 1261 1262; fin_dicPos_LIMIT_REP_SHORT: 1263 ; mov sym, 1 1264 1265fin_dicPos_LIMIT: 1266 mov LOC remainLen, sym 1267 jmp fin_OK 1268 ; For more strict mode we can stop decoding with error 1269 ; mov sym, 1 1270 ; jmp fin 1271 1272 1273fin_ERROR_MATCH_DIST: 1274 1275 ; rep3 = rep2; 1276 ; rep2 = rep1; 1277 ; rep1 = rep0; 1278 ; rep0 = distance + 1; 1279 1280 add len_temp, kMatchSpecLen_Error_Data 1281 mov LOC remainLen, len_temp 1282 1283 mov LOC rep0, sym 1284 mov LOC rep1, t1 1285 mov LOC rep2, x1 1286 mov LOC rep3, x2 1287 1288 ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; 1289 cmp state, (kNumStates + kNumLitStates) * PMULT 1290 mov state, kNumLitStates * PMULT 1291 mov t0, (kNumLitStates + 3) * PMULT 1292 cmovae state, t0 1293 1294 ; jmp fin_OK 1295 mov sym, 1 1296 jmp fin 1297 1298end_of_payload: 1299 inc sym 1300 jnz fin_ERROR_MATCH_DIST 1301 1302 mov LOC remainLen, kMatchSpecLenStart 1303 sub state, kNumStates * PMULT 1304 1305fin_OK: 1306 xor sym, sym 1307 1308fin: 1309 NORM 1310 1311 mov r1, LOC lzmaPtr 1312 1313 sub dicPos, LOC dic_Spec 1314 mov GLOB dicPos_Spec, dicPos 1315 mov GLOB buf_Spec, buf 1316 mov GLOB range_Spec, range 1317 mov GLOB code_Spec, cod 1318 shr state, PSHIFT 1319 mov GLOB state_Spec, state 1320 mov GLOB processedPos_Spec, processedPos 1321 1322 RESTORE_VAR(remainLen) 1323 RESTORE_VAR(rep0) 1324 RESTORE_VAR(rep1) 1325 RESTORE_VAR(rep2) 1326 RESTORE_VAR(rep3) 1327 1328 mov x0, sym 1329 1330 mov RSP, LOC Old_RSP 1331 1332MY_POP_PRESERVED_ABI_REGS 1333MY_ENDP 1334 1335ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT 1336_TEXT$LZMADECOPT ENDS 1337endif 1338 1339end 1340