xref: /aosp_15_r20/external/lzma/Asm/x86/LzmaDecOpt.asm (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2; 2024-06-18: Igor Pavlov : Public domain
3;
4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5; function for check at link time.
6; That code is tightly coupled with LzmaDec_TryDummy()
7; and with another functions in LzmaDec.c file.
8; CLzmaDec structure, (probs) array layout, input and output of
9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
10
11ifndef x64
12; x64=1
13; .err <x64_IS_REQUIRED>
14endif
15
16include 7zAsm.asm
17
18MY_ASM_START
19
20; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is     defined, we use additional SEGMENT with 64-byte alignment.
21; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is not defined, we use default SEGMENT (where default 16-byte alignment of segment is expected).
22; The performance is almost identical in our tests.
23; But the performance can depend from position of lzmadec code inside instruction cache
24; or micro-op cache line (depending from low address bits in 32-byte/64-byte cache lines).
25; And 64-byte alignment provides a more consistent speed regardless
26; of the code's position in the executable.
27; But also it's possible that code without Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT can be
28; slightly faster than 64-bytes aligned code in some cases, if offset of lzmadec
29; code in 64-byte block after compilation provides better speed by some reason.
30; Note that Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT adds an extra section to the ELF file.
31; If you don't want to get that extra section, do not define Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT.
32
33ifndef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
34if (IS_LINUX gt 0)
35  Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
36else
37  Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
38endif
39endif
40
41ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
42_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
43MY_ALIGN macro num:req
44        align  num
45        ; align  16
46endm
47else
48MY_ALIGN macro num:req
49        ; We expect that ".text" is aligned for 16-bytes.
50        ; So we don't need large alignment inside out function.
51        align  16
52endm
53endif
54
55
56MY_ALIGN_16 macro
57        MY_ALIGN 16
58endm
59
60MY_ALIGN_32 macro
61        MY_ALIGN 32
62endm
63
64MY_ALIGN_64 macro
65        MY_ALIGN 64
66endm
67
68
69; _LZMA_SIZE_OPT  equ 1
70
71; _LZMA_PROB32 equ 1
72
73ifdef _LZMA_PROB32
74        PSHIFT  equ 2
75        PLOAD macro dest, mem
76                mov     dest, dword ptr [mem]
77        endm
78        PSTORE  macro src, mem
79                mov     dword ptr [mem], src
80        endm
81else
82        PSHIFT  equ 1
83        PLOAD macro dest, mem
84                movzx   dest, word ptr [mem]
85        endm
86        PSTORE macro src, mem
87                mov     word ptr [mem], @CatStr(src, _W)
88        endm
89endif
90
91PMULT           equ (1 SHL PSHIFT)
92PMULT_HALF      equ (1 SHL (PSHIFT - 1))
93PMULT_2         equ (1 SHL (PSHIFT + 1))
94
95kMatchSpecLen_Error_Data equ (1 SHL 9)
96
97;       x0      range
98;       x1      pbPos / (prob) TREE
99;       x2      probBranch / prm (MATCHED) / pbPos / cnt
100;       x3      sym
101;====== r4 ===  RSP
102;       x5      cod
103;       x6      t1 NORM_CALC / probs_state / dist
104;       x7      t0 NORM_CALC / prob2 IF_BIT_1
105;       x8      state
106;       x9      match (MATCHED) / sym2 / dist2 / lpMask_reg
107;       x10     kBitModelTotal_reg
108;       r11     probs
109;       x12     offs (MATCHED) / dic / len_temp
110;       x13     processedPos
111;       x14     bit (MATCHED) / dicPos
112;       r15     buf
113
114
115cod     equ x5
116cod_L   equ x5_L
117range   equ x0
118state   equ x8
119state_R equ r8
120buf     equ r15
121processedPos equ x13
122kBitModelTotal_reg equ x10
123
124probBranch   equ x2
125probBranch_R equ r2
126probBranch_W equ x2_W
127
128pbPos   equ x1
129pbPos_R equ r1
130
131cnt     equ x2
132cnt_R   equ r2
133
134lpMask_reg equ x9
135dicPos  equ r14
136
137sym     equ x3
138sym_R   equ r3
139sym_L   equ x3_L
140
141probs   equ r11
142dic     equ r12
143
144t0      equ x7
145t0_W    equ x7_W
146t0_R    equ r7
147
148prob2   equ t0
149prob2_W equ t0_W
150
151t1      equ x6
152t1_R    equ r6
153
154probs_state     equ t1
155probs_state_R   equ t1_R
156
157prm     equ r2
158match   equ x9
159match_R equ r9
160offs    equ x12
161offs_R  equ r12
162bit     equ x14
163bit_R   equ r14
164
165sym2    equ x9
166sym2_R  equ r9
167
168len_temp equ x12
169
170dist    equ sym
171dist2   equ x9
172
173
174
175kNumBitModelTotalBits   equ 11
176kBitModelTotal          equ (1 SHL kNumBitModelTotalBits)
177kNumMoveBits            equ 5
178kBitModelOffset         equ ((1 SHL kNumMoveBits) - 1)
179kTopValue               equ (1 SHL 24)
180
181NORM_2 macro
182        ; movzx   t0, BYTE PTR [buf]
183        shl     cod, 8
184        mov     cod_L, BYTE PTR [buf]
185        shl     range, 8
186        ; or      cod, t0
187        inc     buf
188endm
189
190
191NORM macro
192        cmp     range, kTopValue
193        jae     SHORT @F
194        NORM_2
195@@:
196endm
197
198
199; ---------- Branch MACROS ----------
200
201UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
202        mov     prob2, kBitModelTotal_reg
203        sub     prob2, probBranch
204        shr     prob2, kNumMoveBits
205        add     probBranch, prob2
206        PSTORE  probBranch, probOffset * 1 + probsArray + probDisp * PMULT
207endm
208
209
210UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
211        sub     prob2, range
212        sub     cod, range
213        mov     range, prob2
214        mov     prob2, probBranch
215        shr     probBranch, kNumMoveBits
216        sub     prob2, probBranch
217        PSTORE  prob2, probOffset * 1 + probsArray + probDisp * PMULT
218endm
219
220
221CMP_COD macro probsArray:req, probOffset:req, probDisp:req
222        PLOAD   probBranch, probOffset * 1 + probsArray + probDisp * PMULT
223        NORM
224        mov     prob2, range
225        shr     range, kNumBitModelTotalBits
226        imul    range, probBranch
227        cmp     cod, range
228endm
229
230
231IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
232        CMP_COD probsArray, probOffset, probDisp
233        jae     toLabel
234endm
235
236
237IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
238        IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
239        UPDATE_0 probsArray, probOffset, probDisp
240endm
241
242
243IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
244        CMP_COD probsArray, probOffset, probDisp
245        jb      toLabel
246endm
247
248
249; ---------- CMOV MACROS ----------
250
251NORM_CALC macro prob:req
252        NORM
253        mov     t0, range
254        shr     range, kNumBitModelTotalBits
255        imul    range, prob
256        sub     t0, range
257        mov     t1, cod
258        sub     cod, range
259endm
260
261
262PUP macro prob:req, probPtr:req
263        sub     t0, prob
264       ; only sar works for both 16/32 bit prob modes
265        sar     t0, kNumMoveBits
266        add     t0, prob
267        PSTORE  t0, probPtr
268endm
269
270
271PUP_SUB macro prob:req, probPtr:req, symSub:req
272        sbb     sym, symSub
273        PUP prob, probPtr
274endm
275
276
277PUP_COD macro prob:req, probPtr:req, symSub:req
278        mov     t0, kBitModelOffset
279        cmovb   cod, t1
280        mov     t1, sym
281        cmovb   t0, kBitModelTotal_reg
282        PUP_SUB prob, probPtr, symSub
283endm
284
285
286BIT_0 macro prob:req, probNext:req
287        PLOAD   prob, probs + 1 * PMULT
288        PLOAD   probNext, probs + 1 * PMULT_2
289
290        NORM_CALC prob
291
292        cmovae  range, t0
293        PLOAD   t0, probs + 1 * PMULT_2 + PMULT
294        cmovae  probNext, t0
295        mov     t0, kBitModelOffset
296        cmovb   cod, t1
297        cmovb   t0, kBitModelTotal_reg
298        mov     sym, 2
299        PUP_SUB prob, probs + 1 * PMULT, 0 - 1
300endm
301
302
303BIT_1 macro prob:req, probNext:req
304        PLOAD   probNext, probs + sym_R * PMULT_2
305        add     sym, sym
306
307        NORM_CALC prob
308
309        cmovae  range, t0
310        PLOAD   t0, probs + sym_R * PMULT + PMULT
311        cmovae  probNext, t0
312        PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
313endm
314
315
316BIT_2 macro prob:req, symSub:req
317        add     sym, sym
318
319        NORM_CALC prob
320
321        cmovae  range, t0
322        PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
323endm
324
325
326; ---------- MATCHED LITERAL ----------
327
328LITM_0 macro
329        mov     offs, 256 * PMULT
330        shl     match, (PSHIFT + 1)
331        mov     bit, offs
332        and     bit, match
333        PLOAD   x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
334        lea     prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
335        ; lea     prm, [probs + 256 * PMULT + 1 * PMULT]
336        ; add     prm, bit_R
337        xor     offs, bit
338        add     match, match
339
340        NORM_CALC x1
341
342        cmovae  offs, bit
343        mov     bit, match
344        cmovae  range, t0
345        mov     t0, kBitModelOffset
346        cmovb   cod, t1
347        cmovb   t0, kBitModelTotal_reg
348        mov     sym, 0
349        PUP_SUB x1, prm, -2-1
350endm
351
352
353LITM macro
354        and     bit, offs
355        lea     prm, [probs + offs_R * 1]
356        add     prm, bit_R
357        PLOAD   x1, prm + sym_R * PMULT
358        xor     offs, bit
359        add     sym, sym
360        add     match, match
361
362        NORM_CALC x1
363
364        cmovae  offs, bit
365        mov     bit, match
366        cmovae  range, t0
367        PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
368endm
369
370
371LITM_2 macro
372        and     bit, offs
373        lea     prm, [probs + offs_R * 1]
374        add     prm, bit_R
375        PLOAD   x1, prm + sym_R * PMULT
376        add     sym, sym
377
378        NORM_CALC x1
379
380        cmovae  range, t0
381        PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
382endm
383
384
385; ---------- REVERSE BITS ----------
386
387REV_0 macro prob:req, probNext:req
388        ; PLOAD   prob, probs + 1 * PMULT
389        ; lea     sym2_R, [probs + 2 * PMULT]
390        ; PLOAD   probNext, probs + 2 * PMULT
391        PLOAD   probNext, sym2_R
392
393        NORM_CALC prob
394
395        cmovae  range, t0
396        PLOAD   t0, probs + 3 * PMULT
397        cmovae  probNext, t0
398        cmovb   cod, t1
399        mov     t0, kBitModelOffset
400        cmovb   t0, kBitModelTotal_reg
401        lea     t1_R, [probs + 3 * PMULT]
402        cmovae  sym2_R, t1_R
403        PUP prob, probs + 1 * PMULT
404endm
405
406
407REV_1 macro prob:req, probNext:req, step:req
408        add     sym2_R, step * PMULT
409        PLOAD   probNext, sym2_R
410
411        NORM_CALC prob
412
413        cmovae  range, t0
414        PLOAD   t0, sym2_R + step * PMULT
415        cmovae  probNext, t0
416        cmovb   cod, t1
417        mov     t0, kBitModelOffset
418        cmovb   t0, kBitModelTotal_reg
419        lea     t1_R, [sym2_R + step * PMULT]
420        cmovae  sym2_R, t1_R
421        PUP prob, t1_R - step * PMULT_2
422endm
423
424
425REV_2 macro prob:req, step:req
426        sub     sym2_R, probs
427        shr     sym2, PSHIFT
428        or      sym, sym2
429
430        NORM_CALC prob
431
432        cmovae  range, t0
433        lea     t0, [sym - step]
434        cmovb   sym, t0
435        cmovb   cod, t1
436        mov     t0, kBitModelOffset
437        cmovb   t0, kBitModelTotal_reg
438        PUP prob, probs + sym2_R * PMULT
439endm
440
441
442REV_1_VAR macro prob:req
443        PLOAD   prob, sym_R
444        mov     probs, sym_R
445        add     sym_R, sym2_R
446
447        NORM_CALC prob
448
449        cmovae  range, t0
450        lea     t0_R, [sym_R + 1 * sym2_R]
451        cmovae  sym_R, t0_R
452        mov     t0, kBitModelOffset
453        cmovb   cod, t1
454        ; mov     t1, kBitModelTotal
455        ; cmovb   t0, t1
456        cmovb   t0, kBitModelTotal_reg
457        add     sym2, sym2
458        PUP prob, probs
459endm
460
461
462
463
464LIT_PROBS macro lpMaskParam:req
465        ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
466        mov     t0, processedPos
467        shl     t0, 8
468        add     sym, t0
469        and     sym, lpMaskParam
470        add     probs_state_R, pbPos_R
471        mov     x1, LOC lc2
472        lea     sym, dword ptr[sym_R + 2 * sym_R]
473        add     probs, Literal * PMULT
474        shl     sym, x1_L
475        add     probs, sym_R
476        UPDATE_0 probs_state_R, 0, IsMatch
477        inc     processedPos
478endm
479
480
481
482kNumPosBitsMax          equ 4
483kNumPosStatesMax        equ (1 SHL kNumPosBitsMax)
484
485kLenNumLowBits          equ 3
486kLenNumLowSymbols       equ (1 SHL kLenNumLowBits)
487kLenNumHighBits         equ 8
488kLenNumHighSymbols      equ (1 SHL kLenNumHighBits)
489kNumLenProbs            equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
490
491LenLow                  equ 0
492LenChoice               equ LenLow
493LenChoice2              equ (LenLow + kLenNumLowSymbols)
494LenHigh                 equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
495
496kNumStates              equ 12
497kNumStates2             equ 16
498kNumLitStates           equ 7
499
500kStartPosModelIndex     equ 4
501kEndPosModelIndex       equ 14
502kNumFullDistances       equ (1 SHL (kEndPosModelIndex SHR 1))
503
504kNumPosSlotBits         equ 6
505kNumLenToPosStates      equ 4
506
507kNumAlignBits           equ 4
508kAlignTableSize         equ (1 SHL kNumAlignBits)
509
510kMatchMinLen            equ 2
511kMatchSpecLenStart      equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
512
513kStartOffset    equ 1664
514SpecPos         equ (-kStartOffset)
515IsRep0Long      equ (SpecPos + kNumFullDistances)
516RepLenCoder     equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
517LenCoder        equ (RepLenCoder + kNumLenProbs)
518IsMatch         equ (LenCoder + kNumLenProbs)
519kAlign          equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
520IsRep           equ (kAlign + kAlignTableSize)
521IsRepG0         equ (IsRep + kNumStates)
522IsRepG1         equ (IsRepG0 + kNumStates)
523IsRepG2         equ (IsRepG1 + kNumStates)
524PosSlot         equ (IsRepG2 + kNumStates)
525Literal         equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
526NUM_BASE_PROBS  equ (Literal + kStartOffset)
527
528if kAlign ne 0
529  .err <Stop_Compiling_Bad_LZMA_kAlign>
530endif
531
532if NUM_BASE_PROBS ne 1984
533  .err <Stop_Compiling_Bad_LZMA_PROBS>
534endif
535
536
537PTR_FIELD equ dq ?
538
539CLzmaDec_Asm struct
540        lc      db ?
541        lp      db ?
542        pb      db ?
543        _pad_   db ?
544        dicSize dd ?
545
546        probs_Spec      PTR_FIELD
547        probs_1664      PTR_FIELD
548        dic_Spec        PTR_FIELD
549        dicBufSize      PTR_FIELD
550        dicPos_Spec     PTR_FIELD
551        buf_Spec        PTR_FIELD
552
553        range_Spec      dd ?
554        code_Spec       dd ?
555        processedPos_Spec  dd ?
556        checkDicSize    dd ?
557        rep0    dd ?
558        rep1    dd ?
559        rep2    dd ?
560        rep3    dd ?
561        state_Spec      dd ?
562        remainLen dd ?
563CLzmaDec_Asm ends
564
565
566CLzmaDec_Asm_Loc struct
567        OLD_RSP    PTR_FIELD
568        lzmaPtr    PTR_FIELD
569        _pad0_     PTR_FIELD
570        _pad1_     PTR_FIELD
571        _pad2_     PTR_FIELD
572        dicBufSize PTR_FIELD
573        probs_Spec PTR_FIELD
574        dic_Spec   PTR_FIELD
575
576        limit      PTR_FIELD
577        bufLimit   PTR_FIELD
578        lc2       dd ?
579        lpMask    dd ?
580        pbMask    dd ?
581        checkDicSize   dd ?
582
583        _pad_     dd ?
584        remainLen dd ?
585        dicPos_Spec     PTR_FIELD
586        rep0      dd ?
587        rep1      dd ?
588        rep2      dd ?
589        rep3      dd ?
590CLzmaDec_Asm_Loc ends
591
592
593GLOB_2  equ [sym_R].CLzmaDec_Asm.
594GLOB    equ [r1].CLzmaDec_Asm.
595LOC_0   equ [r0].CLzmaDec_Asm_Loc.
596LOC     equ [RSP].CLzmaDec_Asm_Loc.
597
598
599COPY_VAR macro name
600        mov     t0, GLOB_2 name
601        mov     LOC_0 name, t0
602endm
603
604
605RESTORE_VAR macro name
606        mov     t0, LOC name
607        mov     GLOB name, t0
608endm
609
610
611
612IsMatchBranch_Pre macro reg
613        ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
614        mov     pbPos, LOC pbMask
615        and     pbPos, processedPos
616        shl     pbPos, (kLenNumLowBits + 1 + PSHIFT)
617        lea     probs_state_R, [probs + 1 * state_R]
618endm
619
620
621IsMatchBranch macro reg
622        IsMatchBranch_Pre
623        IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
624endm
625
626
627CheckLimits macro reg
628        cmp     buf, LOC bufLimit
629        jae     fin_OK
630        cmp     dicPos, LOC limit
631        jae     fin_OK
632endm
633
634
635
636; RSP is (16x + 8) bytes aligned in WIN64-x64
637; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
638
639PARAM_lzma      equ REG_ABI_PARAM_0
640PARAM_limit     equ REG_ABI_PARAM_1
641PARAM_bufLimit  equ REG_ABI_PARAM_2
642
643ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
644; MY_ALIGN_64
645else
646  MY_ALIGN_16
647endif
648MY_PROC LzmaDec_DecodeReal_3, 3
649MY_PUSH_PRESERVED_ABI_REGS
650
651        lea     r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
652        and     r0, -128
653        mov     r5, RSP
654        mov     RSP, r0
655        mov     LOC_0 Old_RSP, r5
656        mov     LOC_0 lzmaPtr, PARAM_lzma
657
658        mov     LOC_0 remainLen, 0  ; remainLen must be ZERO
659
660        mov     LOC_0 bufLimit, PARAM_bufLimit
661        mov     sym_R, PARAM_lzma  ;  CLzmaDec_Asm_Loc pointer for GLOB_2
662        mov     dic, GLOB_2 dic_Spec
663        add     PARAM_limit, dic
664        mov     LOC_0 limit, PARAM_limit
665
666        COPY_VAR(rep0)
667        COPY_VAR(rep1)
668        COPY_VAR(rep2)
669        COPY_VAR(rep3)
670
671        mov     dicPos, GLOB_2 dicPos_Spec
672        add     dicPos, dic
673        mov     LOC_0 dicPos_Spec, dicPos
674        mov     LOC_0 dic_Spec, dic
675
676        mov     x1_L, GLOB_2 pb
677        mov     t0, 1
678        shl     t0, x1_L
679        dec     t0
680        mov     LOC_0 pbMask, t0
681
682        ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
683        ; unsigned lc = p->prop.lc;
684        ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
685
686        mov     x1_L, GLOB_2 lc
687        mov     x2, 100h
688        mov     t0, x2
689        shr     x2, x1_L
690        ; inc     x1
691        add     x1_L, PSHIFT
692        mov     LOC_0 lc2, x1
693        mov     x1_L, GLOB_2 lp
694        shl     t0, x1_L
695        sub     t0, x2
696        mov     LOC_0 lpMask, t0
697        mov     lpMask_reg, t0
698
699        ; mov     probs, GLOB_2 probs_Spec
700        ; add     probs, kStartOffset SHL PSHIFT
701        mov     probs, GLOB_2 probs_1664
702        mov     LOC_0 probs_Spec, probs
703
704        mov     t0_R, GLOB_2 dicBufSize
705        mov     LOC_0 dicBufSize, t0_R
706
707        mov     x1, GLOB_2 checkDicSize
708        mov     LOC_0 checkDicSize, x1
709
710        mov     processedPos, GLOB_2 processedPos_Spec
711
712        mov     state, GLOB_2 state_Spec
713        shl     state, PSHIFT
714
715        mov     buf,   GLOB_2 buf_Spec
716        mov     range, GLOB_2 range_Spec
717        mov     cod,   GLOB_2 code_Spec
718        mov     kBitModelTotal_reg, kBitModelTotal
719        xor     sym, sym
720
721        ; if (processedPos != 0 || checkDicSize != 0)
722        or      x1, processedPos
723        jz      @f
724
725        add     t0_R, dic
726        cmp     dicPos, dic
727        cmovnz  t0_R, dicPos
728        movzx   sym, byte ptr[t0_R - 1]
729
730@@:
731        IsMatchBranch_Pre
732        cmp     state, 4 * PMULT
733        jb      lit_end
734        cmp     state, kNumLitStates * PMULT
735        jb      lit_matched_end
736        jmp     lz_end
737
738
739
740
741; ---------- LITERAL ----------
742MY_ALIGN_64
743lit_start:
744        xor     state, state
745lit_start_2:
746        LIT_PROBS lpMask_reg
747
748    ifdef _LZMA_SIZE_OPT
749
750        PLOAD   x1, probs + 1 * PMULT
751        mov     sym, 1
752MY_ALIGN_16
753lit_loop:
754        BIT_1   x1, x2
755        mov     x1, x2
756        cmp     sym, 127
757        jbe     lit_loop
758
759    else
760
761        BIT_0   x1, x2
762        BIT_1   x2, x1
763        BIT_1   x1, x2
764        BIT_1   x2, x1
765        BIT_1   x1, x2
766        BIT_1   x2, x1
767        BIT_1   x1, x2
768
769    endif
770
771        BIT_2   x2, 256 - 1
772
773        ; mov     dic, LOC dic_Spec
774        mov     probs, LOC probs_Spec
775        IsMatchBranch_Pre
776        mov     byte ptr[dicPos], sym_L
777        inc     dicPos
778
779        CheckLimits
780lit_end:
781        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
782
783        ; jmp     IsMatch_label
784
785; ---------- MATCHES ----------
786; MY_ALIGN_32
787IsMatch_label:
788        UPDATE_1 probs_state_R, pbPos_R, IsMatch
789        IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
790
791        add     probs, LenCoder * PMULT
792        add     state, kNumStates * PMULT
793
794; ---------- LEN DECODE ----------
795len_decode:
796        mov     len_temp, 8 - 1 - kMatchMinLen
797        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
798        UPDATE_1 probs, 0, 0
799        add     probs, (1 SHL (kLenNumLowBits + PSHIFT))
800        mov     len_temp, -1 - kMatchMinLen
801        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
802        UPDATE_1 probs, 0, 0
803        add     probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
804        mov     sym, 1
805        PLOAD   x1, probs + 1 * PMULT
806
807MY_ALIGN_32
808len8_loop:
809        BIT_1   x1, x2
810        mov     x1, x2
811        cmp     sym, 64
812        jb      len8_loop
813
814        mov     len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
815        jmp     short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
816
817MY_ALIGN_32
818len_mid_0:
819        UPDATE_0 probs, 0, 0
820        add     probs, pbPos_R
821        BIT_0   x2, x1
822len_mid_2:
823        BIT_1   x1, x2
824        BIT_2   x2, len_temp
825        mov     probs, LOC probs_Spec
826        cmp     state, kNumStates * PMULT
827        jb      copy_match
828
829
830; ---------- DECODE DISTANCE ----------
831        ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
832
833        mov     t0, 3 + kMatchMinLen
834        cmp     sym, 3 + kMatchMinLen
835        cmovb   t0, sym
836        add     probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
837        shl     t0, (kNumPosSlotBits + PSHIFT)
838        add     probs, t0_R
839
840        ; sym = Len
841        ; mov     LOC remainLen, sym
842        mov     len_temp, sym
843
844    ifdef _LZMA_SIZE_OPT
845
846        PLOAD   x1, probs + 1 * PMULT
847        mov     sym, 1
848MY_ALIGN_16
849slot_loop:
850        BIT_1   x1, x2
851        mov     x1, x2
852        cmp     sym, 32
853        jb      slot_loop
854
855    else
856
857        BIT_0   x1, x2
858        BIT_1   x2, x1
859        BIT_1   x1, x2
860        BIT_1   x2, x1
861        BIT_1   x1, x2
862
863    endif
864
865        mov     x1, sym
866        BIT_2   x2, 64-1
867
868        and     sym, 3
869        mov     probs, LOC probs_Spec
870        cmp     x1, 32 + kEndPosModelIndex / 2
871        jb      short_dist
872
873        ;  unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
874        sub     x1, (32 + 1 + kNumAlignBits)
875        ;  distance = (2 | (distance & 1));
876        or      sym, 2
877        PLOAD   x2, probs + 1 * PMULT
878        shl     sym, kNumAlignBits + 1
879        lea     sym2_R, [probs + 2 * PMULT]
880
881        jmp     direct_norm
882        ; lea     t1, [sym_R + (1 SHL kNumAlignBits)]
883        ; cmp     range, kTopValue
884        ; jb      direct_norm
885
886; ---------- DIRECT DISTANCE ----------
887MY_ALIGN_32
888direct_loop:
889        shr     range, 1
890        mov     t0, cod
891        sub     cod, range
892        cmovs   cod, t0
893        cmovns  sym, t1
894
895        comment ~
896        sub     cod, range
897        mov     x2, cod
898        sar     x2, 31
899        lea     sym, dword ptr [r2 + sym_R * 2 + 1]
900        and     x2, range
901        add     cod, x2
902        ~
903        dec     x1
904        je      direct_end
905
906        add     sym, sym
907direct_norm:
908        lea     t1, [sym_R + (1 SHL kNumAlignBits)]
909        cmp     range, kTopValue
910        jae     near ptr direct_loop
911        ; we align for 32 here with "near ptr" command above
912        NORM_2
913        jmp     direct_loop
914
915MY_ALIGN_32
916direct_end:
917        ;  prob =  + kAlign;
918        ;  distance <<= kNumAlignBits;
919        REV_0   x2, x1
920        REV_1   x1, x2, 2
921        REV_1   x2, x1, 4
922        REV_2   x1, 8
923
924decode_dist_end:
925
926        ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
927
928        mov     t1, LOC rep0
929        mov     x1, LOC rep1
930        mov     x2, LOC rep2
931
932        mov     t0, LOC checkDicSize
933        test    t0, t0
934        cmove   t0, processedPos
935        cmp     sym, t0
936        jae     end_of_payload
937        ; jmp     end_of_payload ; for debug
938
939        ; rep3 = rep2;
940        ; rep2 = rep1;
941        ; rep1 = rep0;
942        ; rep0 = distance + 1;
943
944        inc     sym
945        mov     LOC rep0, sym
946        ; mov     sym, LOC remainLen
947        mov     sym, len_temp
948        mov     LOC rep1, t1
949        mov     LOC rep2, x1
950        mov     LOC rep3, x2
951
952        ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
953        cmp     state, (kNumStates + kNumLitStates) * PMULT
954        mov     state, kNumLitStates * PMULT
955        mov     t0, (kNumLitStates + 3) * PMULT
956        cmovae  state, t0
957
958
959; ---------- COPY MATCH ----------
960copy_match:
961
962        ; len += kMatchMinLen;
963        ; add     sym, kMatchMinLen
964
965        ; if ((rem = limit - dicPos) == 0)
966        ; {
967        ;   p->dicPos = dicPos;
968        ;   return SZ_ERROR_DATA;
969        ; }
970        mov     cnt_R, LOC limit
971        sub     cnt_R, dicPos
972        jz      fin_dicPos_LIMIT
973
974        ; curLen = ((rem < len) ? (unsigned)rem : len);
975        cmp     cnt_R, sym_R
976        ; cmovae  cnt_R, sym_R ; 64-bit
977        cmovae  cnt, sym ; 32-bit
978
979        mov     dic, LOC dic_Spec
980        mov     x1, LOC rep0
981
982        mov     t0_R, dicPos
983        add     dicPos, cnt_R
984        ; processedPos += curLen;
985        add     processedPos, cnt
986        ; len -= curLen;
987        sub     sym, cnt
988        mov     LOC remainLen, sym
989
990        sub     t0_R, dic
991
992        ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
993        sub     t0_R, r1
994        jae     @f
995
996        mov     r1, LOC dicBufSize
997        add     t0_R, r1
998        sub     r1, t0_R
999        cmp     cnt_R, r1
1000        ja      copy_match_cross
1001@@:
1002        ; if (curLen <= dicBufSize - pos)
1003
1004; ---------- COPY MATCH FAST ----------
1005        ; Byte *dest = dic + dicPos;
1006        ; mov     r1, dic
1007        ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
1008        ; sub   t0_R, dicPos
1009        ; dicPos += curLen;
1010
1011        ; const Byte *lim = dest + curLen;
1012        add     t0_R, dic
1013        movzx   sym, byte ptr[t0_R]
1014        add     t0_R, cnt_R
1015        neg     cnt_R
1016        ; lea     r1, [dicPos - 1]
1017copy_common:
1018        dec     dicPos
1019        ; cmp   LOC rep0, 1
1020        ; je    rep0Label
1021
1022        ; t0_R - src_lim
1023        ; r1 - dest_lim - 1
1024        ; cnt_R - (-cnt)
1025
1026        IsMatchBranch_Pre
1027        inc     cnt_R
1028        jz      copy_end
1029MY_ALIGN_16
1030@@:
1031        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
1032        movzx   sym, byte ptr[cnt_R * 1 + t0_R]
1033        inc     cnt_R
1034        jnz     @b
1035
1036copy_end:
1037lz_end_match:
1038        mov     byte ptr[dicPos], sym_L
1039        inc     dicPos
1040
1041        ; IsMatchBranch_Pre
1042        CheckLimits
1043lz_end:
1044        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1045
1046
1047
1048; ---------- LITERAL MATCHED ----------
1049
1050        LIT_PROBS LOC lpMask
1051
1052        ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1053        mov     x1, LOC rep0
1054        ; mov     dic, LOC dic_Spec
1055        mov     LOC dicPos_Spec, dicPos
1056
1057        ; state -= (state < 10) ? 3 : 6;
1058        lea     t0, [state_R - 6 * PMULT]
1059        sub     state, 3 * PMULT
1060        cmp     state, 7 * PMULT
1061        cmovae  state, t0
1062
1063        sub     dicPos, dic
1064        sub     dicPos, r1
1065        jae     @f
1066        add     dicPos, LOC dicBufSize
1067@@:
1068        comment ~
1069        xor     t0, t0
1070        sub     dicPos, r1
1071        cmovb   t0_R, LOC dicBufSize
1072        ~
1073
1074        movzx   match, byte ptr[dic + dicPos * 1]
1075
1076    ifdef _LZMA_SIZE_OPT
1077
1078        mov     offs, 256 * PMULT
1079        shl     match, (PSHIFT + 1)
1080        mov     bit, match
1081        mov     sym, 1
1082MY_ALIGN_16
1083litm_loop:
1084        LITM
1085        cmp     sym, 256
1086        jb      litm_loop
1087        sub     sym, 256
1088
1089    else
1090
1091        LITM_0
1092        LITM
1093        LITM
1094        LITM
1095        LITM
1096        LITM
1097        LITM
1098        LITM_2
1099
1100    endif
1101
1102        mov     probs, LOC probs_Spec
1103        IsMatchBranch_Pre
1104        ; mov     dic, LOC dic_Spec
1105        mov     dicPos, LOC dicPos_Spec
1106        mov     byte ptr[dicPos], sym_L
1107        inc     dicPos
1108
1109        CheckLimits
1110lit_matched_end:
1111        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1112        ; IsMatchBranch
1113        mov     lpMask_reg, LOC lpMask
1114        sub     state, 3 * PMULT
1115        jmp     lit_start_2
1116
1117
1118
1119; ---------- REP 0 LITERAL ----------
1120MY_ALIGN_32
1121IsRep0Short_label:
1122        UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1123
1124        ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1125        mov     dic, LOC dic_Spec
1126        mov     t0_R, dicPos
1127        mov     probBranch, LOC rep0
1128        sub     t0_R, dic
1129
1130        sub     probs, RepLenCoder * PMULT
1131
1132        ; state = state < kNumLitStates ? 9 : 11;
1133        or      state, 1 * PMULT
1134
1135        ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1136        ; so we don't need the following (dicPos == limit) check here:
1137        ; cmp     dicPos, LOC limit
1138        ; jae     fin_dicPos_LIMIT_REP_SHORT
1139
1140        inc     processedPos
1141
1142        IsMatchBranch_Pre
1143
1144;        xor     sym, sym
1145;        sub     t0_R, probBranch_R
1146;        cmovb   sym_R, LOC dicBufSize
1147;        add     t0_R, sym_R
1148        sub     t0_R, probBranch_R
1149        jae     @f
1150        add     t0_R, LOC dicBufSize
1151@@:
1152        movzx   sym, byte ptr[dic + t0_R * 1]
1153        jmp     lz_end_match
1154
1155
1156MY_ALIGN_32
1157IsRep_label:
1158        UPDATE_1 probs_state_R, 0, IsRep
1159
1160        ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1161        ; So we don't check it here.
1162
1163        ; mov     t0, processedPos
1164        ; or      t0, LOC checkDicSize
1165        ; jz      fin_ERROR_2
1166
1167        ; state = state < kNumLitStates ? 8 : 11;
1168        cmp     state, kNumLitStates * PMULT
1169        mov     state, 8 * PMULT
1170        mov     probBranch, 11 * PMULT
1171        cmovae  state, probBranch
1172
1173        ; prob = probs + RepLenCoder;
1174        add     probs, RepLenCoder * PMULT
1175
1176        IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1177        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1178        UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1179        jmp     len_decode
1180
1181MY_ALIGN_32
1182IsRepG0_label:
1183        UPDATE_1 probs_state_R, 0, IsRepG0
1184        mov     dist2, LOC rep0
1185        mov     dist, LOC rep1
1186        mov     LOC rep1, dist2
1187
1188        IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1189        mov     LOC rep0, dist
1190        jmp     len_decode
1191
1192; MY_ALIGN_32
1193IsRepG1_label:
1194        UPDATE_1 probs_state_R, 0, IsRepG1
1195        mov     dist2, LOC rep2
1196        mov     LOC rep2, dist
1197
1198        IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1199        mov     LOC rep0, dist2
1200        jmp     len_decode
1201
1202; MY_ALIGN_32
1203IsRepG2_label:
1204        UPDATE_1 probs_state_R, 0, IsRepG2
1205        mov     dist, LOC rep3
1206        mov     LOC rep3, dist2
1207        mov     LOC rep0, dist
1208        jmp     len_decode
1209
1210
1211
1212; ---------- SPEC SHORT DISTANCE ----------
1213
1214MY_ALIGN_32
1215short_dist:
1216        sub     x1, 32 + 1
1217        jbe     decode_dist_end
1218        or      sym, 2
1219        shl     sym, x1_L
1220        lea     sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1221        mov     sym2, PMULT ; step
1222MY_ALIGN_32
1223spec_loop:
1224        REV_1_VAR x2
1225        dec     x1
1226        jnz     spec_loop
1227
1228        mov     probs, LOC probs_Spec
1229        sub     sym, sym2
1230        sub     sym, SpecPos * PMULT
1231        sub     sym_R, probs
1232        shr     sym, PSHIFT
1233
1234        jmp     decode_dist_end
1235
1236
1237; ---------- COPY MATCH CROSS ----------
1238copy_match_cross:
1239        ; t0_R - src pos
1240        ; r1 - len to dicBufSize
1241        ; cnt_R - total copy len
1242
1243        mov     t1_R, t0_R         ; srcPos
1244        mov     t0_R, dic
1245        mov     r1, LOC dicBufSize   ;
1246        neg     cnt_R
1247@@:
1248        movzx   sym, byte ptr[t1_R * 1 + t0_R]
1249        inc     t1_R
1250        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
1251        inc     cnt_R
1252        cmp     t1_R, r1
1253        jne     @b
1254
1255        movzx   sym, byte ptr[t0_R]
1256        sub     t0_R, cnt_R
1257        jmp     copy_common
1258
1259
1260
1261
1262; fin_dicPos_LIMIT_REP_SHORT:
1263        ; mov     sym, 1
1264
1265fin_dicPos_LIMIT:
1266        mov     LOC remainLen, sym
1267        jmp     fin_OK
1268        ; For more strict mode we can stop decoding with error
1269        ; mov     sym, 1
1270        ; jmp     fin
1271
1272
1273fin_ERROR_MATCH_DIST:
1274
1275        ; rep3 = rep2;
1276        ; rep2 = rep1;
1277        ; rep1 = rep0;
1278        ; rep0 = distance + 1;
1279
1280        add     len_temp, kMatchSpecLen_Error_Data
1281        mov     LOC remainLen, len_temp
1282
1283        mov     LOC rep0, sym
1284        mov     LOC rep1, t1
1285        mov     LOC rep2, x1
1286        mov     LOC rep3, x2
1287
1288        ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1289        cmp     state, (kNumStates + kNumLitStates) * PMULT
1290        mov     state, kNumLitStates * PMULT
1291        mov     t0, (kNumLitStates + 3) * PMULT
1292        cmovae  state, t0
1293
1294        ; jmp     fin_OK
1295        mov     sym, 1
1296        jmp     fin
1297
1298end_of_payload:
1299        inc     sym
1300        jnz     fin_ERROR_MATCH_DIST
1301
1302        mov     LOC remainLen, kMatchSpecLenStart
1303        sub     state, kNumStates * PMULT
1304
1305fin_OK:
1306        xor     sym, sym
1307
1308fin:
1309        NORM
1310
1311        mov     r1, LOC lzmaPtr
1312
1313        sub     dicPos, LOC dic_Spec
1314        mov     GLOB dicPos_Spec, dicPos
1315        mov     GLOB buf_Spec, buf
1316        mov     GLOB range_Spec, range
1317        mov     GLOB code_Spec, cod
1318        shr     state, PSHIFT
1319        mov     GLOB state_Spec, state
1320        mov     GLOB processedPos_Spec, processedPos
1321
1322        RESTORE_VAR(remainLen)
1323        RESTORE_VAR(rep0)
1324        RESTORE_VAR(rep1)
1325        RESTORE_VAR(rep2)
1326        RESTORE_VAR(rep3)
1327
1328        mov     x0, sym
1329
1330        mov     RSP, LOC Old_RSP
1331
1332MY_POP_PRESERVED_ABI_REGS
1333MY_ENDP
1334
1335ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
1336_TEXT$LZMADECOPT ENDS
1337endif
1338
1339end
1340