xref: /aosp_15_r20/external/lzma/CPP/7zip/Compress/LzxDecoder.cpp (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 // LzxDecoder.cpp
2 
3 #include "StdAfx.h"
4 
5 #include <string.h>
6 // #include <stdio.h>
7 
8 // #define SHOW_DEBUG_INFO
9 
10 #ifdef SHOW_DEBUG_INFO
11 #include <stdio.h>
12 #define PRF(x) x
13 #else
14 #define PRF(x)
15 #endif
16 
17 #include "../../../C/Alloc.h"
18 #include "../../../C/RotateDefs.h"
19 #include "../../../C/CpuArch.h"
20 
21 #include "LzxDecoder.h"
22 
23 
24 #ifdef MY_CPU_X86_OR_AMD64
25 #if defined(MY_CPU_AMD64)  \
26     || defined(__SSE2__) \
27     || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
28     || 0 && defined(_MSC_VER) && (_MSC_VER >= 1400) // set (1 &&) for debug
29 
30 #if defined(__clang__) && (__clang_major__ >= 2) \
31     || defined(__GNUC__) && (__GNUC__ >= 4) \
32     || defined(_MSC_VER) && (_MSC_VER >= 1400)
33 #define Z7_LZX_X86_FILTER_USE_SSE2
34 #endif
35 #endif
36 #endif
37 
38 
39 #ifdef Z7_LZX_X86_FILTER_USE_SSE2
40 // #ifdef MY_CPU_X86_OR_AMD64
41 #include <emmintrin.h> // SSE2
42 // #endif
43   #if defined(__clang__) || defined(__GNUC__)
44     typedef int ctz_type;
45     #define MY_CTZ(dest, mask) dest = __builtin_ctz((UInt32)(mask))
46   #else  // #if defined(_MSC_VER)
47     #if (_MSC_VER >= 1600)
48       // #include <intrin.h>
49     #endif
50     typedef unsigned long ctz_type;
51     #define MY_CTZ(dest, mask)  _BitScanForward(&dest, (mask));
52   #endif // _MSC_VER
53 #endif
54 
55 // when window buffer is filled, we must wrap position to zero,
56 // and we want to wrap at same points where original-lzx must wrap.
57 // But the wrapping is possible in point where chunk is finished.
58 // Usually (chunk_size == 32KB), but (chunk_size != 32KB) also is allowed.
59 // So we don't use additional buffer space over required (winSize).
60 // And we can't use large overwrite after (len) in CopyLzMatch().
61 // But we are allowed to write 3 bytes after (len), because
62 // (delta <= _winSize - 3).
63 
64 // #define k_Lz_OverwriteSize  0  // for debug : to disable overwrite
65 #define k_Lz_OverwriteSize  3 // = kNumReps
66 #if k_Lz_OverwriteSize > 0
67 // (k_Lz_OutBufSize_Add >= k_Lz_OverwriteSize) is required
68 // we use value 4 to simplify memset() code.
69 #define k_Lz_OutBufSize_Add  (k_Lz_OverwriteSize + 1) // == 4
70 #else
71 #define k_Lz_OutBufSize_Add  0
72 #endif
73 
74 // (len != 0)
75 // (0 < delta <= _winSize - 3)
76 Z7_FORCE_INLINE
CopyLzMatch(Byte * dest,const Byte * src,UInt32 len,UInt32 delta)77 void CopyLzMatch(Byte *dest, const Byte *src, UInt32 len, UInt32 delta)
78 {
79   if (delta >= 4)
80   {
81 #if k_Lz_OverwriteSize >= 3
82     // optimized code with overwrite to reduce the number of branches
83   #ifdef MY_CPU_LE_UNALIGN
84     *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
85   #else
86     dest[0] = src[0];
87     dest[1] = src[1];
88     dest[2] = src[2];
89     dest[3] = src[3];
90   #endif
91     len--;
92     src++;
93     dest++;
94     {
95 #else
96     // no overwrite in out buffer
97     dest[0] = src[0];
98     {
99       const unsigned m = (unsigned)len & 1;
100       src += m;
101       dest += m;
102     }
103     if (len &= ~(unsigned)1)
104     {
105       dest[0] = src[0];
106       dest[1] = src[1];
107 #endif
108       // len == 0 is allowed here
109       {
110         const unsigned m = (unsigned)len & 3;
111         src += m;
112         dest += m;
113       }
114       if (len &= ~(unsigned)3)
115       {
116 #ifdef MY_CPU_LE_UNALIGN
117       #if 1
118         *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
119         {
120           const unsigned m = (unsigned)len & 7;
121           dest += m;
122           src += m;
123         }
124         if (len &= ~(unsigned)7)
125           do
126           {
127             *(UInt32 *)(void *)(dest    ) = *(const UInt32 *)(const void *)(src);
128             *(UInt32 *)(void *)(dest + 4) = *(const UInt32 *)(const void *)(src + 4);
129             src += 8;
130             dest += 8;
131           }
132           while (len -= 8);
133       #else
134         // gcc-11 -O3 for x64 generates incorrect code here
135         do
136         {
137           *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
138           src += 4;
139           dest += 4;
140         }
141         while (len -= 4);
142       #endif
143 #else
144         do
145         {
146           const Byte b0 = src[0];
147           const Byte b1 = src[1];
148           dest[0] = b0;
149           dest[1] = b1;
150           const Byte b2 = src[2];
151           const Byte b3 = src[3];
152           dest[2] = b2;
153           dest[3] = b3;
154           src += 4;
155           dest += 4;
156         }
157         while (len -= 4);
158 #endif
159       }
160     }
161   }
162   else // (delta < 4)
163   {
164     const unsigned b0 = *src;
165     *dest = (Byte)b0;
166     if (len >= 2)
167     {
168       if (delta < 2)
169       {
170         dest += (unsigned)len & 1;
171         dest[0] = (Byte)b0;
172         dest[1] = (Byte)b0;
173         dest += (unsigned)len & 2;
174         if (len &= ~(unsigned)3)
175         {
176 #ifdef MY_CPU_LE_UNALIGN
177           #ifdef MY_CPU_64BIT
178           const UInt64 a = (UInt64)b0 * 0x101010101010101;
179           *(UInt32 *)(void *)dest = (UInt32)a;
180           dest += (unsigned)len & 7;
181           if (len &= ~(unsigned)7)
182           {
183             // *(UInt64 *)(void *)dest = a;
184             // dest += 8;
185             // len -= 8;
186             // if (len)
187             {
188               // const ptrdiff_t delta = (ptrdiff_t)dest & 7;
189               // dest -= delta;
190               do
191               {
192                 *(UInt64 *)(void *)dest = a;
193                 dest += 8;
194               }
195               while (len -= 8);
196               // dest += delta - 8;
197               // *(UInt64 *)(void *)dest = a;
198             }
199           }
200           #else
201           const UInt32 a = (UInt32)b0 * 0x1010101;
202           do
203           {
204             *(UInt32 *)(void *)dest = a;
205             dest += 4;
206           }
207           while (len -= 4);
208           #endif
209 #else
210           do
211           {
212             dest[0] = (Byte)b0;
213             dest[1] = (Byte)b0;
214             dest[2] = (Byte)b0;
215             dest[3] = (Byte)b0;
216             dest += 4;
217           }
218           while (len -= 4);
219 #endif
220         }
221       }
222       else if (delta == 2)
223       {
224         const unsigned m = (unsigned)len & 1;
225         len &= ~(unsigned)1;
226         src += m;
227         dest += m;
228         {
229           const Byte a0 = src[0];
230           const Byte a1 = src[1];
231           do
232           {
233             dest[0] = a0;
234             dest[1] = a1;
235             dest += 2;
236           }
237           while (len -= 2);
238         }
239       }
240       else /* if (delta == 3) */
241       {
242         const unsigned b1 = src[1];
243         dest[1] = (Byte)b1;
244         if (len -= 2)
245         {
246           const unsigned b2 = src[2];
247           dest += 2;
248           do
249           {
250             dest[0] = (Byte)b2;  if (--len == 0) break;
251             dest[1] = (Byte)b0;  if (--len == 0) break;
252             dest[2] = (Byte)b1;
253             dest += 3;
254           }
255           while (--len);
256         }
257       }
258     }
259   }
260 }
261 
262 // #define Z7_LZX_SHOW_STAT
263 #ifdef Z7_LZX_SHOW_STAT
264 #include <stdio.h>
265 #endif
266 
267 namespace NCompress {
268 namespace NLzx {
269 
270 // #define Z7_LZX_SHOW_STAT
271 #ifdef Z7_LZX_SHOW_STAT
272 static UInt32 g_stats_Num_x86[3];
273 static UInt32 g_stats_NumTables;
274 static UInt32 g_stats_NumLits;
275 static UInt32 g_stats_NumAlign;
276 static UInt32 g_stats_main[kMainTableSize];
277 static UInt32 g_stats_len[kNumLenSymbols];
278 static UInt32 g_stats_main_levels[kNumHuffmanBits + 1];
279 static UInt32 g_stats_len_levels[kNumHuffmanBits + 1];
280 #define UPDATE_STAT(a) a
281 static void PrintVal(UInt32 v)
282 {
283   printf("\n    : %9u", v);
284 }
285 static void PrintStat(const char *name, const UInt32 *a, size_t num)
286 {
287   printf("\n\n==== %s:", name);
288   UInt32 sum = 0;
289   size_t i;
290   for (i = 0; i < num; i++)
291     sum += a[i];
292   PrintVal(sum);
293   if (sum != 0)
294   {
295     for (i = 0; i < num; i++)
296     {
297       if (i % 8 == 0)
298         printf("\n");
299       printf("\n%3x : %9u : %5.2f", (unsigned)i, (unsigned)a[i], (double)a[i] * 100 / sum);
300     }
301   }
302   printf("\n");
303 }
304 
305 static struct CStat
306 {
307   ~CStat()
308   {
309     PrintStat("x86_filter", g_stats_Num_x86, Z7_ARRAY_SIZE(g_stats_Num_x86));
310     printf("\nTables:"); PrintVal(g_stats_NumTables);
311     printf("\nLits:");   PrintVal(g_stats_NumLits);
312     printf("\nAlign:");  PrintVal(g_stats_NumAlign);
313     PrintStat("Main", g_stats_main, Z7_ARRAY_SIZE(g_stats_main));
314     PrintStat("Len", g_stats_len, Z7_ARRAY_SIZE(g_stats_len));
315     PrintStat("Main Levels", g_stats_main_levels, Z7_ARRAY_SIZE(g_stats_main_levels));
316     PrintStat("Len Levels", g_stats_len_levels, Z7_ARRAY_SIZE(g_stats_len_levels));
317   }
318 } g_stat;
319 #else
320 #define UPDATE_STAT(a)
321 #endif
322 
323 
324 
325 /*
326 3 p015  : ivb-   : or r32,r32 / add r32,r32
327 4 p0156 : hsw+
328 5 p0156b: adl+
329 2 p0_5  : ivb-   : shl r32,i8
330 2 p0__6 : hsw+
331 1 p5    : ivb-   : jb
332 2 p0__6 : hsw+
333 2 p0_5  : wsm-    : SSE2  : pcmpeqb  : _mm_cmpeq_epi8
334 2 p_15  : snb-bdw
335 2 p01   : skl+
336 1 p0              : SSE2  : pmovmskb : _mm_movemask_epi8
337 */
338 /*
339   v24.00: the code was fixed for more compatibility with original-ms-cab-decoder.
340   for ((Int32)translationSize >= 0) : LZX specification shows the code with signed Int32.
341   for ((Int32)translationSize <  0) : no specification for that case, but we support that case.
342   We suppose our code now is compatible with original-ms-cab-decoder.
343 
344   Starting byte of data stream (real_pos == 0) is special corner case,
345   where we don't need any conversion (as in original-ms-cab-decoder).
346   Our optimization: we use unsigned (UInt32 pos) (pos = -1 - real_pos).
347   So (pos) is always negative: ((Int32)pos < 0).
348   It allows us to use simple comparison (v > pos) instead of more complex comparisons.
349 */
350 // (p) will point 5 bytes after 0xe8 byte:
351 // pos == -1 - (p - 5 - data_start) == 4 + data_start - p
352 // (FILTER_PROCESSED_SIZE_DELTA == 4) is optimized value for better speed in some compilers:
353 #define FILTER_PROCESSED_SIZE_DELTA  4
354 
355 #if defined(MY_CPU_X86_OR_AMD64) || defined(MY_CPU_ARM_OR_ARM64)
356   // optimized branch:
357   // size_t must be at least 32-bit for this branch.
358   #if 1 // use 1 for simpler code
359     // use integer (low 32 bits of pointer) instead of pointer
360     #define X86_FILTER_PREPARE  processedSize4 = (UInt32)(size_t)(ptrdiff_t)data + \
361         (UInt32)(4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
362     #define X86_FILTER_CALC_pos(p)  const UInt32 pos = processedSize4 - (UInt32)(size_t)(ptrdiff_t)p;
363   #else
364     // note: (dataStart) pointer can point out of array ranges:
365     #define X86_FILTER_PREPARE  const Byte *dataStart = data + \
366                 (4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
367     #define X86_FILTER_CALC_pos(p)  const UInt32 pos = (UInt32)(size_t)(dataStart - p);
368   #endif
369 #else
370   // non-optimized branch for unusual platforms (16-bit size_t or unusual size_t):
371     #define X86_FILTER_PREPARE  processedSize4 = \
372         (UInt32)(4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
373     #define X86_FILTER_CALC_pos(p)  const UInt32 pos = processedSize4 - (UInt32)(size_t)(p - data);
374 #endif
375 
376 #define X86_TRANSLATE_PRE(p) \
377     UInt32 v = GetUi32((p) - 4);
378 
379 #define X86_TRANSLATE_POST(p) \
380   { \
381     X86_FILTER_CALC_pos(p) \
382     if (v < translationSize) { \
383       UPDATE_STAT(g_stats_Num_x86[0]++;) \
384       v += pos + 1; \
385       SetUi32((p) - 4, v) \
386     } \
387     else if (v > pos) { \
388       UPDATE_STAT(g_stats_Num_x86[1]++;) \
389       v += translationSize; \
390       SetUi32((p) - 4, v) \
391     } else { UPDATE_STAT(g_stats_Num_x86[2]++;) } \
392   }
393 
394 
395 /*
396   if (   defined(Z7_LZX_X86_FILTER_USE_SSE2)
397       && defined(Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED))
398     the function can read up to aligned_for_32_up_from(size) bytes in (data).
399 */
400 // processedSize < (1 << 30)
401 Z7_NO_INLINE
402 static void x86_Filter4(Byte *data, size_t size, UInt32 processedSize4, UInt32 translationSize)
403 {
404   const size_t kResidue = 10;
405   if (size <= kResidue)
406     return;
407   Byte * const lim = data + size - kResidue + 4;
408   const Byte save = lim[0];
409   lim[0] = 0xe8;
410   X86_FILTER_PREPARE
411   Byte *p = data;
412 
413 #define FILTER_RETURN_IF_LIM(_p_)  if (_p_ > lim) { lim[0] = save; return; }
414 
415 #ifdef Z7_LZX_X86_FILTER_USE_SSE2
416 
417 // sse2-aligned/sse2-unaligned provide same speed on real data.
418 // but the code is smaller for sse2-unaligned version.
419 // for debug : define it to get alternative version with aligned 128-bit reads:
420 // #define Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
421 
422 #define FILTER_MASK_INT  UInt32
423 #define FILTER_NUM_VECTORS_IN_CHUNK   2
424 #define FILTER_CHUNK_BYTES_OFFSET     (16 * FILTER_NUM_VECTORS_IN_CHUNK - 5)
425 
426 #ifdef Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
427   // aligned version doesn't uses additional space if buf size is aligned for 32
428   #define k_Filter_OutBufSize_Add  0
429   #define k_Filter_OutBufSize_AlignMask  (16 * FILTER_NUM_VECTORS_IN_CHUNK - 1)
430   #define FILTER_LOAD_128(p)  _mm_load_si128 ((const __m128i *)(const void *)(p))
431 #else
432   #define k_Filter_OutBufSize_Add  (16 * FILTER_NUM_VECTORS_IN_CHUNK)
433   #define k_Filter_OutBufSize_AlignMask 0
434   #define FILTER_LOAD_128(p)  _mm_loadu_si128((const __m128i *)(const void *)(p))
435 #endif
436 
437 #define GET_E8_MASK(dest, dest1, p) \
438 { \
439   __m128i v0 = FILTER_LOAD_128(p); \
440   __m128i v1 = FILTER_LOAD_128(p + 16); \
441   p += 16 * FILTER_NUM_VECTORS_IN_CHUNK; \
442   v0 = _mm_cmpeq_epi8(v0, k_e8_Vector); \
443   v1 = _mm_cmpeq_epi8(v1, k_e8_Vector); \
444   dest  = (unsigned)_mm_movemask_epi8(v0); \
445   dest1 = (unsigned)_mm_movemask_epi8(v1); \
446 }
447 
448   const __m128i k_e8_Vector = _mm_set1_epi32((Int32)(UInt32)0xe8e8e8e8);
449   for (;;)
450   {
451       // for debug: define it for smaller code:
452       // #define Z7_LZX_X86_FILTER_CALC_IN_LOOP
453       // without Z7_LZX_X86_FILTER_CALC_IN_LOOP, we can get faster and simpler loop
454     FILTER_MASK_INT mask;
455     {
456       FILTER_MASK_INT mask1;
457       do
458       {
459         GET_E8_MASK(mask, mask1, p)
460         #ifndef Z7_LZX_X86_FILTER_CALC_IN_LOOP
461           mask += mask1;
462         #else
463           mask |= mask1 << 16;
464         #endif
465       }
466       while (!mask);
467 
468       #ifndef Z7_LZX_X86_FILTER_CALC_IN_LOOP
469         mask -= mask1;
470         mask |= mask1 << 16;
471       #endif
472     }
473 
474 #ifdef Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
475     for (;;)
476     {
477       ctz_type index;
478       typedef
479       #ifdef MY_CPU_64BIT
480         UInt64
481       #else
482         UInt32
483       #endif
484         SUPER_MASK_INT;
485       SUPER_MASK_INT superMask;
486       {
487         MY_CTZ(index, mask);
488         Byte *p2 = p - FILTER_CHUNK_BYTES_OFFSET + (unsigned)index;
489         X86_TRANSLATE_PRE(p2)
490         superMask = ~(SUPER_MASK_INT)0x1f << index;
491         FILTER_RETURN_IF_LIM(p2)
492         X86_TRANSLATE_POST(p2)
493         mask &= (UInt32)superMask;
494       }
495       if (mask)
496         continue;
497       if (index <= FILTER_CHUNK_BYTES_OFFSET)
498         break;
499       {
500         FILTER_MASK_INT mask1;
501         GET_E8_MASK(mask, mask1, p)
502         mask &=
503             #ifdef MY_CPU_64BIT
504               (UInt32)(superMask >> 32);
505             #else
506               ((FILTER_MASK_INT)0 - 1) << ((int)index - FILTER_CHUNK_BYTES_OFFSET);
507             #endif
508         mask |= mask1 << 16;
509       }
510       if (!mask)
511         break;
512     }
513 #else // ! Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
514     {
515       // we use simplest version without loop:
516       // for (;;)
517       {
518         ctz_type index;
519         MY_CTZ(index, mask);
520         /*
521         printf("\np=%p, mask=%8x, index = %2d, p + index = %x\n",
522             (p - 16 * FILTER_NUM_VECTORS_IN_CHUNK), (unsigned)mask,
523             (unsigned)index, (unsigned)((unsigned)(ptrdiff_t)(p - 16 * FILTER_NUM_VECTORS_IN_CHUNK) + index));
524         */
525         p += (size_t)(unsigned)index - FILTER_CHUNK_BYTES_OFFSET;
526         FILTER_RETURN_IF_LIM(p)
527         // mask &= ~(FILTER_MASK_INT)0x1f << index;  mask >>= index;
528         X86_TRANSLATE_PRE(p)
529         X86_TRANSLATE_POST(p)
530         // if (!mask) break; // p += 16 * FILTER_NUM_VECTORS_IN_CHUNK;
531       }
532     }
533 #endif // ! Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
534   }
535 
536 #else // ! Z7_LZX_X86_FILTER_USE_SSE2
537 
538 #define k_Filter_OutBufSize_Add  0
539 #define k_Filter_OutBufSize_AlignMask 0
540 
541 
542   for (;;)
543   {
544     for (;;)
545     {
546       if (p[0] == 0xe8) { p += 5; break; }
547       if (p[1] == 0xe8) { p += 6; break; }
548       if (p[2] == 0xe8) { p += 7; break; }
549       p += 4;
550       if (p[-1] == 0xe8) { p += 4; break; }
551     }
552     FILTER_RETURN_IF_LIM(p)
553     X86_TRANSLATE_PRE(p)
554     X86_TRANSLATE_POST(p)
555   }
556 
557 #endif // ! Z7_LZX_X86_FILTER_USE_SSE2
558 }
559 
560 
561 CDecoder::CDecoder() throw():
562     _win(NULL),
563     _isUncompressedBlock(false),
564     _skipByte(false),
565     _keepHistory(false),
566     _keepHistoryForNext(true),
567     _needAlloc(true),
568     _wimMode(false),
569     _numDictBits(15),
570     _unpackBlockSize(0),
571     _x86_translationSize(0),
572     _x86_buf(NULL),
573     _unpackedData(NULL)
574 {
575   {
576     // it's better to get empty virtual entries, if mispredicted value can be used:
577     memset(_reps, 0, kPosSlotOffset * sizeof(_reps[0]));
578     memset(_extra, 0, kPosSlotOffset);
579 #define SET_NUM_BITS(i) i // #define NUM_BITS_DELTA 31
580     _extra[kPosSlotOffset + 0] = SET_NUM_BITS(0);
581     _extra[kPosSlotOffset + 1] = SET_NUM_BITS(0);
582     // reps[0] = 0 - (kNumReps - 1);
583     // reps[1] = 1 - (kNumReps - 1);
584     UInt32 a = 2 - (kNumReps - 1);
585     UInt32 delta = 1;
586     unsigned i;
587     for (i = 0; i < kNumLinearPosSlotBits; i++)
588     {
589       _extra[(size_t)i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
590       _extra[(size_t)i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
591       _reps [(size_t)i * 2 + 2 + kPosSlotOffset] = a;  a += delta;
592       _reps [(size_t)i * 2 + 3 + kPosSlotOffset] = a;  a += delta;
593       delta += delta;
594     }
595     for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
596     {
597       _extra[(size_t)i + kPosSlotOffset] = SET_NUM_BITS(kNumLinearPosSlotBits);
598       _reps [(size_t)i + kPosSlotOffset] = a;
599       a += (UInt32)1 << kNumLinearPosSlotBits;
600     }
601   }
602 }
603 
604 CDecoder::~CDecoder() throw()
605 {
606   if (_needAlloc)
607     // BigFree
608     z7_AlignedFree
609       (_win);
610   z7_AlignedFree(_x86_buf);
611 }
612 
613 HRESULT CDecoder::Flush() throw()
614 {
615   // UInt32 t = _x86_processedSize; for (int y = 0; y < 50; y++) { _x86_processedSize = t; // benchmark: (branch predicted)
616   if (_x86_translationSize != 0)
617   {
618     Byte *destData = _win + _writePos;
619     const UInt32 curSize = _pos - _writePos;
620     if (_keepHistoryForNext)
621     {
622       const size_t kChunkSize = (size_t)1 << 15;
623       if (curSize > kChunkSize)
624         return E_NOTIMPL;
625       if (!_x86_buf)
626       {
627         // (kChunkSize % 32 == 0) is required in some cases, because
628         // the filter can read data by 32-bytes chunks in some cases.
629         // if (chunk_size > (1 << 15)) is possible, then we must the code:
630         const size_t kAllocSize = kChunkSize + k_Filter_OutBufSize_Add;
631         _x86_buf = (Byte *)z7_AlignedAlloc(kAllocSize);
632         if (!_x86_buf)
633           return E_OUTOFMEMORY;
634         #if 0 != k_Filter_OutBufSize_Add || \
635             0 != k_Filter_OutBufSize_AlignMask
636           // x86_Filter4() can read after curSize.
637           // So we set all data to zero to prevent reading of uninitialized data:
638           memset(_x86_buf, 0, kAllocSize); // optional
639         #endif
640       }
641       // for (int yy = 0; yy < 1; yy++) // for debug
642       memcpy(_x86_buf, destData, curSize);
643       _unpackedData = _x86_buf;
644       destData = _x86_buf;
645     }
646     else
647     {
648       // x86_Filter4() can overread after (curSize),
649       // so we can do memset() after (curSize):
650       // k_Filter_OutBufSize_AlignMask also can be used
651       // if (!_overDict) memset(destData + curSize, 0, k_Filter_OutBufSize_Add);
652     }
653     x86_Filter4(destData, curSize, _x86_processedSize - FILTER_PROCESSED_SIZE_DELTA, _x86_translationSize);
654     _x86_processedSize += (UInt32)curSize;
655     if (_x86_processedSize >= ((UInt32)1 << 30))
656       _x86_translationSize = 0;
657   }
658   // }
659   return S_OK;
660 }
661 
662 
663 
664 // (NUM_DELTA_BYTES == 2) reduces the code in main loop.
665 #if 1
666   #define NUM_DELTA_BYTES  2
667 #else
668   #define NUM_DELTA_BYTES  0
669 #endif
670 
671 #define NUM_DELTA_BIT_OFFSET_BITS  (NUM_DELTA_BYTES * 8)
672 
673 #if NUM_DELTA_BIT_OFFSET_BITS > 0
674   #define DECODE_ERROR_CODE  0
675   #define IS_OVERFLOW_bitOffset(bo)  ((bo) >= 0)
676   // ( >= 0) comparison after bitOffset change gives simpler commands than ( > 0) comparison
677 #else
678   #define DECODE_ERROR_CODE  1
679   #define IS_OVERFLOW_bitOffset(bo)  ((bo) >  0)
680 #endif
681 
682 // (numBits != 0)
683 #define GET_VAL_BASE(numBits)  (_value >> (32 - (numBits)))
684 
685 #define Z7_LZX_HUFF_DECODE( sym, huff, kNumTableBits, move_pos_op, check_op, error_op) \
686     Z7_HUFF_DECODE_VAL_IN_HIGH32(sym, huff, kNumHuffmanBits, kNumTableBits,  \
687         _value, check_op, error_op, move_pos_op, NORMALIZE, bs)
688 
689 #define Z7_LZX_HUFF_DECODE_CHECK_YES(sym, huff, kNumTableBits, move_pos_op) \
690         Z7_LZX_HUFF_DECODE(          sym, huff, kNumTableBits, move_pos_op, \
691             Z7_HUFF_DECODE_ERROR_SYM_CHECK_YES, { return DECODE_ERROR_CODE; })
692 
693 #define Z7_LZX_HUFF_DECODE_CHECK_NO( sym, huff, kNumTableBits, move_pos_op) \
694         Z7_LZX_HUFF_DECODE(          sym, huff, kNumTableBits, move_pos_op, \
695             Z7_HUFF_DECODE_ERROR_SYM_CHECK_NO, {})
696 
697 #define NORMALIZE \
698 { \
699   const Byte *ptr = _buf + (_bitOffset >> 4) * 2; \
700   /* _value = (((UInt32)GetUi16(ptr) << 16) | GetUi16(ptr + 2)) << (_bitOffset & 15); */ \
701   const UInt32 v = GetUi32(ptr); \
702   _value = rotlFixed (v, ((int)_bitOffset & 15) + 16); \
703 }
704 
705 #define MOVE_POS(bs, numBits) \
706 { \
707   _bitOffset += numBits; \
708 }
709 
710 #define MOVE_POS_STAT(bs, numBits) \
711 { \
712   UPDATE_STAT(g_stats_len_levels[numBits]++;) \
713   MOVE_POS(bs, numBits); \
714 }
715 
716 #define MOVE_POS_CHECK(bs, numBits) \
717 { \
718   if (IS_OVERFLOW_bitOffset(_bitOffset += numBits)) return DECODE_ERROR_CODE; \
719 }
720 
721 #define MOVE_POS_CHECK_STAT(bs, numBits) \
722 { \
723   UPDATE_STAT(g_stats_main_levels[numBits]++;) \
724   MOVE_POS_CHECK(bs, numBits) \
725 }
726 
727 
728 // (numBits == 0) is supported
729 
730 #ifdef Z7_HUFF_USE_64BIT_LIMIT
731 
732 #define MACRO_ReadBitsBig_pre(numBits) \
733 { \
734   _bitOffset += (numBits); \
735   _value >>= 32 - (numBits); \
736 }
737 
738 #else
739 
740 #define MACRO_ReadBitsBig_pre(numBits) \
741 { \
742   _bitOffset += (numBits); \
743   _value = (UInt32)((UInt32)_value >> 1 >> (31 ^ (numBits))); \
744 }
745 
746 #endif
747 
748 
749 #define MACRO_ReadBitsBig_add(dest) \
750   { dest += (UInt32)_value; }
751 
752 #define MACRO_ReadBitsBig_add3(dest) \
753   { dest += (UInt32)(_value) << 3; }
754 
755 
756 // (numBits != 0)
757 #define MACRO_ReadBits_NonZero(val, numBits) \
758 { \
759   val = (UInt32)(_value >> (32 - (numBits))); \
760   MOVE_POS(bs, numBits); \
761   NORMALIZE \
762 }
763 
764 
765 struct CBitDecoder
766 {
767   ptrdiff_t _bitOffset;
768   const Byte *_buf;
769 
770   Z7_FORCE_INLINE
771   UInt32 GetVal() const
772   {
773     const Byte *ptr = _buf + (_bitOffset >> 4) * 2;
774     const UInt32 v = GetUi32(ptr);
775     return rotlFixed (v, ((int)_bitOffset & 15) + 16);
776   }
777 
778   Z7_FORCE_INLINE
779   bool IsOverRead() const
780   {
781     return _bitOffset > (int)(0 - NUM_DELTA_BIT_OFFSET_BITS);
782   }
783 
784 
785   Z7_FORCE_INLINE
786   bool WasBitStreamFinishedOK() const
787   {
788     // we check that all 0-15 unused bits are zeros:
789     if (_bitOffset == 0 - NUM_DELTA_BIT_OFFSET_BITS)
790       return true;
791     if ((_bitOffset + NUM_DELTA_BIT_OFFSET_BITS + 15) & ~(ptrdiff_t)15)
792       return false;
793     const Byte *ptr = _buf - NUM_DELTA_BYTES - 2;
794     if ((UInt16)(GetUi16(ptr) << (_bitOffset & 15)))
795       return false;
796     return true;
797   }
798 
799   // (numBits != 0)
800   Z7_FORCE_INLINE
801   UInt32 ReadBits_NonZero(unsigned numBits) throw()
802   {
803     const UInt32 val = GetVal() >> (32 - numBits);
804     _bitOffset += numBits;
805     return val;
806   }
807 };
808 
809 
810 class CBitByteDecoder: public CBitDecoder
811 {
812   size_t _size;
813 public:
814 
815   Z7_FORCE_INLINE
816   void Init_ByteMode(const Byte *data, size_t size)
817   {
818     _buf = data;
819     _size = size;
820   }
821 
822   Z7_FORCE_INLINE
823   void Init_BitMode(const Byte *data, size_t size)
824   {
825     _size = size & 1;
826     size &= ~(size_t)1;
827     _buf = data + size + NUM_DELTA_BYTES;
828     _bitOffset = 0 - (ptrdiff_t)(size * 8) - NUM_DELTA_BIT_OFFSET_BITS;
829   }
830 
831   Z7_FORCE_INLINE
832   void Switch_To_BitMode()
833   {
834     Init_BitMode(_buf, _size);
835   }
836 
837   Z7_FORCE_INLINE
838   bool Switch_To_ByteMode()
839   {
840     /* here we check that unused bits in high 16-bits word are zeros.
841        If high word is full (all 16-bits are unused),
842        we check that all 16-bits are zeros.
843        So we check and skip (1-16 bits) unused bits */
844     if ((GetVal() >> (16 + (_bitOffset & 15))) != 0)
845       return false;
846     _bitOffset += 16;
847     _bitOffset &= ~(ptrdiff_t)15;
848     if (_bitOffset > 0 - NUM_DELTA_BIT_OFFSET_BITS)
849       return false;
850     const ptrdiff_t delta = _bitOffset >> 3;
851     _size = (size_t)((ptrdiff_t)(_size) - delta - NUM_DELTA_BYTES);
852     _buf += delta;
853     // _bitOffset = 0; // optional
854     return true;
855   }
856 
857   Z7_FORCE_INLINE
858   size_t GetRem() const { return _size; }
859 
860   Z7_FORCE_INLINE
861   UInt32 ReadUInt32()
862   {
863     const Byte *ptr = _buf;
864     const UInt32 v = GetUi32(ptr);
865     _buf += 4;
866     _size -= 4;
867     return v;
868   }
869 
870   Z7_FORCE_INLINE
871   void CopyTo(Byte *dest, size_t size)
872   {
873     memcpy(dest, _buf, size);
874     _buf += size;
875     _size -= size;
876   }
877 
878   Z7_FORCE_INLINE
879   bool IsOneDirectByteLeft() const
880   {
881     return GetRem() == 1;
882   }
883 
884   Z7_FORCE_INLINE
885   Byte DirectReadByte()
886   {
887     _size--;
888     return *_buf++;
889   }
890 };
891 
892 
893 // numBits != 0
894 // Z7_FORCE_INLINE
895 Z7_NO_INLINE
896 static
897 UInt32 ReadBits(CBitDecoder &_bitStream, unsigned numBits)
898 {
899   return _bitStream.ReadBits_NonZero(numBits);
900 }
901 
902 #define RIF(x) { if (!(x)) return false; }
903 
904 
905 /*
906 MSVC compiler adds extra move operation,
907   if we access array with 32-bit index
908     array[calc_index_32_bit(32-bit_var)]
909     where calc_index_32_bit operations are: ((unsigned)a>>cnt), &, ^, |
910   clang is also affected for ((unsigned)a>>cnt) in byte array.
911 */
912 
913 // it can overread input buffer for 7-17 bytes.
914 // (levels != levelsEnd)
915 Z7_NO_INLINE
916 static ptrdiff_t ReadTable(ptrdiff_t _bitOffset, const Byte *_buf, Byte *levels, const Byte *levelsEnd)
917 {
918   const unsigned kNumTableBits_Level = 7;
919   NHuffman::CDecoder256<kNumHuffmanBits, kLevelTableSize, kNumTableBits_Level> _levelDecoder;
920   NHuffman::CValueInt _value;
921   // optional check to reduce size of overread zone:
922   if (_bitOffset > (int)0 - (int)NUM_DELTA_BIT_OFFSET_BITS - (int)(kLevelTableSize * kNumLevelBits))
923     return DECODE_ERROR_CODE;
924   NORMALIZE
925   {
926     Byte levels2[kLevelTableSize / 4 * 4];
927     for (size_t i = 0; i < kLevelTableSize / 4 * 4; i += 4)
928     {
929       UInt32 val;
930       MACRO_ReadBits_NonZero(val, kNumLevelBits * 4)
931       levels2[i + 0] = (Byte)((val >> (3 * kNumLevelBits)));
932       levels2[i + 1] = (Byte)((val >> (2 * kNumLevelBits)) & ((1u << kNumLevelBits) - 1));
933       levels2[i + 2] = (Byte)((Byte)val >> (1 * kNumLevelBits));
934       levels2[i + 3] = (Byte)((val) & ((1u << kNumLevelBits) - 1));
935     }
936     RIF(_levelDecoder.Build(levels2, NHuffman::k_BuildMode_Full))
937   }
938 
939   do
940   {
941     unsigned sym;
942     Z7_LZX_HUFF_DECODE_CHECK_NO(sym, &_levelDecoder, kNumTableBits_Level, MOVE_POS_CHECK)
943     // Z7_HUFF_DECODE_CHECK(sym, &_levelDecoder, kNumHuffmanBits, kNumTableBits_Level, &bitStream, return false)
944     // sym = _levelDecoder.Decode(&bitStream);
945     // if (!_levelDecoder.Decode_SymCheck_MovePosCheck(&bitStream, sym)) return false;
946 
947     if (sym <= kNumHuffmanBits)
948     {
949       int delta = (int)*levels - (int)sym;
950       delta += delta < 0 ? kNumHuffmanBits + 1 : 0;
951       *levels++ = (Byte)delta;
952       continue;
953     }
954 
955     unsigned num;
956     int symbol;
957 
958     if (sym < kLevelSym_Same)
959     {
960       // sym -= kLevelSym_Zero1;
961       MACRO_ReadBits_NonZero(num, kLevelSym_Zero1_NumBits + (sym - kLevelSym_Zero1))
962       num += (sym << kLevelSym_Zero1_NumBits) - (kLevelSym_Zero1 << kLevelSym_Zero1_NumBits) + kLevelSym_Zero1_Start;
963       symbol = 0;
964     }
965     // else if (sym != kLevelSym_Same) return DECODE_ERROR_CODE;
966     else // (sym == kLevelSym_Same)
967     {
968       MACRO_ReadBits_NonZero(num, kLevelSym_Same_NumBits)
969       num += kLevelSym_Same_Start;
970       // + (unsigned)bitStream.ReadBitsSmall(kLevelSym_Same_NumBits);
971       // Z7_HUFF_DECODE_CHECK(sym, &_levelDecoder, kNumHuffmanBits, kNumTableBits_Level, &bitStream, return DECODE_ERROR_CODE)
972       // if (!_levelDecoder.Decode2(&bitStream, sym)) return DECODE_ERROR_CODE;
973       // sym = _levelDecoder.Decode(&bitStream);
974 
975       Z7_LZX_HUFF_DECODE_CHECK_NO(sym, &_levelDecoder, kNumTableBits_Level, MOVE_POS)
976 
977       if (sym > kNumHuffmanBits) return DECODE_ERROR_CODE;
978       symbol = *levels - (int)sym;
979       symbol += symbol < 0 ? kNumHuffmanBits + 1 : 0;
980     }
981 
982     if (num > (size_t)(levelsEnd - levels))
983       return false;
984     const Byte *limit = levels + num;
985     do
986       *levels++ = (Byte)symbol;
987     while (levels != limit);
988   }
989   while (levels != levelsEnd);
990 
991   return _bitOffset;
992 }
993 
994 
995 static const unsigned kPosSlotDelta = 256 / kNumLenSlots - kPosSlotOffset;
996 
997 
998 #define READ_TABLE(_bitStream, levels, levelsEnd) \
999 { \
1000   _bitStream._bitOffset = ReadTable(_bitStream._bitOffset, _bitStream._buf, levels, levelsEnd); \
1001   if (_bitStream.IsOverRead()) return false; \
1002 }
1003 
1004 // can over-read input buffer for less than 32 bytes
1005 bool CDecoder::ReadTables(CBitByteDecoder &_bitStream) throw()
1006 {
1007   UPDATE_STAT(g_stats_NumTables++;)
1008   {
1009     const unsigned blockType = (unsigned)ReadBits(_bitStream, kBlockType_NumBits);
1010     // if (blockType > kBlockType_Uncompressed || blockType == 0)
1011     if ((unsigned)(blockType - 1) > kBlockType_Uncompressed - 1)
1012       return false;
1013     _unpackBlockSize = 1u << 15;
1014     if (!_wimMode || ReadBits(_bitStream, 1) == 0)
1015     {
1016       _unpackBlockSize = ReadBits(_bitStream, 16);
1017       // wimlib supports chunks larger than 32KB (unsupported my MS wim).
1018       if (!_wimMode || _numDictBits >= 16)
1019       {
1020         _unpackBlockSize <<= 8;
1021         _unpackBlockSize |= ReadBits(_bitStream, 8);
1022       }
1023     }
1024 
1025     PRF(printf("\nBlockSize = %6d   %s  ", _unpackBlockSize, (_pos & 1) ? "@@@" : "   "));
1026 
1027     _isUncompressedBlock = (blockType == kBlockType_Uncompressed);
1028     _skipByte = false;
1029 
1030     if (_isUncompressedBlock)
1031     {
1032       _skipByte = ((_unpackBlockSize & 1) != 0);
1033       // printf("\n UncompressedBlock %d", _unpackBlockSize);
1034       PRF(printf(" UncompressedBlock ");)
1035       // if (_unpackBlockSize & 1) { PRF(printf(" ######### ")); }
1036       if (!_bitStream.Switch_To_ByteMode())
1037         return false;
1038       if (_bitStream.GetRem() < kNumReps * 4)
1039         return false;
1040       for (unsigned i = 0; i < kNumReps; i++)
1041       {
1042         const UInt32 rep = _bitStream.ReadUInt32();
1043         // here we allow only such values for (rep) that can be set also by LZ code:
1044         if (rep == 0 || rep > _winSize - kNumReps)
1045           return false;
1046         _reps[(size_t)i + kPosSlotOffset] = rep;
1047       }
1048       // printf("\n");
1049       return true;
1050     }
1051 
1052     // _numAlignBits = 64;
1053     // const UInt32 k_numAlignBits_PosSlots_MAX = 64 + kPosSlotDelta;
1054     // _numAlignBits_PosSlots = k_numAlignBits_PosSlots_MAX;
1055     const UInt32 k_numAlignBits_Dist_MAX = (UInt32)(Int32)-1;
1056     _numAlignBits_Dist = k_numAlignBits_Dist_MAX;
1057     if (blockType == kBlockType_Aligned)
1058     {
1059       Byte levels[kAlignTableSize];
1060       // unsigned not0 = 0;
1061       unsigned not3 = 0;
1062       for (unsigned i = 0; i < kAlignTableSize; i++)
1063       {
1064         const unsigned val = ReadBits(_bitStream, kNumAlignLevelBits);
1065         levels[i] = (Byte)val;
1066         // not0 |= val;
1067         not3 |= (val ^ 3);
1068       }
1069       // static unsigned number = 0, all = 0; all++;
1070       // if (!not0) return false; // Build(true) will test this case
1071       if (not3)
1072       {
1073         // _numAlignBits_PosSlots = (kNumAlignBits + 1) * 2 + kPosSlotDelta;
1074         // _numAlignBits = kNumAlignBits;
1075         _numAlignBits_Dist = (1u << (kNumAlignBits + 1)) - (kNumReps - 1);
1076         RIF(_alignDecoder.Build(levels, true)) // full
1077       }
1078       // else { number++; if (number % 4 == 0) printf("\nnumber= %u : %u%%", number, number * 100 / all); }
1079     }
1080     // if (_numAlignBits_PosSlots == k_numAlignBits_PosSlots_MAX)
1081     if (_numAlignBits_Dist == k_numAlignBits_Dist_MAX)
1082     {
1083       size_t i;
1084       for (i = 3; i < kNumLinearPosSlotBits; i++)
1085       {
1086         _extra[i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
1087         _extra[i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
1088       }
1089       for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
1090         _extra[i + kPosSlotOffset] = (Byte)SET_NUM_BITS(kNumLinearPosSlotBits);
1091     }
1092     else
1093     {
1094       size_t i;
1095       for (i = 3; i < kNumLinearPosSlotBits; i++)
1096       {
1097         _extra[i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i) - 3);
1098         _extra[i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i) - 3);
1099       }
1100       for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
1101         _extra[i + kPosSlotOffset] = (Byte)(SET_NUM_BITS(kNumLinearPosSlotBits) - 3);
1102     }
1103   }
1104 
1105   READ_TABLE(_bitStream, _mainLevels, _mainLevels + 256)
1106   READ_TABLE(_bitStream, _mainLevels + 256, _mainLevels + 256 + _numPosLenSlots)
1107   const unsigned end = 256 + _numPosLenSlots;
1108   memset(_mainLevels + end, 0, kMainTableSize - end);
1109   // #define NUM_CYC 1
1110   // unsigned j; for (j = 0; j < NUM_CYC; j++)
1111   RIF(_mainDecoder.Build(_mainLevels, NHuffman::k_BuildMode_Full))
1112   // if (kNumLenSymols_Big_Start)
1113   memset(_lenLevels, 0, kNumLenSymols_Big_Start);
1114   READ_TABLE(_bitStream,
1115       _lenLevels + kNumLenSymols_Big_Start,
1116       _lenLevels + kNumLenSymols_Big_Start + kNumLenSymbols)
1117   // for (j = 0; j < NUM_CYC; j++)
1118   RIF(_lenDecoder.Build(_lenLevels, NHuffman::k_BuildMode_Full_or_Empty))
1119   return true;
1120 }
1121 
1122 
1123 
1124 static ptrdiff_t CodeLz(CDecoder *dec, size_t next, ptrdiff_t _bitOffset, const Byte *_buf) throw()
1125 {
1126   {
1127     Byte *const win = dec->_win;
1128     const UInt32 winSize = dec->_winSize;
1129     Byte *pos = win + dec->_pos;
1130     const Byte * const posEnd = pos + next;
1131     NHuffman::CValueInt _value;
1132 
1133     NORMALIZE
1134 
1135 #if 1
1136   #define HUFF_DEC_PREFIX  dec->
1137 #else
1138     const NHuffman::CDecoder<kNumHuffmanBits, kMainTableSize, kNumTableBits_Main> _mainDecoder = dec->_mainDecoder;
1139     const NHuffman::CDecoder256<kNumHuffmanBits, kNumLenSymbols, kNumTableBits_Len> _lenDecoder = dec->_lenDecoder;
1140     const NHuffman::CDecoder7b<kAlignTableSize> _alignDecoder = dec->_alignDecoder;
1141   #define HUFF_DEC_PREFIX
1142 #endif
1143 
1144     do
1145     {
1146       unsigned sym;
1147       // printf("\npos = %6u", pos - win);
1148       {
1149         const NHuffman::CDecoder<kNumHuffmanBits, kMainTableSize, kNumTableBits_Main>
1150             *mainDecoder = & HUFF_DEC_PREFIX _mainDecoder;
1151         Z7_LZX_HUFF_DECODE_CHECK_NO(sym, mainDecoder, kNumTableBits_Main, MOVE_POS_CHECK_STAT)
1152       }
1153       // if (!_mainDecoder.Decode_SymCheck_MovePosCheck(&bitStream, sym)) return DECODE_ERROR_CODE;
1154       // sym = _mainDecoder.Decode(&bitStream);
1155       // if (bitStream.WasExtraReadError_Fast()) return DECODE_ERROR_CODE;
1156 
1157       // printf(" sym = %3x", sym);
1158       UPDATE_STAT(g_stats_main[sym]++;)
1159 
1160       if (sym < 256)
1161       {
1162         UPDATE_STAT(g_stats_NumLits++;)
1163         *pos++ = (Byte)sym;
1164       }
1165       else
1166       {
1167         // sym -= 256;
1168         // if (sym >= _numPosLenSlots) return DECODE_ERROR_CODE;
1169         const unsigned posSlot = sym / kNumLenSlots;
1170         unsigned len = sym % kNumLenSlots + kMatchMinLen;
1171         if (len == kNumLenSlots - 1 + kMatchMinLen)
1172         {
1173           const NHuffman::CDecoder256<kNumHuffmanBits, kNumLenSymbols, kNumTableBits_Len>
1174               *lenDecoder = & HUFF_DEC_PREFIX _lenDecoder;
1175           Z7_LZX_HUFF_DECODE_CHECK_YES(len, lenDecoder, kNumTableBits_Len, MOVE_POS_STAT)
1176           // if (!_lenDecoder.Decode2(&bitStream, len)) return DECODE_ERROR_CODE;
1177           // len = _lenDecoder.Decode(&bitStream);
1178           // if (len >= kNumLenSymbols) return DECODE_ERROR_CODE;
1179           UPDATE_STAT(g_stats_len[len - kNumLenSymols_Big_Start]++;)
1180           len += kNumLenSlots - 1 + kMatchMinLen - kNumLenSymols_Big_Start;
1181         }
1182         /*
1183         if ((next -= len) < 0)
1184           return DECODE_ERROR_CODE;
1185         */
1186         UInt32 dist;
1187 
1188         dist = dec->_reps[(size_t)posSlot - kPosSlotDelta];
1189         if (posSlot < kNumReps + 256 / kNumLenSlots)
1190         {
1191           // if (posSlot != kNumReps + kPosSlotDelta)
1192           // if (posSlot - (kNumReps + kPosSlotDelta + 1) < 2)
1193           dec->_reps[(size_t)posSlot - kPosSlotDelta] = dec->_reps[kPosSlotOffset];
1194           /*
1195           if (posSlot != kPosSlotDelta)
1196           {
1197             UInt32 temp = dist;
1198             if (posSlot == kPosSlotDelta + 1)
1199             {
1200               dist = reps[1];
1201               reps[1] = temp;
1202             }
1203             else
1204             {
1205               dist = reps[2];
1206               reps[2] = temp;
1207             }
1208             // dist = reps[(size_t)(posSlot) - kPosSlotDelta];
1209             // reps[(size_t)(posSlot) - kPosSlotDelta] = reps[0];
1210             // reps[(size_t)(posSlot) - kPosSlotDelta] = temp;
1211           }
1212           */
1213         }
1214         else // if (posSlot != kNumReps + kPosSlotDelta)
1215         {
1216           unsigned numDirectBits;
1217 #if 0
1218           if (posSlot < kNumPowerPosSlots + kPosSlotDelta)
1219           {
1220             numDirectBits = (posSlot - 2 - kPosSlotDelta) >> 1;
1221             dist = (UInt32)(2 | (posSlot & 1)) << numDirectBits;
1222           }
1223           else
1224           {
1225             numDirectBits = kNumLinearPosSlotBits;
1226             dist = (UInt32)(posSlot - 0x22 - kPosSlotDelta) << kNumLinearPosSlotBits;
1227           }
1228           dist -= kNumReps - 1;
1229 #else
1230           numDirectBits = dec->_extra[(size_t)posSlot - kPosSlotDelta];
1231           // dist = reps[(size_t)(posSlot) - kPosSlotDelta];
1232 #endif
1233           dec->_reps[kPosSlotOffset + 2] =
1234           dec->_reps[kPosSlotOffset + 1];
1235           dec->_reps[kPosSlotOffset + 1] =
1236           dec->_reps[kPosSlotOffset + 0];
1237 
1238           // dist += val; dist += bitStream.ReadBitsBig(numDirectBits);
1239           // if (posSlot >= _numAlignBits_PosSlots)
1240           // if (numDirectBits >= _numAlignBits)
1241           // if (val >= _numAlignBits_Dist)
1242           // UInt32 val; MACRO_ReadBitsBig(val , numDirectBits)
1243           // dist += val;
1244           // dist += (UInt32)((UInt32)_value >> 1 >> (/* 31 ^ */ (numDirectBits)));
1245           // MOVE_POS((numDirectBits ^ 31))
1246           MACRO_ReadBitsBig_pre(numDirectBits)
1247           // dist += (UInt32)_value;
1248           if (dist >= dec->_numAlignBits_Dist)
1249           {
1250             // if (numDirectBits != _numAlignBits)
1251             {
1252               // UInt32 val;
1253               // dist -= (UInt32)_value;
1254               MACRO_ReadBitsBig_add3(dist)
1255               NORMALIZE
1256               // dist += (val << kNumAlignBits);
1257               // dist += bitStream.ReadBitsSmall(numDirectBits - kNumAlignBits) << kNumAlignBits;
1258             }
1259             {
1260               // const unsigned alignTemp = _alignDecoder.Decode(&bitStream);
1261               const NHuffman::CDecoder7b<kAlignTableSize> *alignDecoder = & HUFF_DEC_PREFIX _alignDecoder;
1262               unsigned alignTemp;
1263               UPDATE_STAT(g_stats_NumAlign++;)
1264               Z7_HUFF_DECODER_7B_DECODE(alignTemp, alignDecoder, GET_VAL_BASE, MOVE_POS, bs)
1265               // NORMALIZE
1266               // if (alignTemp >= kAlignTableSize) return DECODE_ERROR_CODE;
1267               dist += alignTemp;
1268             }
1269           }
1270           else
1271           {
1272             {
1273               MACRO_ReadBitsBig_add(dist)
1274               // dist += bitStream.ReadBitsSmall(numDirectBits - kNumAlignBits) << kNumAlignBits;
1275             }
1276           }
1277           NORMALIZE
1278           /*
1279           else
1280           {
1281             UInt32 val;
1282             MACRO_ReadBitsBig(val, numDirectBits)
1283             dist += val;
1284             // dist += bitStream.ReadBitsBig(numDirectBits);
1285           }
1286           */
1287         }
1288         dec->_reps[kPosSlotOffset + 0] = dist;
1289 
1290         Byte *dest = pos;
1291         if (len > (size_t)(posEnd - pos))
1292           return DECODE_ERROR_CODE;
1293         Int32 srcPos = (Int32)(pos - win);
1294         pos += len;
1295         srcPos -= (Int32)dist;
1296         if (srcPos < 0) // fast version
1297         {
1298           if (!dec->_overDict)
1299             return DECODE_ERROR_CODE;
1300           srcPos &= winSize - 1;
1301           UInt32 rem = winSize - (UInt32)srcPos;
1302           if (len > rem)
1303           {
1304             len -= rem;
1305             const Byte *src = win + (UInt32)srcPos;
1306             do
1307               *dest++ = *src++;
1308             while (--rem);
1309             srcPos = 0;
1310           }
1311         }
1312         CopyLzMatch(dest, win + (UInt32)srcPos, len, dist);
1313       }
1314     }
1315     while (pos != posEnd);
1316 
1317     return _bitOffset;
1318   }
1319 }
1320 
1321 
1322 
1323 
1324 // inSize != 0
1325 // outSize != 0 ???
1326 HRESULT CDecoder::CodeSpec(const Byte *inData, size_t inSize, UInt32 outSize) throw()
1327 {
1328   // ((inSize & 1) != 0) case is possible, if current call will be finished with Uncompressed Block.
1329   CBitByteDecoder _bitStream;
1330   if (_keepHistory && _isUncompressedBlock)
1331     _bitStream.Init_ByteMode(inData, inSize);
1332   else
1333     _bitStream.Init_BitMode(inData, inSize);
1334 
1335   if (!_keepHistory)
1336   {
1337     _isUncompressedBlock = false;
1338     _skipByte = false;
1339     _unpackBlockSize = 0;
1340     memset(_mainLevels, 0, sizeof(_mainLevels));
1341     memset(_lenLevels, 0, sizeof(_lenLevels));
1342     {
1343       _x86_translationSize = 12000000;
1344       if (!_wimMode)
1345       {
1346         _x86_translationSize = 0;
1347         if (ReadBits(_bitStream, 1) != 0)
1348         {
1349           UInt32 v = ReadBits(_bitStream, 16) << 16;
1350           v       |= ReadBits(_bitStream, 16);
1351           _x86_translationSize = v;
1352         }
1353       }
1354       _x86_processedSize = 0;
1355     }
1356     _reps[0 + kPosSlotOffset] = 1;
1357     _reps[1 + kPosSlotOffset] = 1;
1358     _reps[2 + kPosSlotOffset] = 1;
1359   }
1360 
1361   while (outSize)
1362   {
1363     /*
1364     // check it for bit mode only:
1365     if (_bitStream.WasExtraReadError_Fast())
1366       return S_FALSE;
1367     */
1368     if (_unpackBlockSize == 0)
1369     {
1370       if (_skipByte)
1371       {
1372         if (_bitStream.GetRem() < 1)
1373           return S_FALSE;
1374         if (_bitStream.DirectReadByte() != 0)
1375           return S_FALSE;
1376       }
1377       if (_isUncompressedBlock)
1378         _bitStream.Switch_To_BitMode();
1379       if (!ReadTables(_bitStream))
1380         return S_FALSE;
1381       continue;
1382     }
1383 
1384     // _unpackBlockSize != 0
1385     UInt32 next = _unpackBlockSize;
1386     if (next > outSize)
1387         next = outSize;
1388     // next != 0
1389 
1390     // PRF(printf("\nnext = %d", (unsigned)next);)
1391 
1392     if (_isUncompressedBlock)
1393     {
1394       if (_bitStream.GetRem() < next)
1395         return S_FALSE;
1396       _bitStream.CopyTo(_win + _pos, next);
1397       _pos += next;
1398       _unpackBlockSize -= next;
1399     }
1400     else
1401     {
1402       _unpackBlockSize -= next;
1403       _bitStream._bitOffset = CodeLz(this, next, _bitStream._bitOffset, _bitStream._buf);
1404       if (_bitStream.IsOverRead())
1405         return S_FALSE;
1406       _pos += next;
1407     }
1408     outSize -= next;
1409   }
1410 
1411   // outSize == 0
1412 
1413   if (_isUncompressedBlock)
1414   {
1415     /* we don't know where skipByte can be placed, if it's end of chunk:
1416         1) in current chunk - there are such cab archives, if chunk is last
1417         2) in next chunk - are there such archives ? */
1418     if (_unpackBlockSize == 0
1419         && _skipByte
1420         // && outSize == 0
1421         && _bitStream.IsOneDirectByteLeft())
1422     {
1423       _skipByte = false;
1424       if (_bitStream.DirectReadByte() != 0)
1425         return S_FALSE;
1426     }
1427   }
1428 
1429   if (_bitStream.GetRem() != 0)
1430     return S_FALSE;
1431   if (!_isUncompressedBlock)
1432     if (!_bitStream.WasBitStreamFinishedOK())
1433       return S_FALSE;
1434   return S_OK;
1435 }
1436 
1437 
1438 #if k_Filter_OutBufSize_Add > k_Lz_OutBufSize_Add
1439   #define k_OutBufSize_Add  k_Filter_OutBufSize_Add
1440 #else
1441   #define k_OutBufSize_Add  k_Lz_OutBufSize_Add
1442 #endif
1443 
1444 HRESULT CDecoder::Code_WithExceedReadWrite(const Byte *inData, size_t inSize, UInt32 outSize) throw()
1445 {
1446   if (!_keepHistory)
1447   {
1448     _pos = 0;
1449     _overDict = false;
1450   }
1451   else if (_pos == _winSize)
1452   {
1453     _pos = 0;
1454     _overDict = true;
1455 #if k_OutBufSize_Add > 0
1456     // data after (_winSize) can be used, because we can use overwrite.
1457     // memset(_win + _winSize, 0, k_OutBufSize_Add);
1458 #endif
1459   }
1460   _writePos = _pos;
1461   _unpackedData = _win + _pos;
1462 
1463   if (outSize > _winSize - _pos)
1464     return S_FALSE;
1465 
1466   PRF(printf("\ninSize = %d", (unsigned)inSize);)
1467   PRF(if ((inSize & 1) != 0) printf("---------");)
1468 
1469   if (inSize == 0)
1470     return S_FALSE;
1471   const HRESULT res = CodeSpec(inData, inSize, outSize);
1472   const HRESULT res2 = Flush();
1473   return (res == S_OK ? res2 : res);
1474 }
1475 
1476 
1477 HRESULT CDecoder::SetParams2(unsigned numDictBits) throw()
1478 {
1479   if (numDictBits < kNumDictBits_Min ||
1480       numDictBits > kNumDictBits_Max)
1481     return E_INVALIDARG;
1482   _numDictBits = (Byte)numDictBits;
1483   const unsigned numPosSlots2 = (numDictBits < 20) ?
1484       numDictBits : 17 + (1u << (numDictBits - 18));
1485   _numPosLenSlots = numPosSlots2 * (kNumLenSlots * 2);
1486   return S_OK;
1487 }
1488 
1489 
1490 HRESULT CDecoder::Set_DictBits_and_Alloc(unsigned numDictBits) throw()
1491 {
1492   RINOK(SetParams2(numDictBits))
1493   const UInt32 newWinSize = (UInt32)1 << numDictBits;
1494   if (_needAlloc)
1495   {
1496     if (!_win || newWinSize != _winSize)
1497     {
1498       // BigFree
1499       z7_AlignedFree
1500         (_win);
1501       _winSize = 0;
1502       const size_t alloc_size = newWinSize + k_OutBufSize_Add;
1503       _win = (Byte *)
1504           // BigAlloc
1505           z7_AlignedAlloc
1506           (alloc_size);
1507       if (!_win)
1508         return E_OUTOFMEMORY;
1509       // optional:
1510       memset(_win, 0, alloc_size);
1511     }
1512   }
1513   _winSize = newWinSize;
1514   return S_OK;
1515 }
1516 
1517 }}
1518