1 // LzxDecoder.cpp
2
3 #include "StdAfx.h"
4
5 #include <string.h>
6 // #include <stdio.h>
7
8 // #define SHOW_DEBUG_INFO
9
10 #ifdef SHOW_DEBUG_INFO
11 #include <stdio.h>
12 #define PRF(x) x
13 #else
14 #define PRF(x)
15 #endif
16
17 #include "../../../C/Alloc.h"
18 #include "../../../C/RotateDefs.h"
19 #include "../../../C/CpuArch.h"
20
21 #include "LzxDecoder.h"
22
23
24 #ifdef MY_CPU_X86_OR_AMD64
25 #if defined(MY_CPU_AMD64) \
26 || defined(__SSE2__) \
27 || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
28 || 0 && defined(_MSC_VER) && (_MSC_VER >= 1400) // set (1 &&) for debug
29
30 #if defined(__clang__) && (__clang_major__ >= 2) \
31 || defined(__GNUC__) && (__GNUC__ >= 4) \
32 || defined(_MSC_VER) && (_MSC_VER >= 1400)
33 #define Z7_LZX_X86_FILTER_USE_SSE2
34 #endif
35 #endif
36 #endif
37
38
39 #ifdef Z7_LZX_X86_FILTER_USE_SSE2
40 // #ifdef MY_CPU_X86_OR_AMD64
41 #include <emmintrin.h> // SSE2
42 // #endif
43 #if defined(__clang__) || defined(__GNUC__)
44 typedef int ctz_type;
45 #define MY_CTZ(dest, mask) dest = __builtin_ctz((UInt32)(mask))
46 #else // #if defined(_MSC_VER)
47 #if (_MSC_VER >= 1600)
48 // #include <intrin.h>
49 #endif
50 typedef unsigned long ctz_type;
51 #define MY_CTZ(dest, mask) _BitScanForward(&dest, (mask));
52 #endif // _MSC_VER
53 #endif
54
55 // when window buffer is filled, we must wrap position to zero,
56 // and we want to wrap at same points where original-lzx must wrap.
57 // But the wrapping is possible in point where chunk is finished.
58 // Usually (chunk_size == 32KB), but (chunk_size != 32KB) also is allowed.
59 // So we don't use additional buffer space over required (winSize).
60 // And we can't use large overwrite after (len) in CopyLzMatch().
61 // But we are allowed to write 3 bytes after (len), because
62 // (delta <= _winSize - 3).
63
64 // #define k_Lz_OverwriteSize 0 // for debug : to disable overwrite
65 #define k_Lz_OverwriteSize 3 // = kNumReps
66 #if k_Lz_OverwriteSize > 0
67 // (k_Lz_OutBufSize_Add >= k_Lz_OverwriteSize) is required
68 // we use value 4 to simplify memset() code.
69 #define k_Lz_OutBufSize_Add (k_Lz_OverwriteSize + 1) // == 4
70 #else
71 #define k_Lz_OutBufSize_Add 0
72 #endif
73
74 // (len != 0)
75 // (0 < delta <= _winSize - 3)
76 Z7_FORCE_INLINE
CopyLzMatch(Byte * dest,const Byte * src,UInt32 len,UInt32 delta)77 void CopyLzMatch(Byte *dest, const Byte *src, UInt32 len, UInt32 delta)
78 {
79 if (delta >= 4)
80 {
81 #if k_Lz_OverwriteSize >= 3
82 // optimized code with overwrite to reduce the number of branches
83 #ifdef MY_CPU_LE_UNALIGN
84 *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
85 #else
86 dest[0] = src[0];
87 dest[1] = src[1];
88 dest[2] = src[2];
89 dest[3] = src[3];
90 #endif
91 len--;
92 src++;
93 dest++;
94 {
95 #else
96 // no overwrite in out buffer
97 dest[0] = src[0];
98 {
99 const unsigned m = (unsigned)len & 1;
100 src += m;
101 dest += m;
102 }
103 if (len &= ~(unsigned)1)
104 {
105 dest[0] = src[0];
106 dest[1] = src[1];
107 #endif
108 // len == 0 is allowed here
109 {
110 const unsigned m = (unsigned)len & 3;
111 src += m;
112 dest += m;
113 }
114 if (len &= ~(unsigned)3)
115 {
116 #ifdef MY_CPU_LE_UNALIGN
117 #if 1
118 *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
119 {
120 const unsigned m = (unsigned)len & 7;
121 dest += m;
122 src += m;
123 }
124 if (len &= ~(unsigned)7)
125 do
126 {
127 *(UInt32 *)(void *)(dest ) = *(const UInt32 *)(const void *)(src);
128 *(UInt32 *)(void *)(dest + 4) = *(const UInt32 *)(const void *)(src + 4);
129 src += 8;
130 dest += 8;
131 }
132 while (len -= 8);
133 #else
134 // gcc-11 -O3 for x64 generates incorrect code here
135 do
136 {
137 *(UInt32 *)(void *)(dest) = *(const UInt32 *)(const void *)(src);
138 src += 4;
139 dest += 4;
140 }
141 while (len -= 4);
142 #endif
143 #else
144 do
145 {
146 const Byte b0 = src[0];
147 const Byte b1 = src[1];
148 dest[0] = b0;
149 dest[1] = b1;
150 const Byte b2 = src[2];
151 const Byte b3 = src[3];
152 dest[2] = b2;
153 dest[3] = b3;
154 src += 4;
155 dest += 4;
156 }
157 while (len -= 4);
158 #endif
159 }
160 }
161 }
162 else // (delta < 4)
163 {
164 const unsigned b0 = *src;
165 *dest = (Byte)b0;
166 if (len >= 2)
167 {
168 if (delta < 2)
169 {
170 dest += (unsigned)len & 1;
171 dest[0] = (Byte)b0;
172 dest[1] = (Byte)b0;
173 dest += (unsigned)len & 2;
174 if (len &= ~(unsigned)3)
175 {
176 #ifdef MY_CPU_LE_UNALIGN
177 #ifdef MY_CPU_64BIT
178 const UInt64 a = (UInt64)b0 * 0x101010101010101;
179 *(UInt32 *)(void *)dest = (UInt32)a;
180 dest += (unsigned)len & 7;
181 if (len &= ~(unsigned)7)
182 {
183 // *(UInt64 *)(void *)dest = a;
184 // dest += 8;
185 // len -= 8;
186 // if (len)
187 {
188 // const ptrdiff_t delta = (ptrdiff_t)dest & 7;
189 // dest -= delta;
190 do
191 {
192 *(UInt64 *)(void *)dest = a;
193 dest += 8;
194 }
195 while (len -= 8);
196 // dest += delta - 8;
197 // *(UInt64 *)(void *)dest = a;
198 }
199 }
200 #else
201 const UInt32 a = (UInt32)b0 * 0x1010101;
202 do
203 {
204 *(UInt32 *)(void *)dest = a;
205 dest += 4;
206 }
207 while (len -= 4);
208 #endif
209 #else
210 do
211 {
212 dest[0] = (Byte)b0;
213 dest[1] = (Byte)b0;
214 dest[2] = (Byte)b0;
215 dest[3] = (Byte)b0;
216 dest += 4;
217 }
218 while (len -= 4);
219 #endif
220 }
221 }
222 else if (delta == 2)
223 {
224 const unsigned m = (unsigned)len & 1;
225 len &= ~(unsigned)1;
226 src += m;
227 dest += m;
228 {
229 const Byte a0 = src[0];
230 const Byte a1 = src[1];
231 do
232 {
233 dest[0] = a0;
234 dest[1] = a1;
235 dest += 2;
236 }
237 while (len -= 2);
238 }
239 }
240 else /* if (delta == 3) */
241 {
242 const unsigned b1 = src[1];
243 dest[1] = (Byte)b1;
244 if (len -= 2)
245 {
246 const unsigned b2 = src[2];
247 dest += 2;
248 do
249 {
250 dest[0] = (Byte)b2; if (--len == 0) break;
251 dest[1] = (Byte)b0; if (--len == 0) break;
252 dest[2] = (Byte)b1;
253 dest += 3;
254 }
255 while (--len);
256 }
257 }
258 }
259 }
260 }
261
262 // #define Z7_LZX_SHOW_STAT
263 #ifdef Z7_LZX_SHOW_STAT
264 #include <stdio.h>
265 #endif
266
267 namespace NCompress {
268 namespace NLzx {
269
270 // #define Z7_LZX_SHOW_STAT
271 #ifdef Z7_LZX_SHOW_STAT
272 static UInt32 g_stats_Num_x86[3];
273 static UInt32 g_stats_NumTables;
274 static UInt32 g_stats_NumLits;
275 static UInt32 g_stats_NumAlign;
276 static UInt32 g_stats_main[kMainTableSize];
277 static UInt32 g_stats_len[kNumLenSymbols];
278 static UInt32 g_stats_main_levels[kNumHuffmanBits + 1];
279 static UInt32 g_stats_len_levels[kNumHuffmanBits + 1];
280 #define UPDATE_STAT(a) a
281 static void PrintVal(UInt32 v)
282 {
283 printf("\n : %9u", v);
284 }
285 static void PrintStat(const char *name, const UInt32 *a, size_t num)
286 {
287 printf("\n\n==== %s:", name);
288 UInt32 sum = 0;
289 size_t i;
290 for (i = 0; i < num; i++)
291 sum += a[i];
292 PrintVal(sum);
293 if (sum != 0)
294 {
295 for (i = 0; i < num; i++)
296 {
297 if (i % 8 == 0)
298 printf("\n");
299 printf("\n%3x : %9u : %5.2f", (unsigned)i, (unsigned)a[i], (double)a[i] * 100 / sum);
300 }
301 }
302 printf("\n");
303 }
304
305 static struct CStat
306 {
307 ~CStat()
308 {
309 PrintStat("x86_filter", g_stats_Num_x86, Z7_ARRAY_SIZE(g_stats_Num_x86));
310 printf("\nTables:"); PrintVal(g_stats_NumTables);
311 printf("\nLits:"); PrintVal(g_stats_NumLits);
312 printf("\nAlign:"); PrintVal(g_stats_NumAlign);
313 PrintStat("Main", g_stats_main, Z7_ARRAY_SIZE(g_stats_main));
314 PrintStat("Len", g_stats_len, Z7_ARRAY_SIZE(g_stats_len));
315 PrintStat("Main Levels", g_stats_main_levels, Z7_ARRAY_SIZE(g_stats_main_levels));
316 PrintStat("Len Levels", g_stats_len_levels, Z7_ARRAY_SIZE(g_stats_len_levels));
317 }
318 } g_stat;
319 #else
320 #define UPDATE_STAT(a)
321 #endif
322
323
324
325 /*
326 3 p015 : ivb- : or r32,r32 / add r32,r32
327 4 p0156 : hsw+
328 5 p0156b: adl+
329 2 p0_5 : ivb- : shl r32,i8
330 2 p0__6 : hsw+
331 1 p5 : ivb- : jb
332 2 p0__6 : hsw+
333 2 p0_5 : wsm- : SSE2 : pcmpeqb : _mm_cmpeq_epi8
334 2 p_15 : snb-bdw
335 2 p01 : skl+
336 1 p0 : SSE2 : pmovmskb : _mm_movemask_epi8
337 */
338 /*
339 v24.00: the code was fixed for more compatibility with original-ms-cab-decoder.
340 for ((Int32)translationSize >= 0) : LZX specification shows the code with signed Int32.
341 for ((Int32)translationSize < 0) : no specification for that case, but we support that case.
342 We suppose our code now is compatible with original-ms-cab-decoder.
343
344 Starting byte of data stream (real_pos == 0) is special corner case,
345 where we don't need any conversion (as in original-ms-cab-decoder).
346 Our optimization: we use unsigned (UInt32 pos) (pos = -1 - real_pos).
347 So (pos) is always negative: ((Int32)pos < 0).
348 It allows us to use simple comparison (v > pos) instead of more complex comparisons.
349 */
350 // (p) will point 5 bytes after 0xe8 byte:
351 // pos == -1 - (p - 5 - data_start) == 4 + data_start - p
352 // (FILTER_PROCESSED_SIZE_DELTA == 4) is optimized value for better speed in some compilers:
353 #define FILTER_PROCESSED_SIZE_DELTA 4
354
355 #if defined(MY_CPU_X86_OR_AMD64) || defined(MY_CPU_ARM_OR_ARM64)
356 // optimized branch:
357 // size_t must be at least 32-bit for this branch.
358 #if 1 // use 1 for simpler code
359 // use integer (low 32 bits of pointer) instead of pointer
360 #define X86_FILTER_PREPARE processedSize4 = (UInt32)(size_t)(ptrdiff_t)data + \
361 (UInt32)(4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
362 #define X86_FILTER_CALC_pos(p) const UInt32 pos = processedSize4 - (UInt32)(size_t)(ptrdiff_t)p;
363 #else
364 // note: (dataStart) pointer can point out of array ranges:
365 #define X86_FILTER_PREPARE const Byte *dataStart = data + \
366 (4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
367 #define X86_FILTER_CALC_pos(p) const UInt32 pos = (UInt32)(size_t)(dataStart - p);
368 #endif
369 #else
370 // non-optimized branch for unusual platforms (16-bit size_t or unusual size_t):
371 #define X86_FILTER_PREPARE processedSize4 = \
372 (UInt32)(4 - FILTER_PROCESSED_SIZE_DELTA) - processedSize4;
373 #define X86_FILTER_CALC_pos(p) const UInt32 pos = processedSize4 - (UInt32)(size_t)(p - data);
374 #endif
375
376 #define X86_TRANSLATE_PRE(p) \
377 UInt32 v = GetUi32((p) - 4);
378
379 #define X86_TRANSLATE_POST(p) \
380 { \
381 X86_FILTER_CALC_pos(p) \
382 if (v < translationSize) { \
383 UPDATE_STAT(g_stats_Num_x86[0]++;) \
384 v += pos + 1; \
385 SetUi32((p) - 4, v) \
386 } \
387 else if (v > pos) { \
388 UPDATE_STAT(g_stats_Num_x86[1]++;) \
389 v += translationSize; \
390 SetUi32((p) - 4, v) \
391 } else { UPDATE_STAT(g_stats_Num_x86[2]++;) } \
392 }
393
394
395 /*
396 if ( defined(Z7_LZX_X86_FILTER_USE_SSE2)
397 && defined(Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED))
398 the function can read up to aligned_for_32_up_from(size) bytes in (data).
399 */
400 // processedSize < (1 << 30)
401 Z7_NO_INLINE
402 static void x86_Filter4(Byte *data, size_t size, UInt32 processedSize4, UInt32 translationSize)
403 {
404 const size_t kResidue = 10;
405 if (size <= kResidue)
406 return;
407 Byte * const lim = data + size - kResidue + 4;
408 const Byte save = lim[0];
409 lim[0] = 0xe8;
410 X86_FILTER_PREPARE
411 Byte *p = data;
412
413 #define FILTER_RETURN_IF_LIM(_p_) if (_p_ > lim) { lim[0] = save; return; }
414
415 #ifdef Z7_LZX_X86_FILTER_USE_SSE2
416
417 // sse2-aligned/sse2-unaligned provide same speed on real data.
418 // but the code is smaller for sse2-unaligned version.
419 // for debug : define it to get alternative version with aligned 128-bit reads:
420 // #define Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
421
422 #define FILTER_MASK_INT UInt32
423 #define FILTER_NUM_VECTORS_IN_CHUNK 2
424 #define FILTER_CHUNK_BYTES_OFFSET (16 * FILTER_NUM_VECTORS_IN_CHUNK - 5)
425
426 #ifdef Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
427 // aligned version doesn't uses additional space if buf size is aligned for 32
428 #define k_Filter_OutBufSize_Add 0
429 #define k_Filter_OutBufSize_AlignMask (16 * FILTER_NUM_VECTORS_IN_CHUNK - 1)
430 #define FILTER_LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
431 #else
432 #define k_Filter_OutBufSize_Add (16 * FILTER_NUM_VECTORS_IN_CHUNK)
433 #define k_Filter_OutBufSize_AlignMask 0
434 #define FILTER_LOAD_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
435 #endif
436
437 #define GET_E8_MASK(dest, dest1, p) \
438 { \
439 __m128i v0 = FILTER_LOAD_128(p); \
440 __m128i v1 = FILTER_LOAD_128(p + 16); \
441 p += 16 * FILTER_NUM_VECTORS_IN_CHUNK; \
442 v0 = _mm_cmpeq_epi8(v0, k_e8_Vector); \
443 v1 = _mm_cmpeq_epi8(v1, k_e8_Vector); \
444 dest = (unsigned)_mm_movemask_epi8(v0); \
445 dest1 = (unsigned)_mm_movemask_epi8(v1); \
446 }
447
448 const __m128i k_e8_Vector = _mm_set1_epi32((Int32)(UInt32)0xe8e8e8e8);
449 for (;;)
450 {
451 // for debug: define it for smaller code:
452 // #define Z7_LZX_X86_FILTER_CALC_IN_LOOP
453 // without Z7_LZX_X86_FILTER_CALC_IN_LOOP, we can get faster and simpler loop
454 FILTER_MASK_INT mask;
455 {
456 FILTER_MASK_INT mask1;
457 do
458 {
459 GET_E8_MASK(mask, mask1, p)
460 #ifndef Z7_LZX_X86_FILTER_CALC_IN_LOOP
461 mask += mask1;
462 #else
463 mask |= mask1 << 16;
464 #endif
465 }
466 while (!mask);
467
468 #ifndef Z7_LZX_X86_FILTER_CALC_IN_LOOP
469 mask -= mask1;
470 mask |= mask1 << 16;
471 #endif
472 }
473
474 #ifdef Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
475 for (;;)
476 {
477 ctz_type index;
478 typedef
479 #ifdef MY_CPU_64BIT
480 UInt64
481 #else
482 UInt32
483 #endif
484 SUPER_MASK_INT;
485 SUPER_MASK_INT superMask;
486 {
487 MY_CTZ(index, mask);
488 Byte *p2 = p - FILTER_CHUNK_BYTES_OFFSET + (unsigned)index;
489 X86_TRANSLATE_PRE(p2)
490 superMask = ~(SUPER_MASK_INT)0x1f << index;
491 FILTER_RETURN_IF_LIM(p2)
492 X86_TRANSLATE_POST(p2)
493 mask &= (UInt32)superMask;
494 }
495 if (mask)
496 continue;
497 if (index <= FILTER_CHUNK_BYTES_OFFSET)
498 break;
499 {
500 FILTER_MASK_INT mask1;
501 GET_E8_MASK(mask, mask1, p)
502 mask &=
503 #ifdef MY_CPU_64BIT
504 (UInt32)(superMask >> 32);
505 #else
506 ((FILTER_MASK_INT)0 - 1) << ((int)index - FILTER_CHUNK_BYTES_OFFSET);
507 #endif
508 mask |= mask1 << 16;
509 }
510 if (!mask)
511 break;
512 }
513 #else // ! Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
514 {
515 // we use simplest version without loop:
516 // for (;;)
517 {
518 ctz_type index;
519 MY_CTZ(index, mask);
520 /*
521 printf("\np=%p, mask=%8x, index = %2d, p + index = %x\n",
522 (p - 16 * FILTER_NUM_VECTORS_IN_CHUNK), (unsigned)mask,
523 (unsigned)index, (unsigned)((unsigned)(ptrdiff_t)(p - 16 * FILTER_NUM_VECTORS_IN_CHUNK) + index));
524 */
525 p += (size_t)(unsigned)index - FILTER_CHUNK_BYTES_OFFSET;
526 FILTER_RETURN_IF_LIM(p)
527 // mask &= ~(FILTER_MASK_INT)0x1f << index; mask >>= index;
528 X86_TRANSLATE_PRE(p)
529 X86_TRANSLATE_POST(p)
530 // if (!mask) break; // p += 16 * FILTER_NUM_VECTORS_IN_CHUNK;
531 }
532 }
533 #endif // ! Z7_LZX_X86_FILTER_USE_SSE2_ALIGNED
534 }
535
536 #else // ! Z7_LZX_X86_FILTER_USE_SSE2
537
538 #define k_Filter_OutBufSize_Add 0
539 #define k_Filter_OutBufSize_AlignMask 0
540
541
542 for (;;)
543 {
544 for (;;)
545 {
546 if (p[0] == 0xe8) { p += 5; break; }
547 if (p[1] == 0xe8) { p += 6; break; }
548 if (p[2] == 0xe8) { p += 7; break; }
549 p += 4;
550 if (p[-1] == 0xe8) { p += 4; break; }
551 }
552 FILTER_RETURN_IF_LIM(p)
553 X86_TRANSLATE_PRE(p)
554 X86_TRANSLATE_POST(p)
555 }
556
557 #endif // ! Z7_LZX_X86_FILTER_USE_SSE2
558 }
559
560
561 CDecoder::CDecoder() throw():
562 _win(NULL),
563 _isUncompressedBlock(false),
564 _skipByte(false),
565 _keepHistory(false),
566 _keepHistoryForNext(true),
567 _needAlloc(true),
568 _wimMode(false),
569 _numDictBits(15),
570 _unpackBlockSize(0),
571 _x86_translationSize(0),
572 _x86_buf(NULL),
573 _unpackedData(NULL)
574 {
575 {
576 // it's better to get empty virtual entries, if mispredicted value can be used:
577 memset(_reps, 0, kPosSlotOffset * sizeof(_reps[0]));
578 memset(_extra, 0, kPosSlotOffset);
579 #define SET_NUM_BITS(i) i // #define NUM_BITS_DELTA 31
580 _extra[kPosSlotOffset + 0] = SET_NUM_BITS(0);
581 _extra[kPosSlotOffset + 1] = SET_NUM_BITS(0);
582 // reps[0] = 0 - (kNumReps - 1);
583 // reps[1] = 1 - (kNumReps - 1);
584 UInt32 a = 2 - (kNumReps - 1);
585 UInt32 delta = 1;
586 unsigned i;
587 for (i = 0; i < kNumLinearPosSlotBits; i++)
588 {
589 _extra[(size_t)i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
590 _extra[(size_t)i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
591 _reps [(size_t)i * 2 + 2 + kPosSlotOffset] = a; a += delta;
592 _reps [(size_t)i * 2 + 3 + kPosSlotOffset] = a; a += delta;
593 delta += delta;
594 }
595 for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
596 {
597 _extra[(size_t)i + kPosSlotOffset] = SET_NUM_BITS(kNumLinearPosSlotBits);
598 _reps [(size_t)i + kPosSlotOffset] = a;
599 a += (UInt32)1 << kNumLinearPosSlotBits;
600 }
601 }
602 }
603
604 CDecoder::~CDecoder() throw()
605 {
606 if (_needAlloc)
607 // BigFree
608 z7_AlignedFree
609 (_win);
610 z7_AlignedFree(_x86_buf);
611 }
612
613 HRESULT CDecoder::Flush() throw()
614 {
615 // UInt32 t = _x86_processedSize; for (int y = 0; y < 50; y++) { _x86_processedSize = t; // benchmark: (branch predicted)
616 if (_x86_translationSize != 0)
617 {
618 Byte *destData = _win + _writePos;
619 const UInt32 curSize = _pos - _writePos;
620 if (_keepHistoryForNext)
621 {
622 const size_t kChunkSize = (size_t)1 << 15;
623 if (curSize > kChunkSize)
624 return E_NOTIMPL;
625 if (!_x86_buf)
626 {
627 // (kChunkSize % 32 == 0) is required in some cases, because
628 // the filter can read data by 32-bytes chunks in some cases.
629 // if (chunk_size > (1 << 15)) is possible, then we must the code:
630 const size_t kAllocSize = kChunkSize + k_Filter_OutBufSize_Add;
631 _x86_buf = (Byte *)z7_AlignedAlloc(kAllocSize);
632 if (!_x86_buf)
633 return E_OUTOFMEMORY;
634 #if 0 != k_Filter_OutBufSize_Add || \
635 0 != k_Filter_OutBufSize_AlignMask
636 // x86_Filter4() can read after curSize.
637 // So we set all data to zero to prevent reading of uninitialized data:
638 memset(_x86_buf, 0, kAllocSize); // optional
639 #endif
640 }
641 // for (int yy = 0; yy < 1; yy++) // for debug
642 memcpy(_x86_buf, destData, curSize);
643 _unpackedData = _x86_buf;
644 destData = _x86_buf;
645 }
646 else
647 {
648 // x86_Filter4() can overread after (curSize),
649 // so we can do memset() after (curSize):
650 // k_Filter_OutBufSize_AlignMask also can be used
651 // if (!_overDict) memset(destData + curSize, 0, k_Filter_OutBufSize_Add);
652 }
653 x86_Filter4(destData, curSize, _x86_processedSize - FILTER_PROCESSED_SIZE_DELTA, _x86_translationSize);
654 _x86_processedSize += (UInt32)curSize;
655 if (_x86_processedSize >= ((UInt32)1 << 30))
656 _x86_translationSize = 0;
657 }
658 // }
659 return S_OK;
660 }
661
662
663
664 // (NUM_DELTA_BYTES == 2) reduces the code in main loop.
665 #if 1
666 #define NUM_DELTA_BYTES 2
667 #else
668 #define NUM_DELTA_BYTES 0
669 #endif
670
671 #define NUM_DELTA_BIT_OFFSET_BITS (NUM_DELTA_BYTES * 8)
672
673 #if NUM_DELTA_BIT_OFFSET_BITS > 0
674 #define DECODE_ERROR_CODE 0
675 #define IS_OVERFLOW_bitOffset(bo) ((bo) >= 0)
676 // ( >= 0) comparison after bitOffset change gives simpler commands than ( > 0) comparison
677 #else
678 #define DECODE_ERROR_CODE 1
679 #define IS_OVERFLOW_bitOffset(bo) ((bo) > 0)
680 #endif
681
682 // (numBits != 0)
683 #define GET_VAL_BASE(numBits) (_value >> (32 - (numBits)))
684
685 #define Z7_LZX_HUFF_DECODE( sym, huff, kNumTableBits, move_pos_op, check_op, error_op) \
686 Z7_HUFF_DECODE_VAL_IN_HIGH32(sym, huff, kNumHuffmanBits, kNumTableBits, \
687 _value, check_op, error_op, move_pos_op, NORMALIZE, bs)
688
689 #define Z7_LZX_HUFF_DECODE_CHECK_YES(sym, huff, kNumTableBits, move_pos_op) \
690 Z7_LZX_HUFF_DECODE( sym, huff, kNumTableBits, move_pos_op, \
691 Z7_HUFF_DECODE_ERROR_SYM_CHECK_YES, { return DECODE_ERROR_CODE; })
692
693 #define Z7_LZX_HUFF_DECODE_CHECK_NO( sym, huff, kNumTableBits, move_pos_op) \
694 Z7_LZX_HUFF_DECODE( sym, huff, kNumTableBits, move_pos_op, \
695 Z7_HUFF_DECODE_ERROR_SYM_CHECK_NO, {})
696
697 #define NORMALIZE \
698 { \
699 const Byte *ptr = _buf + (_bitOffset >> 4) * 2; \
700 /* _value = (((UInt32)GetUi16(ptr) << 16) | GetUi16(ptr + 2)) << (_bitOffset & 15); */ \
701 const UInt32 v = GetUi32(ptr); \
702 _value = rotlFixed (v, ((int)_bitOffset & 15) + 16); \
703 }
704
705 #define MOVE_POS(bs, numBits) \
706 { \
707 _bitOffset += numBits; \
708 }
709
710 #define MOVE_POS_STAT(bs, numBits) \
711 { \
712 UPDATE_STAT(g_stats_len_levels[numBits]++;) \
713 MOVE_POS(bs, numBits); \
714 }
715
716 #define MOVE_POS_CHECK(bs, numBits) \
717 { \
718 if (IS_OVERFLOW_bitOffset(_bitOffset += numBits)) return DECODE_ERROR_CODE; \
719 }
720
721 #define MOVE_POS_CHECK_STAT(bs, numBits) \
722 { \
723 UPDATE_STAT(g_stats_main_levels[numBits]++;) \
724 MOVE_POS_CHECK(bs, numBits) \
725 }
726
727
728 // (numBits == 0) is supported
729
730 #ifdef Z7_HUFF_USE_64BIT_LIMIT
731
732 #define MACRO_ReadBitsBig_pre(numBits) \
733 { \
734 _bitOffset += (numBits); \
735 _value >>= 32 - (numBits); \
736 }
737
738 #else
739
740 #define MACRO_ReadBitsBig_pre(numBits) \
741 { \
742 _bitOffset += (numBits); \
743 _value = (UInt32)((UInt32)_value >> 1 >> (31 ^ (numBits))); \
744 }
745
746 #endif
747
748
749 #define MACRO_ReadBitsBig_add(dest) \
750 { dest += (UInt32)_value; }
751
752 #define MACRO_ReadBitsBig_add3(dest) \
753 { dest += (UInt32)(_value) << 3; }
754
755
756 // (numBits != 0)
757 #define MACRO_ReadBits_NonZero(val, numBits) \
758 { \
759 val = (UInt32)(_value >> (32 - (numBits))); \
760 MOVE_POS(bs, numBits); \
761 NORMALIZE \
762 }
763
764
765 struct CBitDecoder
766 {
767 ptrdiff_t _bitOffset;
768 const Byte *_buf;
769
770 Z7_FORCE_INLINE
771 UInt32 GetVal() const
772 {
773 const Byte *ptr = _buf + (_bitOffset >> 4) * 2;
774 const UInt32 v = GetUi32(ptr);
775 return rotlFixed (v, ((int)_bitOffset & 15) + 16);
776 }
777
778 Z7_FORCE_INLINE
779 bool IsOverRead() const
780 {
781 return _bitOffset > (int)(0 - NUM_DELTA_BIT_OFFSET_BITS);
782 }
783
784
785 Z7_FORCE_INLINE
786 bool WasBitStreamFinishedOK() const
787 {
788 // we check that all 0-15 unused bits are zeros:
789 if (_bitOffset == 0 - NUM_DELTA_BIT_OFFSET_BITS)
790 return true;
791 if ((_bitOffset + NUM_DELTA_BIT_OFFSET_BITS + 15) & ~(ptrdiff_t)15)
792 return false;
793 const Byte *ptr = _buf - NUM_DELTA_BYTES - 2;
794 if ((UInt16)(GetUi16(ptr) << (_bitOffset & 15)))
795 return false;
796 return true;
797 }
798
799 // (numBits != 0)
800 Z7_FORCE_INLINE
801 UInt32 ReadBits_NonZero(unsigned numBits) throw()
802 {
803 const UInt32 val = GetVal() >> (32 - numBits);
804 _bitOffset += numBits;
805 return val;
806 }
807 };
808
809
810 class CBitByteDecoder: public CBitDecoder
811 {
812 size_t _size;
813 public:
814
815 Z7_FORCE_INLINE
816 void Init_ByteMode(const Byte *data, size_t size)
817 {
818 _buf = data;
819 _size = size;
820 }
821
822 Z7_FORCE_INLINE
823 void Init_BitMode(const Byte *data, size_t size)
824 {
825 _size = size & 1;
826 size &= ~(size_t)1;
827 _buf = data + size + NUM_DELTA_BYTES;
828 _bitOffset = 0 - (ptrdiff_t)(size * 8) - NUM_DELTA_BIT_OFFSET_BITS;
829 }
830
831 Z7_FORCE_INLINE
832 void Switch_To_BitMode()
833 {
834 Init_BitMode(_buf, _size);
835 }
836
837 Z7_FORCE_INLINE
838 bool Switch_To_ByteMode()
839 {
840 /* here we check that unused bits in high 16-bits word are zeros.
841 If high word is full (all 16-bits are unused),
842 we check that all 16-bits are zeros.
843 So we check and skip (1-16 bits) unused bits */
844 if ((GetVal() >> (16 + (_bitOffset & 15))) != 0)
845 return false;
846 _bitOffset += 16;
847 _bitOffset &= ~(ptrdiff_t)15;
848 if (_bitOffset > 0 - NUM_DELTA_BIT_OFFSET_BITS)
849 return false;
850 const ptrdiff_t delta = _bitOffset >> 3;
851 _size = (size_t)((ptrdiff_t)(_size) - delta - NUM_DELTA_BYTES);
852 _buf += delta;
853 // _bitOffset = 0; // optional
854 return true;
855 }
856
857 Z7_FORCE_INLINE
858 size_t GetRem() const { return _size; }
859
860 Z7_FORCE_INLINE
861 UInt32 ReadUInt32()
862 {
863 const Byte *ptr = _buf;
864 const UInt32 v = GetUi32(ptr);
865 _buf += 4;
866 _size -= 4;
867 return v;
868 }
869
870 Z7_FORCE_INLINE
871 void CopyTo(Byte *dest, size_t size)
872 {
873 memcpy(dest, _buf, size);
874 _buf += size;
875 _size -= size;
876 }
877
878 Z7_FORCE_INLINE
879 bool IsOneDirectByteLeft() const
880 {
881 return GetRem() == 1;
882 }
883
884 Z7_FORCE_INLINE
885 Byte DirectReadByte()
886 {
887 _size--;
888 return *_buf++;
889 }
890 };
891
892
893 // numBits != 0
894 // Z7_FORCE_INLINE
895 Z7_NO_INLINE
896 static
897 UInt32 ReadBits(CBitDecoder &_bitStream, unsigned numBits)
898 {
899 return _bitStream.ReadBits_NonZero(numBits);
900 }
901
902 #define RIF(x) { if (!(x)) return false; }
903
904
905 /*
906 MSVC compiler adds extra move operation,
907 if we access array with 32-bit index
908 array[calc_index_32_bit(32-bit_var)]
909 where calc_index_32_bit operations are: ((unsigned)a>>cnt), &, ^, |
910 clang is also affected for ((unsigned)a>>cnt) in byte array.
911 */
912
913 // it can overread input buffer for 7-17 bytes.
914 // (levels != levelsEnd)
915 Z7_NO_INLINE
916 static ptrdiff_t ReadTable(ptrdiff_t _bitOffset, const Byte *_buf, Byte *levels, const Byte *levelsEnd)
917 {
918 const unsigned kNumTableBits_Level = 7;
919 NHuffman::CDecoder256<kNumHuffmanBits, kLevelTableSize, kNumTableBits_Level> _levelDecoder;
920 NHuffman::CValueInt _value;
921 // optional check to reduce size of overread zone:
922 if (_bitOffset > (int)0 - (int)NUM_DELTA_BIT_OFFSET_BITS - (int)(kLevelTableSize * kNumLevelBits))
923 return DECODE_ERROR_CODE;
924 NORMALIZE
925 {
926 Byte levels2[kLevelTableSize / 4 * 4];
927 for (size_t i = 0; i < kLevelTableSize / 4 * 4; i += 4)
928 {
929 UInt32 val;
930 MACRO_ReadBits_NonZero(val, kNumLevelBits * 4)
931 levels2[i + 0] = (Byte)((val >> (3 * kNumLevelBits)));
932 levels2[i + 1] = (Byte)((val >> (2 * kNumLevelBits)) & ((1u << kNumLevelBits) - 1));
933 levels2[i + 2] = (Byte)((Byte)val >> (1 * kNumLevelBits));
934 levels2[i + 3] = (Byte)((val) & ((1u << kNumLevelBits) - 1));
935 }
936 RIF(_levelDecoder.Build(levels2, NHuffman::k_BuildMode_Full))
937 }
938
939 do
940 {
941 unsigned sym;
942 Z7_LZX_HUFF_DECODE_CHECK_NO(sym, &_levelDecoder, kNumTableBits_Level, MOVE_POS_CHECK)
943 // Z7_HUFF_DECODE_CHECK(sym, &_levelDecoder, kNumHuffmanBits, kNumTableBits_Level, &bitStream, return false)
944 // sym = _levelDecoder.Decode(&bitStream);
945 // if (!_levelDecoder.Decode_SymCheck_MovePosCheck(&bitStream, sym)) return false;
946
947 if (sym <= kNumHuffmanBits)
948 {
949 int delta = (int)*levels - (int)sym;
950 delta += delta < 0 ? kNumHuffmanBits + 1 : 0;
951 *levels++ = (Byte)delta;
952 continue;
953 }
954
955 unsigned num;
956 int symbol;
957
958 if (sym < kLevelSym_Same)
959 {
960 // sym -= kLevelSym_Zero1;
961 MACRO_ReadBits_NonZero(num, kLevelSym_Zero1_NumBits + (sym - kLevelSym_Zero1))
962 num += (sym << kLevelSym_Zero1_NumBits) - (kLevelSym_Zero1 << kLevelSym_Zero1_NumBits) + kLevelSym_Zero1_Start;
963 symbol = 0;
964 }
965 // else if (sym != kLevelSym_Same) return DECODE_ERROR_CODE;
966 else // (sym == kLevelSym_Same)
967 {
968 MACRO_ReadBits_NonZero(num, kLevelSym_Same_NumBits)
969 num += kLevelSym_Same_Start;
970 // + (unsigned)bitStream.ReadBitsSmall(kLevelSym_Same_NumBits);
971 // Z7_HUFF_DECODE_CHECK(sym, &_levelDecoder, kNumHuffmanBits, kNumTableBits_Level, &bitStream, return DECODE_ERROR_CODE)
972 // if (!_levelDecoder.Decode2(&bitStream, sym)) return DECODE_ERROR_CODE;
973 // sym = _levelDecoder.Decode(&bitStream);
974
975 Z7_LZX_HUFF_DECODE_CHECK_NO(sym, &_levelDecoder, kNumTableBits_Level, MOVE_POS)
976
977 if (sym > kNumHuffmanBits) return DECODE_ERROR_CODE;
978 symbol = *levels - (int)sym;
979 symbol += symbol < 0 ? kNumHuffmanBits + 1 : 0;
980 }
981
982 if (num > (size_t)(levelsEnd - levels))
983 return false;
984 const Byte *limit = levels + num;
985 do
986 *levels++ = (Byte)symbol;
987 while (levels != limit);
988 }
989 while (levels != levelsEnd);
990
991 return _bitOffset;
992 }
993
994
995 static const unsigned kPosSlotDelta = 256 / kNumLenSlots - kPosSlotOffset;
996
997
998 #define READ_TABLE(_bitStream, levels, levelsEnd) \
999 { \
1000 _bitStream._bitOffset = ReadTable(_bitStream._bitOffset, _bitStream._buf, levels, levelsEnd); \
1001 if (_bitStream.IsOverRead()) return false; \
1002 }
1003
1004 // can over-read input buffer for less than 32 bytes
1005 bool CDecoder::ReadTables(CBitByteDecoder &_bitStream) throw()
1006 {
1007 UPDATE_STAT(g_stats_NumTables++;)
1008 {
1009 const unsigned blockType = (unsigned)ReadBits(_bitStream, kBlockType_NumBits);
1010 // if (blockType > kBlockType_Uncompressed || blockType == 0)
1011 if ((unsigned)(blockType - 1) > kBlockType_Uncompressed - 1)
1012 return false;
1013 _unpackBlockSize = 1u << 15;
1014 if (!_wimMode || ReadBits(_bitStream, 1) == 0)
1015 {
1016 _unpackBlockSize = ReadBits(_bitStream, 16);
1017 // wimlib supports chunks larger than 32KB (unsupported my MS wim).
1018 if (!_wimMode || _numDictBits >= 16)
1019 {
1020 _unpackBlockSize <<= 8;
1021 _unpackBlockSize |= ReadBits(_bitStream, 8);
1022 }
1023 }
1024
1025 PRF(printf("\nBlockSize = %6d %s ", _unpackBlockSize, (_pos & 1) ? "@@@" : " "));
1026
1027 _isUncompressedBlock = (blockType == kBlockType_Uncompressed);
1028 _skipByte = false;
1029
1030 if (_isUncompressedBlock)
1031 {
1032 _skipByte = ((_unpackBlockSize & 1) != 0);
1033 // printf("\n UncompressedBlock %d", _unpackBlockSize);
1034 PRF(printf(" UncompressedBlock ");)
1035 // if (_unpackBlockSize & 1) { PRF(printf(" ######### ")); }
1036 if (!_bitStream.Switch_To_ByteMode())
1037 return false;
1038 if (_bitStream.GetRem() < kNumReps * 4)
1039 return false;
1040 for (unsigned i = 0; i < kNumReps; i++)
1041 {
1042 const UInt32 rep = _bitStream.ReadUInt32();
1043 // here we allow only such values for (rep) that can be set also by LZ code:
1044 if (rep == 0 || rep > _winSize - kNumReps)
1045 return false;
1046 _reps[(size_t)i + kPosSlotOffset] = rep;
1047 }
1048 // printf("\n");
1049 return true;
1050 }
1051
1052 // _numAlignBits = 64;
1053 // const UInt32 k_numAlignBits_PosSlots_MAX = 64 + kPosSlotDelta;
1054 // _numAlignBits_PosSlots = k_numAlignBits_PosSlots_MAX;
1055 const UInt32 k_numAlignBits_Dist_MAX = (UInt32)(Int32)-1;
1056 _numAlignBits_Dist = k_numAlignBits_Dist_MAX;
1057 if (blockType == kBlockType_Aligned)
1058 {
1059 Byte levels[kAlignTableSize];
1060 // unsigned not0 = 0;
1061 unsigned not3 = 0;
1062 for (unsigned i = 0; i < kAlignTableSize; i++)
1063 {
1064 const unsigned val = ReadBits(_bitStream, kNumAlignLevelBits);
1065 levels[i] = (Byte)val;
1066 // not0 |= val;
1067 not3 |= (val ^ 3);
1068 }
1069 // static unsigned number = 0, all = 0; all++;
1070 // if (!not0) return false; // Build(true) will test this case
1071 if (not3)
1072 {
1073 // _numAlignBits_PosSlots = (kNumAlignBits + 1) * 2 + kPosSlotDelta;
1074 // _numAlignBits = kNumAlignBits;
1075 _numAlignBits_Dist = (1u << (kNumAlignBits + 1)) - (kNumReps - 1);
1076 RIF(_alignDecoder.Build(levels, true)) // full
1077 }
1078 // else { number++; if (number % 4 == 0) printf("\nnumber= %u : %u%%", number, number * 100 / all); }
1079 }
1080 // if (_numAlignBits_PosSlots == k_numAlignBits_PosSlots_MAX)
1081 if (_numAlignBits_Dist == k_numAlignBits_Dist_MAX)
1082 {
1083 size_t i;
1084 for (i = 3; i < kNumLinearPosSlotBits; i++)
1085 {
1086 _extra[i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
1087 _extra[i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i));
1088 }
1089 for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
1090 _extra[i + kPosSlotOffset] = (Byte)SET_NUM_BITS(kNumLinearPosSlotBits);
1091 }
1092 else
1093 {
1094 size_t i;
1095 for (i = 3; i < kNumLinearPosSlotBits; i++)
1096 {
1097 _extra[i * 2 + 2 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i) - 3);
1098 _extra[i * 2 + 3 + kPosSlotOffset] = (Byte)(SET_NUM_BITS(i) - 3);
1099 }
1100 for (i = kNumLinearPosSlotBits * 2 + 2; i < kNumPosSlots; i++)
1101 _extra[i + kPosSlotOffset] = (Byte)(SET_NUM_BITS(kNumLinearPosSlotBits) - 3);
1102 }
1103 }
1104
1105 READ_TABLE(_bitStream, _mainLevels, _mainLevels + 256)
1106 READ_TABLE(_bitStream, _mainLevels + 256, _mainLevels + 256 + _numPosLenSlots)
1107 const unsigned end = 256 + _numPosLenSlots;
1108 memset(_mainLevels + end, 0, kMainTableSize - end);
1109 // #define NUM_CYC 1
1110 // unsigned j; for (j = 0; j < NUM_CYC; j++)
1111 RIF(_mainDecoder.Build(_mainLevels, NHuffman::k_BuildMode_Full))
1112 // if (kNumLenSymols_Big_Start)
1113 memset(_lenLevels, 0, kNumLenSymols_Big_Start);
1114 READ_TABLE(_bitStream,
1115 _lenLevels + kNumLenSymols_Big_Start,
1116 _lenLevels + kNumLenSymols_Big_Start + kNumLenSymbols)
1117 // for (j = 0; j < NUM_CYC; j++)
1118 RIF(_lenDecoder.Build(_lenLevels, NHuffman::k_BuildMode_Full_or_Empty))
1119 return true;
1120 }
1121
1122
1123
1124 static ptrdiff_t CodeLz(CDecoder *dec, size_t next, ptrdiff_t _bitOffset, const Byte *_buf) throw()
1125 {
1126 {
1127 Byte *const win = dec->_win;
1128 const UInt32 winSize = dec->_winSize;
1129 Byte *pos = win + dec->_pos;
1130 const Byte * const posEnd = pos + next;
1131 NHuffman::CValueInt _value;
1132
1133 NORMALIZE
1134
1135 #if 1
1136 #define HUFF_DEC_PREFIX dec->
1137 #else
1138 const NHuffman::CDecoder<kNumHuffmanBits, kMainTableSize, kNumTableBits_Main> _mainDecoder = dec->_mainDecoder;
1139 const NHuffman::CDecoder256<kNumHuffmanBits, kNumLenSymbols, kNumTableBits_Len> _lenDecoder = dec->_lenDecoder;
1140 const NHuffman::CDecoder7b<kAlignTableSize> _alignDecoder = dec->_alignDecoder;
1141 #define HUFF_DEC_PREFIX
1142 #endif
1143
1144 do
1145 {
1146 unsigned sym;
1147 // printf("\npos = %6u", pos - win);
1148 {
1149 const NHuffman::CDecoder<kNumHuffmanBits, kMainTableSize, kNumTableBits_Main>
1150 *mainDecoder = & HUFF_DEC_PREFIX _mainDecoder;
1151 Z7_LZX_HUFF_DECODE_CHECK_NO(sym, mainDecoder, kNumTableBits_Main, MOVE_POS_CHECK_STAT)
1152 }
1153 // if (!_mainDecoder.Decode_SymCheck_MovePosCheck(&bitStream, sym)) return DECODE_ERROR_CODE;
1154 // sym = _mainDecoder.Decode(&bitStream);
1155 // if (bitStream.WasExtraReadError_Fast()) return DECODE_ERROR_CODE;
1156
1157 // printf(" sym = %3x", sym);
1158 UPDATE_STAT(g_stats_main[sym]++;)
1159
1160 if (sym < 256)
1161 {
1162 UPDATE_STAT(g_stats_NumLits++;)
1163 *pos++ = (Byte)sym;
1164 }
1165 else
1166 {
1167 // sym -= 256;
1168 // if (sym >= _numPosLenSlots) return DECODE_ERROR_CODE;
1169 const unsigned posSlot = sym / kNumLenSlots;
1170 unsigned len = sym % kNumLenSlots + kMatchMinLen;
1171 if (len == kNumLenSlots - 1 + kMatchMinLen)
1172 {
1173 const NHuffman::CDecoder256<kNumHuffmanBits, kNumLenSymbols, kNumTableBits_Len>
1174 *lenDecoder = & HUFF_DEC_PREFIX _lenDecoder;
1175 Z7_LZX_HUFF_DECODE_CHECK_YES(len, lenDecoder, kNumTableBits_Len, MOVE_POS_STAT)
1176 // if (!_lenDecoder.Decode2(&bitStream, len)) return DECODE_ERROR_CODE;
1177 // len = _lenDecoder.Decode(&bitStream);
1178 // if (len >= kNumLenSymbols) return DECODE_ERROR_CODE;
1179 UPDATE_STAT(g_stats_len[len - kNumLenSymols_Big_Start]++;)
1180 len += kNumLenSlots - 1 + kMatchMinLen - kNumLenSymols_Big_Start;
1181 }
1182 /*
1183 if ((next -= len) < 0)
1184 return DECODE_ERROR_CODE;
1185 */
1186 UInt32 dist;
1187
1188 dist = dec->_reps[(size_t)posSlot - kPosSlotDelta];
1189 if (posSlot < kNumReps + 256 / kNumLenSlots)
1190 {
1191 // if (posSlot != kNumReps + kPosSlotDelta)
1192 // if (posSlot - (kNumReps + kPosSlotDelta + 1) < 2)
1193 dec->_reps[(size_t)posSlot - kPosSlotDelta] = dec->_reps[kPosSlotOffset];
1194 /*
1195 if (posSlot != kPosSlotDelta)
1196 {
1197 UInt32 temp = dist;
1198 if (posSlot == kPosSlotDelta + 1)
1199 {
1200 dist = reps[1];
1201 reps[1] = temp;
1202 }
1203 else
1204 {
1205 dist = reps[2];
1206 reps[2] = temp;
1207 }
1208 // dist = reps[(size_t)(posSlot) - kPosSlotDelta];
1209 // reps[(size_t)(posSlot) - kPosSlotDelta] = reps[0];
1210 // reps[(size_t)(posSlot) - kPosSlotDelta] = temp;
1211 }
1212 */
1213 }
1214 else // if (posSlot != kNumReps + kPosSlotDelta)
1215 {
1216 unsigned numDirectBits;
1217 #if 0
1218 if (posSlot < kNumPowerPosSlots + kPosSlotDelta)
1219 {
1220 numDirectBits = (posSlot - 2 - kPosSlotDelta) >> 1;
1221 dist = (UInt32)(2 | (posSlot & 1)) << numDirectBits;
1222 }
1223 else
1224 {
1225 numDirectBits = kNumLinearPosSlotBits;
1226 dist = (UInt32)(posSlot - 0x22 - kPosSlotDelta) << kNumLinearPosSlotBits;
1227 }
1228 dist -= kNumReps - 1;
1229 #else
1230 numDirectBits = dec->_extra[(size_t)posSlot - kPosSlotDelta];
1231 // dist = reps[(size_t)(posSlot) - kPosSlotDelta];
1232 #endif
1233 dec->_reps[kPosSlotOffset + 2] =
1234 dec->_reps[kPosSlotOffset + 1];
1235 dec->_reps[kPosSlotOffset + 1] =
1236 dec->_reps[kPosSlotOffset + 0];
1237
1238 // dist += val; dist += bitStream.ReadBitsBig(numDirectBits);
1239 // if (posSlot >= _numAlignBits_PosSlots)
1240 // if (numDirectBits >= _numAlignBits)
1241 // if (val >= _numAlignBits_Dist)
1242 // UInt32 val; MACRO_ReadBitsBig(val , numDirectBits)
1243 // dist += val;
1244 // dist += (UInt32)((UInt32)_value >> 1 >> (/* 31 ^ */ (numDirectBits)));
1245 // MOVE_POS((numDirectBits ^ 31))
1246 MACRO_ReadBitsBig_pre(numDirectBits)
1247 // dist += (UInt32)_value;
1248 if (dist >= dec->_numAlignBits_Dist)
1249 {
1250 // if (numDirectBits != _numAlignBits)
1251 {
1252 // UInt32 val;
1253 // dist -= (UInt32)_value;
1254 MACRO_ReadBitsBig_add3(dist)
1255 NORMALIZE
1256 // dist += (val << kNumAlignBits);
1257 // dist += bitStream.ReadBitsSmall(numDirectBits - kNumAlignBits) << kNumAlignBits;
1258 }
1259 {
1260 // const unsigned alignTemp = _alignDecoder.Decode(&bitStream);
1261 const NHuffman::CDecoder7b<kAlignTableSize> *alignDecoder = & HUFF_DEC_PREFIX _alignDecoder;
1262 unsigned alignTemp;
1263 UPDATE_STAT(g_stats_NumAlign++;)
1264 Z7_HUFF_DECODER_7B_DECODE(alignTemp, alignDecoder, GET_VAL_BASE, MOVE_POS, bs)
1265 // NORMALIZE
1266 // if (alignTemp >= kAlignTableSize) return DECODE_ERROR_CODE;
1267 dist += alignTemp;
1268 }
1269 }
1270 else
1271 {
1272 {
1273 MACRO_ReadBitsBig_add(dist)
1274 // dist += bitStream.ReadBitsSmall(numDirectBits - kNumAlignBits) << kNumAlignBits;
1275 }
1276 }
1277 NORMALIZE
1278 /*
1279 else
1280 {
1281 UInt32 val;
1282 MACRO_ReadBitsBig(val, numDirectBits)
1283 dist += val;
1284 // dist += bitStream.ReadBitsBig(numDirectBits);
1285 }
1286 */
1287 }
1288 dec->_reps[kPosSlotOffset + 0] = dist;
1289
1290 Byte *dest = pos;
1291 if (len > (size_t)(posEnd - pos))
1292 return DECODE_ERROR_CODE;
1293 Int32 srcPos = (Int32)(pos - win);
1294 pos += len;
1295 srcPos -= (Int32)dist;
1296 if (srcPos < 0) // fast version
1297 {
1298 if (!dec->_overDict)
1299 return DECODE_ERROR_CODE;
1300 srcPos &= winSize - 1;
1301 UInt32 rem = winSize - (UInt32)srcPos;
1302 if (len > rem)
1303 {
1304 len -= rem;
1305 const Byte *src = win + (UInt32)srcPos;
1306 do
1307 *dest++ = *src++;
1308 while (--rem);
1309 srcPos = 0;
1310 }
1311 }
1312 CopyLzMatch(dest, win + (UInt32)srcPos, len, dist);
1313 }
1314 }
1315 while (pos != posEnd);
1316
1317 return _bitOffset;
1318 }
1319 }
1320
1321
1322
1323
1324 // inSize != 0
1325 // outSize != 0 ???
1326 HRESULT CDecoder::CodeSpec(const Byte *inData, size_t inSize, UInt32 outSize) throw()
1327 {
1328 // ((inSize & 1) != 0) case is possible, if current call will be finished with Uncompressed Block.
1329 CBitByteDecoder _bitStream;
1330 if (_keepHistory && _isUncompressedBlock)
1331 _bitStream.Init_ByteMode(inData, inSize);
1332 else
1333 _bitStream.Init_BitMode(inData, inSize);
1334
1335 if (!_keepHistory)
1336 {
1337 _isUncompressedBlock = false;
1338 _skipByte = false;
1339 _unpackBlockSize = 0;
1340 memset(_mainLevels, 0, sizeof(_mainLevels));
1341 memset(_lenLevels, 0, sizeof(_lenLevels));
1342 {
1343 _x86_translationSize = 12000000;
1344 if (!_wimMode)
1345 {
1346 _x86_translationSize = 0;
1347 if (ReadBits(_bitStream, 1) != 0)
1348 {
1349 UInt32 v = ReadBits(_bitStream, 16) << 16;
1350 v |= ReadBits(_bitStream, 16);
1351 _x86_translationSize = v;
1352 }
1353 }
1354 _x86_processedSize = 0;
1355 }
1356 _reps[0 + kPosSlotOffset] = 1;
1357 _reps[1 + kPosSlotOffset] = 1;
1358 _reps[2 + kPosSlotOffset] = 1;
1359 }
1360
1361 while (outSize)
1362 {
1363 /*
1364 // check it for bit mode only:
1365 if (_bitStream.WasExtraReadError_Fast())
1366 return S_FALSE;
1367 */
1368 if (_unpackBlockSize == 0)
1369 {
1370 if (_skipByte)
1371 {
1372 if (_bitStream.GetRem() < 1)
1373 return S_FALSE;
1374 if (_bitStream.DirectReadByte() != 0)
1375 return S_FALSE;
1376 }
1377 if (_isUncompressedBlock)
1378 _bitStream.Switch_To_BitMode();
1379 if (!ReadTables(_bitStream))
1380 return S_FALSE;
1381 continue;
1382 }
1383
1384 // _unpackBlockSize != 0
1385 UInt32 next = _unpackBlockSize;
1386 if (next > outSize)
1387 next = outSize;
1388 // next != 0
1389
1390 // PRF(printf("\nnext = %d", (unsigned)next);)
1391
1392 if (_isUncompressedBlock)
1393 {
1394 if (_bitStream.GetRem() < next)
1395 return S_FALSE;
1396 _bitStream.CopyTo(_win + _pos, next);
1397 _pos += next;
1398 _unpackBlockSize -= next;
1399 }
1400 else
1401 {
1402 _unpackBlockSize -= next;
1403 _bitStream._bitOffset = CodeLz(this, next, _bitStream._bitOffset, _bitStream._buf);
1404 if (_bitStream.IsOverRead())
1405 return S_FALSE;
1406 _pos += next;
1407 }
1408 outSize -= next;
1409 }
1410
1411 // outSize == 0
1412
1413 if (_isUncompressedBlock)
1414 {
1415 /* we don't know where skipByte can be placed, if it's end of chunk:
1416 1) in current chunk - there are such cab archives, if chunk is last
1417 2) in next chunk - are there such archives ? */
1418 if (_unpackBlockSize == 0
1419 && _skipByte
1420 // && outSize == 0
1421 && _bitStream.IsOneDirectByteLeft())
1422 {
1423 _skipByte = false;
1424 if (_bitStream.DirectReadByte() != 0)
1425 return S_FALSE;
1426 }
1427 }
1428
1429 if (_bitStream.GetRem() != 0)
1430 return S_FALSE;
1431 if (!_isUncompressedBlock)
1432 if (!_bitStream.WasBitStreamFinishedOK())
1433 return S_FALSE;
1434 return S_OK;
1435 }
1436
1437
1438 #if k_Filter_OutBufSize_Add > k_Lz_OutBufSize_Add
1439 #define k_OutBufSize_Add k_Filter_OutBufSize_Add
1440 #else
1441 #define k_OutBufSize_Add k_Lz_OutBufSize_Add
1442 #endif
1443
1444 HRESULT CDecoder::Code_WithExceedReadWrite(const Byte *inData, size_t inSize, UInt32 outSize) throw()
1445 {
1446 if (!_keepHistory)
1447 {
1448 _pos = 0;
1449 _overDict = false;
1450 }
1451 else if (_pos == _winSize)
1452 {
1453 _pos = 0;
1454 _overDict = true;
1455 #if k_OutBufSize_Add > 0
1456 // data after (_winSize) can be used, because we can use overwrite.
1457 // memset(_win + _winSize, 0, k_OutBufSize_Add);
1458 #endif
1459 }
1460 _writePos = _pos;
1461 _unpackedData = _win + _pos;
1462
1463 if (outSize > _winSize - _pos)
1464 return S_FALSE;
1465
1466 PRF(printf("\ninSize = %d", (unsigned)inSize);)
1467 PRF(if ((inSize & 1) != 0) printf("---------");)
1468
1469 if (inSize == 0)
1470 return S_FALSE;
1471 const HRESULT res = CodeSpec(inData, inSize, outSize);
1472 const HRESULT res2 = Flush();
1473 return (res == S_OK ? res2 : res);
1474 }
1475
1476
1477 HRESULT CDecoder::SetParams2(unsigned numDictBits) throw()
1478 {
1479 if (numDictBits < kNumDictBits_Min ||
1480 numDictBits > kNumDictBits_Max)
1481 return E_INVALIDARG;
1482 _numDictBits = (Byte)numDictBits;
1483 const unsigned numPosSlots2 = (numDictBits < 20) ?
1484 numDictBits : 17 + (1u << (numDictBits - 18));
1485 _numPosLenSlots = numPosSlots2 * (kNumLenSlots * 2);
1486 return S_OK;
1487 }
1488
1489
1490 HRESULT CDecoder::Set_DictBits_and_Alloc(unsigned numDictBits) throw()
1491 {
1492 RINOK(SetParams2(numDictBits))
1493 const UInt32 newWinSize = (UInt32)1 << numDictBits;
1494 if (_needAlloc)
1495 {
1496 if (!_win || newWinSize != _winSize)
1497 {
1498 // BigFree
1499 z7_AlignedFree
1500 (_win);
1501 _winSize = 0;
1502 const size_t alloc_size = newWinSize + k_OutBufSize_Add;
1503 _win = (Byte *)
1504 // BigAlloc
1505 z7_AlignedAlloc
1506 (alloc_size);
1507 if (!_win)
1508 return E_OUTOFMEMORY;
1509 // optional:
1510 memset(_win, 0, alloc_size);
1511 }
1512 }
1513 _winSize = newWinSize;
1514 return S_OK;
1515 }
1516
1517 }}
1518