xref: /aosp_15_r20/external/lzma/C/LzFind.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* LzFind.c -- Match finder for LZ algorithms
2 2024-03-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include <string.h>
7 // #include <stdio.h>
8 
9 #include "CpuArch.h"
10 #include "LzFind.h"
11 #include "LzHash.h"
12 
13 #define kBlockMoveAlign       (1 << 7)    // alignment for memmove()
14 #define kBlockSizeAlign       (1 << 16)   // alignment for block allocation
15 #define kBlockSizeReserveMin  (1 << 24)   // it's 1/256 from 4 GB dictinary
16 
17 #define kEmptyHashValue 0
18 
19 #define kMaxValForNormalize ((UInt32)0)
20 // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
21 
22 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23 
24 #define GET_AVAIL_BYTES(p) \
25   Inline_MatchFinder_GetNumAvailableBytes(p)
26 
27 
28 // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29 #define kFix5HashSize kFix4HashSize
30 
31 /*
32  HASH2_CALC:
33    if (hv) match, then cur[0] and cur[1] also match
34 */
35 #define HASH2_CALC hv = GetUi16(cur);
36 
37 // (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38 
39 /*
40  HASH3_CALC:
41    if (cur[0]) and (h2) match, then cur[1]            also match
42    if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43 */
44 #define HASH3_CALC { \
45   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46   h2 = temp & (kHash2Size - 1); \
47   hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48 
49 #define HASH4_CALC { \
50   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51   h2 = temp & (kHash2Size - 1); \
52   temp ^= ((UInt32)cur[2] << 8); \
53   h3 = temp & (kHash3Size - 1); \
54   hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55 
56 #define HASH5_CALC { \
57   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58   h2 = temp & (kHash2Size - 1); \
59   temp ^= ((UInt32)cur[2] << 8); \
60   h3 = temp & (kHash3Size - 1); \
61   temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62   /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63   hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64 
65 #define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66 
67 
LzInWindow_Free(CMatchFinder * p,ISzAllocPtr alloc)68 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69 {
70   // if (!p->directInput)
71   {
72     ISzAlloc_Free(alloc, p->bufBase);
73     p->bufBase = NULL;
74   }
75 }
76 
77 
LzInWindow_Create2(CMatchFinder * p,UInt32 blockSize,ISzAllocPtr alloc)78 static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79 {
80   if (blockSize == 0)
81     return 0;
82   if (!p->bufBase || p->blockSize != blockSize)
83   {
84     // size_t blockSizeT;
85     LzInWindow_Free(p, alloc);
86     p->blockSize = blockSize;
87     // blockSizeT = blockSize;
88 
89     // printf("\nblockSize = 0x%x\n", blockSize);
90     /*
91     #if defined _WIN64
92     // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93     // we use UInt32 type for (p->blockSize), because
94     // we don't want to wrap over 4 GiB,
95     // when we use (p->streamPos - p->pos) that is UInt32.
96     if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97     {
98       blockSizeT = ((size_t)1 << 32);
99       printf("\nchanged to blockSizeT = 4GiB\n");
100     }
101     #endif
102     */
103 
104     p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105     // printf("\nbufferBase = %p\n", p->bufBase);
106     // return 0; // for debug
107   }
108   return (p->bufBase != NULL);
109 }
110 
MatchFinder_GetPointerToCurrentPos(void * p)111 static const Byte *MatchFinder_GetPointerToCurrentPos(void *p)
112 {
113   return ((CMatchFinder *)p)->buffer;
114 }
115 
MatchFinder_GetNumAvailableBytes(void * p)116 static UInt32 MatchFinder_GetNumAvailableBytes(void *p)
117 {
118   return GET_AVAIL_BYTES((CMatchFinder *)p);
119 }
120 
121 
122 Z7_NO_INLINE
MatchFinder_ReadBlock(CMatchFinder * p)123 static void MatchFinder_ReadBlock(CMatchFinder *p)
124 {
125   if (p->streamEndWasReached || p->result != SZ_OK)
126     return;
127 
128   /* We use (p->streamPos - p->pos) value.
129      (p->streamPos < p->pos) is allowed. */
130 
131   if (p->directInput)
132   {
133     UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
134     if (curSize > p->directInputRem)
135       curSize = (UInt32)p->directInputRem;
136     p->streamPos += curSize;
137     p->directInputRem -= curSize;
138     if (p->directInputRem == 0)
139       p->streamEndWasReached = 1;
140     return;
141   }
142 
143   for (;;)
144   {
145     const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
146     size_t size = (size_t)(p->bufBase + p->blockSize - dest);
147     if (size == 0)
148     {
149       /* we call ReadBlock() after NeedMove() and MoveBlock().
150          NeedMove() and MoveBlock() povide more than (keepSizeAfter)
151          to the end of (blockSize).
152          So we don't execute this branch in normal code flow.
153          We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
154       */
155       // p->result = SZ_ERROR_FAIL; // we can show error here
156       return;
157     }
158 
159     // #define kRead 3
160     // if (size > kRead) size = kRead; // for debug
161 
162     /*
163     // we need cast (Byte *)dest.
164     #ifdef __clang__
165       #pragma GCC diagnostic ignored "-Wcast-qual"
166     #endif
167     */
168     p->result = ISeqInStream_Read(p->stream,
169         p->bufBase + (dest - p->bufBase), &size);
170     if (p->result != SZ_OK)
171       return;
172     if (size == 0)
173     {
174       p->streamEndWasReached = 1;
175       return;
176     }
177     p->streamPos += (UInt32)size;
178     if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
179       return;
180     /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
181          (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
182   }
183 
184   // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
185 }
186 
187 
188 
189 Z7_NO_INLINE
MatchFinder_MoveBlock(CMatchFinder * p)190 void MatchFinder_MoveBlock(CMatchFinder *p)
191 {
192   const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
193   const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
194   p->buffer = p->bufBase + keepBefore;
195   memmove(p->bufBase,
196       p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
197       keepBefore + (size_t)GET_AVAIL_BYTES(p));
198 }
199 
200 /* We call MoveBlock() before ReadBlock().
201    So MoveBlock() can be wasteful operation, if the whole input data
202    can fit in current block even without calling MoveBlock().
203    in important case where (dataSize <= historySize)
204      condition (p->blockSize > dataSize + p->keepSizeAfter) is met
205      So there is no MoveBlock() in that case case.
206 */
207 
MatchFinder_NeedMove(CMatchFinder * p)208 int MatchFinder_NeedMove(CMatchFinder *p)
209 {
210   if (p->directInput)
211     return 0;
212   if (p->streamEndWasReached || p->result != SZ_OK)
213     return 0;
214   return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
215 }
216 
MatchFinder_ReadIfRequired(CMatchFinder * p)217 void MatchFinder_ReadIfRequired(CMatchFinder *p)
218 {
219   if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
220     MatchFinder_ReadBlock(p);
221 }
222 
223 
224 
MatchFinder_SetDefaultSettings(CMatchFinder * p)225 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
226 {
227   p->cutValue = 32;
228   p->btMode = 1;
229   p->numHashBytes = 4;
230   p->numHashBytes_Min = 2;
231   p->numHashOutBits = 0;
232   p->bigHash = 0;
233 }
234 
235 #define kCrcPoly 0xEDB88320
236 
MatchFinder_Construct(CMatchFinder * p)237 void MatchFinder_Construct(CMatchFinder *p)
238 {
239   unsigned i;
240   p->buffer = NULL;
241   p->bufBase = NULL;
242   p->directInput = 0;
243   p->stream = NULL;
244   p->hash = NULL;
245   p->expectedDataSize = (UInt64)(Int64)-1;
246   MatchFinder_SetDefaultSettings(p);
247 
248   for (i = 0; i < 256; i++)
249   {
250     UInt32 r = (UInt32)i;
251     unsigned j;
252     for (j = 0; j < 8; j++)
253       r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
254     p->crc[i] = r;
255   }
256 }
257 
258 #undef kCrcPoly
259 
MatchFinder_FreeThisClassMemory(CMatchFinder * p,ISzAllocPtr alloc)260 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
261 {
262   ISzAlloc_Free(alloc, p->hash);
263   p->hash = NULL;
264 }
265 
MatchFinder_Free(CMatchFinder * p,ISzAllocPtr alloc)266 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
267 {
268   MatchFinder_FreeThisClassMemory(p, alloc);
269   LzInWindow_Free(p, alloc);
270 }
271 
AllocRefs(size_t num,ISzAllocPtr alloc)272 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
273 {
274   const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
275   if (sizeInBytes / sizeof(CLzRef) != num)
276     return NULL;
277   return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
278 }
279 
280 #if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
281   #error Stop_Compiling_Bad_Reserve
282 #endif
283 
284 
285 
GetBlockSize(CMatchFinder * p,UInt32 historySize)286 static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
287 {
288   UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
289   /*
290   if (historySize > kMaxHistorySize)
291     return 0;
292   */
293   // printf("\nhistorySize == 0x%x\n", historySize);
294 
295   if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore)  // if 32-bit overflow
296     return 0;
297 
298   {
299     const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
300     const UInt32 rem = kBlockSizeMax - blockSize;
301     const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
302         + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
303     if (blockSize >= kBlockSizeMax
304         || rem < kBlockSizeReserveMin) // we reject settings that will be slow
305       return 0;
306     if (reserve >= rem)
307       blockSize = kBlockSizeMax;
308     else
309     {
310       blockSize += reserve;
311       blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
312     }
313   }
314   // printf("\n LzFind_blockSize = %x\n", blockSize);
315   // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
316   return blockSize;
317 }
318 
319 
320 // input is historySize
MatchFinder_GetHashMask2(CMatchFinder * p,UInt32 hs)321 static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
322 {
323   if (p->numHashBytes == 2)
324     return (1 << 16) - 1;
325   if (hs != 0)
326     hs--;
327   hs |= (hs >> 1);
328   hs |= (hs >> 2);
329   hs |= (hs >> 4);
330   hs |= (hs >> 8);
331   // we propagated 16 bits in (hs). Low 16 bits must be set later
332   if (hs >= (1 << 24))
333   {
334     if (p->numHashBytes == 3)
335       hs = (1 << 24) - 1;
336     /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
337   }
338   // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
339   hs |= (1 << 16) - 1; /* don't change it! */
340   // bt5: we adjust the size with recommended minimum size
341   if (p->numHashBytes >= 5)
342     hs |= (256 << kLzHash_CrcShift_2) - 1;
343   return hs;
344 }
345 
346 // input is historySize
MatchFinder_GetHashMask(CMatchFinder * p,UInt32 hs)347 static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
348 {
349   if (p->numHashBytes == 2)
350     return (1 << 16) - 1;
351   if (hs != 0)
352     hs--;
353   hs |= (hs >> 1);
354   hs |= (hs >> 2);
355   hs |= (hs >> 4);
356   hs |= (hs >> 8);
357   // we propagated 16 bits in (hs). Low 16 bits must be set later
358   hs >>= 1;
359   if (hs >= (1 << 24))
360   {
361     if (p->numHashBytes == 3)
362       hs = (1 << 24) - 1;
363     else
364       hs >>= 1;
365     /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
366   }
367   // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
368   hs |= (1 << 16) - 1; /* don't change it! */
369   // bt5: we adjust the size with recommended minimum size
370   if (p->numHashBytes >= 5)
371     hs |= (256 << kLzHash_CrcShift_2) - 1;
372   return hs;
373 }
374 
375 
MatchFinder_Create(CMatchFinder * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)376 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
377     UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
378     ISzAllocPtr alloc)
379 {
380   /* we need one additional byte in (p->keepSizeBefore),
381      since we use MoveBlock() after (p->pos++) and before dictionary using */
382   // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
383   p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
384 
385   keepAddBufferAfter += matchMaxLen;
386   /* we need (p->keepSizeAfter >= p->numHashBytes) */
387   if (keepAddBufferAfter < p->numHashBytes)
388     keepAddBufferAfter = p->numHashBytes;
389   // keepAddBufferAfter -= 2; // for debug
390   p->keepSizeAfter = keepAddBufferAfter;
391 
392   if (p->directInput)
393     p->blockSize = 0;
394   if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
395   {
396     size_t hashSizeSum;
397     {
398       UInt32 hs;
399       UInt32 hsCur;
400 
401       if (p->numHashOutBits != 0)
402       {
403         unsigned numBits = p->numHashOutBits;
404         const unsigned nbMax =
405             (p->numHashBytes == 2 ? 16 :
406             (p->numHashBytes == 3 ? 24 : 32));
407         if (numBits > nbMax)
408           numBits = nbMax;
409         if (numBits >= 32)
410           hs = (UInt32)0 - 1;
411         else
412           hs = ((UInt32)1 << numBits) - 1;
413         // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
414         hs |= (1 << 16) - 1; /* don't change it! */
415         if (p->numHashBytes >= 5)
416           hs |= (256 << kLzHash_CrcShift_2) - 1;
417         {
418           const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
419           if (hs > hs2)
420             hs = hs2;
421         }
422         hsCur = hs;
423         if (p->expectedDataSize < historySize)
424         {
425           const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
426           if (hsCur > hs2)
427             hsCur = hs2;
428         }
429       }
430       else
431       {
432         hs = MatchFinder_GetHashMask(p, historySize);
433         hsCur = hs;
434         if (p->expectedDataSize < historySize)
435         {
436           hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
437           if (hsCur > hs) // is it possible?
438             hsCur = hs;
439         }
440       }
441 
442       p->hashMask = hsCur;
443 
444       hashSizeSum = hs;
445       hashSizeSum++;
446       if (hashSizeSum < hs)
447         return 0;
448       {
449         UInt32 fixedHashSize = 0;
450         if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
451         if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
452         // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
453         hashSizeSum += fixedHashSize;
454         p->fixedHashSize = fixedHashSize;
455       }
456     }
457 
458     p->matchMaxLen = matchMaxLen;
459 
460     {
461       size_t newSize;
462       size_t numSons;
463       const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
464       p->historySize = historySize;
465       p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
466 
467       numSons = newCyclicBufferSize;
468       if (p->btMode)
469         numSons <<= 1;
470       newSize = hashSizeSum + numSons;
471 
472       if (numSons < newCyclicBufferSize || newSize < numSons)
473         return 0;
474 
475       // aligned size is not required here, but it can be better for some loops
476       #define NUM_REFS_ALIGN_MASK 0xF
477       newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
478 
479       // 22.02: we don't reallocate buffer, if old size is enough
480       if (p->hash && p->numRefs >= newSize)
481         return 1;
482 
483       MatchFinder_FreeThisClassMemory(p, alloc);
484       p->numRefs = newSize;
485       p->hash = AllocRefs(newSize, alloc);
486 
487       if (p->hash)
488       {
489         p->son = p->hash + hashSizeSum;
490         return 1;
491       }
492     }
493   }
494 
495   MatchFinder_Free(p, alloc);
496   return 0;
497 }
498 
499 
MatchFinder_SetLimits(CMatchFinder * p)500 static void MatchFinder_SetLimits(CMatchFinder *p)
501 {
502   UInt32 k;
503   UInt32 n = kMaxValForNormalize - p->pos;
504   if (n == 0)
505     n = (UInt32)(Int32)-1;  // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
506 
507   k = p->cyclicBufferSize - p->cyclicBufferPos;
508   if (k < n)
509     n = k;
510 
511   k = GET_AVAIL_BYTES(p);
512   {
513     const UInt32 ksa = p->keepSizeAfter;
514     UInt32 mm = p->matchMaxLen;
515     if (k > ksa)
516       k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
517     else if (k >= mm)
518     {
519       // the limitation for (p->lenLimit) update
520       k -= mm;   // optimization : to reduce the number of checks
521       k++;
522       // k = 1; // non-optimized version : for debug
523     }
524     else
525     {
526       mm = k;
527       if (k != 0)
528         k = 1;
529     }
530     p->lenLimit = mm;
531   }
532   if (k < n)
533     n = k;
534 
535   p->posLimit = p->pos + n;
536 }
537 
538 
MatchFinder_Init_LowHash(CMatchFinder * p)539 void MatchFinder_Init_LowHash(CMatchFinder *p)
540 {
541   size_t i;
542   CLzRef *items = p->hash;
543   const size_t numItems = p->fixedHashSize;
544   for (i = 0; i < numItems; i++)
545     items[i] = kEmptyHashValue;
546 }
547 
548 
MatchFinder_Init_HighHash(CMatchFinder * p)549 void MatchFinder_Init_HighHash(CMatchFinder *p)
550 {
551   size_t i;
552   CLzRef *items = p->hash + p->fixedHashSize;
553   const size_t numItems = (size_t)p->hashMask + 1;
554   for (i = 0; i < numItems; i++)
555     items[i] = kEmptyHashValue;
556 }
557 
558 
MatchFinder_Init_4(CMatchFinder * p)559 void MatchFinder_Init_4(CMatchFinder *p)
560 {
561   if (!p->directInput)
562     p->buffer = p->bufBase;
563   {
564     /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
565        the code in CMatchFinderMt expects (pos = 1) */
566     p->pos =
567     p->streamPos =
568         1; // it's smallest optimal value. do not change it
569         // 0; // for debug
570   }
571   p->result = SZ_OK;
572   p->streamEndWasReached = 0;
573 }
574 
575 
576 // (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
577 #define CYC_TO_POS_OFFSET 0
578 // #define CYC_TO_POS_OFFSET 1 // for debug
579 
MatchFinder_Init(void * _p)580 void MatchFinder_Init(void *_p)
581 {
582   CMatchFinder *p = (CMatchFinder *)_p;
583   MatchFinder_Init_HighHash(p);
584   MatchFinder_Init_LowHash(p);
585   MatchFinder_Init_4(p);
586   // if (readData)
587   MatchFinder_ReadBlock(p);
588 
589   /* if we init (cyclicBufferPos = pos), then we can use one variable
590      instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
591   p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
592   // p->cyclicBufferPos = 0; // smallest value
593   // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
594   MatchFinder_SetLimits(p);
595 }
596 
597 
598 
599 #ifdef MY_CPU_X86_OR_AMD64
600   #if defined(__clang__) && (__clang_major__ >= 4) \
601     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
602     // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
603 
604       #define USE_LZFIND_SATUR_SUB_128
605       #define USE_LZFIND_SATUR_SUB_256
606       #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
607       #define LZFIND_ATTRIB_AVX2  __attribute__((__target__("avx2")))
608   #elif defined(_MSC_VER)
609     #if (_MSC_VER >= 1600)
610       #define USE_LZFIND_SATUR_SUB_128
611     #endif
612     #if (_MSC_VER >= 1900)
613       #define USE_LZFIND_SATUR_SUB_256
614     #endif
615   #endif
616 
617 #elif defined(MY_CPU_ARM64) \
618   /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */
619 
620   #if  defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
621     || defined(__GNUC__) && (__GNUC__ >= 6)
622       #define USE_LZFIND_SATUR_SUB_128
623     #ifdef MY_CPU_ARM64
624       // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
625     #else
626       #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon")))
627     #endif
628 
629   #elif defined(_MSC_VER)
630     #if (_MSC_VER >= 1910)
631       #define USE_LZFIND_SATUR_SUB_128
632     #endif
633   #endif
634 
635   #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
636     #include <arm64_neon.h>
637   #else
638     #include <arm_neon.h>
639   #endif
640 
641 #endif
642 
643 
644 #ifdef USE_LZFIND_SATUR_SUB_128
645 
646 // #define Z7_SHOW_HW_STATUS
647 
648 #ifdef Z7_SHOW_HW_STATUS
649 #include <stdio.h>
650 #define PRF(x) x
651 PRF(;)
652 #else
653 #define PRF(x)
654 #endif
655 
656 
657 #ifdef MY_CPU_ARM_OR_ARM64
658 
659 #ifdef MY_CPU_ARM64
660 // #define FORCE_LZFIND_SATUR_SUB_128
661 #endif
662 typedef uint32x4_t LzFind_v128;
663 #define SASUB_128_V(v, s) \
664   vsubq_u32(vmaxq_u32(v, s), s)
665 
666 #else // MY_CPU_ARM_OR_ARM64
667 
668 #include <smmintrin.h> // sse4.1
669 
670 typedef __m128i LzFind_v128;
671 // SSE 4.1
672 #define SASUB_128_V(v, s)   \
673   _mm_sub_epi32(_mm_max_epu32(v, s), s)
674 
675 #endif // MY_CPU_ARM_OR_ARM64
676 
677 
678 #define SASUB_128(i) \
679   *(      LzFind_v128 *)(      void *)(items + (i) * 4) = SASUB_128_V( \
680   *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
681 
682 
683 Z7_NO_INLINE
684 static
685 #ifdef LZFIND_ATTRIB_SSE41
686 LZFIND_ATTRIB_SSE41
687 #endif
688 void
689 Z7_FASTCALL
LzFind_SaturSub_128(UInt32 subValue,CLzRef * items,const CLzRef * lim)690 LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
691 {
692   const LzFind_v128 sub2 =
693     #ifdef MY_CPU_ARM_OR_ARM64
694       vdupq_n_u32(subValue);
695     #else
696       _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
697     #endif
698   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
699   do
700   {
701     SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
702     SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
703   }
704   while (items != lim);
705 }
706 
707 
708 
709 #ifdef USE_LZFIND_SATUR_SUB_256
710 
711 #include <immintrin.h> // avx
712 /*
713 clang :immintrin.h uses
714 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
715     defined(__AVX2__)
716 #include <avx2intrin.h>
717 #endif
718 so we need <avxintrin.h> for clang-cl */
719 
720 #if defined(__clang__)
721 #include <avxintrin.h>
722 #include <avx2intrin.h>
723 #endif
724 
725 // AVX2:
726 #define SASUB_256(i) \
727     *(      __m256i *)(      void *)(items + (i) * 8) = \
728    _mm256_sub_epi32(_mm256_max_epu32( \
729     *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
730 
731 Z7_NO_INLINE
732 static
733 #ifdef LZFIND_ATTRIB_AVX2
734 LZFIND_ATTRIB_AVX2
735 #endif
736 void
737 Z7_FASTCALL
LzFind_SaturSub_256(UInt32 subValue,CLzRef * items,const CLzRef * lim)738 LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
739 {
740   const __m256i sub2 = _mm256_set_epi32(
741       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
742       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
743   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
744   do
745   {
746     SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
747     SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
748   }
749   while (items != lim);
750 }
751 #endif // USE_LZFIND_SATUR_SUB_256
752 
753 #ifndef FORCE_LZFIND_SATUR_SUB_128
754 typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
755     UInt32 subValue, CLzRef *items, const CLzRef *lim);
756 static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
757 #endif // FORCE_LZFIND_SATUR_SUB_128
758 
759 #endif // USE_LZFIND_SATUR_SUB_128
760 
761 
762 // kEmptyHashValue must be zero
763 // #define SASUB_32(i)  { UInt32 v = items[i];  UInt32 m = v - subValue;  if (v < subValue) m = kEmptyHashValue;  items[i] = m; }
764 #define SASUB_32(i)  { UInt32 v = items[i];  if (v < subValue) v = subValue; items[i] = v - subValue; }
765 
766 #ifdef FORCE_LZFIND_SATUR_SUB_128
767 
768 #define DEFAULT_SaturSub LzFind_SaturSub_128
769 
770 #else
771 
772 #define DEFAULT_SaturSub LzFind_SaturSub_32
773 
774 Z7_NO_INLINE
775 static
776 void
777 Z7_FASTCALL
LzFind_SaturSub_32(UInt32 subValue,CLzRef * items,const CLzRef * lim)778 LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
779 {
780   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
781   do
782   {
783     SASUB_32(0)  SASUB_32(1)  items += 2;
784     SASUB_32(0)  SASUB_32(1)  items += 2;
785     SASUB_32(0)  SASUB_32(1)  items += 2;
786     SASUB_32(0)  SASUB_32(1)  items += 2;
787   }
788   while (items != lim);
789 }
790 
791 #endif
792 
793 
794 Z7_NO_INLINE
MatchFinder_Normalize3(UInt32 subValue,CLzRef * items,size_t numItems)795 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
796 {
797   #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
798   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
799   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
800   {
801     SASUB_32(0)
802     items++;
803   }
804   {
805     const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
806     CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
807     numItems &= k_Align_Mask;
808     if (items != lim)
809     {
810       #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
811         if (g_LzFind_SaturSub)
812           g_LzFind_SaturSub(subValue, items, lim);
813         else
814       #endif
815           DEFAULT_SaturSub(subValue, items, lim);
816     }
817     items = lim;
818   }
819   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
820   for (; numItems != 0; numItems--)
821   {
822     SASUB_32(0)
823     items++;
824   }
825 }
826 
827 
828 
829 // call MatchFinder_CheckLimits() only after (p->pos++) update
830 
831 Z7_NO_INLINE
MatchFinder_CheckLimits(CMatchFinder * p)832 static void MatchFinder_CheckLimits(CMatchFinder *p)
833 {
834   if (// !p->streamEndWasReached && p->result == SZ_OK &&
835       p->keepSizeAfter == GET_AVAIL_BYTES(p))
836   {
837     // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
838     if (MatchFinder_NeedMove(p))
839       MatchFinder_MoveBlock(p);
840     MatchFinder_ReadBlock(p);
841   }
842 
843   if (p->pos == kMaxValForNormalize)
844   if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
845     /*
846        if we disable normalization for last bytes of data, and
847        if (data_size == 4 GiB), we don't call wastfull normalization,
848        but (pos) will be wrapped over Zero (0) in that case.
849        And we cannot resume later to normal operation
850     */
851   {
852     // MatchFinder_Normalize(p);
853     /* after normalization we need (p->pos >= p->historySize + 1); */
854     /* we can reduce subValue to aligned value, if want to keep alignment
855        of (p->pos) and (p->buffer) for speculated accesses. */
856     const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
857     // const UInt32 subValue = (1 << 15); // for debug
858     // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
859     MatchFinder_REDUCE_OFFSETS(p, subValue)
860     MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
861     {
862       size_t numSonRefs = p->cyclicBufferSize;
863       if (p->btMode)
864         numSonRefs <<= 1;
865       MatchFinder_Normalize3(subValue, p->son, numSonRefs);
866     }
867   }
868 
869   if (p->cyclicBufferPos == p->cyclicBufferSize)
870     p->cyclicBufferPos = 0;
871 
872   MatchFinder_SetLimits(p);
873 }
874 
875 
876 /*
877   (lenLimit > maxLen)
878 */
879 Z7_FORCE_INLINE
Hc_GetMatchesSpec(size_t lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,unsigned maxLen)880 static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
881     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
882     UInt32 *d, unsigned maxLen)
883 {
884   /*
885   son[_cyclicBufferPos] = curMatch;
886   for (;;)
887   {
888     UInt32 delta = pos - curMatch;
889     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
890       return d;
891     {
892       const Byte *pb = cur - delta;
893       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
894       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
895       {
896         UInt32 len = 0;
897         while (++len != lenLimit)
898           if (pb[len] != cur[len])
899             break;
900         if (maxLen < len)
901         {
902           maxLen = len;
903           *d++ = len;
904           *d++ = delta - 1;
905           if (len == lenLimit)
906             return d;
907         }
908       }
909     }
910   }
911   */
912 
913   const Byte *lim = cur + lenLimit;
914   son[_cyclicBufferPos] = curMatch;
915 
916   do
917   {
918     UInt32 delta;
919 
920     if (curMatch == 0)
921       break;
922     // if (curMatch2 >= curMatch) return NULL;
923     delta = pos - curMatch;
924     if (delta >= _cyclicBufferSize)
925       break;
926     {
927       ptrdiff_t diff;
928       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
929       diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
930       if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
931       {
932         const Byte *c = cur;
933         while (*c == c[diff])
934         {
935           if (++c == lim)
936           {
937             d[0] = (UInt32)(lim - cur);
938             d[1] = delta - 1;
939             return d + 2;
940           }
941         }
942         {
943           const unsigned len = (unsigned)(c - cur);
944           if (maxLen < len)
945           {
946             maxLen = len;
947             d[0] = (UInt32)len;
948             d[1] = delta - 1;
949             d += 2;
950           }
951         }
952       }
953     }
954   }
955   while (--cutValue);
956 
957   return d;
958 }
959 
960 
961 Z7_FORCE_INLINE
GetMatchesSpec1(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,UInt32 maxLen)962 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
963     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
964     UInt32 *d, UInt32 maxLen)
965 {
966   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
967   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
968   unsigned len0 = 0, len1 = 0;
969 
970   UInt32 cmCheck;
971 
972   // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
973 
974   cmCheck = (UInt32)(pos - _cyclicBufferSize);
975   if ((UInt32)pos <= _cyclicBufferSize)
976     cmCheck = 0;
977 
978   if (cmCheck < curMatch)
979   do
980   {
981     const UInt32 delta = pos - curMatch;
982     {
983       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
984       const Byte *pb = cur - delta;
985       unsigned len = (len0 < len1 ? len0 : len1);
986       const UInt32 pair0 = pair[0];
987       if (pb[len] == cur[len])
988       {
989         if (++len != lenLimit && pb[len] == cur[len])
990           while (++len != lenLimit)
991             if (pb[len] != cur[len])
992               break;
993         if (maxLen < len)
994         {
995           maxLen = (UInt32)len;
996           *d++ = (UInt32)len;
997           *d++ = delta - 1;
998           if (len == lenLimit)
999           {
1000             *ptr1 = pair0;
1001             *ptr0 = pair[1];
1002             return d;
1003           }
1004         }
1005       }
1006       if (pb[len] < cur[len])
1007       {
1008         *ptr1 = curMatch;
1009         // const UInt32 curMatch2 = pair[1];
1010         // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue;  return NULL; }
1011         // curMatch = curMatch2;
1012         curMatch = pair[1];
1013         ptr1 = pair + 1;
1014         len1 = len;
1015       }
1016       else
1017       {
1018         *ptr0 = curMatch;
1019         curMatch = pair[0];
1020         ptr0 = pair;
1021         len0 = len;
1022       }
1023     }
1024   }
1025   while(--cutValue && cmCheck < curMatch);
1026 
1027   *ptr0 = *ptr1 = kEmptyHashValue;
1028   return d;
1029 }
1030 
1031 
SkipMatchesSpec(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue)1032 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1033     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1034 {
1035   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1036   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1037   unsigned len0 = 0, len1 = 0;
1038 
1039   UInt32 cmCheck;
1040 
1041   cmCheck = (UInt32)(pos - _cyclicBufferSize);
1042   if ((UInt32)pos <= _cyclicBufferSize)
1043     cmCheck = 0;
1044 
1045   if (// curMatch >= pos ||  // failure
1046       cmCheck < curMatch)
1047   do
1048   {
1049     const UInt32 delta = pos - curMatch;
1050     {
1051       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1052       const Byte *pb = cur - delta;
1053       unsigned len = (len0 < len1 ? len0 : len1);
1054       if (pb[len] == cur[len])
1055       {
1056         while (++len != lenLimit)
1057           if (pb[len] != cur[len])
1058             break;
1059         {
1060           if (len == lenLimit)
1061           {
1062             *ptr1 = pair[0];
1063             *ptr0 = pair[1];
1064             return;
1065           }
1066         }
1067       }
1068       if (pb[len] < cur[len])
1069       {
1070         *ptr1 = curMatch;
1071         curMatch = pair[1];
1072         ptr1 = pair + 1;
1073         len1 = len;
1074       }
1075       else
1076       {
1077         *ptr0 = curMatch;
1078         curMatch = pair[0];
1079         ptr0 = pair;
1080         len0 = len;
1081       }
1082     }
1083   }
1084   while(--cutValue && cmCheck < curMatch);
1085 
1086   *ptr0 = *ptr1 = kEmptyHashValue;
1087   return;
1088 }
1089 
1090 
1091 #define MOVE_POS \
1092   p->cyclicBufferPos++; \
1093   p->buffer++; \
1094   { const UInt32 pos1 = p->pos + 1; \
1095     p->pos = pos1; \
1096     if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1097 
1098 #define MOVE_POS_RET MOVE_POS return distances;
1099 
1100 Z7_NO_INLINE
MatchFinder_MovePos(CMatchFinder * p)1101 static void MatchFinder_MovePos(CMatchFinder *p)
1102 {
1103   /* we go here at the end of stream data, when (avail < num_hash_bytes)
1104      We don't update sons[cyclicBufferPos << btMode].
1105      So (sons) record will contain junk. And we cannot resume match searching
1106      to normal operation, even if we will provide more input data in buffer.
1107      p->sons[p->cyclicBufferPos << p->btMode] = 0;  // kEmptyHashValue
1108      if (p->btMode)
1109         p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0;  // kEmptyHashValue
1110   */
1111   MOVE_POS
1112 }
1113 
1114 #define GET_MATCHES_HEADER2(minLen, ret_op) \
1115   UInt32 hv; const Byte *cur; UInt32 curMatch; \
1116   UInt32 lenLimit = p->lenLimit; \
1117   if (lenLimit < minLen) { MatchFinder_MovePos(p);  ret_op; } \
1118   cur = p->buffer;
1119 
1120 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1121 #define SKIP_HEADER(minLen)  \
1122   do { GET_MATCHES_HEADER2(minLen, continue)
1123 
1124 #define MF_PARAMS(p)  lenLimit, curMatch, p->pos, p->buffer, p->son, \
1125     p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1126 
1127 #define SKIP_FOOTER  \
1128     SkipMatchesSpec(MF_PARAMS(p)); \
1129     MOVE_POS \
1130   } while (--num);
1131 
1132 #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1133   distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \
1134   MOVE_POS_RET
1135 
1136 #define GET_MATCHES_FOOTER_BT(_maxLen_) \
1137   GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1138 
1139 #define GET_MATCHES_FOOTER_HC(_maxLen_) \
1140   GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1141 
1142 
1143 
1144 #define UPDATE_maxLen { \
1145     const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1146     const Byte *c = cur + maxLen; \
1147     const Byte *lim = cur + lenLimit; \
1148     for (; c != lim; c++) if (*(c + diff) != *c) break; \
1149     maxLen = (unsigned)(c - cur); }
1150 
Bt2_MatchFinder_GetMatches(void * _p,UInt32 * distances)1151 static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1152 {
1153   CMatchFinder *p = (CMatchFinder *)_p;
1154   GET_MATCHES_HEADER(2)
1155   HASH2_CALC
1156   curMatch = p->hash[hv];
1157   p->hash[hv] = p->pos;
1158   GET_MATCHES_FOOTER_BT(1)
1159 }
1160 
Bt3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1161 UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1162 {
1163   GET_MATCHES_HEADER(3)
1164   HASH_ZIP_CALC
1165   curMatch = p->hash[hv];
1166   p->hash[hv] = p->pos;
1167   GET_MATCHES_FOOTER_BT(2)
1168 }
1169 
1170 
1171 #define SET_mmm  \
1172   mmm = p->cyclicBufferSize; \
1173   if (pos < mmm) \
1174     mmm = pos;
1175 
1176 
Bt3_MatchFinder_GetMatches(void * _p,UInt32 * distances)1177 static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1178 {
1179   CMatchFinder *p = (CMatchFinder *)_p;
1180   UInt32 mmm;
1181   UInt32 h2, d2, pos;
1182   unsigned maxLen;
1183   UInt32 *hash;
1184   GET_MATCHES_HEADER(3)
1185 
1186   HASH3_CALC
1187 
1188   hash = p->hash;
1189   pos = p->pos;
1190 
1191   d2 = pos - hash[h2];
1192 
1193   curMatch = (hash + kFix3HashSize)[hv];
1194 
1195   hash[h2] = pos;
1196   (hash + kFix3HashSize)[hv] = pos;
1197 
1198   SET_mmm
1199 
1200   maxLen = 2;
1201 
1202   if (d2 < mmm && *(cur - d2) == *cur)
1203   {
1204     UPDATE_maxLen
1205     distances[0] = (UInt32)maxLen;
1206     distances[1] = d2 - 1;
1207     distances += 2;
1208     if (maxLen == lenLimit)
1209     {
1210       SkipMatchesSpec(MF_PARAMS(p));
1211       MOVE_POS_RET
1212     }
1213   }
1214 
1215   GET_MATCHES_FOOTER_BT(maxLen)
1216 }
1217 
1218 
Bt4_MatchFinder_GetMatches(void * _p,UInt32 * distances)1219 static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1220 {
1221   CMatchFinder *p = (CMatchFinder *)_p;
1222   UInt32 mmm;
1223   UInt32 h2, h3, d2, d3, pos;
1224   unsigned maxLen;
1225   UInt32 *hash;
1226   GET_MATCHES_HEADER(4)
1227 
1228   HASH4_CALC
1229 
1230   hash = p->hash;
1231   pos = p->pos;
1232 
1233   d2 = pos - hash                  [h2];
1234   d3 = pos - (hash + kFix3HashSize)[h3];
1235   curMatch = (hash + kFix4HashSize)[hv];
1236 
1237   hash                  [h2] = pos;
1238   (hash + kFix3HashSize)[h3] = pos;
1239   (hash + kFix4HashSize)[hv] = pos;
1240 
1241   SET_mmm
1242 
1243   maxLen = 3;
1244 
1245   for (;;)
1246   {
1247     if (d2 < mmm && *(cur - d2) == *cur)
1248     {
1249       distances[0] = 2;
1250       distances[1] = d2 - 1;
1251       distances += 2;
1252       if (*(cur - d2 + 2) == cur[2])
1253       {
1254         // distances[-2] = 3;
1255       }
1256       else if (d3 < mmm && *(cur - d3) == *cur)
1257       {
1258         d2 = d3;
1259         distances[1] = d3 - 1;
1260         distances += 2;
1261       }
1262       else
1263         break;
1264     }
1265     else if (d3 < mmm && *(cur - d3) == *cur)
1266     {
1267       d2 = d3;
1268       distances[1] = d3 - 1;
1269       distances += 2;
1270     }
1271     else
1272       break;
1273 
1274     UPDATE_maxLen
1275     distances[-2] = (UInt32)maxLen;
1276     if (maxLen == lenLimit)
1277     {
1278       SkipMatchesSpec(MF_PARAMS(p));
1279       MOVE_POS_RET
1280     }
1281     break;
1282   }
1283 
1284   GET_MATCHES_FOOTER_BT(maxLen)
1285 }
1286 
1287 
Bt5_MatchFinder_GetMatches(void * _p,UInt32 * distances)1288 static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1289 {
1290   CMatchFinder *p = (CMatchFinder *)_p;
1291   UInt32 mmm;
1292   UInt32 h2, h3, d2, d3, pos;
1293   unsigned maxLen;
1294   UInt32 *hash;
1295   GET_MATCHES_HEADER(5)
1296 
1297   HASH5_CALC
1298 
1299   hash = p->hash;
1300   pos = p->pos;
1301 
1302   d2 = pos - hash                  [h2];
1303   d3 = pos - (hash + kFix3HashSize)[h3];
1304   // d4 = pos - (hash + kFix4HashSize)[h4];
1305 
1306   curMatch = (hash + kFix5HashSize)[hv];
1307 
1308   hash                  [h2] = pos;
1309   (hash + kFix3HashSize)[h3] = pos;
1310   // (hash + kFix4HashSize)[h4] = pos;
1311   (hash + kFix5HashSize)[hv] = pos;
1312 
1313   SET_mmm
1314 
1315   maxLen = 4;
1316 
1317   for (;;)
1318   {
1319     if (d2 < mmm && *(cur - d2) == *cur)
1320     {
1321       distances[0] = 2;
1322       distances[1] = d2 - 1;
1323       distances += 2;
1324       if (*(cur - d2 + 2) == cur[2])
1325       {
1326       }
1327       else if (d3 < mmm && *(cur - d3) == *cur)
1328       {
1329         distances[1] = d3 - 1;
1330         distances += 2;
1331         d2 = d3;
1332       }
1333       else
1334         break;
1335     }
1336     else if (d3 < mmm && *(cur - d3) == *cur)
1337     {
1338       distances[1] = d3 - 1;
1339       distances += 2;
1340       d2 = d3;
1341     }
1342     else
1343       break;
1344 
1345     distances[-2] = 3;
1346     if (*(cur - d2 + 3) != cur[3])
1347       break;
1348     UPDATE_maxLen
1349     distances[-2] = (UInt32)maxLen;
1350     if (maxLen == lenLimit)
1351     {
1352       SkipMatchesSpec(MF_PARAMS(p));
1353       MOVE_POS_RET
1354     }
1355     break;
1356   }
1357 
1358   GET_MATCHES_FOOTER_BT(maxLen)
1359 }
1360 
1361 
Hc4_MatchFinder_GetMatches(void * _p,UInt32 * distances)1362 static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1363 {
1364   CMatchFinder *p = (CMatchFinder *)_p;
1365   UInt32 mmm;
1366   UInt32 h2, h3, d2, d3, pos;
1367   unsigned maxLen;
1368   UInt32 *hash;
1369   GET_MATCHES_HEADER(4)
1370 
1371   HASH4_CALC
1372 
1373   hash = p->hash;
1374   pos = p->pos;
1375 
1376   d2 = pos - hash                  [h2];
1377   d3 = pos - (hash + kFix3HashSize)[h3];
1378   curMatch = (hash + kFix4HashSize)[hv];
1379 
1380   hash                  [h2] = pos;
1381   (hash + kFix3HashSize)[h3] = pos;
1382   (hash + kFix4HashSize)[hv] = pos;
1383 
1384   SET_mmm
1385 
1386   maxLen = 3;
1387 
1388   for (;;)
1389   {
1390     if (d2 < mmm && *(cur - d2) == *cur)
1391     {
1392       distances[0] = 2;
1393       distances[1] = d2 - 1;
1394       distances += 2;
1395       if (*(cur - d2 + 2) == cur[2])
1396       {
1397         // distances[-2] = 3;
1398       }
1399       else if (d3 < mmm && *(cur - d3) == *cur)
1400       {
1401         d2 = d3;
1402         distances[1] = d3 - 1;
1403         distances += 2;
1404       }
1405       else
1406         break;
1407     }
1408     else if (d3 < mmm && *(cur - d3) == *cur)
1409     {
1410       d2 = d3;
1411       distances[1] = d3 - 1;
1412       distances += 2;
1413     }
1414     else
1415       break;
1416 
1417     UPDATE_maxLen
1418     distances[-2] = (UInt32)maxLen;
1419     if (maxLen == lenLimit)
1420     {
1421       p->son[p->cyclicBufferPos] = curMatch;
1422       MOVE_POS_RET
1423     }
1424     break;
1425   }
1426 
1427   GET_MATCHES_FOOTER_HC(maxLen)
1428 }
1429 
1430 
Hc5_MatchFinder_GetMatches(void * _p,UInt32 * distances)1431 static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1432 {
1433   CMatchFinder *p = (CMatchFinder *)_p;
1434   UInt32 mmm;
1435   UInt32 h2, h3, d2, d3, pos;
1436   unsigned maxLen;
1437   UInt32 *hash;
1438   GET_MATCHES_HEADER(5)
1439 
1440   HASH5_CALC
1441 
1442   hash = p->hash;
1443   pos = p->pos;
1444 
1445   d2 = pos - hash                  [h2];
1446   d3 = pos - (hash + kFix3HashSize)[h3];
1447   // d4 = pos - (hash + kFix4HashSize)[h4];
1448 
1449   curMatch = (hash + kFix5HashSize)[hv];
1450 
1451   hash                  [h2] = pos;
1452   (hash + kFix3HashSize)[h3] = pos;
1453   // (hash + kFix4HashSize)[h4] = pos;
1454   (hash + kFix5HashSize)[hv] = pos;
1455 
1456   SET_mmm
1457 
1458   maxLen = 4;
1459 
1460   for (;;)
1461   {
1462     if (d2 < mmm && *(cur - d2) == *cur)
1463     {
1464       distances[0] = 2;
1465       distances[1] = d2 - 1;
1466       distances += 2;
1467       if (*(cur - d2 + 2) == cur[2])
1468       {
1469       }
1470       else if (d3 < mmm && *(cur - d3) == *cur)
1471       {
1472         distances[1] = d3 - 1;
1473         distances += 2;
1474         d2 = d3;
1475       }
1476       else
1477         break;
1478     }
1479     else if (d3 < mmm && *(cur - d3) == *cur)
1480     {
1481       distances[1] = d3 - 1;
1482       distances += 2;
1483       d2 = d3;
1484     }
1485     else
1486       break;
1487 
1488     distances[-2] = 3;
1489     if (*(cur - d2 + 3) != cur[3])
1490       break;
1491     UPDATE_maxLen
1492     distances[-2] = (UInt32)maxLen;
1493     if (maxLen == lenLimit)
1494     {
1495       p->son[p->cyclicBufferPos] = curMatch;
1496       MOVE_POS_RET
1497     }
1498     break;
1499   }
1500 
1501   GET_MATCHES_FOOTER_HC(maxLen)
1502 }
1503 
1504 
Hc3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1505 UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1506 {
1507   GET_MATCHES_HEADER(3)
1508   HASH_ZIP_CALC
1509   curMatch = p->hash[hv];
1510   p->hash[hv] = p->pos;
1511   GET_MATCHES_FOOTER_HC(2)
1512 }
1513 
1514 
Bt2_MatchFinder_Skip(void * _p,UInt32 num)1515 static void Bt2_MatchFinder_Skip(void *_p, UInt32 num)
1516 {
1517   CMatchFinder *p = (CMatchFinder *)_p;
1518   SKIP_HEADER(2)
1519   {
1520     HASH2_CALC
1521     curMatch = p->hash[hv];
1522     p->hash[hv] = p->pos;
1523   }
1524   SKIP_FOOTER
1525 }
1526 
Bt3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1527 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1528 {
1529   SKIP_HEADER(3)
1530   {
1531     HASH_ZIP_CALC
1532     curMatch = p->hash[hv];
1533     p->hash[hv] = p->pos;
1534   }
1535   SKIP_FOOTER
1536 }
1537 
Bt3_MatchFinder_Skip(void * _p,UInt32 num)1538 static void Bt3_MatchFinder_Skip(void *_p, UInt32 num)
1539 {
1540   CMatchFinder *p = (CMatchFinder *)_p;
1541   SKIP_HEADER(3)
1542   {
1543     UInt32 h2;
1544     UInt32 *hash;
1545     HASH3_CALC
1546     hash = p->hash;
1547     curMatch = (hash + kFix3HashSize)[hv];
1548     hash[h2] =
1549     (hash + kFix3HashSize)[hv] = p->pos;
1550   }
1551   SKIP_FOOTER
1552 }
1553 
Bt4_MatchFinder_Skip(void * _p,UInt32 num)1554 static void Bt4_MatchFinder_Skip(void *_p, UInt32 num)
1555 {
1556   CMatchFinder *p = (CMatchFinder *)_p;
1557   SKIP_HEADER(4)
1558   {
1559     UInt32 h2, h3;
1560     UInt32 *hash;
1561     HASH4_CALC
1562     hash = p->hash;
1563     curMatch = (hash + kFix4HashSize)[hv];
1564     hash                  [h2] =
1565     (hash + kFix3HashSize)[h3] =
1566     (hash + kFix4HashSize)[hv] = p->pos;
1567   }
1568   SKIP_FOOTER
1569 }
1570 
Bt5_MatchFinder_Skip(void * _p,UInt32 num)1571 static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1572 {
1573   CMatchFinder *p = (CMatchFinder *)_p;
1574   SKIP_HEADER(5)
1575   {
1576     UInt32 h2, h3;
1577     UInt32 *hash;
1578     HASH5_CALC
1579     hash = p->hash;
1580     curMatch = (hash + kFix5HashSize)[hv];
1581     hash                  [h2] =
1582     (hash + kFix3HashSize)[h3] =
1583     // (hash + kFix4HashSize)[h4] =
1584     (hash + kFix5HashSize)[hv] = p->pos;
1585   }
1586   SKIP_FOOTER
1587 }
1588 
1589 
1590 #define HC_SKIP_HEADER(minLen) \
1591     do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1592     const Byte *cur; \
1593     UInt32 *hash; \
1594     UInt32 *son; \
1595     UInt32 pos = p->pos; \
1596     UInt32 num2 = num; \
1597     /* (p->pos == p->posLimit) is not allowed here !!! */ \
1598     { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1599     num -= num2; \
1600     { const UInt32 cycPos = p->cyclicBufferPos; \
1601       son = p->son + cycPos; \
1602       p->cyclicBufferPos = cycPos + num2; } \
1603     cur = p->buffer; \
1604     hash = p->hash; \
1605     do { \
1606     UInt32 curMatch; \
1607     UInt32 hv;
1608 
1609 
1610 #define HC_SKIP_FOOTER \
1611     cur++;  pos++;  *son++ = curMatch; \
1612     } while (--num2); \
1613     p->buffer = cur; \
1614     p->pos = pos; \
1615     if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1616     }} while(num); \
1617 
1618 
Hc4_MatchFinder_Skip(void * _p,UInt32 num)1619 static void Hc4_MatchFinder_Skip(void *_p, UInt32 num)
1620 {
1621   CMatchFinder *p = (CMatchFinder *)_p;
1622   HC_SKIP_HEADER(4)
1623 
1624     UInt32 h2, h3;
1625     HASH4_CALC
1626     curMatch = (hash + kFix4HashSize)[hv];
1627     hash                  [h2] =
1628     (hash + kFix3HashSize)[h3] =
1629     (hash + kFix4HashSize)[hv] = pos;
1630 
1631   HC_SKIP_FOOTER
1632 }
1633 
1634 
Hc5_MatchFinder_Skip(void * _p,UInt32 num)1635 static void Hc5_MatchFinder_Skip(void *_p, UInt32 num)
1636 {
1637   CMatchFinder *p = (CMatchFinder *)_p;
1638   HC_SKIP_HEADER(5)
1639 
1640     UInt32 h2, h3;
1641     HASH5_CALC
1642     curMatch = (hash + kFix5HashSize)[hv];
1643     hash                  [h2] =
1644     (hash + kFix3HashSize)[h3] =
1645     // (hash + kFix4HashSize)[h4] =
1646     (hash + kFix5HashSize)[hv] = pos;
1647 
1648   HC_SKIP_FOOTER
1649 }
1650 
1651 
Hc3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1652 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1653 {
1654   HC_SKIP_HEADER(3)
1655 
1656     HASH_ZIP_CALC
1657     curMatch = hash[hv];
1658     hash[hv] = pos;
1659 
1660   HC_SKIP_FOOTER
1661 }
1662 
1663 
MatchFinder_CreateVTable(CMatchFinder * p,IMatchFinder2 * vTable)1664 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1665 {
1666   vTable->Init = MatchFinder_Init;
1667   vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes;
1668   vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos;
1669   if (!p->btMode)
1670   {
1671     if (p->numHashBytes <= 4)
1672     {
1673       vTable->GetMatches = Hc4_MatchFinder_GetMatches;
1674       vTable->Skip = Hc4_MatchFinder_Skip;
1675     }
1676     else
1677     {
1678       vTable->GetMatches = Hc5_MatchFinder_GetMatches;
1679       vTable->Skip = Hc5_MatchFinder_Skip;
1680     }
1681   }
1682   else if (p->numHashBytes == 2)
1683   {
1684     vTable->GetMatches = Bt2_MatchFinder_GetMatches;
1685     vTable->Skip = Bt2_MatchFinder_Skip;
1686   }
1687   else if (p->numHashBytes == 3)
1688   {
1689     vTable->GetMatches = Bt3_MatchFinder_GetMatches;
1690     vTable->Skip = Bt3_MatchFinder_Skip;
1691   }
1692   else if (p->numHashBytes == 4)
1693   {
1694     vTable->GetMatches = Bt4_MatchFinder_GetMatches;
1695     vTable->Skip = Bt4_MatchFinder_Skip;
1696   }
1697   else
1698   {
1699     vTable->GetMatches = Bt5_MatchFinder_GetMatches;
1700     vTable->Skip = Bt5_MatchFinder_Skip;
1701   }
1702 }
1703 
1704 
1705 
LzFindPrepare(void)1706 void LzFindPrepare(void)
1707 {
1708   #ifndef FORCE_LZFIND_SATUR_SUB_128
1709   #ifdef USE_LZFIND_SATUR_SUB_128
1710   LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1711   #ifdef MY_CPU_ARM_OR_ARM64
1712   {
1713     if (CPU_IsSupported_NEON())
1714     {
1715       // #pragma message ("=== LzFind NEON")
1716       PRF(printf("\n=== LzFind NEON\n"));
1717       f = LzFind_SaturSub_128;
1718     }
1719     // f = 0; // for debug
1720   }
1721   #else // MY_CPU_ARM_OR_ARM64
1722   if (CPU_IsSupported_SSE41())
1723   {
1724     // #pragma message ("=== LzFind SSE41")
1725     PRF(printf("\n=== LzFind SSE41\n"));
1726     f = LzFind_SaturSub_128;
1727 
1728     #ifdef USE_LZFIND_SATUR_SUB_256
1729     if (CPU_IsSupported_AVX2())
1730     {
1731       // #pragma message ("=== LzFind AVX2")
1732       PRF(printf("\n=== LzFind AVX2\n"));
1733       f = LzFind_SaturSub_256;
1734     }
1735     #endif
1736   }
1737   #endif // MY_CPU_ARM_OR_ARM64
1738   g_LzFind_SaturSub = f;
1739   #endif // USE_LZFIND_SATUR_SUB_128
1740   #endif // FORCE_LZFIND_SATUR_SUB_128
1741 }
1742 
1743 
1744 #undef MOVE_POS
1745 #undef MOVE_POS_RET
1746 #undef PRF
1747