xref: /aosp_15_r20/external/zstd/lib/dictBuilder/zdict.c (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui /*
2*01826a49SYabin Cui  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui  * All rights reserved.
4*01826a49SYabin Cui  *
5*01826a49SYabin Cui  * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui  * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui  * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui  */
10*01826a49SYabin Cui 
11*01826a49SYabin Cui 
12*01826a49SYabin Cui /*-**************************************
13*01826a49SYabin Cui *  Tuning parameters
14*01826a49SYabin Cui ****************************************/
15*01826a49SYabin Cui #define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
16*01826a49SYabin Cui #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17*01826a49SYabin Cui #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
18*01826a49SYabin Cui 
19*01826a49SYabin Cui 
20*01826a49SYabin Cui /*-**************************************
21*01826a49SYabin Cui *  Compiler Options
22*01826a49SYabin Cui ****************************************/
23*01826a49SYabin Cui /* Unix Large Files support (>4GB) */
24*01826a49SYabin Cui #define _FILE_OFFSET_BITS 64
25*01826a49SYabin Cui #if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
26*01826a49SYabin Cui #  ifndef _LARGEFILE_SOURCE
27*01826a49SYabin Cui #  define _LARGEFILE_SOURCE
28*01826a49SYabin Cui #  endif
29*01826a49SYabin Cui #elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
30*01826a49SYabin Cui #  ifndef _LARGEFILE64_SOURCE
31*01826a49SYabin Cui #  define _LARGEFILE64_SOURCE
32*01826a49SYabin Cui #  endif
33*01826a49SYabin Cui #endif
34*01826a49SYabin Cui 
35*01826a49SYabin Cui 
36*01826a49SYabin Cui /*-*************************************
37*01826a49SYabin Cui *  Dependencies
38*01826a49SYabin Cui ***************************************/
39*01826a49SYabin Cui #include <stdlib.h>        /* malloc, free */
40*01826a49SYabin Cui #include <string.h>        /* memset */
41*01826a49SYabin Cui #include <stdio.h>         /* fprintf, fopen, ftello64 */
42*01826a49SYabin Cui #include <time.h>          /* clock */
43*01826a49SYabin Cui 
44*01826a49SYabin Cui #ifndef ZDICT_STATIC_LINKING_ONLY
45*01826a49SYabin Cui #  define ZDICT_STATIC_LINKING_ONLY
46*01826a49SYabin Cui #endif
47*01826a49SYabin Cui 
48*01826a49SYabin Cui #include "../common/mem.h"           /* read */
49*01826a49SYabin Cui #include "../common/fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
50*01826a49SYabin Cui #include "../common/huf.h"           /* HUF_buildCTable, HUF_writeCTable */
51*01826a49SYabin Cui #include "../common/zstd_internal.h" /* includes zstd.h */
52*01826a49SYabin Cui #include "../common/xxhash.h"        /* XXH64 */
53*01826a49SYabin Cui #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
54*01826a49SYabin Cui #include "../zdict.h"
55*01826a49SYabin Cui #include "divsufsort.h"
56*01826a49SYabin Cui #include "../common/bits.h"          /* ZSTD_NbCommonBytes */
57*01826a49SYabin Cui 
58*01826a49SYabin Cui 
59*01826a49SYabin Cui /*-*************************************
60*01826a49SYabin Cui *  Constants
61*01826a49SYabin Cui ***************************************/
62*01826a49SYabin Cui #define KB *(1 <<10)
63*01826a49SYabin Cui #define MB *(1 <<20)
64*01826a49SYabin Cui #define GB *(1U<<30)
65*01826a49SYabin Cui 
66*01826a49SYabin Cui #define DICTLISTSIZE_DEFAULT 10000
67*01826a49SYabin Cui 
68*01826a49SYabin Cui #define NOISELENGTH 32
69*01826a49SYabin Cui 
70*01826a49SYabin Cui static const U32 g_selectivity_default = 9;
71*01826a49SYabin Cui 
72*01826a49SYabin Cui 
73*01826a49SYabin Cui /*-*************************************
74*01826a49SYabin Cui *  Console display
75*01826a49SYabin Cui ***************************************/
76*01826a49SYabin Cui #undef  DISPLAY
77*01826a49SYabin Cui #define DISPLAY(...)         do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)
78*01826a49SYabin Cui #undef  DISPLAYLEVEL
79*01826a49SYabin Cui #define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0)    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
80*01826a49SYabin Cui 
ZDICT_clockSpan(clock_t nPrevious)81*01826a49SYabin Cui static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
82*01826a49SYabin Cui 
ZDICT_printHex(const void * ptr,size_t length)83*01826a49SYabin Cui static void ZDICT_printHex(const void* ptr, size_t length)
84*01826a49SYabin Cui {
85*01826a49SYabin Cui     const BYTE* const b = (const BYTE*)ptr;
86*01826a49SYabin Cui     size_t u;
87*01826a49SYabin Cui     for (u=0; u<length; u++) {
88*01826a49SYabin Cui         BYTE c = b[u];
89*01826a49SYabin Cui         if (c<32 || c>126) c = '.';   /* non-printable char */
90*01826a49SYabin Cui         DISPLAY("%c", c);
91*01826a49SYabin Cui     }
92*01826a49SYabin Cui }
93*01826a49SYabin Cui 
94*01826a49SYabin Cui 
95*01826a49SYabin Cui /*-********************************************************
96*01826a49SYabin Cui *  Helper functions
97*01826a49SYabin Cui **********************************************************/
ZDICT_isError(size_t errorCode)98*01826a49SYabin Cui unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
99*01826a49SYabin Cui 
ZDICT_getErrorName(size_t errorCode)100*01826a49SYabin Cui const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
101*01826a49SYabin Cui 
ZDICT_getDictID(const void * dictBuffer,size_t dictSize)102*01826a49SYabin Cui unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
103*01826a49SYabin Cui {
104*01826a49SYabin Cui     if (dictSize < 8) return 0;
105*01826a49SYabin Cui     if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
106*01826a49SYabin Cui     return MEM_readLE32((const char*)dictBuffer + 4);
107*01826a49SYabin Cui }
108*01826a49SYabin Cui 
ZDICT_getDictHeaderSize(const void * dictBuffer,size_t dictSize)109*01826a49SYabin Cui size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
110*01826a49SYabin Cui {
111*01826a49SYabin Cui     size_t headerSize;
112*01826a49SYabin Cui     if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
113*01826a49SYabin Cui 
114*01826a49SYabin Cui     {   ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
115*01826a49SYabin Cui         U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
116*01826a49SYabin Cui         if (!bs || !wksp) {
117*01826a49SYabin Cui             headerSize = ERROR(memory_allocation);
118*01826a49SYabin Cui         } else {
119*01826a49SYabin Cui             ZSTD_reset_compressedBlockState(bs);
120*01826a49SYabin Cui             headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
121*01826a49SYabin Cui         }
122*01826a49SYabin Cui 
123*01826a49SYabin Cui         free(bs);
124*01826a49SYabin Cui         free(wksp);
125*01826a49SYabin Cui     }
126*01826a49SYabin Cui 
127*01826a49SYabin Cui     return headerSize;
128*01826a49SYabin Cui }
129*01826a49SYabin Cui 
130*01826a49SYabin Cui /*-********************************************************
131*01826a49SYabin Cui *  Dictionary training functions
132*01826a49SYabin Cui **********************************************************/
133*01826a49SYabin Cui /*! ZDICT_count() :
134*01826a49SYabin Cui     Count the nb of common bytes between 2 pointers.
135*01826a49SYabin Cui     Note : this function presumes end of buffer followed by noisy guard band.
136*01826a49SYabin Cui */
ZDICT_count(const void * pIn,const void * pMatch)137*01826a49SYabin Cui static size_t ZDICT_count(const void* pIn, const void* pMatch)
138*01826a49SYabin Cui {
139*01826a49SYabin Cui     const char* const pStart = (const char*)pIn;
140*01826a49SYabin Cui     for (;;) {
141*01826a49SYabin Cui         size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
142*01826a49SYabin Cui         if (!diff) {
143*01826a49SYabin Cui             pIn = (const char*)pIn+sizeof(size_t);
144*01826a49SYabin Cui             pMatch = (const char*)pMatch+sizeof(size_t);
145*01826a49SYabin Cui             continue;
146*01826a49SYabin Cui         }
147*01826a49SYabin Cui         pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
148*01826a49SYabin Cui         return (size_t)((const char*)pIn - pStart);
149*01826a49SYabin Cui     }
150*01826a49SYabin Cui }
151*01826a49SYabin Cui 
152*01826a49SYabin Cui 
153*01826a49SYabin Cui typedef struct {
154*01826a49SYabin Cui     U32 pos;
155*01826a49SYabin Cui     U32 length;
156*01826a49SYabin Cui     U32 savings;
157*01826a49SYabin Cui } dictItem;
158*01826a49SYabin Cui 
ZDICT_initDictItem(dictItem * d)159*01826a49SYabin Cui static void ZDICT_initDictItem(dictItem* d)
160*01826a49SYabin Cui {
161*01826a49SYabin Cui     d->pos = 1;
162*01826a49SYabin Cui     d->length = 0;
163*01826a49SYabin Cui     d->savings = (U32)(-1);
164*01826a49SYabin Cui }
165*01826a49SYabin Cui 
166*01826a49SYabin Cui 
167*01826a49SYabin Cui #define LLIMIT 64          /* heuristic determined experimentally */
168*01826a49SYabin Cui #define MINMATCHLENGTH 7   /* heuristic determined experimentally */
ZDICT_analyzePos(BYTE * doneMarks,const int * suffix,U32 start,const void * buffer,U32 minRatio,U32 notificationLevel)169*01826a49SYabin Cui static dictItem ZDICT_analyzePos(
170*01826a49SYabin Cui                        BYTE* doneMarks,
171*01826a49SYabin Cui                        const int* suffix, U32 start,
172*01826a49SYabin Cui                        const void* buffer, U32 minRatio, U32 notificationLevel)
173*01826a49SYabin Cui {
174*01826a49SYabin Cui     U32 lengthList[LLIMIT] = {0};
175*01826a49SYabin Cui     U32 cumulLength[LLIMIT] = {0};
176*01826a49SYabin Cui     U32 savings[LLIMIT] = {0};
177*01826a49SYabin Cui     const BYTE* b = (const BYTE*)buffer;
178*01826a49SYabin Cui     size_t maxLength = LLIMIT;
179*01826a49SYabin Cui     size_t pos = (size_t)suffix[start];
180*01826a49SYabin Cui     U32 end = start;
181*01826a49SYabin Cui     dictItem solution;
182*01826a49SYabin Cui 
183*01826a49SYabin Cui     /* init */
184*01826a49SYabin Cui     memset(&solution, 0, sizeof(solution));
185*01826a49SYabin Cui     doneMarks[pos] = 1;
186*01826a49SYabin Cui 
187*01826a49SYabin Cui     /* trivial repetition cases */
188*01826a49SYabin Cui     if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
189*01826a49SYabin Cui        ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
190*01826a49SYabin Cui        ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
191*01826a49SYabin Cui         /* skip and mark segment */
192*01826a49SYabin Cui         U16 const pattern16 = MEM_read16(b+pos+4);
193*01826a49SYabin Cui         U32 u, patternEnd = 6;
194*01826a49SYabin Cui         while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
195*01826a49SYabin Cui         if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
196*01826a49SYabin Cui         for (u=1; u<patternEnd; u++)
197*01826a49SYabin Cui             doneMarks[pos+u] = 1;
198*01826a49SYabin Cui         return solution;
199*01826a49SYabin Cui     }
200*01826a49SYabin Cui 
201*01826a49SYabin Cui     /* look forward */
202*01826a49SYabin Cui     {   size_t length;
203*01826a49SYabin Cui         do {
204*01826a49SYabin Cui             end++;
205*01826a49SYabin Cui             length = ZDICT_count(b + pos, b + suffix[end]);
206*01826a49SYabin Cui         } while (length >= MINMATCHLENGTH);
207*01826a49SYabin Cui     }
208*01826a49SYabin Cui 
209*01826a49SYabin Cui     /* look backward */
210*01826a49SYabin Cui     {   size_t length;
211*01826a49SYabin Cui         do {
212*01826a49SYabin Cui             length = ZDICT_count(b + pos, b + *(suffix+start-1));
213*01826a49SYabin Cui             if (length >=MINMATCHLENGTH) start--;
214*01826a49SYabin Cui         } while(length >= MINMATCHLENGTH);
215*01826a49SYabin Cui     }
216*01826a49SYabin Cui 
217*01826a49SYabin Cui     /* exit if not found a minimum nb of repetitions */
218*01826a49SYabin Cui     if (end-start < minRatio) {
219*01826a49SYabin Cui         U32 idx;
220*01826a49SYabin Cui         for(idx=start; idx<end; idx++)
221*01826a49SYabin Cui             doneMarks[suffix[idx]] = 1;
222*01826a49SYabin Cui         return solution;
223*01826a49SYabin Cui     }
224*01826a49SYabin Cui 
225*01826a49SYabin Cui     {   int i;
226*01826a49SYabin Cui         U32 mml;
227*01826a49SYabin Cui         U32 refinedStart = start;
228*01826a49SYabin Cui         U32 refinedEnd = end;
229*01826a49SYabin Cui 
230*01826a49SYabin Cui         DISPLAYLEVEL(4, "\n");
231*01826a49SYabin Cui         DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
232*01826a49SYabin Cui         DISPLAYLEVEL(4, "\n");
233*01826a49SYabin Cui 
234*01826a49SYabin Cui         for (mml = MINMATCHLENGTH ; ; mml++) {
235*01826a49SYabin Cui             BYTE currentChar = 0;
236*01826a49SYabin Cui             U32 currentCount = 0;
237*01826a49SYabin Cui             U32 currentID = refinedStart;
238*01826a49SYabin Cui             U32 id;
239*01826a49SYabin Cui             U32 selectedCount = 0;
240*01826a49SYabin Cui             U32 selectedID = currentID;
241*01826a49SYabin Cui             for (id =refinedStart; id < refinedEnd; id++) {
242*01826a49SYabin Cui                 if (b[suffix[id] + mml] != currentChar) {
243*01826a49SYabin Cui                     if (currentCount > selectedCount) {
244*01826a49SYabin Cui                         selectedCount = currentCount;
245*01826a49SYabin Cui                         selectedID = currentID;
246*01826a49SYabin Cui                     }
247*01826a49SYabin Cui                     currentID = id;
248*01826a49SYabin Cui                     currentChar = b[ suffix[id] + mml];
249*01826a49SYabin Cui                     currentCount = 0;
250*01826a49SYabin Cui                 }
251*01826a49SYabin Cui                 currentCount ++;
252*01826a49SYabin Cui             }
253*01826a49SYabin Cui             if (currentCount > selectedCount) {  /* for last */
254*01826a49SYabin Cui                 selectedCount = currentCount;
255*01826a49SYabin Cui                 selectedID = currentID;
256*01826a49SYabin Cui             }
257*01826a49SYabin Cui 
258*01826a49SYabin Cui             if (selectedCount < minRatio)
259*01826a49SYabin Cui                 break;
260*01826a49SYabin Cui             refinedStart = selectedID;
261*01826a49SYabin Cui             refinedEnd = refinedStart + selectedCount;
262*01826a49SYabin Cui         }
263*01826a49SYabin Cui 
264*01826a49SYabin Cui         /* evaluate gain based on new dict */
265*01826a49SYabin Cui         start = refinedStart;
266*01826a49SYabin Cui         pos = suffix[refinedStart];
267*01826a49SYabin Cui         end = start;
268*01826a49SYabin Cui         memset(lengthList, 0, sizeof(lengthList));
269*01826a49SYabin Cui 
270*01826a49SYabin Cui         /* look forward */
271*01826a49SYabin Cui         {   size_t length;
272*01826a49SYabin Cui             do {
273*01826a49SYabin Cui                 end++;
274*01826a49SYabin Cui                 length = ZDICT_count(b + pos, b + suffix[end]);
275*01826a49SYabin Cui                 if (length >= LLIMIT) length = LLIMIT-1;
276*01826a49SYabin Cui                 lengthList[length]++;
277*01826a49SYabin Cui             } while (length >=MINMATCHLENGTH);
278*01826a49SYabin Cui         }
279*01826a49SYabin Cui 
280*01826a49SYabin Cui         /* look backward */
281*01826a49SYabin Cui         {   size_t length = MINMATCHLENGTH;
282*01826a49SYabin Cui             while ((length >= MINMATCHLENGTH) & (start > 0)) {
283*01826a49SYabin Cui                 length = ZDICT_count(b + pos, b + suffix[start - 1]);
284*01826a49SYabin Cui                 if (length >= LLIMIT) length = LLIMIT - 1;
285*01826a49SYabin Cui                 lengthList[length]++;
286*01826a49SYabin Cui                 if (length >= MINMATCHLENGTH) start--;
287*01826a49SYabin Cui             }
288*01826a49SYabin Cui         }
289*01826a49SYabin Cui 
290*01826a49SYabin Cui         /* largest useful length */
291*01826a49SYabin Cui         memset(cumulLength, 0, sizeof(cumulLength));
292*01826a49SYabin Cui         cumulLength[maxLength-1] = lengthList[maxLength-1];
293*01826a49SYabin Cui         for (i=(int)(maxLength-2); i>=0; i--)
294*01826a49SYabin Cui             cumulLength[i] = cumulLength[i+1] + lengthList[i];
295*01826a49SYabin Cui 
296*01826a49SYabin Cui         for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
297*01826a49SYabin Cui         maxLength = i;
298*01826a49SYabin Cui 
299*01826a49SYabin Cui         /* reduce maxLength in case of final into repetitive data */
300*01826a49SYabin Cui         {   U32 l = (U32)maxLength;
301*01826a49SYabin Cui             BYTE const c = b[pos + maxLength-1];
302*01826a49SYabin Cui             while (b[pos+l-2]==c) l--;
303*01826a49SYabin Cui             maxLength = l;
304*01826a49SYabin Cui         }
305*01826a49SYabin Cui         if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
306*01826a49SYabin Cui 
307*01826a49SYabin Cui         /* calculate savings */
308*01826a49SYabin Cui         savings[5] = 0;
309*01826a49SYabin Cui         for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
310*01826a49SYabin Cui             savings[i] = savings[i-1] + (lengthList[i] * (i-3));
311*01826a49SYabin Cui 
312*01826a49SYabin Cui         DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f)  \n",
313*01826a49SYabin Cui                      (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
314*01826a49SYabin Cui 
315*01826a49SYabin Cui         solution.pos = (U32)pos;
316*01826a49SYabin Cui         solution.length = (U32)maxLength;
317*01826a49SYabin Cui         solution.savings = savings[maxLength];
318*01826a49SYabin Cui 
319*01826a49SYabin Cui         /* mark positions done */
320*01826a49SYabin Cui         {   U32 id;
321*01826a49SYabin Cui             for (id=start; id<end; id++) {
322*01826a49SYabin Cui                 U32 p, pEnd, length;
323*01826a49SYabin Cui                 U32 const testedPos = (U32)suffix[id];
324*01826a49SYabin Cui                 if (testedPos == pos)
325*01826a49SYabin Cui                     length = solution.length;
326*01826a49SYabin Cui                 else {
327*01826a49SYabin Cui                     length = (U32)ZDICT_count(b+pos, b+testedPos);
328*01826a49SYabin Cui                     if (length > solution.length) length = solution.length;
329*01826a49SYabin Cui                 }
330*01826a49SYabin Cui                 pEnd = (U32)(testedPos + length);
331*01826a49SYabin Cui                 for (p=testedPos; p<pEnd; p++)
332*01826a49SYabin Cui                     doneMarks[p] = 1;
333*01826a49SYabin Cui     }   }   }
334*01826a49SYabin Cui 
335*01826a49SYabin Cui     return solution;
336*01826a49SYabin Cui }
337*01826a49SYabin Cui 
338*01826a49SYabin Cui 
isIncluded(const void * in,const void * container,size_t length)339*01826a49SYabin Cui static int isIncluded(const void* in, const void* container, size_t length)
340*01826a49SYabin Cui {
341*01826a49SYabin Cui     const char* const ip = (const char*) in;
342*01826a49SYabin Cui     const char* const into = (const char*) container;
343*01826a49SYabin Cui     size_t u;
344*01826a49SYabin Cui 
345*01826a49SYabin Cui     for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
346*01826a49SYabin Cui         if (ip[u] != into[u]) break;
347*01826a49SYabin Cui     }
348*01826a49SYabin Cui 
349*01826a49SYabin Cui     return u==length;
350*01826a49SYabin Cui }
351*01826a49SYabin Cui 
352*01826a49SYabin Cui /*! ZDICT_tryMerge() :
353*01826a49SYabin Cui     check if dictItem can be merged, do it if possible
354*01826a49SYabin Cui     @return : id of destination elt, 0 if not merged
355*01826a49SYabin Cui */
ZDICT_tryMerge(dictItem * table,dictItem elt,U32 eltNbToSkip,const void * buffer)356*01826a49SYabin Cui static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
357*01826a49SYabin Cui {
358*01826a49SYabin Cui     const U32 tableSize = table->pos;
359*01826a49SYabin Cui     const U32 eltEnd = elt.pos + elt.length;
360*01826a49SYabin Cui     const char* const buf = (const char*) buffer;
361*01826a49SYabin Cui 
362*01826a49SYabin Cui     /* tail overlap */
363*01826a49SYabin Cui     U32 u; for (u=1; u<tableSize; u++) {
364*01826a49SYabin Cui         if (u==eltNbToSkip) continue;
365*01826a49SYabin Cui         if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
366*01826a49SYabin Cui             /* append */
367*01826a49SYabin Cui             U32 const addedLength = table[u].pos - elt.pos;
368*01826a49SYabin Cui             table[u].length += addedLength;
369*01826a49SYabin Cui             table[u].pos = elt.pos;
370*01826a49SYabin Cui             table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
371*01826a49SYabin Cui             table[u].savings += elt.length / 8;    /* rough approx bonus */
372*01826a49SYabin Cui             elt = table[u];
373*01826a49SYabin Cui             /* sort : improve rank */
374*01826a49SYabin Cui             while ((u>1) && (table[u-1].savings < elt.savings))
375*01826a49SYabin Cui                 table[u] = table[u-1], u--;
376*01826a49SYabin Cui             table[u] = elt;
377*01826a49SYabin Cui             return u;
378*01826a49SYabin Cui     }   }
379*01826a49SYabin Cui 
380*01826a49SYabin Cui     /* front overlap */
381*01826a49SYabin Cui     for (u=1; u<tableSize; u++) {
382*01826a49SYabin Cui         if (u==eltNbToSkip) continue;
383*01826a49SYabin Cui 
384*01826a49SYabin Cui         if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
385*01826a49SYabin Cui             /* append */
386*01826a49SYabin Cui             int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
387*01826a49SYabin Cui             table[u].savings += elt.length / 8;    /* rough approx bonus */
388*01826a49SYabin Cui             if (addedLength > 0) {   /* otherwise, elt fully included into existing */
389*01826a49SYabin Cui                 table[u].length += addedLength;
390*01826a49SYabin Cui                 table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
391*01826a49SYabin Cui             }
392*01826a49SYabin Cui             /* sort : improve rank */
393*01826a49SYabin Cui             elt = table[u];
394*01826a49SYabin Cui             while ((u>1) && (table[u-1].savings < elt.savings))
395*01826a49SYabin Cui                 table[u] = table[u-1], u--;
396*01826a49SYabin Cui             table[u] = elt;
397*01826a49SYabin Cui             return u;
398*01826a49SYabin Cui         }
399*01826a49SYabin Cui 
400*01826a49SYabin Cui         if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
401*01826a49SYabin Cui             if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
402*01826a49SYabin Cui                 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
403*01826a49SYabin Cui                 table[u].pos = elt.pos;
404*01826a49SYabin Cui                 table[u].savings += (U32)(elt.savings * addedLength / elt.length);
405*01826a49SYabin Cui                 table[u].length = MIN(elt.length, table[u].length + 1);
406*01826a49SYabin Cui                 return u;
407*01826a49SYabin Cui             }
408*01826a49SYabin Cui         }
409*01826a49SYabin Cui     }
410*01826a49SYabin Cui 
411*01826a49SYabin Cui     return 0;
412*01826a49SYabin Cui }
413*01826a49SYabin Cui 
414*01826a49SYabin Cui 
ZDICT_removeDictItem(dictItem * table,U32 id)415*01826a49SYabin Cui static void ZDICT_removeDictItem(dictItem* table, U32 id)
416*01826a49SYabin Cui {
417*01826a49SYabin Cui     /* convention : table[0].pos stores nb of elts */
418*01826a49SYabin Cui     U32 const max = table[0].pos;
419*01826a49SYabin Cui     U32 u;
420*01826a49SYabin Cui     if (!id) return;   /* protection, should never happen */
421*01826a49SYabin Cui     for (u=id; u<max-1; u++)
422*01826a49SYabin Cui         table[u] = table[u+1];
423*01826a49SYabin Cui     table->pos--;
424*01826a49SYabin Cui }
425*01826a49SYabin Cui 
426*01826a49SYabin Cui 
ZDICT_insertDictItem(dictItem * table,U32 maxSize,dictItem elt,const void * buffer)427*01826a49SYabin Cui static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
428*01826a49SYabin Cui {
429*01826a49SYabin Cui     /* merge if possible */
430*01826a49SYabin Cui     U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
431*01826a49SYabin Cui     if (mergeId) {
432*01826a49SYabin Cui         U32 newMerge = 1;
433*01826a49SYabin Cui         while (newMerge) {
434*01826a49SYabin Cui             newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
435*01826a49SYabin Cui             if (newMerge) ZDICT_removeDictItem(table, mergeId);
436*01826a49SYabin Cui             mergeId = newMerge;
437*01826a49SYabin Cui         }
438*01826a49SYabin Cui         return;
439*01826a49SYabin Cui     }
440*01826a49SYabin Cui 
441*01826a49SYabin Cui     /* insert */
442*01826a49SYabin Cui     {   U32 current;
443*01826a49SYabin Cui         U32 nextElt = table->pos;
444*01826a49SYabin Cui         if (nextElt >= maxSize) nextElt = maxSize-1;
445*01826a49SYabin Cui         current = nextElt-1;
446*01826a49SYabin Cui         while (table[current].savings < elt.savings) {
447*01826a49SYabin Cui             table[current+1] = table[current];
448*01826a49SYabin Cui             current--;
449*01826a49SYabin Cui         }
450*01826a49SYabin Cui         table[current+1] = elt;
451*01826a49SYabin Cui         table->pos = nextElt+1;
452*01826a49SYabin Cui     }
453*01826a49SYabin Cui }
454*01826a49SYabin Cui 
455*01826a49SYabin Cui 
ZDICT_dictSize(const dictItem * dictList)456*01826a49SYabin Cui static U32 ZDICT_dictSize(const dictItem* dictList)
457*01826a49SYabin Cui {
458*01826a49SYabin Cui     U32 u, dictSize = 0;
459*01826a49SYabin Cui     for (u=1; u<dictList[0].pos; u++)
460*01826a49SYabin Cui         dictSize += dictList[u].length;
461*01826a49SYabin Cui     return dictSize;
462*01826a49SYabin Cui }
463*01826a49SYabin Cui 
464*01826a49SYabin Cui 
ZDICT_trainBuffer_legacy(dictItem * dictList,U32 dictListSize,const void * const buffer,size_t bufferSize,const size_t * fileSizes,unsigned nbFiles,unsigned minRatio,U32 notificationLevel)465*01826a49SYabin Cui static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
466*01826a49SYabin Cui                             const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
467*01826a49SYabin Cui                             const size_t* fileSizes, unsigned nbFiles,
468*01826a49SYabin Cui                             unsigned minRatio, U32 notificationLevel)
469*01826a49SYabin Cui {
470*01826a49SYabin Cui     int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
471*01826a49SYabin Cui     int* const suffix = suffix0+1;
472*01826a49SYabin Cui     U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
473*01826a49SYabin Cui     BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
474*01826a49SYabin Cui     U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
475*01826a49SYabin Cui     size_t result = 0;
476*01826a49SYabin Cui     clock_t displayClock = 0;
477*01826a49SYabin Cui     clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
478*01826a49SYabin Cui 
479*01826a49SYabin Cui #   undef  DISPLAYUPDATE
480*01826a49SYabin Cui #   define DISPLAYUPDATE(l, ...)                                   \
481*01826a49SYabin Cui         do {                                                       \
482*01826a49SYabin Cui             if (notificationLevel>=l) {                            \
483*01826a49SYabin Cui                 if (ZDICT_clockSpan(displayClock) > refreshRate) { \
484*01826a49SYabin Cui                     displayClock = clock();                        \
485*01826a49SYabin Cui                     DISPLAY(__VA_ARGS__);                          \
486*01826a49SYabin Cui                 }                                                  \
487*01826a49SYabin Cui                 if (notificationLevel>=4) fflush(stderr);          \
488*01826a49SYabin Cui             }                                                      \
489*01826a49SYabin Cui         } while (0)
490*01826a49SYabin Cui 
491*01826a49SYabin Cui     /* init */
492*01826a49SYabin Cui     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
493*01826a49SYabin Cui     if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
494*01826a49SYabin Cui         result = ERROR(memory_allocation);
495*01826a49SYabin Cui         goto _cleanup;
496*01826a49SYabin Cui     }
497*01826a49SYabin Cui     if (minRatio < MINRATIO) minRatio = MINRATIO;
498*01826a49SYabin Cui     memset(doneMarks, 0, bufferSize+16);
499*01826a49SYabin Cui 
500*01826a49SYabin Cui     /* limit sample set size (divsufsort limitation)*/
501*01826a49SYabin Cui     if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
502*01826a49SYabin Cui     while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
503*01826a49SYabin Cui 
504*01826a49SYabin Cui     /* sort */
505*01826a49SYabin Cui     DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
506*01826a49SYabin Cui     {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
507*01826a49SYabin Cui         if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
508*01826a49SYabin Cui     }
509*01826a49SYabin Cui     suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
510*01826a49SYabin Cui     suffix0[0] = (int)bufferSize;           /* leads into noise */
511*01826a49SYabin Cui     /* build reverse suffix sort */
512*01826a49SYabin Cui     {   size_t pos;
513*01826a49SYabin Cui         for (pos=0; pos < bufferSize; pos++)
514*01826a49SYabin Cui             reverseSuffix[suffix[pos]] = (U32)pos;
515*01826a49SYabin Cui         /* note filePos tracks borders between samples.
516*01826a49SYabin Cui            It's not used at this stage, but planned to become useful in a later update */
517*01826a49SYabin Cui         filePos[0] = 0;
518*01826a49SYabin Cui         for (pos=1; pos<nbFiles; pos++)
519*01826a49SYabin Cui             filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
520*01826a49SYabin Cui     }
521*01826a49SYabin Cui 
522*01826a49SYabin Cui     DISPLAYLEVEL(2, "finding patterns ... \n");
523*01826a49SYabin Cui     DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
524*01826a49SYabin Cui 
525*01826a49SYabin Cui     {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
526*01826a49SYabin Cui             dictItem solution;
527*01826a49SYabin Cui             if (doneMarks[cursor]) { cursor++; continue; }
528*01826a49SYabin Cui             solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
529*01826a49SYabin Cui             if (solution.length==0) { cursor++; continue; }
530*01826a49SYabin Cui             ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
531*01826a49SYabin Cui             cursor += solution.length;
532*01826a49SYabin Cui             DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
533*01826a49SYabin Cui     }   }
534*01826a49SYabin Cui 
535*01826a49SYabin Cui _cleanup:
536*01826a49SYabin Cui     free(suffix0);
537*01826a49SYabin Cui     free(reverseSuffix);
538*01826a49SYabin Cui     free(doneMarks);
539*01826a49SYabin Cui     free(filePos);
540*01826a49SYabin Cui     return result;
541*01826a49SYabin Cui }
542*01826a49SYabin Cui 
543*01826a49SYabin Cui 
ZDICT_fillNoise(void * buffer,size_t length)544*01826a49SYabin Cui static void ZDICT_fillNoise(void* buffer, size_t length)
545*01826a49SYabin Cui {
546*01826a49SYabin Cui     unsigned const prime1 = 2654435761U;
547*01826a49SYabin Cui     unsigned const prime2 = 2246822519U;
548*01826a49SYabin Cui     unsigned acc = prime1;
549*01826a49SYabin Cui     size_t p=0;
550*01826a49SYabin Cui     for (p=0; p<length; p++) {
551*01826a49SYabin Cui         acc *= prime2;
552*01826a49SYabin Cui         ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
553*01826a49SYabin Cui     }
554*01826a49SYabin Cui }
555*01826a49SYabin Cui 
556*01826a49SYabin Cui 
557*01826a49SYabin Cui typedef struct
558*01826a49SYabin Cui {
559*01826a49SYabin Cui     ZSTD_CDict* dict;    /* dictionary */
560*01826a49SYabin Cui     ZSTD_CCtx* zc;     /* working context */
561*01826a49SYabin Cui     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
562*01826a49SYabin Cui } EStats_ress_t;
563*01826a49SYabin Cui 
564*01826a49SYabin Cui #define MAXREPOFFSET 1024
565*01826a49SYabin Cui 
ZDICT_countEStats(EStats_ress_t esr,const ZSTD_parameters * params,unsigned * countLit,unsigned * offsetcodeCount,unsigned * matchlengthCount,unsigned * litlengthCount,U32 * repOffsets,const void * src,size_t srcSize,U32 notificationLevel)566*01826a49SYabin Cui static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
567*01826a49SYabin Cui                               unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
568*01826a49SYabin Cui                               const void* src, size_t srcSize,
569*01826a49SYabin Cui                               U32 notificationLevel)
570*01826a49SYabin Cui {
571*01826a49SYabin Cui     size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
572*01826a49SYabin Cui     size_t cSize;
573*01826a49SYabin Cui 
574*01826a49SYabin Cui     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
575*01826a49SYabin Cui     {   size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
576*01826a49SYabin Cui         if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
577*01826a49SYabin Cui 
578*01826a49SYabin Cui     }
579*01826a49SYabin Cui     cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
580*01826a49SYabin Cui     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
581*01826a49SYabin Cui 
582*01826a49SYabin Cui     if (cSize) {  /* if == 0; block is not compressible */
583*01826a49SYabin Cui         const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
584*01826a49SYabin Cui 
585*01826a49SYabin Cui         /* literals stats */
586*01826a49SYabin Cui         {   const BYTE* bytePtr;
587*01826a49SYabin Cui             for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
588*01826a49SYabin Cui                 countLit[*bytePtr]++;
589*01826a49SYabin Cui         }
590*01826a49SYabin Cui 
591*01826a49SYabin Cui         /* seqStats */
592*01826a49SYabin Cui         {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
593*01826a49SYabin Cui             ZSTD_seqToCodes(seqStorePtr);
594*01826a49SYabin Cui 
595*01826a49SYabin Cui             {   const BYTE* codePtr = seqStorePtr->ofCode;
596*01826a49SYabin Cui                 U32 u;
597*01826a49SYabin Cui                 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
598*01826a49SYabin Cui             }
599*01826a49SYabin Cui 
600*01826a49SYabin Cui             {   const BYTE* codePtr = seqStorePtr->mlCode;
601*01826a49SYabin Cui                 U32 u;
602*01826a49SYabin Cui                 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
603*01826a49SYabin Cui             }
604*01826a49SYabin Cui 
605*01826a49SYabin Cui             {   const BYTE* codePtr = seqStorePtr->llCode;
606*01826a49SYabin Cui                 U32 u;
607*01826a49SYabin Cui                 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
608*01826a49SYabin Cui             }
609*01826a49SYabin Cui 
610*01826a49SYabin Cui             if (nbSeq >= 2) { /* rep offsets */
611*01826a49SYabin Cui                 const seqDef* const seq = seqStorePtr->sequencesStart;
612*01826a49SYabin Cui                 U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
613*01826a49SYabin Cui                 U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
614*01826a49SYabin Cui                 if (offset1 >= MAXREPOFFSET) offset1 = 0;
615*01826a49SYabin Cui                 if (offset2 >= MAXREPOFFSET) offset2 = 0;
616*01826a49SYabin Cui                 repOffsets[offset1] += 3;
617*01826a49SYabin Cui                 repOffsets[offset2] += 1;
618*01826a49SYabin Cui     }   }   }
619*01826a49SYabin Cui }
620*01826a49SYabin Cui 
ZDICT_totalSampleSize(const size_t * fileSizes,unsigned nbFiles)621*01826a49SYabin Cui static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
622*01826a49SYabin Cui {
623*01826a49SYabin Cui     size_t total=0;
624*01826a49SYabin Cui     unsigned u;
625*01826a49SYabin Cui     for (u=0; u<nbFiles; u++) total += fileSizes[u];
626*01826a49SYabin Cui     return total;
627*01826a49SYabin Cui }
628*01826a49SYabin Cui 
629*01826a49SYabin Cui typedef struct { U32 offset; U32 count; } offsetCount_t;
630*01826a49SYabin Cui 
ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1],U32 val,U32 count)631*01826a49SYabin Cui static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
632*01826a49SYabin Cui {
633*01826a49SYabin Cui     U32 u;
634*01826a49SYabin Cui     table[ZSTD_REP_NUM].offset = val;
635*01826a49SYabin Cui     table[ZSTD_REP_NUM].count = count;
636*01826a49SYabin Cui     for (u=ZSTD_REP_NUM; u>0; u--) {
637*01826a49SYabin Cui         offsetCount_t tmp;
638*01826a49SYabin Cui         if (table[u-1].count >= table[u].count) break;
639*01826a49SYabin Cui         tmp = table[u-1];
640*01826a49SYabin Cui         table[u-1] = table[u];
641*01826a49SYabin Cui         table[u] = tmp;
642*01826a49SYabin Cui     }
643*01826a49SYabin Cui }
644*01826a49SYabin Cui 
645*01826a49SYabin Cui /* ZDICT_flatLit() :
646*01826a49SYabin Cui  * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
647*01826a49SYabin Cui  * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
648*01826a49SYabin Cui  */
ZDICT_flatLit(unsigned * countLit)649*01826a49SYabin Cui static void ZDICT_flatLit(unsigned* countLit)
650*01826a49SYabin Cui {
651*01826a49SYabin Cui     int u;
652*01826a49SYabin Cui     for (u=1; u<256; u++) countLit[u] = 2;
653*01826a49SYabin Cui     countLit[0]   = 4;
654*01826a49SYabin Cui     countLit[253] = 1;
655*01826a49SYabin Cui     countLit[254] = 1;
656*01826a49SYabin Cui }
657*01826a49SYabin Cui 
658*01826a49SYabin Cui #define OFFCODE_MAX 30  /* only applicable to first block */
ZDICT_analyzeEntropy(void * dstBuffer,size_t maxDstSize,int compressionLevel,const void * srcBuffer,const size_t * fileSizes,unsigned nbFiles,const void * dictBuffer,size_t dictBufferSize,unsigned notificationLevel)659*01826a49SYabin Cui static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
660*01826a49SYabin Cui                                    int compressionLevel,
661*01826a49SYabin Cui                              const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
662*01826a49SYabin Cui                              const void* dictBuffer, size_t  dictBufferSize,
663*01826a49SYabin Cui                                    unsigned notificationLevel)
664*01826a49SYabin Cui {
665*01826a49SYabin Cui     unsigned countLit[256];
666*01826a49SYabin Cui     HUF_CREATE_STATIC_CTABLE(hufTable, 255);
667*01826a49SYabin Cui     unsigned offcodeCount[OFFCODE_MAX+1];
668*01826a49SYabin Cui     short offcodeNCount[OFFCODE_MAX+1];
669*01826a49SYabin Cui     U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
670*01826a49SYabin Cui     unsigned matchLengthCount[MaxML+1];
671*01826a49SYabin Cui     short matchLengthNCount[MaxML+1];
672*01826a49SYabin Cui     unsigned litLengthCount[MaxLL+1];
673*01826a49SYabin Cui     short litLengthNCount[MaxLL+1];
674*01826a49SYabin Cui     U32 repOffset[MAXREPOFFSET];
675*01826a49SYabin Cui     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
676*01826a49SYabin Cui     EStats_ress_t esr = { NULL, NULL, NULL };
677*01826a49SYabin Cui     ZSTD_parameters params;
678*01826a49SYabin Cui     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
679*01826a49SYabin Cui     size_t pos = 0, errorCode;
680*01826a49SYabin Cui     size_t eSize = 0;
681*01826a49SYabin Cui     size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
682*01826a49SYabin Cui     size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
683*01826a49SYabin Cui     BYTE* dstPtr = (BYTE*)dstBuffer;
684*01826a49SYabin Cui     U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
685*01826a49SYabin Cui 
686*01826a49SYabin Cui     /* init */
687*01826a49SYabin Cui     DEBUGLOG(4, "ZDICT_analyzeEntropy");
688*01826a49SYabin Cui     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
689*01826a49SYabin Cui     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
690*01826a49SYabin Cui     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
691*01826a49SYabin Cui     for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
692*01826a49SYabin Cui     for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
693*01826a49SYabin Cui     memset(repOffset, 0, sizeof(repOffset));
694*01826a49SYabin Cui     repOffset[1] = repOffset[4] = repOffset[8] = 1;
695*01826a49SYabin Cui     memset(bestRepOffset, 0, sizeof(bestRepOffset));
696*01826a49SYabin Cui     if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
697*01826a49SYabin Cui     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
698*01826a49SYabin Cui 
699*01826a49SYabin Cui     esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
700*01826a49SYabin Cui     esr.zc = ZSTD_createCCtx();
701*01826a49SYabin Cui     esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
702*01826a49SYabin Cui     if (!esr.dict || !esr.zc || !esr.workPlace) {
703*01826a49SYabin Cui         eSize = ERROR(memory_allocation);
704*01826a49SYabin Cui         DISPLAYLEVEL(1, "Not enough memory \n");
705*01826a49SYabin Cui         goto _cleanup;
706*01826a49SYabin Cui     }
707*01826a49SYabin Cui 
708*01826a49SYabin Cui     /* collect stats on all samples */
709*01826a49SYabin Cui     for (u=0; u<nbFiles; u++) {
710*01826a49SYabin Cui         ZDICT_countEStats(esr, &params,
711*01826a49SYabin Cui                           countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
712*01826a49SYabin Cui                          (const char*)srcBuffer + pos, fileSizes[u],
713*01826a49SYabin Cui                           notificationLevel);
714*01826a49SYabin Cui         pos += fileSizes[u];
715*01826a49SYabin Cui     }
716*01826a49SYabin Cui 
717*01826a49SYabin Cui     if (notificationLevel >= 4) {
718*01826a49SYabin Cui         /* writeStats */
719*01826a49SYabin Cui         DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
720*01826a49SYabin Cui         for (u=0; u<=offcodeMax; u++) {
721*01826a49SYabin Cui             DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
722*01826a49SYabin Cui     }   }
723*01826a49SYabin Cui 
724*01826a49SYabin Cui     /* analyze, build stats, starting with literals */
725*01826a49SYabin Cui     {   size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
726*01826a49SYabin Cui         if (HUF_isError(maxNbBits)) {
727*01826a49SYabin Cui             eSize = maxNbBits;
728*01826a49SYabin Cui             DISPLAYLEVEL(1, " HUF_buildCTable error \n");
729*01826a49SYabin Cui             goto _cleanup;
730*01826a49SYabin Cui         }
731*01826a49SYabin Cui         if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
732*01826a49SYabin Cui             DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
733*01826a49SYabin Cui             ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
734*01826a49SYabin Cui             maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
735*01826a49SYabin Cui             assert(maxNbBits==9);
736*01826a49SYabin Cui         }
737*01826a49SYabin Cui         huffLog = (U32)maxNbBits;
738*01826a49SYabin Cui     }
739*01826a49SYabin Cui 
740*01826a49SYabin Cui     /* looking for most common first offsets */
741*01826a49SYabin Cui     {   U32 offset;
742*01826a49SYabin Cui         for (offset=1; offset<MAXREPOFFSET; offset++)
743*01826a49SYabin Cui             ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
744*01826a49SYabin Cui     }
745*01826a49SYabin Cui     /* note : the result of this phase should be used to better appreciate the impact on statistics */
746*01826a49SYabin Cui 
747*01826a49SYabin Cui     total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
748*01826a49SYabin Cui     errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
749*01826a49SYabin Cui     if (FSE_isError(errorCode)) {
750*01826a49SYabin Cui         eSize = errorCode;
751*01826a49SYabin Cui         DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
752*01826a49SYabin Cui         goto _cleanup;
753*01826a49SYabin Cui     }
754*01826a49SYabin Cui     Offlog = (U32)errorCode;
755*01826a49SYabin Cui 
756*01826a49SYabin Cui     total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
757*01826a49SYabin Cui     errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
758*01826a49SYabin Cui     if (FSE_isError(errorCode)) {
759*01826a49SYabin Cui         eSize = errorCode;
760*01826a49SYabin Cui         DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
761*01826a49SYabin Cui         goto _cleanup;
762*01826a49SYabin Cui     }
763*01826a49SYabin Cui     mlLog = (U32)errorCode;
764*01826a49SYabin Cui 
765*01826a49SYabin Cui     total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
766*01826a49SYabin Cui     errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
767*01826a49SYabin Cui     if (FSE_isError(errorCode)) {
768*01826a49SYabin Cui         eSize = errorCode;
769*01826a49SYabin Cui         DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
770*01826a49SYabin Cui         goto _cleanup;
771*01826a49SYabin Cui     }
772*01826a49SYabin Cui     llLog = (U32)errorCode;
773*01826a49SYabin Cui 
774*01826a49SYabin Cui     /* write result to buffer */
775*01826a49SYabin Cui     {   size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
776*01826a49SYabin Cui         if (HUF_isError(hhSize)) {
777*01826a49SYabin Cui             eSize = hhSize;
778*01826a49SYabin Cui             DISPLAYLEVEL(1, "HUF_writeCTable error \n");
779*01826a49SYabin Cui             goto _cleanup;
780*01826a49SYabin Cui         }
781*01826a49SYabin Cui         dstPtr += hhSize;
782*01826a49SYabin Cui         maxDstSize -= hhSize;
783*01826a49SYabin Cui         eSize += hhSize;
784*01826a49SYabin Cui     }
785*01826a49SYabin Cui 
786*01826a49SYabin Cui     {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
787*01826a49SYabin Cui         if (FSE_isError(ohSize)) {
788*01826a49SYabin Cui             eSize = ohSize;
789*01826a49SYabin Cui             DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
790*01826a49SYabin Cui             goto _cleanup;
791*01826a49SYabin Cui         }
792*01826a49SYabin Cui         dstPtr += ohSize;
793*01826a49SYabin Cui         maxDstSize -= ohSize;
794*01826a49SYabin Cui         eSize += ohSize;
795*01826a49SYabin Cui     }
796*01826a49SYabin Cui 
797*01826a49SYabin Cui     {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
798*01826a49SYabin Cui         if (FSE_isError(mhSize)) {
799*01826a49SYabin Cui             eSize = mhSize;
800*01826a49SYabin Cui             DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
801*01826a49SYabin Cui             goto _cleanup;
802*01826a49SYabin Cui         }
803*01826a49SYabin Cui         dstPtr += mhSize;
804*01826a49SYabin Cui         maxDstSize -= mhSize;
805*01826a49SYabin Cui         eSize += mhSize;
806*01826a49SYabin Cui     }
807*01826a49SYabin Cui 
808*01826a49SYabin Cui     {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
809*01826a49SYabin Cui         if (FSE_isError(lhSize)) {
810*01826a49SYabin Cui             eSize = lhSize;
811*01826a49SYabin Cui             DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
812*01826a49SYabin Cui             goto _cleanup;
813*01826a49SYabin Cui         }
814*01826a49SYabin Cui         dstPtr += lhSize;
815*01826a49SYabin Cui         maxDstSize -= lhSize;
816*01826a49SYabin Cui         eSize += lhSize;
817*01826a49SYabin Cui     }
818*01826a49SYabin Cui 
819*01826a49SYabin Cui     if (maxDstSize<12) {
820*01826a49SYabin Cui         eSize = ERROR(dstSize_tooSmall);
821*01826a49SYabin Cui         DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
822*01826a49SYabin Cui         goto _cleanup;
823*01826a49SYabin Cui     }
824*01826a49SYabin Cui # if 0
825*01826a49SYabin Cui     MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
826*01826a49SYabin Cui     MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
827*01826a49SYabin Cui     MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
828*01826a49SYabin Cui #else
829*01826a49SYabin Cui     /* at this stage, we don't use the result of "most common first offset",
830*01826a49SYabin Cui      * as the impact of statistics is not properly evaluated */
831*01826a49SYabin Cui     MEM_writeLE32(dstPtr+0, repStartValue[0]);
832*01826a49SYabin Cui     MEM_writeLE32(dstPtr+4, repStartValue[1]);
833*01826a49SYabin Cui     MEM_writeLE32(dstPtr+8, repStartValue[2]);
834*01826a49SYabin Cui #endif
835*01826a49SYabin Cui     eSize += 12;
836*01826a49SYabin Cui 
837*01826a49SYabin Cui _cleanup:
838*01826a49SYabin Cui     ZSTD_freeCDict(esr.dict);
839*01826a49SYabin Cui     ZSTD_freeCCtx(esr.zc);
840*01826a49SYabin Cui     free(esr.workPlace);
841*01826a49SYabin Cui 
842*01826a49SYabin Cui     return eSize;
843*01826a49SYabin Cui }
844*01826a49SYabin Cui 
845*01826a49SYabin Cui 
846*01826a49SYabin Cui /**
847*01826a49SYabin Cui  * @returns the maximum repcode value
848*01826a49SYabin Cui  */
ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])849*01826a49SYabin Cui static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
850*01826a49SYabin Cui {
851*01826a49SYabin Cui     U32 maxRep = reps[0];
852*01826a49SYabin Cui     int r;
853*01826a49SYabin Cui     for (r = 1; r < ZSTD_REP_NUM; ++r)
854*01826a49SYabin Cui         maxRep = MAX(maxRep, reps[r]);
855*01826a49SYabin Cui     return maxRep;
856*01826a49SYabin Cui }
857*01826a49SYabin Cui 
ZDICT_finalizeDictionary(void * dictBuffer,size_t dictBufferCapacity,const void * customDictContent,size_t dictContentSize,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_params_t params)858*01826a49SYabin Cui size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
859*01826a49SYabin Cui                           const void* customDictContent, size_t dictContentSize,
860*01826a49SYabin Cui                           const void* samplesBuffer, const size_t* samplesSizes,
861*01826a49SYabin Cui                           unsigned nbSamples, ZDICT_params_t params)
862*01826a49SYabin Cui {
863*01826a49SYabin Cui     size_t hSize;
864*01826a49SYabin Cui #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
865*01826a49SYabin Cui     BYTE header[HBUFFSIZE];
866*01826a49SYabin Cui     int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
867*01826a49SYabin Cui     U32 const notificationLevel = params.notificationLevel;
868*01826a49SYabin Cui     /* The final dictionary content must be at least as large as the largest repcode */
869*01826a49SYabin Cui     size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
870*01826a49SYabin Cui     size_t paddingSize;
871*01826a49SYabin Cui 
872*01826a49SYabin Cui     /* check conditions */
873*01826a49SYabin Cui     DEBUGLOG(4, "ZDICT_finalizeDictionary");
874*01826a49SYabin Cui     if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
875*01826a49SYabin Cui     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
876*01826a49SYabin Cui 
877*01826a49SYabin Cui     /* dictionary header */
878*01826a49SYabin Cui     MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
879*01826a49SYabin Cui     {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
880*01826a49SYabin Cui         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
881*01826a49SYabin Cui         U32 const dictID = params.dictID ? params.dictID : compliantID;
882*01826a49SYabin Cui         MEM_writeLE32(header+4, dictID);
883*01826a49SYabin Cui     }
884*01826a49SYabin Cui     hSize = 8;
885*01826a49SYabin Cui 
886*01826a49SYabin Cui     /* entropy tables */
887*01826a49SYabin Cui     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
888*01826a49SYabin Cui     DISPLAYLEVEL(2, "statistics ... \n");
889*01826a49SYabin Cui     {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
890*01826a49SYabin Cui                                   compressionLevel,
891*01826a49SYabin Cui                                   samplesBuffer, samplesSizes, nbSamples,
892*01826a49SYabin Cui                                   customDictContent, dictContentSize,
893*01826a49SYabin Cui                                   notificationLevel);
894*01826a49SYabin Cui         if (ZDICT_isError(eSize)) return eSize;
895*01826a49SYabin Cui         hSize += eSize;
896*01826a49SYabin Cui     }
897*01826a49SYabin Cui 
898*01826a49SYabin Cui     /* Shrink the content size if it doesn't fit in the buffer */
899*01826a49SYabin Cui     if (hSize + dictContentSize > dictBufferCapacity) {
900*01826a49SYabin Cui         dictContentSize = dictBufferCapacity - hSize;
901*01826a49SYabin Cui     }
902*01826a49SYabin Cui 
903*01826a49SYabin Cui     /* Pad the dictionary content with zeros if it is too small */
904*01826a49SYabin Cui     if (dictContentSize < minContentSize) {
905*01826a49SYabin Cui         RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
906*01826a49SYabin Cui                         "dictBufferCapacity too small to fit max repcode");
907*01826a49SYabin Cui         paddingSize = minContentSize - dictContentSize;
908*01826a49SYabin Cui     } else {
909*01826a49SYabin Cui         paddingSize = 0;
910*01826a49SYabin Cui     }
911*01826a49SYabin Cui 
912*01826a49SYabin Cui     {
913*01826a49SYabin Cui         size_t const dictSize = hSize + paddingSize + dictContentSize;
914*01826a49SYabin Cui 
915*01826a49SYabin Cui         /* The dictionary consists of the header, optional padding, and the content.
916*01826a49SYabin Cui          * The padding comes before the content because the "best" position in the
917*01826a49SYabin Cui          * dictionary is the last byte.
918*01826a49SYabin Cui          */
919*01826a49SYabin Cui         BYTE* const outDictHeader = (BYTE*)dictBuffer;
920*01826a49SYabin Cui         BYTE* const outDictPadding = outDictHeader + hSize;
921*01826a49SYabin Cui         BYTE* const outDictContent = outDictPadding + paddingSize;
922*01826a49SYabin Cui 
923*01826a49SYabin Cui         assert(dictSize <= dictBufferCapacity);
924*01826a49SYabin Cui         assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
925*01826a49SYabin Cui 
926*01826a49SYabin Cui         /* First copy the customDictContent into its final location.
927*01826a49SYabin Cui          * `customDictContent` and `dictBuffer` may overlap, so we must
928*01826a49SYabin Cui          * do this before any other writes into the output buffer.
929*01826a49SYabin Cui          * Then copy the header & padding into the output buffer.
930*01826a49SYabin Cui          */
931*01826a49SYabin Cui         memmove(outDictContent, customDictContent, dictContentSize);
932*01826a49SYabin Cui         memcpy(outDictHeader, header, hSize);
933*01826a49SYabin Cui         memset(outDictPadding, 0, paddingSize);
934*01826a49SYabin Cui 
935*01826a49SYabin Cui         return dictSize;
936*01826a49SYabin Cui     }
937*01826a49SYabin Cui }
938*01826a49SYabin Cui 
939*01826a49SYabin Cui 
ZDICT_addEntropyTablesFromBuffer_advanced(void * dictBuffer,size_t dictContentSize,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_params_t params)940*01826a49SYabin Cui static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
941*01826a49SYabin Cui         void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
942*01826a49SYabin Cui         const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
943*01826a49SYabin Cui         ZDICT_params_t params)
944*01826a49SYabin Cui {
945*01826a49SYabin Cui     int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
946*01826a49SYabin Cui     U32 const notificationLevel = params.notificationLevel;
947*01826a49SYabin Cui     size_t hSize = 8;
948*01826a49SYabin Cui 
949*01826a49SYabin Cui     /* calculate entropy tables */
950*01826a49SYabin Cui     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
951*01826a49SYabin Cui     DISPLAYLEVEL(2, "statistics ... \n");
952*01826a49SYabin Cui     {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
953*01826a49SYabin Cui                                   compressionLevel,
954*01826a49SYabin Cui                                   samplesBuffer, samplesSizes, nbSamples,
955*01826a49SYabin Cui                                   (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
956*01826a49SYabin Cui                                   notificationLevel);
957*01826a49SYabin Cui         if (ZDICT_isError(eSize)) return eSize;
958*01826a49SYabin Cui         hSize += eSize;
959*01826a49SYabin Cui     }
960*01826a49SYabin Cui 
961*01826a49SYabin Cui     /* add dictionary header (after entropy tables) */
962*01826a49SYabin Cui     MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
963*01826a49SYabin Cui     {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
964*01826a49SYabin Cui         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
965*01826a49SYabin Cui         U32 const dictID = params.dictID ? params.dictID : compliantID;
966*01826a49SYabin Cui         MEM_writeLE32((char*)dictBuffer+4, dictID);
967*01826a49SYabin Cui     }
968*01826a49SYabin Cui 
969*01826a49SYabin Cui     if (hSize + dictContentSize < dictBufferCapacity)
970*01826a49SYabin Cui         memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
971*01826a49SYabin Cui     return MIN(dictBufferCapacity, hSize+dictContentSize);
972*01826a49SYabin Cui }
973*01826a49SYabin Cui 
974*01826a49SYabin Cui /*! ZDICT_trainFromBuffer_unsafe_legacy() :
975*01826a49SYabin Cui *   Warning : `samplesBuffer` must be followed by noisy guard band !!!
976*01826a49SYabin Cui *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
977*01826a49SYabin Cui */
ZDICT_trainFromBuffer_unsafe_legacy(void * dictBuffer,size_t maxDictSize,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_legacy_params_t params)978*01826a49SYabin Cui static size_t ZDICT_trainFromBuffer_unsafe_legacy(
979*01826a49SYabin Cui                             void* dictBuffer, size_t maxDictSize,
980*01826a49SYabin Cui                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
981*01826a49SYabin Cui                             ZDICT_legacy_params_t params)
982*01826a49SYabin Cui {
983*01826a49SYabin Cui     U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
984*01826a49SYabin Cui     dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
985*01826a49SYabin Cui     unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
986*01826a49SYabin Cui     unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
987*01826a49SYabin Cui     size_t const targetDictSize = maxDictSize;
988*01826a49SYabin Cui     size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
989*01826a49SYabin Cui     size_t dictSize = 0;
990*01826a49SYabin Cui     U32 const notificationLevel = params.zParams.notificationLevel;
991*01826a49SYabin Cui 
992*01826a49SYabin Cui     /* checks */
993*01826a49SYabin Cui     if (!dictList) return ERROR(memory_allocation);
994*01826a49SYabin Cui     if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
995*01826a49SYabin Cui     if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
996*01826a49SYabin Cui 
997*01826a49SYabin Cui     /* init */
998*01826a49SYabin Cui     ZDICT_initDictItem(dictList);
999*01826a49SYabin Cui 
1000*01826a49SYabin Cui     /* build dictionary */
1001*01826a49SYabin Cui     ZDICT_trainBuffer_legacy(dictList, dictListSize,
1002*01826a49SYabin Cui                        samplesBuffer, samplesBuffSize,
1003*01826a49SYabin Cui                        samplesSizes, nbSamples,
1004*01826a49SYabin Cui                        minRep, notificationLevel);
1005*01826a49SYabin Cui 
1006*01826a49SYabin Cui     /* display best matches */
1007*01826a49SYabin Cui     if (params.zParams.notificationLevel>= 3) {
1008*01826a49SYabin Cui         unsigned const nb = MIN(25, dictList[0].pos);
1009*01826a49SYabin Cui         unsigned const dictContentSize = ZDICT_dictSize(dictList);
1010*01826a49SYabin Cui         unsigned u;
1011*01826a49SYabin Cui         DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1012*01826a49SYabin Cui         DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1013*01826a49SYabin Cui         for (u=1; u<nb; u++) {
1014*01826a49SYabin Cui             unsigned const pos = dictList[u].pos;
1015*01826a49SYabin Cui             unsigned const length = dictList[u].length;
1016*01826a49SYabin Cui             U32 const printedLength = MIN(40, length);
1017*01826a49SYabin Cui             if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1018*01826a49SYabin Cui                 free(dictList);
1019*01826a49SYabin Cui                 return ERROR(GENERIC);   /* should never happen */
1020*01826a49SYabin Cui             }
1021*01826a49SYabin Cui             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
1022*01826a49SYabin Cui                          u, length, pos, (unsigned)dictList[u].savings);
1023*01826a49SYabin Cui             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
1024*01826a49SYabin Cui             DISPLAYLEVEL(3, "| \n");
1025*01826a49SYabin Cui     }   }
1026*01826a49SYabin Cui 
1027*01826a49SYabin Cui 
1028*01826a49SYabin Cui     /* create dictionary */
1029*01826a49SYabin Cui     {   unsigned dictContentSize = ZDICT_dictSize(dictList);
1030*01826a49SYabin Cui         if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
1031*01826a49SYabin Cui         if (dictContentSize < targetDictSize/4) {
1032*01826a49SYabin Cui             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1033*01826a49SYabin Cui             if (samplesBuffSize < 10 * targetDictSize)
1034*01826a49SYabin Cui                 DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
1035*01826a49SYabin Cui             if (minRep > MINRATIO) {
1036*01826a49SYabin Cui                 DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
1037*01826a49SYabin Cui                 DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1038*01826a49SYabin Cui             }
1039*01826a49SYabin Cui         }
1040*01826a49SYabin Cui 
1041*01826a49SYabin Cui         if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
1042*01826a49SYabin Cui             unsigned proposedSelectivity = selectivity-1;
1043*01826a49SYabin Cui             while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
1044*01826a49SYabin Cui             DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
1045*01826a49SYabin Cui             DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
1046*01826a49SYabin Cui             DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
1047*01826a49SYabin Cui         }
1048*01826a49SYabin Cui 
1049*01826a49SYabin Cui         /* limit dictionary size */
1050*01826a49SYabin Cui         {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
1051*01826a49SYabin Cui             U32 currentSize = 0;
1052*01826a49SYabin Cui             U32 n; for (n=1; n<max; n++) {
1053*01826a49SYabin Cui                 currentSize += dictList[n].length;
1054*01826a49SYabin Cui                 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1055*01826a49SYabin Cui             }
1056*01826a49SYabin Cui             dictList->pos = n;
1057*01826a49SYabin Cui             dictContentSize = currentSize;
1058*01826a49SYabin Cui         }
1059*01826a49SYabin Cui 
1060*01826a49SYabin Cui         /* build dict content */
1061*01826a49SYabin Cui         {   U32 u;
1062*01826a49SYabin Cui             BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
1063*01826a49SYabin Cui             for (u=1; u<dictList->pos; u++) {
1064*01826a49SYabin Cui                 U32 l = dictList[u].length;
1065*01826a49SYabin Cui                 ptr -= l;
1066*01826a49SYabin Cui                 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
1067*01826a49SYabin Cui                 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
1068*01826a49SYabin Cui         }   }
1069*01826a49SYabin Cui 
1070*01826a49SYabin Cui         dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
1071*01826a49SYabin Cui                                                              samplesBuffer, samplesSizes, nbSamples,
1072*01826a49SYabin Cui                                                              params.zParams);
1073*01826a49SYabin Cui     }
1074*01826a49SYabin Cui 
1075*01826a49SYabin Cui     /* clean up */
1076*01826a49SYabin Cui     free(dictList);
1077*01826a49SYabin Cui     return dictSize;
1078*01826a49SYabin Cui }
1079*01826a49SYabin Cui 
1080*01826a49SYabin Cui 
1081*01826a49SYabin Cui /* ZDICT_trainFromBuffer_legacy() :
1082*01826a49SYabin Cui  * issue : samplesBuffer need to be followed by a noisy guard band.
1083*01826a49SYabin Cui  * work around : duplicate the buffer, and add the noise */
ZDICT_trainFromBuffer_legacy(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_legacy_params_t params)1084*01826a49SYabin Cui size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1085*01826a49SYabin Cui                               const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1086*01826a49SYabin Cui                               ZDICT_legacy_params_t params)
1087*01826a49SYabin Cui {
1088*01826a49SYabin Cui     size_t result;
1089*01826a49SYabin Cui     void* newBuff;
1090*01826a49SYabin Cui     size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1091*01826a49SYabin Cui     if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
1092*01826a49SYabin Cui 
1093*01826a49SYabin Cui     newBuff = malloc(sBuffSize + NOISELENGTH);
1094*01826a49SYabin Cui     if (!newBuff) return ERROR(memory_allocation);
1095*01826a49SYabin Cui 
1096*01826a49SYabin Cui     memcpy(newBuff, samplesBuffer, sBuffSize);
1097*01826a49SYabin Cui     ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
1098*01826a49SYabin Cui 
1099*01826a49SYabin Cui     result =
1100*01826a49SYabin Cui         ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1101*01826a49SYabin Cui                                             samplesSizes, nbSamples, params);
1102*01826a49SYabin Cui     free(newBuff);
1103*01826a49SYabin Cui     return result;
1104*01826a49SYabin Cui }
1105*01826a49SYabin Cui 
1106*01826a49SYabin Cui 
ZDICT_trainFromBuffer(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples)1107*01826a49SYabin Cui size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1108*01826a49SYabin Cui                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1109*01826a49SYabin Cui {
1110*01826a49SYabin Cui     ZDICT_fastCover_params_t params;
1111*01826a49SYabin Cui     DEBUGLOG(3, "ZDICT_trainFromBuffer");
1112*01826a49SYabin Cui     memset(&params, 0, sizeof(params));
1113*01826a49SYabin Cui     params.d = 8;
1114*01826a49SYabin Cui     params.steps = 4;
1115*01826a49SYabin Cui     /* Use default level since no compression level information is available */
1116*01826a49SYabin Cui     params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1117*01826a49SYabin Cui #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1118*01826a49SYabin Cui     params.zParams.notificationLevel = DEBUGLEVEL;
1119*01826a49SYabin Cui #endif
1120*01826a49SYabin Cui     return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1121*01826a49SYabin Cui                                                samplesBuffer, samplesSizes, nbSamples,
1122*01826a49SYabin Cui                                                &params);
1123*01826a49SYabin Cui }
1124*01826a49SYabin Cui 
ZDICT_addEntropyTablesFromBuffer(void * dictBuffer,size_t dictContentSize,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples)1125*01826a49SYabin Cui size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1126*01826a49SYabin Cui                                   const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1127*01826a49SYabin Cui {
1128*01826a49SYabin Cui     ZDICT_params_t params;
1129*01826a49SYabin Cui     memset(&params, 0, sizeof(params));
1130*01826a49SYabin Cui     return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1131*01826a49SYabin Cui                                                      samplesBuffer, samplesSizes, nbSamples,
1132*01826a49SYabin Cui                                                      params);
1133*01826a49SYabin Cui }
1134