1*01826a49SYabin Cui /*
2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui * All rights reserved.
4*01826a49SYabin Cui *
5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui */
10*01826a49SYabin Cui
11*01826a49SYabin Cui
12*01826a49SYabin Cui /*-**************************************
13*01826a49SYabin Cui * Tuning parameters
14*01826a49SYabin Cui ****************************************/
15*01826a49SYabin Cui #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
16*01826a49SYabin Cui #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17*01826a49SYabin Cui #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
18*01826a49SYabin Cui
19*01826a49SYabin Cui
20*01826a49SYabin Cui /*-**************************************
21*01826a49SYabin Cui * Compiler Options
22*01826a49SYabin Cui ****************************************/
23*01826a49SYabin Cui /* Unix Large Files support (>4GB) */
24*01826a49SYabin Cui #define _FILE_OFFSET_BITS 64
25*01826a49SYabin Cui #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26*01826a49SYabin Cui # ifndef _LARGEFILE_SOURCE
27*01826a49SYabin Cui # define _LARGEFILE_SOURCE
28*01826a49SYabin Cui # endif
29*01826a49SYabin Cui #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
30*01826a49SYabin Cui # ifndef _LARGEFILE64_SOURCE
31*01826a49SYabin Cui # define _LARGEFILE64_SOURCE
32*01826a49SYabin Cui # endif
33*01826a49SYabin Cui #endif
34*01826a49SYabin Cui
35*01826a49SYabin Cui
36*01826a49SYabin Cui /*-*************************************
37*01826a49SYabin Cui * Dependencies
38*01826a49SYabin Cui ***************************************/
39*01826a49SYabin Cui #include <stdlib.h> /* malloc, free */
40*01826a49SYabin Cui #include <string.h> /* memset */
41*01826a49SYabin Cui #include <stdio.h> /* fprintf, fopen, ftello64 */
42*01826a49SYabin Cui #include <time.h> /* clock */
43*01826a49SYabin Cui
44*01826a49SYabin Cui #ifndef ZDICT_STATIC_LINKING_ONLY
45*01826a49SYabin Cui # define ZDICT_STATIC_LINKING_ONLY
46*01826a49SYabin Cui #endif
47*01826a49SYabin Cui
48*01826a49SYabin Cui #include "../common/mem.h" /* read */
49*01826a49SYabin Cui #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
50*01826a49SYabin Cui #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
51*01826a49SYabin Cui #include "../common/zstd_internal.h" /* includes zstd.h */
52*01826a49SYabin Cui #include "../common/xxhash.h" /* XXH64 */
53*01826a49SYabin Cui #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
54*01826a49SYabin Cui #include "../zdict.h"
55*01826a49SYabin Cui #include "divsufsort.h"
56*01826a49SYabin Cui #include "../common/bits.h" /* ZSTD_NbCommonBytes */
57*01826a49SYabin Cui
58*01826a49SYabin Cui
59*01826a49SYabin Cui /*-*************************************
60*01826a49SYabin Cui * Constants
61*01826a49SYabin Cui ***************************************/
62*01826a49SYabin Cui #define KB *(1 <<10)
63*01826a49SYabin Cui #define MB *(1 <<20)
64*01826a49SYabin Cui #define GB *(1U<<30)
65*01826a49SYabin Cui
66*01826a49SYabin Cui #define DICTLISTSIZE_DEFAULT 10000
67*01826a49SYabin Cui
68*01826a49SYabin Cui #define NOISELENGTH 32
69*01826a49SYabin Cui
70*01826a49SYabin Cui static const U32 g_selectivity_default = 9;
71*01826a49SYabin Cui
72*01826a49SYabin Cui
73*01826a49SYabin Cui /*-*************************************
74*01826a49SYabin Cui * Console display
75*01826a49SYabin Cui ***************************************/
76*01826a49SYabin Cui #undef DISPLAY
77*01826a49SYabin Cui #define DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)
78*01826a49SYabin Cui #undef DISPLAYLEVEL
79*01826a49SYabin Cui #define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
80*01826a49SYabin Cui
ZDICT_clockSpan(clock_t nPrevious)81*01826a49SYabin Cui static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
82*01826a49SYabin Cui
ZDICT_printHex(const void * ptr,size_t length)83*01826a49SYabin Cui static void ZDICT_printHex(const void* ptr, size_t length)
84*01826a49SYabin Cui {
85*01826a49SYabin Cui const BYTE* const b = (const BYTE*)ptr;
86*01826a49SYabin Cui size_t u;
87*01826a49SYabin Cui for (u=0; u<length; u++) {
88*01826a49SYabin Cui BYTE c = b[u];
89*01826a49SYabin Cui if (c<32 || c>126) c = '.'; /* non-printable char */
90*01826a49SYabin Cui DISPLAY("%c", c);
91*01826a49SYabin Cui }
92*01826a49SYabin Cui }
93*01826a49SYabin Cui
94*01826a49SYabin Cui
95*01826a49SYabin Cui /*-********************************************************
96*01826a49SYabin Cui * Helper functions
97*01826a49SYabin Cui **********************************************************/
ZDICT_isError(size_t errorCode)98*01826a49SYabin Cui unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
99*01826a49SYabin Cui
ZDICT_getErrorName(size_t errorCode)100*01826a49SYabin Cui const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
101*01826a49SYabin Cui
ZDICT_getDictID(const void * dictBuffer,size_t dictSize)102*01826a49SYabin Cui unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
103*01826a49SYabin Cui {
104*01826a49SYabin Cui if (dictSize < 8) return 0;
105*01826a49SYabin Cui if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
106*01826a49SYabin Cui return MEM_readLE32((const char*)dictBuffer + 4);
107*01826a49SYabin Cui }
108*01826a49SYabin Cui
ZDICT_getDictHeaderSize(const void * dictBuffer,size_t dictSize)109*01826a49SYabin Cui size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
110*01826a49SYabin Cui {
111*01826a49SYabin Cui size_t headerSize;
112*01826a49SYabin Cui if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
113*01826a49SYabin Cui
114*01826a49SYabin Cui { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
115*01826a49SYabin Cui U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
116*01826a49SYabin Cui if (!bs || !wksp) {
117*01826a49SYabin Cui headerSize = ERROR(memory_allocation);
118*01826a49SYabin Cui } else {
119*01826a49SYabin Cui ZSTD_reset_compressedBlockState(bs);
120*01826a49SYabin Cui headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
121*01826a49SYabin Cui }
122*01826a49SYabin Cui
123*01826a49SYabin Cui free(bs);
124*01826a49SYabin Cui free(wksp);
125*01826a49SYabin Cui }
126*01826a49SYabin Cui
127*01826a49SYabin Cui return headerSize;
128*01826a49SYabin Cui }
129*01826a49SYabin Cui
130*01826a49SYabin Cui /*-********************************************************
131*01826a49SYabin Cui * Dictionary training functions
132*01826a49SYabin Cui **********************************************************/
133*01826a49SYabin Cui /*! ZDICT_count() :
134*01826a49SYabin Cui Count the nb of common bytes between 2 pointers.
135*01826a49SYabin Cui Note : this function presumes end of buffer followed by noisy guard band.
136*01826a49SYabin Cui */
ZDICT_count(const void * pIn,const void * pMatch)137*01826a49SYabin Cui static size_t ZDICT_count(const void* pIn, const void* pMatch)
138*01826a49SYabin Cui {
139*01826a49SYabin Cui const char* const pStart = (const char*)pIn;
140*01826a49SYabin Cui for (;;) {
141*01826a49SYabin Cui size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
142*01826a49SYabin Cui if (!diff) {
143*01826a49SYabin Cui pIn = (const char*)pIn+sizeof(size_t);
144*01826a49SYabin Cui pMatch = (const char*)pMatch+sizeof(size_t);
145*01826a49SYabin Cui continue;
146*01826a49SYabin Cui }
147*01826a49SYabin Cui pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
148*01826a49SYabin Cui return (size_t)((const char*)pIn - pStart);
149*01826a49SYabin Cui }
150*01826a49SYabin Cui }
151*01826a49SYabin Cui
152*01826a49SYabin Cui
153*01826a49SYabin Cui typedef struct {
154*01826a49SYabin Cui U32 pos;
155*01826a49SYabin Cui U32 length;
156*01826a49SYabin Cui U32 savings;
157*01826a49SYabin Cui } dictItem;
158*01826a49SYabin Cui
ZDICT_initDictItem(dictItem * d)159*01826a49SYabin Cui static void ZDICT_initDictItem(dictItem* d)
160*01826a49SYabin Cui {
161*01826a49SYabin Cui d->pos = 1;
162*01826a49SYabin Cui d->length = 0;
163*01826a49SYabin Cui d->savings = (U32)(-1);
164*01826a49SYabin Cui }
165*01826a49SYabin Cui
166*01826a49SYabin Cui
167*01826a49SYabin Cui #define LLIMIT 64 /* heuristic determined experimentally */
168*01826a49SYabin Cui #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
ZDICT_analyzePos(BYTE * doneMarks,const int * suffix,U32 start,const void * buffer,U32 minRatio,U32 notificationLevel)169*01826a49SYabin Cui static dictItem ZDICT_analyzePos(
170*01826a49SYabin Cui BYTE* doneMarks,
171*01826a49SYabin Cui const int* suffix, U32 start,
172*01826a49SYabin Cui const void* buffer, U32 minRatio, U32 notificationLevel)
173*01826a49SYabin Cui {
174*01826a49SYabin Cui U32 lengthList[LLIMIT] = {0};
175*01826a49SYabin Cui U32 cumulLength[LLIMIT] = {0};
176*01826a49SYabin Cui U32 savings[LLIMIT] = {0};
177*01826a49SYabin Cui const BYTE* b = (const BYTE*)buffer;
178*01826a49SYabin Cui size_t maxLength = LLIMIT;
179*01826a49SYabin Cui size_t pos = (size_t)suffix[start];
180*01826a49SYabin Cui U32 end = start;
181*01826a49SYabin Cui dictItem solution;
182*01826a49SYabin Cui
183*01826a49SYabin Cui /* init */
184*01826a49SYabin Cui memset(&solution, 0, sizeof(solution));
185*01826a49SYabin Cui doneMarks[pos] = 1;
186*01826a49SYabin Cui
187*01826a49SYabin Cui /* trivial repetition cases */
188*01826a49SYabin Cui if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
189*01826a49SYabin Cui ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
190*01826a49SYabin Cui ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
191*01826a49SYabin Cui /* skip and mark segment */
192*01826a49SYabin Cui U16 const pattern16 = MEM_read16(b+pos+4);
193*01826a49SYabin Cui U32 u, patternEnd = 6;
194*01826a49SYabin Cui while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
195*01826a49SYabin Cui if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
196*01826a49SYabin Cui for (u=1; u<patternEnd; u++)
197*01826a49SYabin Cui doneMarks[pos+u] = 1;
198*01826a49SYabin Cui return solution;
199*01826a49SYabin Cui }
200*01826a49SYabin Cui
201*01826a49SYabin Cui /* look forward */
202*01826a49SYabin Cui { size_t length;
203*01826a49SYabin Cui do {
204*01826a49SYabin Cui end++;
205*01826a49SYabin Cui length = ZDICT_count(b + pos, b + suffix[end]);
206*01826a49SYabin Cui } while (length >= MINMATCHLENGTH);
207*01826a49SYabin Cui }
208*01826a49SYabin Cui
209*01826a49SYabin Cui /* look backward */
210*01826a49SYabin Cui { size_t length;
211*01826a49SYabin Cui do {
212*01826a49SYabin Cui length = ZDICT_count(b + pos, b + *(suffix+start-1));
213*01826a49SYabin Cui if (length >=MINMATCHLENGTH) start--;
214*01826a49SYabin Cui } while(length >= MINMATCHLENGTH);
215*01826a49SYabin Cui }
216*01826a49SYabin Cui
217*01826a49SYabin Cui /* exit if not found a minimum nb of repetitions */
218*01826a49SYabin Cui if (end-start < minRatio) {
219*01826a49SYabin Cui U32 idx;
220*01826a49SYabin Cui for(idx=start; idx<end; idx++)
221*01826a49SYabin Cui doneMarks[suffix[idx]] = 1;
222*01826a49SYabin Cui return solution;
223*01826a49SYabin Cui }
224*01826a49SYabin Cui
225*01826a49SYabin Cui { int i;
226*01826a49SYabin Cui U32 mml;
227*01826a49SYabin Cui U32 refinedStart = start;
228*01826a49SYabin Cui U32 refinedEnd = end;
229*01826a49SYabin Cui
230*01826a49SYabin Cui DISPLAYLEVEL(4, "\n");
231*01826a49SYabin Cui DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
232*01826a49SYabin Cui DISPLAYLEVEL(4, "\n");
233*01826a49SYabin Cui
234*01826a49SYabin Cui for (mml = MINMATCHLENGTH ; ; mml++) {
235*01826a49SYabin Cui BYTE currentChar = 0;
236*01826a49SYabin Cui U32 currentCount = 0;
237*01826a49SYabin Cui U32 currentID = refinedStart;
238*01826a49SYabin Cui U32 id;
239*01826a49SYabin Cui U32 selectedCount = 0;
240*01826a49SYabin Cui U32 selectedID = currentID;
241*01826a49SYabin Cui for (id =refinedStart; id < refinedEnd; id++) {
242*01826a49SYabin Cui if (b[suffix[id] + mml] != currentChar) {
243*01826a49SYabin Cui if (currentCount > selectedCount) {
244*01826a49SYabin Cui selectedCount = currentCount;
245*01826a49SYabin Cui selectedID = currentID;
246*01826a49SYabin Cui }
247*01826a49SYabin Cui currentID = id;
248*01826a49SYabin Cui currentChar = b[ suffix[id] + mml];
249*01826a49SYabin Cui currentCount = 0;
250*01826a49SYabin Cui }
251*01826a49SYabin Cui currentCount ++;
252*01826a49SYabin Cui }
253*01826a49SYabin Cui if (currentCount > selectedCount) { /* for last */
254*01826a49SYabin Cui selectedCount = currentCount;
255*01826a49SYabin Cui selectedID = currentID;
256*01826a49SYabin Cui }
257*01826a49SYabin Cui
258*01826a49SYabin Cui if (selectedCount < minRatio)
259*01826a49SYabin Cui break;
260*01826a49SYabin Cui refinedStart = selectedID;
261*01826a49SYabin Cui refinedEnd = refinedStart + selectedCount;
262*01826a49SYabin Cui }
263*01826a49SYabin Cui
264*01826a49SYabin Cui /* evaluate gain based on new dict */
265*01826a49SYabin Cui start = refinedStart;
266*01826a49SYabin Cui pos = suffix[refinedStart];
267*01826a49SYabin Cui end = start;
268*01826a49SYabin Cui memset(lengthList, 0, sizeof(lengthList));
269*01826a49SYabin Cui
270*01826a49SYabin Cui /* look forward */
271*01826a49SYabin Cui { size_t length;
272*01826a49SYabin Cui do {
273*01826a49SYabin Cui end++;
274*01826a49SYabin Cui length = ZDICT_count(b + pos, b + suffix[end]);
275*01826a49SYabin Cui if (length >= LLIMIT) length = LLIMIT-1;
276*01826a49SYabin Cui lengthList[length]++;
277*01826a49SYabin Cui } while (length >=MINMATCHLENGTH);
278*01826a49SYabin Cui }
279*01826a49SYabin Cui
280*01826a49SYabin Cui /* look backward */
281*01826a49SYabin Cui { size_t length = MINMATCHLENGTH;
282*01826a49SYabin Cui while ((length >= MINMATCHLENGTH) & (start > 0)) {
283*01826a49SYabin Cui length = ZDICT_count(b + pos, b + suffix[start - 1]);
284*01826a49SYabin Cui if (length >= LLIMIT) length = LLIMIT - 1;
285*01826a49SYabin Cui lengthList[length]++;
286*01826a49SYabin Cui if (length >= MINMATCHLENGTH) start--;
287*01826a49SYabin Cui }
288*01826a49SYabin Cui }
289*01826a49SYabin Cui
290*01826a49SYabin Cui /* largest useful length */
291*01826a49SYabin Cui memset(cumulLength, 0, sizeof(cumulLength));
292*01826a49SYabin Cui cumulLength[maxLength-1] = lengthList[maxLength-1];
293*01826a49SYabin Cui for (i=(int)(maxLength-2); i>=0; i--)
294*01826a49SYabin Cui cumulLength[i] = cumulLength[i+1] + lengthList[i];
295*01826a49SYabin Cui
296*01826a49SYabin Cui for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
297*01826a49SYabin Cui maxLength = i;
298*01826a49SYabin Cui
299*01826a49SYabin Cui /* reduce maxLength in case of final into repetitive data */
300*01826a49SYabin Cui { U32 l = (U32)maxLength;
301*01826a49SYabin Cui BYTE const c = b[pos + maxLength-1];
302*01826a49SYabin Cui while (b[pos+l-2]==c) l--;
303*01826a49SYabin Cui maxLength = l;
304*01826a49SYabin Cui }
305*01826a49SYabin Cui if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
306*01826a49SYabin Cui
307*01826a49SYabin Cui /* calculate savings */
308*01826a49SYabin Cui savings[5] = 0;
309*01826a49SYabin Cui for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
310*01826a49SYabin Cui savings[i] = savings[i-1] + (lengthList[i] * (i-3));
311*01826a49SYabin Cui
312*01826a49SYabin Cui DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
313*01826a49SYabin Cui (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
314*01826a49SYabin Cui
315*01826a49SYabin Cui solution.pos = (U32)pos;
316*01826a49SYabin Cui solution.length = (U32)maxLength;
317*01826a49SYabin Cui solution.savings = savings[maxLength];
318*01826a49SYabin Cui
319*01826a49SYabin Cui /* mark positions done */
320*01826a49SYabin Cui { U32 id;
321*01826a49SYabin Cui for (id=start; id<end; id++) {
322*01826a49SYabin Cui U32 p, pEnd, length;
323*01826a49SYabin Cui U32 const testedPos = (U32)suffix[id];
324*01826a49SYabin Cui if (testedPos == pos)
325*01826a49SYabin Cui length = solution.length;
326*01826a49SYabin Cui else {
327*01826a49SYabin Cui length = (U32)ZDICT_count(b+pos, b+testedPos);
328*01826a49SYabin Cui if (length > solution.length) length = solution.length;
329*01826a49SYabin Cui }
330*01826a49SYabin Cui pEnd = (U32)(testedPos + length);
331*01826a49SYabin Cui for (p=testedPos; p<pEnd; p++)
332*01826a49SYabin Cui doneMarks[p] = 1;
333*01826a49SYabin Cui } } }
334*01826a49SYabin Cui
335*01826a49SYabin Cui return solution;
336*01826a49SYabin Cui }
337*01826a49SYabin Cui
338*01826a49SYabin Cui
isIncluded(const void * in,const void * container,size_t length)339*01826a49SYabin Cui static int isIncluded(const void* in, const void* container, size_t length)
340*01826a49SYabin Cui {
341*01826a49SYabin Cui const char* const ip = (const char*) in;
342*01826a49SYabin Cui const char* const into = (const char*) container;
343*01826a49SYabin Cui size_t u;
344*01826a49SYabin Cui
345*01826a49SYabin Cui for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
346*01826a49SYabin Cui if (ip[u] != into[u]) break;
347*01826a49SYabin Cui }
348*01826a49SYabin Cui
349*01826a49SYabin Cui return u==length;
350*01826a49SYabin Cui }
351*01826a49SYabin Cui
352*01826a49SYabin Cui /*! ZDICT_tryMerge() :
353*01826a49SYabin Cui check if dictItem can be merged, do it if possible
354*01826a49SYabin Cui @return : id of destination elt, 0 if not merged
355*01826a49SYabin Cui */
ZDICT_tryMerge(dictItem * table,dictItem elt,U32 eltNbToSkip,const void * buffer)356*01826a49SYabin Cui static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
357*01826a49SYabin Cui {
358*01826a49SYabin Cui const U32 tableSize = table->pos;
359*01826a49SYabin Cui const U32 eltEnd = elt.pos + elt.length;
360*01826a49SYabin Cui const char* const buf = (const char*) buffer;
361*01826a49SYabin Cui
362*01826a49SYabin Cui /* tail overlap */
363*01826a49SYabin Cui U32 u; for (u=1; u<tableSize; u++) {
364*01826a49SYabin Cui if (u==eltNbToSkip) continue;
365*01826a49SYabin Cui if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
366*01826a49SYabin Cui /* append */
367*01826a49SYabin Cui U32 const addedLength = table[u].pos - elt.pos;
368*01826a49SYabin Cui table[u].length += addedLength;
369*01826a49SYabin Cui table[u].pos = elt.pos;
370*01826a49SYabin Cui table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
371*01826a49SYabin Cui table[u].savings += elt.length / 8; /* rough approx bonus */
372*01826a49SYabin Cui elt = table[u];
373*01826a49SYabin Cui /* sort : improve rank */
374*01826a49SYabin Cui while ((u>1) && (table[u-1].savings < elt.savings))
375*01826a49SYabin Cui table[u] = table[u-1], u--;
376*01826a49SYabin Cui table[u] = elt;
377*01826a49SYabin Cui return u;
378*01826a49SYabin Cui } }
379*01826a49SYabin Cui
380*01826a49SYabin Cui /* front overlap */
381*01826a49SYabin Cui for (u=1; u<tableSize; u++) {
382*01826a49SYabin Cui if (u==eltNbToSkip) continue;
383*01826a49SYabin Cui
384*01826a49SYabin Cui if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
385*01826a49SYabin Cui /* append */
386*01826a49SYabin Cui int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
387*01826a49SYabin Cui table[u].savings += elt.length / 8; /* rough approx bonus */
388*01826a49SYabin Cui if (addedLength > 0) { /* otherwise, elt fully included into existing */
389*01826a49SYabin Cui table[u].length += addedLength;
390*01826a49SYabin Cui table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
391*01826a49SYabin Cui }
392*01826a49SYabin Cui /* sort : improve rank */
393*01826a49SYabin Cui elt = table[u];
394*01826a49SYabin Cui while ((u>1) && (table[u-1].savings < elt.savings))
395*01826a49SYabin Cui table[u] = table[u-1], u--;
396*01826a49SYabin Cui table[u] = elt;
397*01826a49SYabin Cui return u;
398*01826a49SYabin Cui }
399*01826a49SYabin Cui
400*01826a49SYabin Cui if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
401*01826a49SYabin Cui if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
402*01826a49SYabin Cui size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
403*01826a49SYabin Cui table[u].pos = elt.pos;
404*01826a49SYabin Cui table[u].savings += (U32)(elt.savings * addedLength / elt.length);
405*01826a49SYabin Cui table[u].length = MIN(elt.length, table[u].length + 1);
406*01826a49SYabin Cui return u;
407*01826a49SYabin Cui }
408*01826a49SYabin Cui }
409*01826a49SYabin Cui }
410*01826a49SYabin Cui
411*01826a49SYabin Cui return 0;
412*01826a49SYabin Cui }
413*01826a49SYabin Cui
414*01826a49SYabin Cui
ZDICT_removeDictItem(dictItem * table,U32 id)415*01826a49SYabin Cui static void ZDICT_removeDictItem(dictItem* table, U32 id)
416*01826a49SYabin Cui {
417*01826a49SYabin Cui /* convention : table[0].pos stores nb of elts */
418*01826a49SYabin Cui U32 const max = table[0].pos;
419*01826a49SYabin Cui U32 u;
420*01826a49SYabin Cui if (!id) return; /* protection, should never happen */
421*01826a49SYabin Cui for (u=id; u<max-1; u++)
422*01826a49SYabin Cui table[u] = table[u+1];
423*01826a49SYabin Cui table->pos--;
424*01826a49SYabin Cui }
425*01826a49SYabin Cui
426*01826a49SYabin Cui
ZDICT_insertDictItem(dictItem * table,U32 maxSize,dictItem elt,const void * buffer)427*01826a49SYabin Cui static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
428*01826a49SYabin Cui {
429*01826a49SYabin Cui /* merge if possible */
430*01826a49SYabin Cui U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
431*01826a49SYabin Cui if (mergeId) {
432*01826a49SYabin Cui U32 newMerge = 1;
433*01826a49SYabin Cui while (newMerge) {
434*01826a49SYabin Cui newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
435*01826a49SYabin Cui if (newMerge) ZDICT_removeDictItem(table, mergeId);
436*01826a49SYabin Cui mergeId = newMerge;
437*01826a49SYabin Cui }
438*01826a49SYabin Cui return;
439*01826a49SYabin Cui }
440*01826a49SYabin Cui
441*01826a49SYabin Cui /* insert */
442*01826a49SYabin Cui { U32 current;
443*01826a49SYabin Cui U32 nextElt = table->pos;
444*01826a49SYabin Cui if (nextElt >= maxSize) nextElt = maxSize-1;
445*01826a49SYabin Cui current = nextElt-1;
446*01826a49SYabin Cui while (table[current].savings < elt.savings) {
447*01826a49SYabin Cui table[current+1] = table[current];
448*01826a49SYabin Cui current--;
449*01826a49SYabin Cui }
450*01826a49SYabin Cui table[current+1] = elt;
451*01826a49SYabin Cui table->pos = nextElt+1;
452*01826a49SYabin Cui }
453*01826a49SYabin Cui }
454*01826a49SYabin Cui
455*01826a49SYabin Cui
ZDICT_dictSize(const dictItem * dictList)456*01826a49SYabin Cui static U32 ZDICT_dictSize(const dictItem* dictList)
457*01826a49SYabin Cui {
458*01826a49SYabin Cui U32 u, dictSize = 0;
459*01826a49SYabin Cui for (u=1; u<dictList[0].pos; u++)
460*01826a49SYabin Cui dictSize += dictList[u].length;
461*01826a49SYabin Cui return dictSize;
462*01826a49SYabin Cui }
463*01826a49SYabin Cui
464*01826a49SYabin Cui
ZDICT_trainBuffer_legacy(dictItem * dictList,U32 dictListSize,const void * const buffer,size_t bufferSize,const size_t * fileSizes,unsigned nbFiles,unsigned minRatio,U32 notificationLevel)465*01826a49SYabin Cui static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
466*01826a49SYabin Cui const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
467*01826a49SYabin Cui const size_t* fileSizes, unsigned nbFiles,
468*01826a49SYabin Cui unsigned minRatio, U32 notificationLevel)
469*01826a49SYabin Cui {
470*01826a49SYabin Cui int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
471*01826a49SYabin Cui int* const suffix = suffix0+1;
472*01826a49SYabin Cui U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
473*01826a49SYabin Cui BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
474*01826a49SYabin Cui U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
475*01826a49SYabin Cui size_t result = 0;
476*01826a49SYabin Cui clock_t displayClock = 0;
477*01826a49SYabin Cui clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
478*01826a49SYabin Cui
479*01826a49SYabin Cui # undef DISPLAYUPDATE
480*01826a49SYabin Cui # define DISPLAYUPDATE(l, ...) \
481*01826a49SYabin Cui do { \
482*01826a49SYabin Cui if (notificationLevel>=l) { \
483*01826a49SYabin Cui if (ZDICT_clockSpan(displayClock) > refreshRate) { \
484*01826a49SYabin Cui displayClock = clock(); \
485*01826a49SYabin Cui DISPLAY(__VA_ARGS__); \
486*01826a49SYabin Cui } \
487*01826a49SYabin Cui if (notificationLevel>=4) fflush(stderr); \
488*01826a49SYabin Cui } \
489*01826a49SYabin Cui } while (0)
490*01826a49SYabin Cui
491*01826a49SYabin Cui /* init */
492*01826a49SYabin Cui DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
493*01826a49SYabin Cui if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
494*01826a49SYabin Cui result = ERROR(memory_allocation);
495*01826a49SYabin Cui goto _cleanup;
496*01826a49SYabin Cui }
497*01826a49SYabin Cui if (minRatio < MINRATIO) minRatio = MINRATIO;
498*01826a49SYabin Cui memset(doneMarks, 0, bufferSize+16);
499*01826a49SYabin Cui
500*01826a49SYabin Cui /* limit sample set size (divsufsort limitation)*/
501*01826a49SYabin Cui if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
502*01826a49SYabin Cui while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
503*01826a49SYabin Cui
504*01826a49SYabin Cui /* sort */
505*01826a49SYabin Cui DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
506*01826a49SYabin Cui { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
507*01826a49SYabin Cui if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
508*01826a49SYabin Cui }
509*01826a49SYabin Cui suffix[bufferSize] = (int)bufferSize; /* leads into noise */
510*01826a49SYabin Cui suffix0[0] = (int)bufferSize; /* leads into noise */
511*01826a49SYabin Cui /* build reverse suffix sort */
512*01826a49SYabin Cui { size_t pos;
513*01826a49SYabin Cui for (pos=0; pos < bufferSize; pos++)
514*01826a49SYabin Cui reverseSuffix[suffix[pos]] = (U32)pos;
515*01826a49SYabin Cui /* note filePos tracks borders between samples.
516*01826a49SYabin Cui It's not used at this stage, but planned to become useful in a later update */
517*01826a49SYabin Cui filePos[0] = 0;
518*01826a49SYabin Cui for (pos=1; pos<nbFiles; pos++)
519*01826a49SYabin Cui filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
520*01826a49SYabin Cui }
521*01826a49SYabin Cui
522*01826a49SYabin Cui DISPLAYLEVEL(2, "finding patterns ... \n");
523*01826a49SYabin Cui DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
524*01826a49SYabin Cui
525*01826a49SYabin Cui { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
526*01826a49SYabin Cui dictItem solution;
527*01826a49SYabin Cui if (doneMarks[cursor]) { cursor++; continue; }
528*01826a49SYabin Cui solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
529*01826a49SYabin Cui if (solution.length==0) { cursor++; continue; }
530*01826a49SYabin Cui ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
531*01826a49SYabin Cui cursor += solution.length;
532*01826a49SYabin Cui DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
533*01826a49SYabin Cui } }
534*01826a49SYabin Cui
535*01826a49SYabin Cui _cleanup:
536*01826a49SYabin Cui free(suffix0);
537*01826a49SYabin Cui free(reverseSuffix);
538*01826a49SYabin Cui free(doneMarks);
539*01826a49SYabin Cui free(filePos);
540*01826a49SYabin Cui return result;
541*01826a49SYabin Cui }
542*01826a49SYabin Cui
543*01826a49SYabin Cui
ZDICT_fillNoise(void * buffer,size_t length)544*01826a49SYabin Cui static void ZDICT_fillNoise(void* buffer, size_t length)
545*01826a49SYabin Cui {
546*01826a49SYabin Cui unsigned const prime1 = 2654435761U;
547*01826a49SYabin Cui unsigned const prime2 = 2246822519U;
548*01826a49SYabin Cui unsigned acc = prime1;
549*01826a49SYabin Cui size_t p=0;
550*01826a49SYabin Cui for (p=0; p<length; p++) {
551*01826a49SYabin Cui acc *= prime2;
552*01826a49SYabin Cui ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
553*01826a49SYabin Cui }
554*01826a49SYabin Cui }
555*01826a49SYabin Cui
556*01826a49SYabin Cui
557*01826a49SYabin Cui typedef struct
558*01826a49SYabin Cui {
559*01826a49SYabin Cui ZSTD_CDict* dict; /* dictionary */
560*01826a49SYabin Cui ZSTD_CCtx* zc; /* working context */
561*01826a49SYabin Cui void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
562*01826a49SYabin Cui } EStats_ress_t;
563*01826a49SYabin Cui
564*01826a49SYabin Cui #define MAXREPOFFSET 1024
565*01826a49SYabin Cui
ZDICT_countEStats(EStats_ress_t esr,const ZSTD_parameters * params,unsigned * countLit,unsigned * offsetcodeCount,unsigned * matchlengthCount,unsigned * litlengthCount,U32 * repOffsets,const void * src,size_t srcSize,U32 notificationLevel)566*01826a49SYabin Cui static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
567*01826a49SYabin Cui unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
568*01826a49SYabin Cui const void* src, size_t srcSize,
569*01826a49SYabin Cui U32 notificationLevel)
570*01826a49SYabin Cui {
571*01826a49SYabin Cui size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
572*01826a49SYabin Cui size_t cSize;
573*01826a49SYabin Cui
574*01826a49SYabin Cui if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
575*01826a49SYabin Cui { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
576*01826a49SYabin Cui if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
577*01826a49SYabin Cui
578*01826a49SYabin Cui }
579*01826a49SYabin Cui cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
580*01826a49SYabin Cui if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
581*01826a49SYabin Cui
582*01826a49SYabin Cui if (cSize) { /* if == 0; block is not compressible */
583*01826a49SYabin Cui const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
584*01826a49SYabin Cui
585*01826a49SYabin Cui /* literals stats */
586*01826a49SYabin Cui { const BYTE* bytePtr;
587*01826a49SYabin Cui for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
588*01826a49SYabin Cui countLit[*bytePtr]++;
589*01826a49SYabin Cui }
590*01826a49SYabin Cui
591*01826a49SYabin Cui /* seqStats */
592*01826a49SYabin Cui { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
593*01826a49SYabin Cui ZSTD_seqToCodes(seqStorePtr);
594*01826a49SYabin Cui
595*01826a49SYabin Cui { const BYTE* codePtr = seqStorePtr->ofCode;
596*01826a49SYabin Cui U32 u;
597*01826a49SYabin Cui for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
598*01826a49SYabin Cui }
599*01826a49SYabin Cui
600*01826a49SYabin Cui { const BYTE* codePtr = seqStorePtr->mlCode;
601*01826a49SYabin Cui U32 u;
602*01826a49SYabin Cui for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
603*01826a49SYabin Cui }
604*01826a49SYabin Cui
605*01826a49SYabin Cui { const BYTE* codePtr = seqStorePtr->llCode;
606*01826a49SYabin Cui U32 u;
607*01826a49SYabin Cui for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
608*01826a49SYabin Cui }
609*01826a49SYabin Cui
610*01826a49SYabin Cui if (nbSeq >= 2) { /* rep offsets */
611*01826a49SYabin Cui const seqDef* const seq = seqStorePtr->sequencesStart;
612*01826a49SYabin Cui U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
613*01826a49SYabin Cui U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
614*01826a49SYabin Cui if (offset1 >= MAXREPOFFSET) offset1 = 0;
615*01826a49SYabin Cui if (offset2 >= MAXREPOFFSET) offset2 = 0;
616*01826a49SYabin Cui repOffsets[offset1] += 3;
617*01826a49SYabin Cui repOffsets[offset2] += 1;
618*01826a49SYabin Cui } } }
619*01826a49SYabin Cui }
620*01826a49SYabin Cui
ZDICT_totalSampleSize(const size_t * fileSizes,unsigned nbFiles)621*01826a49SYabin Cui static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
622*01826a49SYabin Cui {
623*01826a49SYabin Cui size_t total=0;
624*01826a49SYabin Cui unsigned u;
625*01826a49SYabin Cui for (u=0; u<nbFiles; u++) total += fileSizes[u];
626*01826a49SYabin Cui return total;
627*01826a49SYabin Cui }
628*01826a49SYabin Cui
629*01826a49SYabin Cui typedef struct { U32 offset; U32 count; } offsetCount_t;
630*01826a49SYabin Cui
ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1],U32 val,U32 count)631*01826a49SYabin Cui static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
632*01826a49SYabin Cui {
633*01826a49SYabin Cui U32 u;
634*01826a49SYabin Cui table[ZSTD_REP_NUM].offset = val;
635*01826a49SYabin Cui table[ZSTD_REP_NUM].count = count;
636*01826a49SYabin Cui for (u=ZSTD_REP_NUM; u>0; u--) {
637*01826a49SYabin Cui offsetCount_t tmp;
638*01826a49SYabin Cui if (table[u-1].count >= table[u].count) break;
639*01826a49SYabin Cui tmp = table[u-1];
640*01826a49SYabin Cui table[u-1] = table[u];
641*01826a49SYabin Cui table[u] = tmp;
642*01826a49SYabin Cui }
643*01826a49SYabin Cui }
644*01826a49SYabin Cui
645*01826a49SYabin Cui /* ZDICT_flatLit() :
646*01826a49SYabin Cui * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
647*01826a49SYabin Cui * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
648*01826a49SYabin Cui */
ZDICT_flatLit(unsigned * countLit)649*01826a49SYabin Cui static void ZDICT_flatLit(unsigned* countLit)
650*01826a49SYabin Cui {
651*01826a49SYabin Cui int u;
652*01826a49SYabin Cui for (u=1; u<256; u++) countLit[u] = 2;
653*01826a49SYabin Cui countLit[0] = 4;
654*01826a49SYabin Cui countLit[253] = 1;
655*01826a49SYabin Cui countLit[254] = 1;
656*01826a49SYabin Cui }
657*01826a49SYabin Cui
658*01826a49SYabin Cui #define OFFCODE_MAX 30 /* only applicable to first block */
ZDICT_analyzeEntropy(void * dstBuffer,size_t maxDstSize,int compressionLevel,const void * srcBuffer,const size_t * fileSizes,unsigned nbFiles,const void * dictBuffer,size_t dictBufferSize,unsigned notificationLevel)659*01826a49SYabin Cui static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
660*01826a49SYabin Cui int compressionLevel,
661*01826a49SYabin Cui const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
662*01826a49SYabin Cui const void* dictBuffer, size_t dictBufferSize,
663*01826a49SYabin Cui unsigned notificationLevel)
664*01826a49SYabin Cui {
665*01826a49SYabin Cui unsigned countLit[256];
666*01826a49SYabin Cui HUF_CREATE_STATIC_CTABLE(hufTable, 255);
667*01826a49SYabin Cui unsigned offcodeCount[OFFCODE_MAX+1];
668*01826a49SYabin Cui short offcodeNCount[OFFCODE_MAX+1];
669*01826a49SYabin Cui U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
670*01826a49SYabin Cui unsigned matchLengthCount[MaxML+1];
671*01826a49SYabin Cui short matchLengthNCount[MaxML+1];
672*01826a49SYabin Cui unsigned litLengthCount[MaxLL+1];
673*01826a49SYabin Cui short litLengthNCount[MaxLL+1];
674*01826a49SYabin Cui U32 repOffset[MAXREPOFFSET];
675*01826a49SYabin Cui offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
676*01826a49SYabin Cui EStats_ress_t esr = { NULL, NULL, NULL };
677*01826a49SYabin Cui ZSTD_parameters params;
678*01826a49SYabin Cui U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
679*01826a49SYabin Cui size_t pos = 0, errorCode;
680*01826a49SYabin Cui size_t eSize = 0;
681*01826a49SYabin Cui size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
682*01826a49SYabin Cui size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
683*01826a49SYabin Cui BYTE* dstPtr = (BYTE*)dstBuffer;
684*01826a49SYabin Cui U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
685*01826a49SYabin Cui
686*01826a49SYabin Cui /* init */
687*01826a49SYabin Cui DEBUGLOG(4, "ZDICT_analyzeEntropy");
688*01826a49SYabin Cui if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
689*01826a49SYabin Cui for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
690*01826a49SYabin Cui for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
691*01826a49SYabin Cui for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
692*01826a49SYabin Cui for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
693*01826a49SYabin Cui memset(repOffset, 0, sizeof(repOffset));
694*01826a49SYabin Cui repOffset[1] = repOffset[4] = repOffset[8] = 1;
695*01826a49SYabin Cui memset(bestRepOffset, 0, sizeof(bestRepOffset));
696*01826a49SYabin Cui if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
697*01826a49SYabin Cui params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
698*01826a49SYabin Cui
699*01826a49SYabin Cui esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
700*01826a49SYabin Cui esr.zc = ZSTD_createCCtx();
701*01826a49SYabin Cui esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
702*01826a49SYabin Cui if (!esr.dict || !esr.zc || !esr.workPlace) {
703*01826a49SYabin Cui eSize = ERROR(memory_allocation);
704*01826a49SYabin Cui DISPLAYLEVEL(1, "Not enough memory \n");
705*01826a49SYabin Cui goto _cleanup;
706*01826a49SYabin Cui }
707*01826a49SYabin Cui
708*01826a49SYabin Cui /* collect stats on all samples */
709*01826a49SYabin Cui for (u=0; u<nbFiles; u++) {
710*01826a49SYabin Cui ZDICT_countEStats(esr, ¶ms,
711*01826a49SYabin Cui countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
712*01826a49SYabin Cui (const char*)srcBuffer + pos, fileSizes[u],
713*01826a49SYabin Cui notificationLevel);
714*01826a49SYabin Cui pos += fileSizes[u];
715*01826a49SYabin Cui }
716*01826a49SYabin Cui
717*01826a49SYabin Cui if (notificationLevel >= 4) {
718*01826a49SYabin Cui /* writeStats */
719*01826a49SYabin Cui DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
720*01826a49SYabin Cui for (u=0; u<=offcodeMax; u++) {
721*01826a49SYabin Cui DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
722*01826a49SYabin Cui } }
723*01826a49SYabin Cui
724*01826a49SYabin Cui /* analyze, build stats, starting with literals */
725*01826a49SYabin Cui { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
726*01826a49SYabin Cui if (HUF_isError(maxNbBits)) {
727*01826a49SYabin Cui eSize = maxNbBits;
728*01826a49SYabin Cui DISPLAYLEVEL(1, " HUF_buildCTable error \n");
729*01826a49SYabin Cui goto _cleanup;
730*01826a49SYabin Cui }
731*01826a49SYabin Cui if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
732*01826a49SYabin Cui DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
733*01826a49SYabin Cui ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
734*01826a49SYabin Cui maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
735*01826a49SYabin Cui assert(maxNbBits==9);
736*01826a49SYabin Cui }
737*01826a49SYabin Cui huffLog = (U32)maxNbBits;
738*01826a49SYabin Cui }
739*01826a49SYabin Cui
740*01826a49SYabin Cui /* looking for most common first offsets */
741*01826a49SYabin Cui { U32 offset;
742*01826a49SYabin Cui for (offset=1; offset<MAXREPOFFSET; offset++)
743*01826a49SYabin Cui ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
744*01826a49SYabin Cui }
745*01826a49SYabin Cui /* note : the result of this phase should be used to better appreciate the impact on statistics */
746*01826a49SYabin Cui
747*01826a49SYabin Cui total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
748*01826a49SYabin Cui errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
749*01826a49SYabin Cui if (FSE_isError(errorCode)) {
750*01826a49SYabin Cui eSize = errorCode;
751*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
752*01826a49SYabin Cui goto _cleanup;
753*01826a49SYabin Cui }
754*01826a49SYabin Cui Offlog = (U32)errorCode;
755*01826a49SYabin Cui
756*01826a49SYabin Cui total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
757*01826a49SYabin Cui errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
758*01826a49SYabin Cui if (FSE_isError(errorCode)) {
759*01826a49SYabin Cui eSize = errorCode;
760*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
761*01826a49SYabin Cui goto _cleanup;
762*01826a49SYabin Cui }
763*01826a49SYabin Cui mlLog = (U32)errorCode;
764*01826a49SYabin Cui
765*01826a49SYabin Cui total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
766*01826a49SYabin Cui errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
767*01826a49SYabin Cui if (FSE_isError(errorCode)) {
768*01826a49SYabin Cui eSize = errorCode;
769*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
770*01826a49SYabin Cui goto _cleanup;
771*01826a49SYabin Cui }
772*01826a49SYabin Cui llLog = (U32)errorCode;
773*01826a49SYabin Cui
774*01826a49SYabin Cui /* write result to buffer */
775*01826a49SYabin Cui { size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
776*01826a49SYabin Cui if (HUF_isError(hhSize)) {
777*01826a49SYabin Cui eSize = hhSize;
778*01826a49SYabin Cui DISPLAYLEVEL(1, "HUF_writeCTable error \n");
779*01826a49SYabin Cui goto _cleanup;
780*01826a49SYabin Cui }
781*01826a49SYabin Cui dstPtr += hhSize;
782*01826a49SYabin Cui maxDstSize -= hhSize;
783*01826a49SYabin Cui eSize += hhSize;
784*01826a49SYabin Cui }
785*01826a49SYabin Cui
786*01826a49SYabin Cui { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
787*01826a49SYabin Cui if (FSE_isError(ohSize)) {
788*01826a49SYabin Cui eSize = ohSize;
789*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
790*01826a49SYabin Cui goto _cleanup;
791*01826a49SYabin Cui }
792*01826a49SYabin Cui dstPtr += ohSize;
793*01826a49SYabin Cui maxDstSize -= ohSize;
794*01826a49SYabin Cui eSize += ohSize;
795*01826a49SYabin Cui }
796*01826a49SYabin Cui
797*01826a49SYabin Cui { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
798*01826a49SYabin Cui if (FSE_isError(mhSize)) {
799*01826a49SYabin Cui eSize = mhSize;
800*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
801*01826a49SYabin Cui goto _cleanup;
802*01826a49SYabin Cui }
803*01826a49SYabin Cui dstPtr += mhSize;
804*01826a49SYabin Cui maxDstSize -= mhSize;
805*01826a49SYabin Cui eSize += mhSize;
806*01826a49SYabin Cui }
807*01826a49SYabin Cui
808*01826a49SYabin Cui { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
809*01826a49SYabin Cui if (FSE_isError(lhSize)) {
810*01826a49SYabin Cui eSize = lhSize;
811*01826a49SYabin Cui DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
812*01826a49SYabin Cui goto _cleanup;
813*01826a49SYabin Cui }
814*01826a49SYabin Cui dstPtr += lhSize;
815*01826a49SYabin Cui maxDstSize -= lhSize;
816*01826a49SYabin Cui eSize += lhSize;
817*01826a49SYabin Cui }
818*01826a49SYabin Cui
819*01826a49SYabin Cui if (maxDstSize<12) {
820*01826a49SYabin Cui eSize = ERROR(dstSize_tooSmall);
821*01826a49SYabin Cui DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
822*01826a49SYabin Cui goto _cleanup;
823*01826a49SYabin Cui }
824*01826a49SYabin Cui # if 0
825*01826a49SYabin Cui MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
826*01826a49SYabin Cui MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
827*01826a49SYabin Cui MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
828*01826a49SYabin Cui #else
829*01826a49SYabin Cui /* at this stage, we don't use the result of "most common first offset",
830*01826a49SYabin Cui * as the impact of statistics is not properly evaluated */
831*01826a49SYabin Cui MEM_writeLE32(dstPtr+0, repStartValue[0]);
832*01826a49SYabin Cui MEM_writeLE32(dstPtr+4, repStartValue[1]);
833*01826a49SYabin Cui MEM_writeLE32(dstPtr+8, repStartValue[2]);
834*01826a49SYabin Cui #endif
835*01826a49SYabin Cui eSize += 12;
836*01826a49SYabin Cui
837*01826a49SYabin Cui _cleanup:
838*01826a49SYabin Cui ZSTD_freeCDict(esr.dict);
839*01826a49SYabin Cui ZSTD_freeCCtx(esr.zc);
840*01826a49SYabin Cui free(esr.workPlace);
841*01826a49SYabin Cui
842*01826a49SYabin Cui return eSize;
843*01826a49SYabin Cui }
844*01826a49SYabin Cui
845*01826a49SYabin Cui
846*01826a49SYabin Cui /**
847*01826a49SYabin Cui * @returns the maximum repcode value
848*01826a49SYabin Cui */
ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])849*01826a49SYabin Cui static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
850*01826a49SYabin Cui {
851*01826a49SYabin Cui U32 maxRep = reps[0];
852*01826a49SYabin Cui int r;
853*01826a49SYabin Cui for (r = 1; r < ZSTD_REP_NUM; ++r)
854*01826a49SYabin Cui maxRep = MAX(maxRep, reps[r]);
855*01826a49SYabin Cui return maxRep;
856*01826a49SYabin Cui }
857*01826a49SYabin Cui
ZDICT_finalizeDictionary(void * dictBuffer,size_t dictBufferCapacity,const void * customDictContent,size_t dictContentSize,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_params_t params)858*01826a49SYabin Cui size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
859*01826a49SYabin Cui const void* customDictContent, size_t dictContentSize,
860*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes,
861*01826a49SYabin Cui unsigned nbSamples, ZDICT_params_t params)
862*01826a49SYabin Cui {
863*01826a49SYabin Cui size_t hSize;
864*01826a49SYabin Cui #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
865*01826a49SYabin Cui BYTE header[HBUFFSIZE];
866*01826a49SYabin Cui int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
867*01826a49SYabin Cui U32 const notificationLevel = params.notificationLevel;
868*01826a49SYabin Cui /* The final dictionary content must be at least as large as the largest repcode */
869*01826a49SYabin Cui size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
870*01826a49SYabin Cui size_t paddingSize;
871*01826a49SYabin Cui
872*01826a49SYabin Cui /* check conditions */
873*01826a49SYabin Cui DEBUGLOG(4, "ZDICT_finalizeDictionary");
874*01826a49SYabin Cui if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
875*01826a49SYabin Cui if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
876*01826a49SYabin Cui
877*01826a49SYabin Cui /* dictionary header */
878*01826a49SYabin Cui MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
879*01826a49SYabin Cui { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
880*01826a49SYabin Cui U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
881*01826a49SYabin Cui U32 const dictID = params.dictID ? params.dictID : compliantID;
882*01826a49SYabin Cui MEM_writeLE32(header+4, dictID);
883*01826a49SYabin Cui }
884*01826a49SYabin Cui hSize = 8;
885*01826a49SYabin Cui
886*01826a49SYabin Cui /* entropy tables */
887*01826a49SYabin Cui DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
888*01826a49SYabin Cui DISPLAYLEVEL(2, "statistics ... \n");
889*01826a49SYabin Cui { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
890*01826a49SYabin Cui compressionLevel,
891*01826a49SYabin Cui samplesBuffer, samplesSizes, nbSamples,
892*01826a49SYabin Cui customDictContent, dictContentSize,
893*01826a49SYabin Cui notificationLevel);
894*01826a49SYabin Cui if (ZDICT_isError(eSize)) return eSize;
895*01826a49SYabin Cui hSize += eSize;
896*01826a49SYabin Cui }
897*01826a49SYabin Cui
898*01826a49SYabin Cui /* Shrink the content size if it doesn't fit in the buffer */
899*01826a49SYabin Cui if (hSize + dictContentSize > dictBufferCapacity) {
900*01826a49SYabin Cui dictContentSize = dictBufferCapacity - hSize;
901*01826a49SYabin Cui }
902*01826a49SYabin Cui
903*01826a49SYabin Cui /* Pad the dictionary content with zeros if it is too small */
904*01826a49SYabin Cui if (dictContentSize < minContentSize) {
905*01826a49SYabin Cui RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
906*01826a49SYabin Cui "dictBufferCapacity too small to fit max repcode");
907*01826a49SYabin Cui paddingSize = minContentSize - dictContentSize;
908*01826a49SYabin Cui } else {
909*01826a49SYabin Cui paddingSize = 0;
910*01826a49SYabin Cui }
911*01826a49SYabin Cui
912*01826a49SYabin Cui {
913*01826a49SYabin Cui size_t const dictSize = hSize + paddingSize + dictContentSize;
914*01826a49SYabin Cui
915*01826a49SYabin Cui /* The dictionary consists of the header, optional padding, and the content.
916*01826a49SYabin Cui * The padding comes before the content because the "best" position in the
917*01826a49SYabin Cui * dictionary is the last byte.
918*01826a49SYabin Cui */
919*01826a49SYabin Cui BYTE* const outDictHeader = (BYTE*)dictBuffer;
920*01826a49SYabin Cui BYTE* const outDictPadding = outDictHeader + hSize;
921*01826a49SYabin Cui BYTE* const outDictContent = outDictPadding + paddingSize;
922*01826a49SYabin Cui
923*01826a49SYabin Cui assert(dictSize <= dictBufferCapacity);
924*01826a49SYabin Cui assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
925*01826a49SYabin Cui
926*01826a49SYabin Cui /* First copy the customDictContent into its final location.
927*01826a49SYabin Cui * `customDictContent` and `dictBuffer` may overlap, so we must
928*01826a49SYabin Cui * do this before any other writes into the output buffer.
929*01826a49SYabin Cui * Then copy the header & padding into the output buffer.
930*01826a49SYabin Cui */
931*01826a49SYabin Cui memmove(outDictContent, customDictContent, dictContentSize);
932*01826a49SYabin Cui memcpy(outDictHeader, header, hSize);
933*01826a49SYabin Cui memset(outDictPadding, 0, paddingSize);
934*01826a49SYabin Cui
935*01826a49SYabin Cui return dictSize;
936*01826a49SYabin Cui }
937*01826a49SYabin Cui }
938*01826a49SYabin Cui
939*01826a49SYabin Cui
ZDICT_addEntropyTablesFromBuffer_advanced(void * dictBuffer,size_t dictContentSize,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_params_t params)940*01826a49SYabin Cui static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
941*01826a49SYabin Cui void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
942*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
943*01826a49SYabin Cui ZDICT_params_t params)
944*01826a49SYabin Cui {
945*01826a49SYabin Cui int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
946*01826a49SYabin Cui U32 const notificationLevel = params.notificationLevel;
947*01826a49SYabin Cui size_t hSize = 8;
948*01826a49SYabin Cui
949*01826a49SYabin Cui /* calculate entropy tables */
950*01826a49SYabin Cui DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
951*01826a49SYabin Cui DISPLAYLEVEL(2, "statistics ... \n");
952*01826a49SYabin Cui { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
953*01826a49SYabin Cui compressionLevel,
954*01826a49SYabin Cui samplesBuffer, samplesSizes, nbSamples,
955*01826a49SYabin Cui (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
956*01826a49SYabin Cui notificationLevel);
957*01826a49SYabin Cui if (ZDICT_isError(eSize)) return eSize;
958*01826a49SYabin Cui hSize += eSize;
959*01826a49SYabin Cui }
960*01826a49SYabin Cui
961*01826a49SYabin Cui /* add dictionary header (after entropy tables) */
962*01826a49SYabin Cui MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
963*01826a49SYabin Cui { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
964*01826a49SYabin Cui U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
965*01826a49SYabin Cui U32 const dictID = params.dictID ? params.dictID : compliantID;
966*01826a49SYabin Cui MEM_writeLE32((char*)dictBuffer+4, dictID);
967*01826a49SYabin Cui }
968*01826a49SYabin Cui
969*01826a49SYabin Cui if (hSize + dictContentSize < dictBufferCapacity)
970*01826a49SYabin Cui memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
971*01826a49SYabin Cui return MIN(dictBufferCapacity, hSize+dictContentSize);
972*01826a49SYabin Cui }
973*01826a49SYabin Cui
974*01826a49SYabin Cui /*! ZDICT_trainFromBuffer_unsafe_legacy() :
975*01826a49SYabin Cui * Warning : `samplesBuffer` must be followed by noisy guard band !!!
976*01826a49SYabin Cui * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
977*01826a49SYabin Cui */
ZDICT_trainFromBuffer_unsafe_legacy(void * dictBuffer,size_t maxDictSize,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_legacy_params_t params)978*01826a49SYabin Cui static size_t ZDICT_trainFromBuffer_unsafe_legacy(
979*01826a49SYabin Cui void* dictBuffer, size_t maxDictSize,
980*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
981*01826a49SYabin Cui ZDICT_legacy_params_t params)
982*01826a49SYabin Cui {
983*01826a49SYabin Cui U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
984*01826a49SYabin Cui dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
985*01826a49SYabin Cui unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
986*01826a49SYabin Cui unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
987*01826a49SYabin Cui size_t const targetDictSize = maxDictSize;
988*01826a49SYabin Cui size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
989*01826a49SYabin Cui size_t dictSize = 0;
990*01826a49SYabin Cui U32 const notificationLevel = params.zParams.notificationLevel;
991*01826a49SYabin Cui
992*01826a49SYabin Cui /* checks */
993*01826a49SYabin Cui if (!dictList) return ERROR(memory_allocation);
994*01826a49SYabin Cui if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
995*01826a49SYabin Cui if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
996*01826a49SYabin Cui
997*01826a49SYabin Cui /* init */
998*01826a49SYabin Cui ZDICT_initDictItem(dictList);
999*01826a49SYabin Cui
1000*01826a49SYabin Cui /* build dictionary */
1001*01826a49SYabin Cui ZDICT_trainBuffer_legacy(dictList, dictListSize,
1002*01826a49SYabin Cui samplesBuffer, samplesBuffSize,
1003*01826a49SYabin Cui samplesSizes, nbSamples,
1004*01826a49SYabin Cui minRep, notificationLevel);
1005*01826a49SYabin Cui
1006*01826a49SYabin Cui /* display best matches */
1007*01826a49SYabin Cui if (params.zParams.notificationLevel>= 3) {
1008*01826a49SYabin Cui unsigned const nb = MIN(25, dictList[0].pos);
1009*01826a49SYabin Cui unsigned const dictContentSize = ZDICT_dictSize(dictList);
1010*01826a49SYabin Cui unsigned u;
1011*01826a49SYabin Cui DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1012*01826a49SYabin Cui DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1013*01826a49SYabin Cui for (u=1; u<nb; u++) {
1014*01826a49SYabin Cui unsigned const pos = dictList[u].pos;
1015*01826a49SYabin Cui unsigned const length = dictList[u].length;
1016*01826a49SYabin Cui U32 const printedLength = MIN(40, length);
1017*01826a49SYabin Cui if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1018*01826a49SYabin Cui free(dictList);
1019*01826a49SYabin Cui return ERROR(GENERIC); /* should never happen */
1020*01826a49SYabin Cui }
1021*01826a49SYabin Cui DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
1022*01826a49SYabin Cui u, length, pos, (unsigned)dictList[u].savings);
1023*01826a49SYabin Cui ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
1024*01826a49SYabin Cui DISPLAYLEVEL(3, "| \n");
1025*01826a49SYabin Cui } }
1026*01826a49SYabin Cui
1027*01826a49SYabin Cui
1028*01826a49SYabin Cui /* create dictionary */
1029*01826a49SYabin Cui { unsigned dictContentSize = ZDICT_dictSize(dictList);
1030*01826a49SYabin Cui if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
1031*01826a49SYabin Cui if (dictContentSize < targetDictSize/4) {
1032*01826a49SYabin Cui DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1033*01826a49SYabin Cui if (samplesBuffSize < 10 * targetDictSize)
1034*01826a49SYabin Cui DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
1035*01826a49SYabin Cui if (minRep > MINRATIO) {
1036*01826a49SYabin Cui DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
1037*01826a49SYabin Cui DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1038*01826a49SYabin Cui }
1039*01826a49SYabin Cui }
1040*01826a49SYabin Cui
1041*01826a49SYabin Cui if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
1042*01826a49SYabin Cui unsigned proposedSelectivity = selectivity-1;
1043*01826a49SYabin Cui while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
1044*01826a49SYabin Cui DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
1045*01826a49SYabin Cui DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
1046*01826a49SYabin Cui DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
1047*01826a49SYabin Cui }
1048*01826a49SYabin Cui
1049*01826a49SYabin Cui /* limit dictionary size */
1050*01826a49SYabin Cui { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
1051*01826a49SYabin Cui U32 currentSize = 0;
1052*01826a49SYabin Cui U32 n; for (n=1; n<max; n++) {
1053*01826a49SYabin Cui currentSize += dictList[n].length;
1054*01826a49SYabin Cui if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1055*01826a49SYabin Cui }
1056*01826a49SYabin Cui dictList->pos = n;
1057*01826a49SYabin Cui dictContentSize = currentSize;
1058*01826a49SYabin Cui }
1059*01826a49SYabin Cui
1060*01826a49SYabin Cui /* build dict content */
1061*01826a49SYabin Cui { U32 u;
1062*01826a49SYabin Cui BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
1063*01826a49SYabin Cui for (u=1; u<dictList->pos; u++) {
1064*01826a49SYabin Cui U32 l = dictList[u].length;
1065*01826a49SYabin Cui ptr -= l;
1066*01826a49SYabin Cui if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */
1067*01826a49SYabin Cui memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
1068*01826a49SYabin Cui } }
1069*01826a49SYabin Cui
1070*01826a49SYabin Cui dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
1071*01826a49SYabin Cui samplesBuffer, samplesSizes, nbSamples,
1072*01826a49SYabin Cui params.zParams);
1073*01826a49SYabin Cui }
1074*01826a49SYabin Cui
1075*01826a49SYabin Cui /* clean up */
1076*01826a49SYabin Cui free(dictList);
1077*01826a49SYabin Cui return dictSize;
1078*01826a49SYabin Cui }
1079*01826a49SYabin Cui
1080*01826a49SYabin Cui
1081*01826a49SYabin Cui /* ZDICT_trainFromBuffer_legacy() :
1082*01826a49SYabin Cui * issue : samplesBuffer need to be followed by a noisy guard band.
1083*01826a49SYabin Cui * work around : duplicate the buffer, and add the noise */
ZDICT_trainFromBuffer_legacy(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_legacy_params_t params)1084*01826a49SYabin Cui size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1085*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1086*01826a49SYabin Cui ZDICT_legacy_params_t params)
1087*01826a49SYabin Cui {
1088*01826a49SYabin Cui size_t result;
1089*01826a49SYabin Cui void* newBuff;
1090*01826a49SYabin Cui size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1091*01826a49SYabin Cui if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
1092*01826a49SYabin Cui
1093*01826a49SYabin Cui newBuff = malloc(sBuffSize + NOISELENGTH);
1094*01826a49SYabin Cui if (!newBuff) return ERROR(memory_allocation);
1095*01826a49SYabin Cui
1096*01826a49SYabin Cui memcpy(newBuff, samplesBuffer, sBuffSize);
1097*01826a49SYabin Cui ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
1098*01826a49SYabin Cui
1099*01826a49SYabin Cui result =
1100*01826a49SYabin Cui ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1101*01826a49SYabin Cui samplesSizes, nbSamples, params);
1102*01826a49SYabin Cui free(newBuff);
1103*01826a49SYabin Cui return result;
1104*01826a49SYabin Cui }
1105*01826a49SYabin Cui
1106*01826a49SYabin Cui
ZDICT_trainFromBuffer(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples)1107*01826a49SYabin Cui size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1108*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1109*01826a49SYabin Cui {
1110*01826a49SYabin Cui ZDICT_fastCover_params_t params;
1111*01826a49SYabin Cui DEBUGLOG(3, "ZDICT_trainFromBuffer");
1112*01826a49SYabin Cui memset(¶ms, 0, sizeof(params));
1113*01826a49SYabin Cui params.d = 8;
1114*01826a49SYabin Cui params.steps = 4;
1115*01826a49SYabin Cui /* Use default level since no compression level information is available */
1116*01826a49SYabin Cui params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1117*01826a49SYabin Cui #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1118*01826a49SYabin Cui params.zParams.notificationLevel = DEBUGLEVEL;
1119*01826a49SYabin Cui #endif
1120*01826a49SYabin Cui return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1121*01826a49SYabin Cui samplesBuffer, samplesSizes, nbSamples,
1122*01826a49SYabin Cui ¶ms);
1123*01826a49SYabin Cui }
1124*01826a49SYabin Cui
ZDICT_addEntropyTablesFromBuffer(void * dictBuffer,size_t dictContentSize,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples)1125*01826a49SYabin Cui size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1126*01826a49SYabin Cui const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1127*01826a49SYabin Cui {
1128*01826a49SYabin Cui ZDICT_params_t params;
1129*01826a49SYabin Cui memset(¶ms, 0, sizeof(params));
1130*01826a49SYabin Cui return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1131*01826a49SYabin Cui samplesBuffer, samplesSizes, nbSamples,
1132*01826a49SYabin Cui params);
1133*01826a49SYabin Cui }
1134