xref: /aosp_15_r20/external/zstd/lib/dictBuilder/cover.h (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui /*
2*01826a49SYabin Cui  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui  * All rights reserved.
4*01826a49SYabin Cui  *
5*01826a49SYabin Cui  * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui  * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui  * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui  */
10*01826a49SYabin Cui 
11*01826a49SYabin Cui #ifndef ZDICT_STATIC_LINKING_ONLY
12*01826a49SYabin Cui #  define ZDICT_STATIC_LINKING_ONLY
13*01826a49SYabin Cui #endif
14*01826a49SYabin Cui 
15*01826a49SYabin Cui #include "../common/threading.h" /* ZSTD_pthread_mutex_t */
16*01826a49SYabin Cui #include "../common/mem.h"   /* U32, BYTE */
17*01826a49SYabin Cui #include "../zdict.h"
18*01826a49SYabin Cui 
19*01826a49SYabin Cui /**
20*01826a49SYabin Cui  * COVER_best_t is used for two purposes:
21*01826a49SYabin Cui  * 1. Synchronizing threads.
22*01826a49SYabin Cui  * 2. Saving the best parameters and dictionary.
23*01826a49SYabin Cui  *
24*01826a49SYabin Cui  * All of the methods except COVER_best_init() are thread safe if zstd is
25*01826a49SYabin Cui  * compiled with multithreaded support.
26*01826a49SYabin Cui  */
27*01826a49SYabin Cui typedef struct COVER_best_s {
28*01826a49SYabin Cui   ZSTD_pthread_mutex_t mutex;
29*01826a49SYabin Cui   ZSTD_pthread_cond_t cond;
30*01826a49SYabin Cui   size_t liveJobs;
31*01826a49SYabin Cui   void *dict;
32*01826a49SYabin Cui   size_t dictSize;
33*01826a49SYabin Cui   ZDICT_cover_params_t parameters;
34*01826a49SYabin Cui   size_t compressedSize;
35*01826a49SYabin Cui } COVER_best_t;
36*01826a49SYabin Cui 
37*01826a49SYabin Cui /**
38*01826a49SYabin Cui  * A segment is a range in the source as well as the score of the segment.
39*01826a49SYabin Cui  */
40*01826a49SYabin Cui typedef struct {
41*01826a49SYabin Cui   U32 begin;
42*01826a49SYabin Cui   U32 end;
43*01826a49SYabin Cui   U32 score;
44*01826a49SYabin Cui } COVER_segment_t;
45*01826a49SYabin Cui 
46*01826a49SYabin Cui /**
47*01826a49SYabin Cui  *Number of epochs and size of each epoch.
48*01826a49SYabin Cui  */
49*01826a49SYabin Cui typedef struct {
50*01826a49SYabin Cui   U32 num;
51*01826a49SYabin Cui   U32 size;
52*01826a49SYabin Cui } COVER_epoch_info_t;
53*01826a49SYabin Cui 
54*01826a49SYabin Cui /**
55*01826a49SYabin Cui  * Struct used for the dictionary selection function.
56*01826a49SYabin Cui  */
57*01826a49SYabin Cui typedef struct COVER_dictSelection {
58*01826a49SYabin Cui   BYTE* dictContent;
59*01826a49SYabin Cui   size_t dictSize;
60*01826a49SYabin Cui   size_t totalCompressedSize;
61*01826a49SYabin Cui } COVER_dictSelection_t;
62*01826a49SYabin Cui 
63*01826a49SYabin Cui /**
64*01826a49SYabin Cui  * Computes the number of epochs and the size of each epoch.
65*01826a49SYabin Cui  * We will make sure that each epoch gets at least 10 * k bytes.
66*01826a49SYabin Cui  *
67*01826a49SYabin Cui  * The COVER algorithms divide the data up into epochs of equal size and
68*01826a49SYabin Cui  * select one segment from each epoch.
69*01826a49SYabin Cui  *
70*01826a49SYabin Cui  * @param maxDictSize The maximum allowed dictionary size.
71*01826a49SYabin Cui  * @param nbDmers     The number of dmers we are training on.
72*01826a49SYabin Cui  * @param k           The parameter k (segment size).
73*01826a49SYabin Cui  * @param passes      The target number of passes over the dmer corpus.
74*01826a49SYabin Cui  *                    More passes means a better dictionary.
75*01826a49SYabin Cui  */
76*01826a49SYabin Cui COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
77*01826a49SYabin Cui                                        U32 k, U32 passes);
78*01826a49SYabin Cui 
79*01826a49SYabin Cui /**
80*01826a49SYabin Cui  * Warns the user when their corpus is too small.
81*01826a49SYabin Cui  */
82*01826a49SYabin Cui void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
83*01826a49SYabin Cui 
84*01826a49SYabin Cui /**
85*01826a49SYabin Cui  *  Checks total compressed size of a dictionary
86*01826a49SYabin Cui  */
87*01826a49SYabin Cui size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
88*01826a49SYabin Cui                                       const size_t *samplesSizes, const BYTE *samples,
89*01826a49SYabin Cui                                       size_t *offsets,
90*01826a49SYabin Cui                                       size_t nbTrainSamples, size_t nbSamples,
91*01826a49SYabin Cui                                       BYTE *const dict, size_t dictBufferCapacity);
92*01826a49SYabin Cui 
93*01826a49SYabin Cui /**
94*01826a49SYabin Cui  * Returns the sum of the sample sizes.
95*01826a49SYabin Cui  */
96*01826a49SYabin Cui size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
97*01826a49SYabin Cui 
98*01826a49SYabin Cui /**
99*01826a49SYabin Cui  * Initialize the `COVER_best_t`.
100*01826a49SYabin Cui  */
101*01826a49SYabin Cui void COVER_best_init(COVER_best_t *best);
102*01826a49SYabin Cui 
103*01826a49SYabin Cui /**
104*01826a49SYabin Cui  * Wait until liveJobs == 0.
105*01826a49SYabin Cui  */
106*01826a49SYabin Cui void COVER_best_wait(COVER_best_t *best);
107*01826a49SYabin Cui 
108*01826a49SYabin Cui /**
109*01826a49SYabin Cui  * Call COVER_best_wait() and then destroy the COVER_best_t.
110*01826a49SYabin Cui  */
111*01826a49SYabin Cui void COVER_best_destroy(COVER_best_t *best);
112*01826a49SYabin Cui 
113*01826a49SYabin Cui /**
114*01826a49SYabin Cui  * Called when a thread is about to be launched.
115*01826a49SYabin Cui  * Increments liveJobs.
116*01826a49SYabin Cui  */
117*01826a49SYabin Cui void COVER_best_start(COVER_best_t *best);
118*01826a49SYabin Cui 
119*01826a49SYabin Cui /**
120*01826a49SYabin Cui  * Called when a thread finishes executing, both on error or success.
121*01826a49SYabin Cui  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
122*01826a49SYabin Cui  * If this dictionary is the best so far save it and its parameters.
123*01826a49SYabin Cui  */
124*01826a49SYabin Cui void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
125*01826a49SYabin Cui                        COVER_dictSelection_t selection);
126*01826a49SYabin Cui /**
127*01826a49SYabin Cui  * Error function for COVER_selectDict function. Checks if the return
128*01826a49SYabin Cui  * value is an error.
129*01826a49SYabin Cui  */
130*01826a49SYabin Cui unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
131*01826a49SYabin Cui 
132*01826a49SYabin Cui  /**
133*01826a49SYabin Cui   * Error function for COVER_selectDict function. Returns a struct where
134*01826a49SYabin Cui   * return.totalCompressedSize is a ZSTD error.
135*01826a49SYabin Cui   */
136*01826a49SYabin Cui COVER_dictSelection_t COVER_dictSelectionError(size_t error);
137*01826a49SYabin Cui 
138*01826a49SYabin Cui /**
139*01826a49SYabin Cui  * Always call after selectDict is called to free up used memory from
140*01826a49SYabin Cui  * newly created dictionary.
141*01826a49SYabin Cui  */
142*01826a49SYabin Cui void COVER_dictSelectionFree(COVER_dictSelection_t selection);
143*01826a49SYabin Cui 
144*01826a49SYabin Cui /**
145*01826a49SYabin Cui  * Called to finalize the dictionary and select one based on whether or not
146*01826a49SYabin Cui  * the shrink-dict flag was enabled. If enabled the dictionary used is the
147*01826a49SYabin Cui  * smallest dictionary within a specified regression of the compressed size
148*01826a49SYabin Cui  * from the largest dictionary.
149*01826a49SYabin Cui  */
150*01826a49SYabin Cui  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
151*01826a49SYabin Cui                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
152*01826a49SYabin Cui                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
153