1*01826a49SYabin Cui /* 2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates. 3*01826a49SYabin Cui * All rights reserved. 4*01826a49SYabin Cui * 5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the 6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree). 8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses. 9*01826a49SYabin Cui */ 10*01826a49SYabin Cui 11*01826a49SYabin Cui #ifndef ZDICT_STATIC_LINKING_ONLY 12*01826a49SYabin Cui # define ZDICT_STATIC_LINKING_ONLY 13*01826a49SYabin Cui #endif 14*01826a49SYabin Cui 15*01826a49SYabin Cui #include "../common/threading.h" /* ZSTD_pthread_mutex_t */ 16*01826a49SYabin Cui #include "../common/mem.h" /* U32, BYTE */ 17*01826a49SYabin Cui #include "../zdict.h" 18*01826a49SYabin Cui 19*01826a49SYabin Cui /** 20*01826a49SYabin Cui * COVER_best_t is used for two purposes: 21*01826a49SYabin Cui * 1. Synchronizing threads. 22*01826a49SYabin Cui * 2. Saving the best parameters and dictionary. 23*01826a49SYabin Cui * 24*01826a49SYabin Cui * All of the methods except COVER_best_init() are thread safe if zstd is 25*01826a49SYabin Cui * compiled with multithreaded support. 26*01826a49SYabin Cui */ 27*01826a49SYabin Cui typedef struct COVER_best_s { 28*01826a49SYabin Cui ZSTD_pthread_mutex_t mutex; 29*01826a49SYabin Cui ZSTD_pthread_cond_t cond; 30*01826a49SYabin Cui size_t liveJobs; 31*01826a49SYabin Cui void *dict; 32*01826a49SYabin Cui size_t dictSize; 33*01826a49SYabin Cui ZDICT_cover_params_t parameters; 34*01826a49SYabin Cui size_t compressedSize; 35*01826a49SYabin Cui } COVER_best_t; 36*01826a49SYabin Cui 37*01826a49SYabin Cui /** 38*01826a49SYabin Cui * A segment is a range in the source as well as the score of the segment. 39*01826a49SYabin Cui */ 40*01826a49SYabin Cui typedef struct { 41*01826a49SYabin Cui U32 begin; 42*01826a49SYabin Cui U32 end; 43*01826a49SYabin Cui U32 score; 44*01826a49SYabin Cui } COVER_segment_t; 45*01826a49SYabin Cui 46*01826a49SYabin Cui /** 47*01826a49SYabin Cui *Number of epochs and size of each epoch. 48*01826a49SYabin Cui */ 49*01826a49SYabin Cui typedef struct { 50*01826a49SYabin Cui U32 num; 51*01826a49SYabin Cui U32 size; 52*01826a49SYabin Cui } COVER_epoch_info_t; 53*01826a49SYabin Cui 54*01826a49SYabin Cui /** 55*01826a49SYabin Cui * Struct used for the dictionary selection function. 56*01826a49SYabin Cui */ 57*01826a49SYabin Cui typedef struct COVER_dictSelection { 58*01826a49SYabin Cui BYTE* dictContent; 59*01826a49SYabin Cui size_t dictSize; 60*01826a49SYabin Cui size_t totalCompressedSize; 61*01826a49SYabin Cui } COVER_dictSelection_t; 62*01826a49SYabin Cui 63*01826a49SYabin Cui /** 64*01826a49SYabin Cui * Computes the number of epochs and the size of each epoch. 65*01826a49SYabin Cui * We will make sure that each epoch gets at least 10 * k bytes. 66*01826a49SYabin Cui * 67*01826a49SYabin Cui * The COVER algorithms divide the data up into epochs of equal size and 68*01826a49SYabin Cui * select one segment from each epoch. 69*01826a49SYabin Cui * 70*01826a49SYabin Cui * @param maxDictSize The maximum allowed dictionary size. 71*01826a49SYabin Cui * @param nbDmers The number of dmers we are training on. 72*01826a49SYabin Cui * @param k The parameter k (segment size). 73*01826a49SYabin Cui * @param passes The target number of passes over the dmer corpus. 74*01826a49SYabin Cui * More passes means a better dictionary. 75*01826a49SYabin Cui */ 76*01826a49SYabin Cui COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 77*01826a49SYabin Cui U32 k, U32 passes); 78*01826a49SYabin Cui 79*01826a49SYabin Cui /** 80*01826a49SYabin Cui * Warns the user when their corpus is too small. 81*01826a49SYabin Cui */ 82*01826a49SYabin Cui void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 83*01826a49SYabin Cui 84*01826a49SYabin Cui /** 85*01826a49SYabin Cui * Checks total compressed size of a dictionary 86*01826a49SYabin Cui */ 87*01826a49SYabin Cui size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 88*01826a49SYabin Cui const size_t *samplesSizes, const BYTE *samples, 89*01826a49SYabin Cui size_t *offsets, 90*01826a49SYabin Cui size_t nbTrainSamples, size_t nbSamples, 91*01826a49SYabin Cui BYTE *const dict, size_t dictBufferCapacity); 92*01826a49SYabin Cui 93*01826a49SYabin Cui /** 94*01826a49SYabin Cui * Returns the sum of the sample sizes. 95*01826a49SYabin Cui */ 96*01826a49SYabin Cui size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 97*01826a49SYabin Cui 98*01826a49SYabin Cui /** 99*01826a49SYabin Cui * Initialize the `COVER_best_t`. 100*01826a49SYabin Cui */ 101*01826a49SYabin Cui void COVER_best_init(COVER_best_t *best); 102*01826a49SYabin Cui 103*01826a49SYabin Cui /** 104*01826a49SYabin Cui * Wait until liveJobs == 0. 105*01826a49SYabin Cui */ 106*01826a49SYabin Cui void COVER_best_wait(COVER_best_t *best); 107*01826a49SYabin Cui 108*01826a49SYabin Cui /** 109*01826a49SYabin Cui * Call COVER_best_wait() and then destroy the COVER_best_t. 110*01826a49SYabin Cui */ 111*01826a49SYabin Cui void COVER_best_destroy(COVER_best_t *best); 112*01826a49SYabin Cui 113*01826a49SYabin Cui /** 114*01826a49SYabin Cui * Called when a thread is about to be launched. 115*01826a49SYabin Cui * Increments liveJobs. 116*01826a49SYabin Cui */ 117*01826a49SYabin Cui void COVER_best_start(COVER_best_t *best); 118*01826a49SYabin Cui 119*01826a49SYabin Cui /** 120*01826a49SYabin Cui * Called when a thread finishes executing, both on error or success. 121*01826a49SYabin Cui * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 122*01826a49SYabin Cui * If this dictionary is the best so far save it and its parameters. 123*01826a49SYabin Cui */ 124*01826a49SYabin Cui void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 125*01826a49SYabin Cui COVER_dictSelection_t selection); 126*01826a49SYabin Cui /** 127*01826a49SYabin Cui * Error function for COVER_selectDict function. Checks if the return 128*01826a49SYabin Cui * value is an error. 129*01826a49SYabin Cui */ 130*01826a49SYabin Cui unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 131*01826a49SYabin Cui 132*01826a49SYabin Cui /** 133*01826a49SYabin Cui * Error function for COVER_selectDict function. Returns a struct where 134*01826a49SYabin Cui * return.totalCompressedSize is a ZSTD error. 135*01826a49SYabin Cui */ 136*01826a49SYabin Cui COVER_dictSelection_t COVER_dictSelectionError(size_t error); 137*01826a49SYabin Cui 138*01826a49SYabin Cui /** 139*01826a49SYabin Cui * Always call after selectDict is called to free up used memory from 140*01826a49SYabin Cui * newly created dictionary. 141*01826a49SYabin Cui */ 142*01826a49SYabin Cui void COVER_dictSelectionFree(COVER_dictSelection_t selection); 143*01826a49SYabin Cui 144*01826a49SYabin Cui /** 145*01826a49SYabin Cui * Called to finalize the dictionary and select one based on whether or not 146*01826a49SYabin Cui * the shrink-dict flag was enabled. If enabled the dictionary used is the 147*01826a49SYabin Cui * smallest dictionary within a specified regression of the compressed size 148*01826a49SYabin Cui * from the largest dictionary. 149*01826a49SYabin Cui */ 150*01826a49SYabin Cui COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 151*01826a49SYabin Cui size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 152*01826a49SYabin Cui size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 153