1*f4ee7fbaSAndroid Build Coastguard Worker #ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_ 2*f4ee7fbaSAndroid Build Coastguard Worker #define BROTLI_RESEARCH_DURCHSCHLAG_H_ 3*f4ee7fbaSAndroid Build Coastguard Worker 4*f4ee7fbaSAndroid Build Coastguard Worker #include <cstddef> 5*f4ee7fbaSAndroid Build Coastguard Worker #include <cstdint> 6*f4ee7fbaSAndroid Build Coastguard Worker #include <string> 7*f4ee7fbaSAndroid Build Coastguard Worker #include <vector> 8*f4ee7fbaSAndroid Build Coastguard Worker 9*f4ee7fbaSAndroid Build Coastguard Worker /** 10*f4ee7fbaSAndroid Build Coastguard Worker * Generate a dictionary for given samples. 11*f4ee7fbaSAndroid Build Coastguard Worker * 12*f4ee7fbaSAndroid Build Coastguard Worker * @param dictionary_size_limit maximal dictionary size 13*f4ee7fbaSAndroid Build Coastguard Worker * @param slice_len text slice size 14*f4ee7fbaSAndroid Build Coastguard Worker * @param block_len score block length 15*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_sizes vector with sample sizes 16*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_data concatenated samples 17*f4ee7fbaSAndroid Build Coastguard Worker * @return generated dictionary 18*f4ee7fbaSAndroid Build Coastguard Worker */ 19*f4ee7fbaSAndroid Build Coastguard Worker std::string durchschlag_generate( 20*f4ee7fbaSAndroid Build Coastguard Worker size_t dictionary_size_limit, size_t slice_len, size_t block_len, 21*f4ee7fbaSAndroid Build Coastguard Worker const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); 22*f4ee7fbaSAndroid Build Coastguard Worker 23*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 24*f4ee7fbaSAndroid Build Coastguard Worker // Lower level API for repetitive dictionary generation. 25*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 26*f4ee7fbaSAndroid Build Coastguard Worker 27*f4ee7fbaSAndroid Build Coastguard Worker /* Pointer to position in text. */ 28*f4ee7fbaSAndroid Build Coastguard Worker typedef uint32_t DurchschlagTextIdx; 29*f4ee7fbaSAndroid Build Coastguard Worker 30*f4ee7fbaSAndroid Build Coastguard Worker /* Context is made public for flexible serialization / deserialization. */ 31*f4ee7fbaSAndroid Build Coastguard Worker typedef struct DurchschlagContext { 32*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagTextIdx dataSize; 33*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagTextIdx sliceLen; 34*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagTextIdx numUniqueSlices; 35*f4ee7fbaSAndroid Build Coastguard Worker std::vector<DurchschlagTextIdx> offsets; 36*f4ee7fbaSAndroid Build Coastguard Worker std::vector<DurchschlagTextIdx> sliceMap; 37*f4ee7fbaSAndroid Build Coastguard Worker } DurchschlagContext; 38*f4ee7fbaSAndroid Build Coastguard Worker 39*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagContext durchschlag_prepare(size_t slice_len, 40*f4ee7fbaSAndroid Build Coastguard Worker const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); 41*f4ee7fbaSAndroid Build Coastguard Worker 42*f4ee7fbaSAndroid Build Coastguard Worker typedef enum DurchschalgResourceStrategy { 43*f4ee7fbaSAndroid Build Coastguard Worker // Faster 44*f4ee7fbaSAndroid Build Coastguard Worker DURCHSCHLAG_EXCLUSIVE = 0, 45*f4ee7fbaSAndroid Build Coastguard Worker // Uses much less memory 46*f4ee7fbaSAndroid Build Coastguard Worker DURCHSCHLAG_COLLABORATIVE = 1 47*f4ee7fbaSAndroid Build Coastguard Worker } DurchschalgResourceStrategy; 48*f4ee7fbaSAndroid Build Coastguard Worker 49*f4ee7fbaSAndroid Build Coastguard Worker std::string durchschlag_generate(DurchschalgResourceStrategy strategy, 50*f4ee7fbaSAndroid Build Coastguard Worker size_t dictionary_size_limit, size_t block_len, 51*f4ee7fbaSAndroid Build Coastguard Worker const DurchschlagContext& context, const uint8_t* sample_data); 52*f4ee7fbaSAndroid Build Coastguard Worker 53*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 54*f4ee7fbaSAndroid Build Coastguard Worker // Suffix Array based preparation. 55*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 56*f4ee7fbaSAndroid Build Coastguard Worker 57*f4ee7fbaSAndroid Build Coastguard Worker typedef struct DurchschlagIndex { 58*f4ee7fbaSAndroid Build Coastguard Worker std::vector<DurchschlagTextIdx> lcp; 59*f4ee7fbaSAndroid Build Coastguard Worker std::vector<DurchschlagTextIdx> sa; 60*f4ee7fbaSAndroid Build Coastguard Worker } DurchschlagIndex; 61*f4ee7fbaSAndroid Build Coastguard Worker 62*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data); 63*f4ee7fbaSAndroid Build Coastguard Worker 64*f4ee7fbaSAndroid Build Coastguard Worker DurchschlagContext durchschlag_prepare(size_t slice_len, 65*f4ee7fbaSAndroid Build Coastguard Worker const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index); 66*f4ee7fbaSAndroid Build Coastguard Worker 67*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 68*f4ee7fbaSAndroid Build Coastguard Worker // Data preparation. 69*f4ee7fbaSAndroid Build Coastguard Worker //------------------------------------------------------------------------------ 70*f4ee7fbaSAndroid Build Coastguard Worker 71*f4ee7fbaSAndroid Build Coastguard Worker /** 72*f4ee7fbaSAndroid Build Coastguard Worker * Cut out unique slices. 73*f4ee7fbaSAndroid Build Coastguard Worker * 74*f4ee7fbaSAndroid Build Coastguard Worker * Both @p sample_sizes and @p sample_data are modified in-place. Number of 75*f4ee7fbaSAndroid Build Coastguard Worker * samples remains unchanged, but some samples become shorter. 76*f4ee7fbaSAndroid Build Coastguard Worker * 77*f4ee7fbaSAndroid Build Coastguard Worker * @param slice_len (unique) slice size 78*f4ee7fbaSAndroid Build Coastguard Worker * @param minimum_population minimum non-unique slice occurrence 79*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_sizes [in / out] vector with sample sizes 80*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_data [in / out] concatenated samples 81*f4ee7fbaSAndroid Build Coastguard Worker */ 82*f4ee7fbaSAndroid Build Coastguard Worker void durchschlag_distill(size_t slice_len, size_t minimum_population, 83*f4ee7fbaSAndroid Build Coastguard Worker std::vector<size_t>* sample_sizes, uint8_t* sample_data); 84*f4ee7fbaSAndroid Build Coastguard Worker 85*f4ee7fbaSAndroid Build Coastguard Worker /** 86*f4ee7fbaSAndroid Build Coastguard Worker * Replace unique slices with zeroes. 87*f4ee7fbaSAndroid Build Coastguard Worker * 88*f4ee7fbaSAndroid Build Coastguard Worker * @p sample_data is modified in-place. Number of samples and their length 89*f4ee7fbaSAndroid Build Coastguard Worker * remain unchanged. 90*f4ee7fbaSAndroid Build Coastguard Worker * 91*f4ee7fbaSAndroid Build Coastguard Worker * @param slice_len (unique) slice size 92*f4ee7fbaSAndroid Build Coastguard Worker * @param minimum_population minimum non-unique slice occurrence 93*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_sizes vector with sample sizes 94*f4ee7fbaSAndroid Build Coastguard Worker * @param sample_data [in / out] concatenated samples 95*f4ee7fbaSAndroid Build Coastguard Worker */ 96*f4ee7fbaSAndroid Build Coastguard Worker void durchschlag_purify(size_t slice_len, size_t minimum_population, 97*f4ee7fbaSAndroid Build Coastguard Worker const std::vector<size_t>& sample_sizes, uint8_t* sample_data); 98*f4ee7fbaSAndroid Build Coastguard Worker 99*f4ee7fbaSAndroid Build Coastguard Worker #endif // BROTLI_RESEARCH_DURCHSCHLAG_H_ 100