1*f4ee7fbaSAndroid Build Coastguard Worker #include "./deorummolae.h"
2*f4ee7fbaSAndroid Build Coastguard Worker
3*f4ee7fbaSAndroid Build Coastguard Worker #include <array>
4*f4ee7fbaSAndroid Build Coastguard Worker #include <cstdio>
5*f4ee7fbaSAndroid Build Coastguard Worker
6*f4ee7fbaSAndroid Build Coastguard Worker #include "./esaxx/sais.hxx"
7*f4ee7fbaSAndroid Build Coastguard Worker
8*f4ee7fbaSAndroid Build Coastguard Worker /* Used for quick SA-entry to file mapping. Each file is padded to size that
9*f4ee7fbaSAndroid Build Coastguard Worker is a multiple of chunk size. */
10*f4ee7fbaSAndroid Build Coastguard Worker #define CHUNK_SIZE 64
11*f4ee7fbaSAndroid Build Coastguard Worker /* Length of substring that is considered to be covered by dictionary string. */
12*f4ee7fbaSAndroid Build Coastguard Worker #define CUT_MATCH 6
13*f4ee7fbaSAndroid Build Coastguard Worker /* Minimal dictionary entry size. */
14*f4ee7fbaSAndroid Build Coastguard Worker #define MIN_MATCH 24
15*f4ee7fbaSAndroid Build Coastguard Worker
16*f4ee7fbaSAndroid Build Coastguard Worker /* Non tunable definitions. */
17*f4ee7fbaSAndroid Build Coastguard Worker #define CHUNK_MASK (CHUNK_SIZE - 1)
18*f4ee7fbaSAndroid Build Coastguard Worker #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
19*f4ee7fbaSAndroid Build Coastguard Worker
20*f4ee7fbaSAndroid Build Coastguard Worker /* File coverage: every bit set to 1 denotes a file covered by an isle. */
21*f4ee7fbaSAndroid Build Coastguard Worker typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
22*f4ee7fbaSAndroid Build Coastguard Worker
23*f4ee7fbaSAndroid Build Coastguard Worker /* Symbol of text alphabet. */
24*f4ee7fbaSAndroid Build Coastguard Worker typedef int32_t TextChar;
25*f4ee7fbaSAndroid Build Coastguard Worker
26*f4ee7fbaSAndroid Build Coastguard Worker /* Pointer to position in text. */
27*f4ee7fbaSAndroid Build Coastguard Worker typedef uint32_t TextIdx;
28*f4ee7fbaSAndroid Build Coastguard Worker
29*f4ee7fbaSAndroid Build Coastguard Worker /* SAIS sarray_type; unfortunately, must be a signed type. */
30*f4ee7fbaSAndroid Build Coastguard Worker typedef int32_t TextSaIdx;
31*f4ee7fbaSAndroid Build Coastguard Worker
popcount(uint64_t u)32*f4ee7fbaSAndroid Build Coastguard Worker static size_t popcount(uint64_t u) {
33*f4ee7fbaSAndroid Build Coastguard Worker return static_cast<size_t>(__builtin_popcountll(u));
34*f4ee7fbaSAndroid Build Coastguard Worker }
35*f4ee7fbaSAndroid Build Coastguard Worker
36*f4ee7fbaSAndroid Build Coastguard Worker /* Condense terminators and pad file entries. */
rewriteText(std::vector<TextChar> * text)37*f4ee7fbaSAndroid Build Coastguard Worker static void rewriteText(std::vector<TextChar>* text) {
38*f4ee7fbaSAndroid Build Coastguard Worker TextChar terminator = text->back();
39*f4ee7fbaSAndroid Build Coastguard Worker TextChar prev = terminator;
40*f4ee7fbaSAndroid Build Coastguard Worker TextIdx to = 0;
41*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx from = 0; from < text->size(); ++from) {
42*f4ee7fbaSAndroid Build Coastguard Worker TextChar next = text->at(from);
43*f4ee7fbaSAndroid Build Coastguard Worker if (next < 256 || prev < 256) {
44*f4ee7fbaSAndroid Build Coastguard Worker text->at(to++) = next;
45*f4ee7fbaSAndroid Build Coastguard Worker if (next >= 256) terminator = next;
46*f4ee7fbaSAndroid Build Coastguard Worker }
47*f4ee7fbaSAndroid Build Coastguard Worker prev = next;
48*f4ee7fbaSAndroid Build Coastguard Worker }
49*f4ee7fbaSAndroid Build Coastguard Worker text->resize(to);
50*f4ee7fbaSAndroid Build Coastguard Worker if (text->empty()) text->push_back(terminator);
51*f4ee7fbaSAndroid Build Coastguard Worker while (text->size() & CHUNK_MASK) text->push_back(terminator);
52*f4ee7fbaSAndroid Build Coastguard Worker }
53*f4ee7fbaSAndroid Build Coastguard Worker
54*f4ee7fbaSAndroid Build Coastguard Worker /* Reenumerate terminators for smaller alphabet. */
remapTerminators(std::vector<TextChar> * text,TextChar * next_terminator)55*f4ee7fbaSAndroid Build Coastguard Worker static void remapTerminators(std::vector<TextChar>* text,
56*f4ee7fbaSAndroid Build Coastguard Worker TextChar* next_terminator) {
57*f4ee7fbaSAndroid Build Coastguard Worker TextChar prev = -1;
58*f4ee7fbaSAndroid Build Coastguard Worker TextChar x = 256;
59*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx i = 0; i < text->size(); ++i) {
60*f4ee7fbaSAndroid Build Coastguard Worker TextChar next = text->at(i);
61*f4ee7fbaSAndroid Build Coastguard Worker if (next < 256) { // Char.
62*f4ee7fbaSAndroid Build Coastguard Worker // Do nothing.
63*f4ee7fbaSAndroid Build Coastguard Worker } else if (prev < 256) { // Terminator after char.
64*f4ee7fbaSAndroid Build Coastguard Worker next = x++;
65*f4ee7fbaSAndroid Build Coastguard Worker } else { // Terminator after terminator.
66*f4ee7fbaSAndroid Build Coastguard Worker next = prev;
67*f4ee7fbaSAndroid Build Coastguard Worker }
68*f4ee7fbaSAndroid Build Coastguard Worker text->at(i) = next;
69*f4ee7fbaSAndroid Build Coastguard Worker prev = next;
70*f4ee7fbaSAndroid Build Coastguard Worker }
71*f4ee7fbaSAndroid Build Coastguard Worker *next_terminator = x;
72*f4ee7fbaSAndroid Build Coastguard Worker }
73*f4ee7fbaSAndroid Build Coastguard Worker
74*f4ee7fbaSAndroid Build Coastguard Worker /* Combine all file entries; create mapping position->file. */
buildFullText(std::vector<std::vector<TextChar>> * data,std::vector<TextChar> * full_text,std::vector<TextIdx> * file_map,std::vector<TextIdx> * file_offset,TextChar * next_terminator)75*f4ee7fbaSAndroid Build Coastguard Worker static void buildFullText(std::vector<std::vector<TextChar>>* data,
76*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
77*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
78*f4ee7fbaSAndroid Build Coastguard Worker file_map->resize(0);
79*f4ee7fbaSAndroid Build Coastguard Worker file_offset->resize(0);
80*f4ee7fbaSAndroid Build Coastguard Worker full_text->resize(0);
81*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx i = 0; i < data->size(); ++i) {
82*f4ee7fbaSAndroid Build Coastguard Worker file_offset->push_back(full_text->size());
83*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextChar>& file = data->at(i);
84*f4ee7fbaSAndroid Build Coastguard Worker rewriteText(&file);
85*f4ee7fbaSAndroid Build Coastguard Worker full_text->insert(full_text->end(), file.begin(), file.end());
86*f4ee7fbaSAndroid Build Coastguard Worker file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
87*f4ee7fbaSAndroid Build Coastguard Worker }
88*f4ee7fbaSAndroid Build Coastguard Worker if (false) remapTerminators(full_text, next_terminator);
89*f4ee7fbaSAndroid Build Coastguard Worker }
90*f4ee7fbaSAndroid Build Coastguard Worker
91*f4ee7fbaSAndroid Build Coastguard Worker /* Build longest-common-prefix based on suffix array and text.
92*f4ee7fbaSAndroid Build Coastguard Worker TODO: borrowed -> unknown efficiency. */
buildLcp(std::vector<TextChar> * text,std::vector<TextIdx> * sa,std::vector<TextIdx> * lcp,std::vector<TextIdx> * invese_sa)93*f4ee7fbaSAndroid Build Coastguard Worker static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
94*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
95*f4ee7fbaSAndroid Build Coastguard Worker TextIdx size = static_cast<TextIdx>(text->size());
96*f4ee7fbaSAndroid Build Coastguard Worker lcp->resize(size);
97*f4ee7fbaSAndroid Build Coastguard Worker TextIdx k = 0;
98*f4ee7fbaSAndroid Build Coastguard Worker lcp->at(size - 1) = 0;
99*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx i = 0; i < size; ++i) {
100*f4ee7fbaSAndroid Build Coastguard Worker if (invese_sa->at(i) == size - 1) {
101*f4ee7fbaSAndroid Build Coastguard Worker k = 0;
102*f4ee7fbaSAndroid Build Coastguard Worker continue;
103*f4ee7fbaSAndroid Build Coastguard Worker }
104*f4ee7fbaSAndroid Build Coastguard Worker // Suffix which follow i-th suffix.
105*f4ee7fbaSAndroid Build Coastguard Worker TextIdx j = sa->at(invese_sa->at(i) + 1);
106*f4ee7fbaSAndroid Build Coastguard Worker while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
107*f4ee7fbaSAndroid Build Coastguard Worker ++k;
108*f4ee7fbaSAndroid Build Coastguard Worker }
109*f4ee7fbaSAndroid Build Coastguard Worker lcp->at(invese_sa->at(i)) = k;
110*f4ee7fbaSAndroid Build Coastguard Worker if (k > 0) --k;
111*f4ee7fbaSAndroid Build Coastguard Worker }
112*f4ee7fbaSAndroid Build Coastguard Worker }
113*f4ee7fbaSAndroid Build Coastguard Worker
114*f4ee7fbaSAndroid Build Coastguard Worker /* Isle is a range in SA with LCP not less than some value.
115*f4ee7fbaSAndroid Build Coastguard Worker When we raise the LCP requirement, the isle sunks and smaller isles appear
116*f4ee7fbaSAndroid Build Coastguard Worker instead. */
117*f4ee7fbaSAndroid Build Coastguard Worker typedef struct {
118*f4ee7fbaSAndroid Build Coastguard Worker TextIdx lcp;
119*f4ee7fbaSAndroid Build Coastguard Worker TextIdx l;
120*f4ee7fbaSAndroid Build Coastguard Worker TextIdx r;
121*f4ee7fbaSAndroid Build Coastguard Worker Coverage coverage;
122*f4ee7fbaSAndroid Build Coastguard Worker } Isle;
123*f4ee7fbaSAndroid Build Coastguard Worker
124*f4ee7fbaSAndroid Build Coastguard Worker /* Helper routine for `cutMatch`. */
poisonData(TextIdx pos,TextIdx length,std::vector<std::vector<TextChar>> * data,std::vector<TextIdx> * file_map,std::vector<TextIdx> * file_offset,TextChar * next_terminator)125*f4ee7fbaSAndroid Build Coastguard Worker static void poisonData(TextIdx pos, TextIdx length,
126*f4ee7fbaSAndroid Build Coastguard Worker std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
127*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
128*f4ee7fbaSAndroid Build Coastguard Worker TextIdx f = file_map->at(pos / CHUNK_SIZE);
129*f4ee7fbaSAndroid Build Coastguard Worker pos -= file_offset->at(f);
130*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextChar>& file = data->at(f);
131*f4ee7fbaSAndroid Build Coastguard Worker TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
132*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx j = 0; j < l; j++, pos++) {
133*f4ee7fbaSAndroid Build Coastguard Worker if (file[pos] >= 256) continue;
134*f4ee7fbaSAndroid Build Coastguard Worker if (file[pos + 1] >= 256) {
135*f4ee7fbaSAndroid Build Coastguard Worker file[pos] = file[pos + 1];
136*f4ee7fbaSAndroid Build Coastguard Worker } else if (pos > 0 && file[pos - 1] >= 256) {
137*f4ee7fbaSAndroid Build Coastguard Worker file[pos] = file[pos - 1];
138*f4ee7fbaSAndroid Build Coastguard Worker } else {
139*f4ee7fbaSAndroid Build Coastguard Worker file[pos] = (*next_terminator)++;
140*f4ee7fbaSAndroid Build Coastguard Worker }
141*f4ee7fbaSAndroid Build Coastguard Worker }
142*f4ee7fbaSAndroid Build Coastguard Worker }
143*f4ee7fbaSAndroid Build Coastguard Worker
144*f4ee7fbaSAndroid Build Coastguard Worker /* Remove substrings of a given match from files.
145*f4ee7fbaSAndroid Build Coastguard Worker Substrings are replaced with unique terminators, so next iteration SA would
146*f4ee7fbaSAndroid Build Coastguard Worker not allow to cross removed areas. */
cutMatch(std::vector<std::vector<TextChar>> * data,TextIdx index,TextIdx length,std::vector<TextIdx> * sa,std::vector<TextIdx> * lcp,std::vector<TextIdx> * invese_sa,TextChar * next_terminator,std::vector<TextIdx> * file_map,std::vector<TextIdx> * file_offset)147*f4ee7fbaSAndroid Build Coastguard Worker static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
148*f4ee7fbaSAndroid Build Coastguard Worker TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
149*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
150*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
151*f4ee7fbaSAndroid Build Coastguard Worker while (length >= CUT_MATCH) {
152*f4ee7fbaSAndroid Build Coastguard Worker TextIdx i = index;
153*f4ee7fbaSAndroid Build Coastguard Worker while (lcp->at(i) >= length) {
154*f4ee7fbaSAndroid Build Coastguard Worker i++;
155*f4ee7fbaSAndroid Build Coastguard Worker poisonData(
156*f4ee7fbaSAndroid Build Coastguard Worker sa->at(i), length, data, file_map, file_offset, next_terminator);
157*f4ee7fbaSAndroid Build Coastguard Worker }
158*f4ee7fbaSAndroid Build Coastguard Worker while (true) {
159*f4ee7fbaSAndroid Build Coastguard Worker poisonData(
160*f4ee7fbaSAndroid Build Coastguard Worker sa->at(index), length, data, file_map, file_offset, next_terminator);
161*f4ee7fbaSAndroid Build Coastguard Worker if (index == 0 || lcp->at(index - 1) < length) break;
162*f4ee7fbaSAndroid Build Coastguard Worker index--;
163*f4ee7fbaSAndroid Build Coastguard Worker }
164*f4ee7fbaSAndroid Build Coastguard Worker length--;
165*f4ee7fbaSAndroid Build Coastguard Worker index = invese_sa->at(sa->at(index) + 1);
166*f4ee7fbaSAndroid Build Coastguard Worker }
167*f4ee7fbaSAndroid Build Coastguard Worker }
168*f4ee7fbaSAndroid Build Coastguard Worker
DM_generate(size_t dictionary_size_limit,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)169*f4ee7fbaSAndroid Build Coastguard Worker std::string DM_generate(size_t dictionary_size_limit,
170*f4ee7fbaSAndroid Build Coastguard Worker const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
171*f4ee7fbaSAndroid Build Coastguard Worker {
172*f4ee7fbaSAndroid Build Coastguard Worker TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
173*f4ee7fbaSAndroid Build Coastguard Worker if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
174*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "dictionary_size_limit is too large\n");
175*f4ee7fbaSAndroid Build Coastguard Worker return "";
176*f4ee7fbaSAndroid Build Coastguard Worker }
177*f4ee7fbaSAndroid Build Coastguard Worker }
178*f4ee7fbaSAndroid Build Coastguard Worker
179*f4ee7fbaSAndroid Build Coastguard Worker /* Could use 256 + '0' for easier debugging. */
180*f4ee7fbaSAndroid Build Coastguard Worker TextChar next_terminator = 256;
181*f4ee7fbaSAndroid Build Coastguard Worker
182*f4ee7fbaSAndroid Build Coastguard Worker std::string output;
183*f4ee7fbaSAndroid Build Coastguard Worker std::vector<std::vector<TextChar>> data;
184*f4ee7fbaSAndroid Build Coastguard Worker
185*f4ee7fbaSAndroid Build Coastguard Worker TextIdx offset = 0;
186*f4ee7fbaSAndroid Build Coastguard Worker size_t num_samples = sample_sizes.size();
187*f4ee7fbaSAndroid Build Coastguard Worker if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
188*f4ee7fbaSAndroid Build Coastguard Worker for (size_t n = 0; n < num_samples; ++n) {
189*f4ee7fbaSAndroid Build Coastguard Worker TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
190*f4ee7fbaSAndroid Build Coastguard Worker if (delta != sample_sizes[n]) {
191*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "sample is too large\n");
192*f4ee7fbaSAndroid Build Coastguard Worker return "";
193*f4ee7fbaSAndroid Build Coastguard Worker }
194*f4ee7fbaSAndroid Build Coastguard Worker if (delta == 0) {
195*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "0-length samples are prohibited\n");
196*f4ee7fbaSAndroid Build Coastguard Worker return "";
197*f4ee7fbaSAndroid Build Coastguard Worker }
198*f4ee7fbaSAndroid Build Coastguard Worker TextIdx next_offset = offset + delta;
199*f4ee7fbaSAndroid Build Coastguard Worker if (next_offset <= offset) {
200*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "corpus is too large\n");
201*f4ee7fbaSAndroid Build Coastguard Worker return "";
202*f4ee7fbaSAndroid Build Coastguard Worker }
203*f4ee7fbaSAndroid Build Coastguard Worker data.push_back(
204*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
205*f4ee7fbaSAndroid Build Coastguard Worker offset = next_offset;
206*f4ee7fbaSAndroid Build Coastguard Worker data.back().push_back(next_terminator++);
207*f4ee7fbaSAndroid Build Coastguard Worker }
208*f4ee7fbaSAndroid Build Coastguard Worker
209*f4ee7fbaSAndroid Build Coastguard Worker /* Most arrays are allocated once, and then just resized to smaller and
210*f4ee7fbaSAndroid Build Coastguard Worker smaller sizes. */
211*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextChar> full_text;
212*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx> file_map;
213*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx> file_offset;
214*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx> sa;
215*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx> invese_sa;
216*f4ee7fbaSAndroid Build Coastguard Worker std::vector<TextIdx> lcp;
217*f4ee7fbaSAndroid Build Coastguard Worker std::vector<Isle> isles;
218*f4ee7fbaSAndroid Build Coastguard Worker std::vector<char> output_data;
219*f4ee7fbaSAndroid Build Coastguard Worker TextIdx total = 0;
220*f4ee7fbaSAndroid Build Coastguard Worker TextIdx total_cost = 0;
221*f4ee7fbaSAndroid Build Coastguard Worker TextIdx best_cost;
222*f4ee7fbaSAndroid Build Coastguard Worker Isle best_isle;
223*f4ee7fbaSAndroid Build Coastguard Worker size_t min_count = num_samples;
224*f4ee7fbaSAndroid Build Coastguard Worker
225*f4ee7fbaSAndroid Build Coastguard Worker while (true) {
226*f4ee7fbaSAndroid Build Coastguard Worker TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
227*f4ee7fbaSAndroid Build Coastguard Worker buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
228*f4ee7fbaSAndroid Build Coastguard Worker sa.resize(full_text.size());
229*f4ee7fbaSAndroid Build Coastguard Worker /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
230*f4ee7fbaSAndroid Build Coastguard Worker saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
231*f4ee7fbaSAndroid Build Coastguard Worker static_cast<TextChar>(full_text.size()), next_terminator);
232*f4ee7fbaSAndroid Build Coastguard Worker invese_sa.resize(full_text.size());
233*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx i = 0; i < full_text.size(); ++i) {
234*f4ee7fbaSAndroid Build Coastguard Worker invese_sa[sa[i]] = i;
235*f4ee7fbaSAndroid Build Coastguard Worker }
236*f4ee7fbaSAndroid Build Coastguard Worker buildLcp(&full_text, &sa, &lcp, &invese_sa);
237*f4ee7fbaSAndroid Build Coastguard Worker
238*f4ee7fbaSAndroid Build Coastguard Worker /* Do not rebuild SA/LCP, just use different selection. */
239*f4ee7fbaSAndroid Build Coastguard Worker retry:
240*f4ee7fbaSAndroid Build Coastguard Worker best_cost = 0;
241*f4ee7fbaSAndroid Build Coastguard Worker best_isle = {0, 0, 0, {{0}}};
242*f4ee7fbaSAndroid Build Coastguard Worker isles.resize(0);
243*f4ee7fbaSAndroid Build Coastguard Worker isles.push_back(best_isle);
244*f4ee7fbaSAndroid Build Coastguard Worker
245*f4ee7fbaSAndroid Build Coastguard Worker for (TextIdx i = 0; i < lcp.size(); ++i) {
246*f4ee7fbaSAndroid Build Coastguard Worker TextIdx l = i;
247*f4ee7fbaSAndroid Build Coastguard Worker Coverage cov = {{0}};
248*f4ee7fbaSAndroid Build Coastguard Worker size_t f = file_map[sa[i] / CHUNK_SIZE];
249*f4ee7fbaSAndroid Build Coastguard Worker cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
250*f4ee7fbaSAndroid Build Coastguard Worker while (lcp[i] < isles.back().lcp) {
251*f4ee7fbaSAndroid Build Coastguard Worker Isle& top = isles.back();
252*f4ee7fbaSAndroid Build Coastguard Worker top.r = i;
253*f4ee7fbaSAndroid Build Coastguard Worker l = top.l;
254*f4ee7fbaSAndroid Build Coastguard Worker for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
255*f4ee7fbaSAndroid Build Coastguard Worker size_t count = 0;
256*f4ee7fbaSAndroid Build Coastguard Worker for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
257*f4ee7fbaSAndroid Build Coastguard Worker TextIdx effective_lcp = top.lcp;
258*f4ee7fbaSAndroid Build Coastguard Worker /* Restrict (last) dictionary entry length. */
259*f4ee7fbaSAndroid Build Coastguard Worker if (effective_lcp > max_match) effective_lcp = max_match;
260*f4ee7fbaSAndroid Build Coastguard Worker TextIdx cost = count * effective_lcp;
261*f4ee7fbaSAndroid Build Coastguard Worker if (cost > best_cost && count >= min_count &&
262*f4ee7fbaSAndroid Build Coastguard Worker effective_lcp >= MIN_MATCH) {
263*f4ee7fbaSAndroid Build Coastguard Worker best_cost = cost;
264*f4ee7fbaSAndroid Build Coastguard Worker best_isle = top;
265*f4ee7fbaSAndroid Build Coastguard Worker best_isle.lcp = effective_lcp;
266*f4ee7fbaSAndroid Build Coastguard Worker }
267*f4ee7fbaSAndroid Build Coastguard Worker isles.pop_back();
268*f4ee7fbaSAndroid Build Coastguard Worker for (size_t x = 0; x < cov.size(); ++x) {
269*f4ee7fbaSAndroid Build Coastguard Worker isles.back().coverage[x] |= cov[x];
270*f4ee7fbaSAndroid Build Coastguard Worker }
271*f4ee7fbaSAndroid Build Coastguard Worker }
272*f4ee7fbaSAndroid Build Coastguard Worker if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
273*f4ee7fbaSAndroid Build Coastguard Worker for (size_t x = 0; x < cov.size(); ++x) {
274*f4ee7fbaSAndroid Build Coastguard Worker isles.back().coverage[x] |= cov[x];
275*f4ee7fbaSAndroid Build Coastguard Worker }
276*f4ee7fbaSAndroid Build Coastguard Worker }
277*f4ee7fbaSAndroid Build Coastguard Worker
278*f4ee7fbaSAndroid Build Coastguard Worker /* When saturated matches do not match length restrictions, lower the
279*f4ee7fbaSAndroid Build Coastguard Worker saturation requirements. */
280*f4ee7fbaSAndroid Build Coastguard Worker if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
281*f4ee7fbaSAndroid Build Coastguard Worker if (min_count >= 8) {
282*f4ee7fbaSAndroid Build Coastguard Worker min_count = (min_count * 7) / 8;
283*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "Retry: min_count=%zu\n", min_count);
284*f4ee7fbaSAndroid Build Coastguard Worker goto retry;
285*f4ee7fbaSAndroid Build Coastguard Worker }
286*f4ee7fbaSAndroid Build Coastguard Worker break;
287*f4ee7fbaSAndroid Build Coastguard Worker }
288*f4ee7fbaSAndroid Build Coastguard Worker
289*f4ee7fbaSAndroid Build Coastguard Worker /* Save the entry. */
290*f4ee7fbaSAndroid Build Coastguard Worker fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
291*f4ee7fbaSAndroid Build Coastguard Worker total_cost, best_cost, total, best_isle.lcp);
292*f4ee7fbaSAndroid Build Coastguard Worker int* piece = &full_text[sa[best_isle.l]];
293*f4ee7fbaSAndroid Build Coastguard Worker output.insert(output.end(), piece, piece + best_isle.lcp);
294*f4ee7fbaSAndroid Build Coastguard Worker total += best_isle.lcp;
295*f4ee7fbaSAndroid Build Coastguard Worker total_cost += best_cost;
296*f4ee7fbaSAndroid Build Coastguard Worker cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
297*f4ee7fbaSAndroid Build Coastguard Worker &next_terminator, &file_map, &file_offset);
298*f4ee7fbaSAndroid Build Coastguard Worker if (total >= dictionary_size_limit) break;
299*f4ee7fbaSAndroid Build Coastguard Worker }
300*f4ee7fbaSAndroid Build Coastguard Worker
301*f4ee7fbaSAndroid Build Coastguard Worker return output;
302*f4ee7fbaSAndroid Build Coastguard Worker }
303