xref: /aosp_15_r20/external/brotli/research/sieve.cc (revision f4ee7fba7774faf2a30f13154332c0a06550dbc4)
1*f4ee7fbaSAndroid Build Coastguard Worker #include "./sieve.h"
2*f4ee7fbaSAndroid Build Coastguard Worker 
3*f4ee7fbaSAndroid Build Coastguard Worker /* Pointer to position in (combined corpus) text. */
4*f4ee7fbaSAndroid Build Coastguard Worker typedef uint32_t TextIdx;
5*f4ee7fbaSAndroid Build Coastguard Worker 
6*f4ee7fbaSAndroid Build Coastguard Worker /* Index of sample / generation. */
7*f4ee7fbaSAndroid Build Coastguard Worker typedef uint16_t SampleIdx;
8*f4ee7fbaSAndroid Build Coastguard Worker 
9*f4ee7fbaSAndroid Build Coastguard Worker typedef struct Slot {
10*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx next;
11*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx offset;
12*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx presence;
13*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx mark;
14*f4ee7fbaSAndroid Build Coastguard Worker } Slot;
15*f4ee7fbaSAndroid Build Coastguard Worker 
16*f4ee7fbaSAndroid Build Coastguard Worker static const TextIdx kNowhere = static_cast<TextIdx>(-1);
17*f4ee7fbaSAndroid Build Coastguard Worker 
dryRun(TextIdx sliceLen,Slot * map,TextIdx * shortcut,TextIdx end,TextIdx middle,SampleIdx minPresence,SampleIdx iteration)18*f4ee7fbaSAndroid Build Coastguard Worker static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
19*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
20*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx from = kNowhere;
21*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx to = kNowhere;
22*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx result = 0;
23*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx targetPresence = minPresence;
24*f4ee7fbaSAndroid Build Coastguard Worker   for (TextIdx i = 0; i < end; ++i) {
25*f4ee7fbaSAndroid Build Coastguard Worker     if (i == middle) {
26*f4ee7fbaSAndroid Build Coastguard Worker       targetPresence++;
27*f4ee7fbaSAndroid Build Coastguard Worker     }
28*f4ee7fbaSAndroid Build Coastguard Worker     Slot& item = map[shortcut[i]];
29*f4ee7fbaSAndroid Build Coastguard Worker     if (item.mark != iteration) {
30*f4ee7fbaSAndroid Build Coastguard Worker       item.mark = iteration;
31*f4ee7fbaSAndroid Build Coastguard Worker       if (item.presence >= targetPresence) {
32*f4ee7fbaSAndroid Build Coastguard Worker         if ((to == kNowhere) || (to < i)) {
33*f4ee7fbaSAndroid Build Coastguard Worker           if (from != kNowhere) {
34*f4ee7fbaSAndroid Build Coastguard Worker             result += to - from;
35*f4ee7fbaSAndroid Build Coastguard Worker           }
36*f4ee7fbaSAndroid Build Coastguard Worker           from = i;
37*f4ee7fbaSAndroid Build Coastguard Worker         }
38*f4ee7fbaSAndroid Build Coastguard Worker         to = i + sliceLen;
39*f4ee7fbaSAndroid Build Coastguard Worker       }
40*f4ee7fbaSAndroid Build Coastguard Worker     }
41*f4ee7fbaSAndroid Build Coastguard Worker   }
42*f4ee7fbaSAndroid Build Coastguard Worker   if (from != kNowhere) {
43*f4ee7fbaSAndroid Build Coastguard Worker     result += to - from;
44*f4ee7fbaSAndroid Build Coastguard Worker   }
45*f4ee7fbaSAndroid Build Coastguard Worker   return result;
46*f4ee7fbaSAndroid Build Coastguard Worker }
47*f4ee7fbaSAndroid Build Coastguard Worker 
createDictionary(const uint8_t * data,TextIdx sliceLen,Slot * map,TextIdx * shortcut,TextIdx end,TextIdx middle,SampleIdx minPresence,SampleIdx iteration)48*f4ee7fbaSAndroid Build Coastguard Worker static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
49*f4ee7fbaSAndroid Build Coastguard Worker     Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
50*f4ee7fbaSAndroid Build Coastguard Worker     SampleIdx minPresence, SampleIdx iteration) {
51*f4ee7fbaSAndroid Build Coastguard Worker   std::string output;
52*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx from = kNowhere;
53*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx to = kNowhere;
54*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx targetPresence = minPresence;
55*f4ee7fbaSAndroid Build Coastguard Worker   for (TextIdx i = 0; i < end; ++i) {
56*f4ee7fbaSAndroid Build Coastguard Worker     if (i == middle) {
57*f4ee7fbaSAndroid Build Coastguard Worker       targetPresence++;
58*f4ee7fbaSAndroid Build Coastguard Worker     }
59*f4ee7fbaSAndroid Build Coastguard Worker     Slot& item = map[shortcut[i]];
60*f4ee7fbaSAndroid Build Coastguard Worker     if (item.mark != iteration) {
61*f4ee7fbaSAndroid Build Coastguard Worker       item.mark = iteration;
62*f4ee7fbaSAndroid Build Coastguard Worker       if (item.presence >= targetPresence) {
63*f4ee7fbaSAndroid Build Coastguard Worker         if ((to == kNowhere) || (to < i)) {
64*f4ee7fbaSAndroid Build Coastguard Worker           if (from != kNowhere) {
65*f4ee7fbaSAndroid Build Coastguard Worker             output.insert(output.end(), &data[from], &data[to]);
66*f4ee7fbaSAndroid Build Coastguard Worker           }
67*f4ee7fbaSAndroid Build Coastguard Worker           from = i;
68*f4ee7fbaSAndroid Build Coastguard Worker         }
69*f4ee7fbaSAndroid Build Coastguard Worker         to = i + sliceLen;
70*f4ee7fbaSAndroid Build Coastguard Worker       }
71*f4ee7fbaSAndroid Build Coastguard Worker     }
72*f4ee7fbaSAndroid Build Coastguard Worker   }
73*f4ee7fbaSAndroid Build Coastguard Worker   if (from != kNowhere) {
74*f4ee7fbaSAndroid Build Coastguard Worker     output.insert(output.end(), &data[from], &data[to]);
75*f4ee7fbaSAndroid Build Coastguard Worker   }
76*f4ee7fbaSAndroid Build Coastguard Worker   return output;
77*f4ee7fbaSAndroid Build Coastguard Worker }
78*f4ee7fbaSAndroid Build Coastguard Worker 
sieve_generate(size_t dictionary_size_limit,size_t slice_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)79*f4ee7fbaSAndroid Build Coastguard Worker std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
80*f4ee7fbaSAndroid Build Coastguard Worker     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
81*f4ee7fbaSAndroid Build Coastguard Worker   /* Parameters aliasing */
82*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
83*f4ee7fbaSAndroid Build Coastguard Worker   if (targetSize != dictionary_size_limit) {
84*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "dictionary_size_limit is too large\n");
85*f4ee7fbaSAndroid Build Coastguard Worker     return "";
86*f4ee7fbaSAndroid Build Coastguard Worker   }
87*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx sliceLen = static_cast<TextIdx>(slice_len);
88*f4ee7fbaSAndroid Build Coastguard Worker   if (sliceLen != slice_len) {
89*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "slice_len is too large\n");
90*f4ee7fbaSAndroid Build Coastguard Worker     return "";
91*f4ee7fbaSAndroid Build Coastguard Worker   }
92*f4ee7fbaSAndroid Build Coastguard Worker   if (sliceLen < 1) {
93*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "slice_len is too small\n");
94*f4ee7fbaSAndroid Build Coastguard Worker     return "";
95*f4ee7fbaSAndroid Build Coastguard Worker   }
96*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
97*f4ee7fbaSAndroid Build Coastguard Worker   if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
98*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "too many samples\n");
99*f4ee7fbaSAndroid Build Coastguard Worker     return "";
100*f4ee7fbaSAndroid Build Coastguard Worker   }
101*f4ee7fbaSAndroid Build Coastguard Worker   const uint8_t* data = sample_data;
102*f4ee7fbaSAndroid Build Coastguard Worker 
103*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx total = 0;
104*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<TextIdx> offsets;
105*f4ee7fbaSAndroid Build Coastguard Worker   for (SampleIdx i = 0; i < numSamples; ++i) {
106*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
107*f4ee7fbaSAndroid Build Coastguard Worker     if (delta != sample_sizes[i]) {
108*f4ee7fbaSAndroid Build Coastguard Worker       fprintf(stderr, "sample is too large\n");
109*f4ee7fbaSAndroid Build Coastguard Worker       return "";
110*f4ee7fbaSAndroid Build Coastguard Worker     }
111*f4ee7fbaSAndroid Build Coastguard Worker     if (delta == 0) {
112*f4ee7fbaSAndroid Build Coastguard Worker       fprintf(stderr, "empty samples are prohibited\n");
113*f4ee7fbaSAndroid Build Coastguard Worker       return "";
114*f4ee7fbaSAndroid Build Coastguard Worker     }
115*f4ee7fbaSAndroid Build Coastguard Worker     if (total + delta <= total) {
116*f4ee7fbaSAndroid Build Coastguard Worker       fprintf(stderr, "corpus is too large\n");
117*f4ee7fbaSAndroid Build Coastguard Worker       return "";
118*f4ee7fbaSAndroid Build Coastguard Worker     }
119*f4ee7fbaSAndroid Build Coastguard Worker     total += delta;
120*f4ee7fbaSAndroid Build Coastguard Worker     offsets.push_back(total);
121*f4ee7fbaSAndroid Build Coastguard Worker   }
122*f4ee7fbaSAndroid Build Coastguard Worker 
123*f4ee7fbaSAndroid Build Coastguard Worker   if (total * 2 < total) {
124*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "corpus is too large\n");
125*f4ee7fbaSAndroid Build Coastguard Worker     return "";
126*f4ee7fbaSAndroid Build Coastguard Worker   }
127*f4ee7fbaSAndroid Build Coastguard Worker 
128*f4ee7fbaSAndroid Build Coastguard Worker   if (total < sliceLen) {
129*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "slice_len is larger than corpus size\n");
130*f4ee7fbaSAndroid Build Coastguard Worker     return "";
131*f4ee7fbaSAndroid Build Coastguard Worker   }
132*f4ee7fbaSAndroid Build Coastguard Worker 
133*f4ee7fbaSAndroid Build Coastguard Worker   /*****************************************************************************
134*f4ee7fbaSAndroid Build Coastguard Worker    * Build coverage map.
135*f4ee7fbaSAndroid Build Coastguard Worker    ****************************************************************************/
136*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<Slot> map;
137*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<TextIdx> shortcut;
138*f4ee7fbaSAndroid Build Coastguard Worker   map.push_back({0, 0, 0, 0});
139*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx end = total - sliceLen;
140*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx hashLen = 11;
141*f4ee7fbaSAndroid Build Coastguard Worker   while (hashLen < 29 && ((1u << hashLen) < end)) {
142*f4ee7fbaSAndroid Build Coastguard Worker     hashLen += 3;
143*f4ee7fbaSAndroid Build Coastguard Worker   }
144*f4ee7fbaSAndroid Build Coastguard Worker   hashLen -= 3;
145*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx hashMask = (1u << hashLen) - 1u;
146*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<TextIdx> hashHead(1 << hashLen);
147*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx hashSlot = 1;
148*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx piece = 0;
149*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx hash = 0;
150*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx lShift = 3;
151*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx rShift = hashLen - lShift;
152*f4ee7fbaSAndroid Build Coastguard Worker   for (TextIdx i = 0; i < sliceLen - 1; ++i) {
153*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx v = data[i];
154*f4ee7fbaSAndroid Build Coastguard Worker     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
155*f4ee7fbaSAndroid Build Coastguard Worker   }
156*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
157*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx rShiftX = hashLen - lShiftX;
158*f4ee7fbaSAndroid Build Coastguard Worker   for (TextIdx i = 0; i < end; ++i) {
159*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx v = data[i + sliceLen - 1];
160*f4ee7fbaSAndroid Build Coastguard Worker     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
161*f4ee7fbaSAndroid Build Coastguard Worker 
162*f4ee7fbaSAndroid Build Coastguard Worker     if (offsets[piece] == i) {
163*f4ee7fbaSAndroid Build Coastguard Worker       piece++;
164*f4ee7fbaSAndroid Build Coastguard Worker     }
165*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx slot = hashHead[hash];
166*f4ee7fbaSAndroid Build Coastguard Worker     while (slot != 0) {
167*f4ee7fbaSAndroid Build Coastguard Worker       Slot& item = map[slot];
168*f4ee7fbaSAndroid Build Coastguard Worker       TextIdx start = item.offset;
169*f4ee7fbaSAndroid Build Coastguard Worker       bool miss = false;
170*f4ee7fbaSAndroid Build Coastguard Worker       for (TextIdx j = 0; j < sliceLen; ++j) {
171*f4ee7fbaSAndroid Build Coastguard Worker         if (data[i + j] != data[start + j]) {
172*f4ee7fbaSAndroid Build Coastguard Worker           miss = true;
173*f4ee7fbaSAndroid Build Coastguard Worker           break;
174*f4ee7fbaSAndroid Build Coastguard Worker         }
175*f4ee7fbaSAndroid Build Coastguard Worker       }
176*f4ee7fbaSAndroid Build Coastguard Worker       if (!miss) {
177*f4ee7fbaSAndroid Build Coastguard Worker         if (item.mark != piece) {
178*f4ee7fbaSAndroid Build Coastguard Worker           item.mark = piece;
179*f4ee7fbaSAndroid Build Coastguard Worker           item.presence++;
180*f4ee7fbaSAndroid Build Coastguard Worker         }
181*f4ee7fbaSAndroid Build Coastguard Worker         shortcut.push_back(slot);
182*f4ee7fbaSAndroid Build Coastguard Worker         break;
183*f4ee7fbaSAndroid Build Coastguard Worker       }
184*f4ee7fbaSAndroid Build Coastguard Worker       slot = item.next;
185*f4ee7fbaSAndroid Build Coastguard Worker     }
186*f4ee7fbaSAndroid Build Coastguard Worker     if (slot == 0) {
187*f4ee7fbaSAndroid Build Coastguard Worker       map.push_back({hashHead[hash], i, 1, piece});
188*f4ee7fbaSAndroid Build Coastguard Worker       hashHead[hash] = hashSlot;
189*f4ee7fbaSAndroid Build Coastguard Worker       shortcut.push_back(hashSlot);
190*f4ee7fbaSAndroid Build Coastguard Worker       hashSlot++;
191*f4ee7fbaSAndroid Build Coastguard Worker     }
192*f4ee7fbaSAndroid Build Coastguard Worker     v = data[i];
193*f4ee7fbaSAndroid Build Coastguard Worker     hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
194*f4ee7fbaSAndroid Build Coastguard Worker   }
195*f4ee7fbaSAndroid Build Coastguard Worker 
196*f4ee7fbaSAndroid Build Coastguard Worker   /*****************************************************************************
197*f4ee7fbaSAndroid Build Coastguard Worker    * Build dictionary of specified size.
198*f4ee7fbaSAndroid Build Coastguard Worker    ****************************************************************************/
199*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx a = 1;
200*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx size = dryRun(
201*f4ee7fbaSAndroid Build Coastguard Worker       sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
202*f4ee7fbaSAndroid Build Coastguard Worker   /* Maximal output is smaller than target. */
203*f4ee7fbaSAndroid Build Coastguard Worker   if (size <= targetSize) {
204*f4ee7fbaSAndroid Build Coastguard Worker     return createDictionary(
205*f4ee7fbaSAndroid Build Coastguard Worker         data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
206*f4ee7fbaSAndroid Build Coastguard Worker   }
207*f4ee7fbaSAndroid Build Coastguard Worker 
208*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx b = numSamples;
209*f4ee7fbaSAndroid Build Coastguard Worker   size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
210*f4ee7fbaSAndroid Build Coastguard Worker   if (size == targetSize) {
211*f4ee7fbaSAndroid Build Coastguard Worker     return createDictionary(
212*f4ee7fbaSAndroid Build Coastguard Worker         data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
213*f4ee7fbaSAndroid Build Coastguard Worker   }
214*f4ee7fbaSAndroid Build Coastguard Worker   /* Run binary search. */
215*f4ee7fbaSAndroid Build Coastguard Worker   if (size < targetSize) {
216*f4ee7fbaSAndroid Build Coastguard Worker     /* size(a) > targetSize > size(b) && a < m < b */
217*f4ee7fbaSAndroid Build Coastguard Worker     while (a + 1 < b) {
218*f4ee7fbaSAndroid Build Coastguard Worker       SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
219*f4ee7fbaSAndroid Build Coastguard Worker       size = dryRun(
220*f4ee7fbaSAndroid Build Coastguard Worker           sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
221*f4ee7fbaSAndroid Build Coastguard Worker       if (size < targetSize) {
222*f4ee7fbaSAndroid Build Coastguard Worker         b = m;
223*f4ee7fbaSAndroid Build Coastguard Worker       } else if (size > targetSize) {
224*f4ee7fbaSAndroid Build Coastguard Worker         a = m;
225*f4ee7fbaSAndroid Build Coastguard Worker       } else {
226*f4ee7fbaSAndroid Build Coastguard Worker         return createDictionary(
227*f4ee7fbaSAndroid Build Coastguard Worker             data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
228*f4ee7fbaSAndroid Build Coastguard Worker       }
229*f4ee7fbaSAndroid Build Coastguard Worker     }
230*f4ee7fbaSAndroid Build Coastguard Worker   } else {
231*f4ee7fbaSAndroid Build Coastguard Worker     a = b;
232*f4ee7fbaSAndroid Build Coastguard Worker   }
233*f4ee7fbaSAndroid Build Coastguard Worker   /* size(minPresence) > targetSize > size(minPresence + 1) */
234*f4ee7fbaSAndroid Build Coastguard Worker   SampleIdx minPresence = a;
235*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx c = 0;
236*f4ee7fbaSAndroid Build Coastguard Worker   TextIdx d = end;
237*f4ee7fbaSAndroid Build Coastguard Worker   /* size(a) < targetSize < size(b) && a < m < b */
238*f4ee7fbaSAndroid Build Coastguard Worker   while (c + 1 < d) {
239*f4ee7fbaSAndroid Build Coastguard Worker     TextIdx m = (c + d) / 2;
240*f4ee7fbaSAndroid Build Coastguard Worker     size = dryRun(
241*f4ee7fbaSAndroid Build Coastguard Worker         sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
242*f4ee7fbaSAndroid Build Coastguard Worker     if (size < targetSize) {
243*f4ee7fbaSAndroid Build Coastguard Worker       c = m;
244*f4ee7fbaSAndroid Build Coastguard Worker     } else if (size > targetSize) {
245*f4ee7fbaSAndroid Build Coastguard Worker       d = m;
246*f4ee7fbaSAndroid Build Coastguard Worker     } else {
247*f4ee7fbaSAndroid Build Coastguard Worker       return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
248*f4ee7fbaSAndroid Build Coastguard Worker           m, minPresence, ++piece);
249*f4ee7fbaSAndroid Build Coastguard Worker     }
250*f4ee7fbaSAndroid Build Coastguard Worker   }
251*f4ee7fbaSAndroid Build Coastguard Worker 
252*f4ee7fbaSAndroid Build Coastguard Worker   bool unrestricted = false;
253*f4ee7fbaSAndroid Build Coastguard Worker   if (minPresence <= 2 && !unrestricted) {
254*f4ee7fbaSAndroid Build Coastguard Worker     minPresence = 2;
255*f4ee7fbaSAndroid Build Coastguard Worker     c = end;
256*f4ee7fbaSAndroid Build Coastguard Worker   }
257*f4ee7fbaSAndroid Build Coastguard Worker   return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
258*f4ee7fbaSAndroid Build Coastguard Worker       minPresence, ++piece);
259*f4ee7fbaSAndroid Build Coastguard Worker }
260