xref: /aosp_15_r20/external/brotli/research/dictionary_generator.cc (revision f4ee7fba7774faf2a30f13154332c0a06550dbc4)
1*f4ee7fbaSAndroid Build Coastguard Worker #include <climits>
2*f4ee7fbaSAndroid Build Coastguard Worker #include <cstddef>
3*f4ee7fbaSAndroid Build Coastguard Worker #include <cstdio>
4*f4ee7fbaSAndroid Build Coastguard Worker #include <cstring>
5*f4ee7fbaSAndroid Build Coastguard Worker #include <fstream>
6*f4ee7fbaSAndroid Build Coastguard Worker #include <vector>
7*f4ee7fbaSAndroid Build Coastguard Worker 
8*f4ee7fbaSAndroid Build Coastguard Worker #include "./deorummolae.h"
9*f4ee7fbaSAndroid Build Coastguard Worker #include "./durchschlag.h"
10*f4ee7fbaSAndroid Build Coastguard Worker #include "./sieve.h"
11*f4ee7fbaSAndroid Build Coastguard Worker 
12*f4ee7fbaSAndroid Build Coastguard Worker /* This isn't a definitive list of "--foo" arguments, only those that take an
13*f4ee7fbaSAndroid Build Coastguard Worker  * additional "=#" integer parameter, like "--foo=20" or "--foo=32K".
14*f4ee7fbaSAndroid Build Coastguard Worker  */
15*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_BLOCK_LEN "--block_len="
16*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_SLICE_LEN "--slice_len="
17*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_TARGET_DICT_LEN "--target_dict_len="
18*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_MIN_SLICE_POP "--min_slice_pop="
19*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_CHUNK_LEN "--chunk_len="
20*f4ee7fbaSAndroid Build Coastguard Worker #define LONG_ARG_OVERLAP_LEN "--overlap_len="
21*f4ee7fbaSAndroid Build Coastguard Worker 
22*f4ee7fbaSAndroid Build Coastguard Worker #define METHOD_DM 0
23*f4ee7fbaSAndroid Build Coastguard Worker #define METHOD_SIEVE 1
24*f4ee7fbaSAndroid Build Coastguard Worker #define METHOD_DURCHSCHLAG 2
25*f4ee7fbaSAndroid Build Coastguard Worker #define METHOD_DISTILL 3
26*f4ee7fbaSAndroid Build Coastguard Worker #define METHOD_PURIFY 4
27*f4ee7fbaSAndroid Build Coastguard Worker 
readInt(const char * str)28*f4ee7fbaSAndroid Build Coastguard Worker static size_t readInt(const char* str) {
29*f4ee7fbaSAndroid Build Coastguard Worker   size_t result = 0;
30*f4ee7fbaSAndroid Build Coastguard Worker   if (str[0] == 0 || str[0] == '0') {
31*f4ee7fbaSAndroid Build Coastguard Worker     return 0;
32*f4ee7fbaSAndroid Build Coastguard Worker   }
33*f4ee7fbaSAndroid Build Coastguard Worker   for (size_t i = 0; i < 13; ++i) {
34*f4ee7fbaSAndroid Build Coastguard Worker     if (str[i] == 0) {
35*f4ee7fbaSAndroid Build Coastguard Worker       return result;
36*f4ee7fbaSAndroid Build Coastguard Worker     }
37*f4ee7fbaSAndroid Build Coastguard Worker     if (str[i] == 'k' || str[i] == 'K') {
38*f4ee7fbaSAndroid Build Coastguard Worker       if ((str[i + 1] == 0) && ((result << 10) > result)) {
39*f4ee7fbaSAndroid Build Coastguard Worker         return result << 10;
40*f4ee7fbaSAndroid Build Coastguard Worker       }
41*f4ee7fbaSAndroid Build Coastguard Worker       return 0;
42*f4ee7fbaSAndroid Build Coastguard Worker     }
43*f4ee7fbaSAndroid Build Coastguard Worker     if (str[i] == 'm' || str[i] == 'M') {
44*f4ee7fbaSAndroid Build Coastguard Worker       if ((str[i + 1] == 0) && ((result << 20) > result)) {
45*f4ee7fbaSAndroid Build Coastguard Worker         return result << 20;
46*f4ee7fbaSAndroid Build Coastguard Worker       }
47*f4ee7fbaSAndroid Build Coastguard Worker       return 0;
48*f4ee7fbaSAndroid Build Coastguard Worker     }
49*f4ee7fbaSAndroid Build Coastguard Worker     if (str[i] < '0' || str[i] > '9') {
50*f4ee7fbaSAndroid Build Coastguard Worker       return 0;
51*f4ee7fbaSAndroid Build Coastguard Worker     }
52*f4ee7fbaSAndroid Build Coastguard Worker     size_t next = (10 * result) + (str[i] - '0');
53*f4ee7fbaSAndroid Build Coastguard Worker     if (next <= result) {
54*f4ee7fbaSAndroid Build Coastguard Worker       return 0;
55*f4ee7fbaSAndroid Build Coastguard Worker     }
56*f4ee7fbaSAndroid Build Coastguard Worker     result = next;
57*f4ee7fbaSAndroid Build Coastguard Worker   }
58*f4ee7fbaSAndroid Build Coastguard Worker   return 0;
59*f4ee7fbaSAndroid Build Coastguard Worker }
60*f4ee7fbaSAndroid Build Coastguard Worker 
readFile(const std::string & path)61*f4ee7fbaSAndroid Build Coastguard Worker static std::string readFile(const std::string& path) {
62*f4ee7fbaSAndroid Build Coastguard Worker   std::ifstream file(path);
63*f4ee7fbaSAndroid Build Coastguard Worker   std::string content(
64*f4ee7fbaSAndroid Build Coastguard Worker       (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
65*f4ee7fbaSAndroid Build Coastguard Worker   return content;
66*f4ee7fbaSAndroid Build Coastguard Worker }
67*f4ee7fbaSAndroid Build Coastguard Worker 
writeFile(const char * file,const std::string & content)68*f4ee7fbaSAndroid Build Coastguard Worker static void writeFile(const char* file, const std::string& content) {
69*f4ee7fbaSAndroid Build Coastguard Worker   std::ofstream outfile(file, std::ofstream::binary);
70*f4ee7fbaSAndroid Build Coastguard Worker   outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
71*f4ee7fbaSAndroid Build Coastguard Worker   outfile.close();
72*f4ee7fbaSAndroid Build Coastguard Worker }
73*f4ee7fbaSAndroid Build Coastguard Worker 
writeSamples(char const * argv[],const std::vector<int> & pathArgs,const std::vector<size_t> & sizes,const uint8_t * data)74*f4ee7fbaSAndroid Build Coastguard Worker static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
75*f4ee7fbaSAndroid Build Coastguard Worker     const std::vector<size_t>& sizes, const uint8_t* data) {
76*f4ee7fbaSAndroid Build Coastguard Worker   size_t offset = 0;
77*f4ee7fbaSAndroid Build Coastguard Worker   for (size_t i = 0; i < pathArgs.size(); ++i) {
78*f4ee7fbaSAndroid Build Coastguard Worker     int j = pathArgs[i];
79*f4ee7fbaSAndroid Build Coastguard Worker     const char* file = argv[j];
80*f4ee7fbaSAndroid Build Coastguard Worker     size_t sampleSize = sizes[i];
81*f4ee7fbaSAndroid Build Coastguard Worker     std::ofstream outfile(file, std::ofstream::binary);
82*f4ee7fbaSAndroid Build Coastguard Worker     outfile.write(reinterpret_cast<const char*>(data + offset),
83*f4ee7fbaSAndroid Build Coastguard Worker         static_cast<std::streamsize>(sampleSize));
84*f4ee7fbaSAndroid Build Coastguard Worker     outfile.close();
85*f4ee7fbaSAndroid Build Coastguard Worker     offset += sampleSize;
86*f4ee7fbaSAndroid Build Coastguard Worker   }
87*f4ee7fbaSAndroid Build Coastguard Worker }
88*f4ee7fbaSAndroid Build Coastguard Worker 
89*f4ee7fbaSAndroid Build Coastguard Worker /* Returns "base file name" or its tail, if it contains '/' or '\'. */
fileName(const char * path)90*f4ee7fbaSAndroid Build Coastguard Worker static const char* fileName(const char* path) {
91*f4ee7fbaSAndroid Build Coastguard Worker   const char* separator_position = strrchr(path, '/');
92*f4ee7fbaSAndroid Build Coastguard Worker   if (separator_position) path = separator_position + 1;
93*f4ee7fbaSAndroid Build Coastguard Worker   separator_position = strrchr(path, '\\');
94*f4ee7fbaSAndroid Build Coastguard Worker   if (separator_position) path = separator_position + 1;
95*f4ee7fbaSAndroid Build Coastguard Worker   return path;
96*f4ee7fbaSAndroid Build Coastguard Worker }
97*f4ee7fbaSAndroid Build Coastguard Worker 
printHelp(const char * name)98*f4ee7fbaSAndroid Build Coastguard Worker static void printHelp(const char* name) {
99*f4ee7fbaSAndroid Build Coastguard Worker   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
100*f4ee7fbaSAndroid Build Coastguard Worker   fprintf(stderr,
101*f4ee7fbaSAndroid Build Coastguard Worker       "Options:\n"
102*f4ee7fbaSAndroid Build Coastguard Worker       "  --dm       use 'deorummolae' engine\n"
103*f4ee7fbaSAndroid Build Coastguard Worker       "  --distill  rewrite samples; unique text parts are removed\n"
104*f4ee7fbaSAndroid Build Coastguard Worker       "  --dsh      use 'durchschlag' engine (default)\n"
105*f4ee7fbaSAndroid Build Coastguard Worker       "  --purify   rewrite samples; unique text parts are zeroed out\n"
106*f4ee7fbaSAndroid Build Coastguard Worker       "  --sieve    use 'sieve' engine\n"
107*f4ee7fbaSAndroid Build Coastguard Worker       "  -b#, --block_len=#\n"
108*f4ee7fbaSAndroid Build Coastguard Worker       "             set block length for 'durchschlag'; default: 1024\n"
109*f4ee7fbaSAndroid Build Coastguard Worker       "  -s#, --slice_len=#\n"
110*f4ee7fbaSAndroid Build Coastguard Worker       "             set slice length for 'distill', 'durchschlag', 'purify'\n"
111*f4ee7fbaSAndroid Build Coastguard Worker       "             and 'sieve'; default: 16\n"
112*f4ee7fbaSAndroid Build Coastguard Worker       "  -t#, --target_dict_len=#\n"
113*f4ee7fbaSAndroid Build Coastguard Worker       "             set target dictionary length (limit); default: 16K\n"
114*f4ee7fbaSAndroid Build Coastguard Worker       "  -u#, --min_slice_pop=#\n"
115*f4ee7fbaSAndroid Build Coastguard Worker       "             set minimum slice population (for rewrites); default: 2\n"
116*f4ee7fbaSAndroid Build Coastguard Worker       "  -c#, --chunk_len=#\n"
117*f4ee7fbaSAndroid Build Coastguard Worker       "             if positive, samples are cut into chunks of this length;\n"
118*f4ee7fbaSAndroid Build Coastguard Worker       "             default: 0; cannot mix with 'rewrite samples'\n"
119*f4ee7fbaSAndroid Build Coastguard Worker       "  -o#, --overlap_len=#\n"
120*f4ee7fbaSAndroid Build Coastguard Worker       "             set chunk overlap length; default 0\n"
121*f4ee7fbaSAndroid Build Coastguard Worker       "# is a decimal number with optional k/K/m/M suffix.\n"
122*f4ee7fbaSAndroid Build Coastguard Worker       "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
123*f4ee7fbaSAndroid Build Coastguard Worker       "         Completely unique samples might become empty files.\n\n");
124*f4ee7fbaSAndroid Build Coastguard Worker }
125*f4ee7fbaSAndroid Build Coastguard Worker 
main(int argc,char const * argv[])126*f4ee7fbaSAndroid Build Coastguard Worker int main(int argc, char const* argv[]) {
127*f4ee7fbaSAndroid Build Coastguard Worker   int dictionaryArg = -1;
128*f4ee7fbaSAndroid Build Coastguard Worker   int method = METHOD_DURCHSCHLAG;
129*f4ee7fbaSAndroid Build Coastguard Worker   size_t sliceLen = 16;
130*f4ee7fbaSAndroid Build Coastguard Worker   size_t targetSize = 16 << 10;
131*f4ee7fbaSAndroid Build Coastguard Worker   size_t blockSize = 1024;
132*f4ee7fbaSAndroid Build Coastguard Worker   size_t minimumPopulation = 2;
133*f4ee7fbaSAndroid Build Coastguard Worker   size_t chunkLen = 0;
134*f4ee7fbaSAndroid Build Coastguard Worker   size_t overlapLen = 0;
135*f4ee7fbaSAndroid Build Coastguard Worker 
136*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<uint8_t> data;
137*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<size_t> sizes;
138*f4ee7fbaSAndroid Build Coastguard Worker   std::vector<int> pathArgs;
139*f4ee7fbaSAndroid Build Coastguard Worker   size_t total = 0;
140*f4ee7fbaSAndroid Build Coastguard Worker   for (int i = 1; i < argc; ++i) {
141*f4ee7fbaSAndroid Build Coastguard Worker     if (argv[i] == nullptr) {
142*f4ee7fbaSAndroid Build Coastguard Worker       continue;
143*f4ee7fbaSAndroid Build Coastguard Worker     }
144*f4ee7fbaSAndroid Build Coastguard Worker 
145*f4ee7fbaSAndroid Build Coastguard Worker     if (argv[i][0] == '-') {
146*f4ee7fbaSAndroid Build Coastguard Worker       char arg1 = argv[i][1];
147*f4ee7fbaSAndroid Build Coastguard Worker       const char* arg2 = arg1 ? &argv[i][2] : nullptr;
148*f4ee7fbaSAndroid Build Coastguard Worker       if (arg1 == '-') {
149*f4ee7fbaSAndroid Build Coastguard Worker         if (dictionaryArg != -1) {
150*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr,
151*f4ee7fbaSAndroid Build Coastguard Worker               "Method should be specified before dictionary / sample '%s'\n",
152*f4ee7fbaSAndroid Build Coastguard Worker               argv[i]);
153*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
154*f4ee7fbaSAndroid Build Coastguard Worker         }
155*f4ee7fbaSAndroid Build Coastguard Worker 
156*f4ee7fbaSAndroid Build Coastguard Worker         /* Look for "--long_arg" via exact match. */
157*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strcmp(argv[i], "--sieve") == 0) {
158*f4ee7fbaSAndroid Build Coastguard Worker           method = METHOD_SIEVE;
159*f4ee7fbaSAndroid Build Coastguard Worker           continue;
160*f4ee7fbaSAndroid Build Coastguard Worker         }
161*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strcmp(argv[i], "--dm") == 0) {
162*f4ee7fbaSAndroid Build Coastguard Worker           method = METHOD_DM;
163*f4ee7fbaSAndroid Build Coastguard Worker           continue;
164*f4ee7fbaSAndroid Build Coastguard Worker         }
165*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strcmp(argv[i], "--dsh") == 0) {
166*f4ee7fbaSAndroid Build Coastguard Worker           method = METHOD_DURCHSCHLAG;
167*f4ee7fbaSAndroid Build Coastguard Worker           continue;
168*f4ee7fbaSAndroid Build Coastguard Worker         }
169*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strcmp(argv[i], "--distill") == 0) {
170*f4ee7fbaSAndroid Build Coastguard Worker           method = METHOD_DISTILL;
171*f4ee7fbaSAndroid Build Coastguard Worker           continue;
172*f4ee7fbaSAndroid Build Coastguard Worker         }
173*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strcmp(argv[i], "--purify") == 0) {
174*f4ee7fbaSAndroid Build Coastguard Worker           method = METHOD_PURIFY;
175*f4ee7fbaSAndroid Build Coastguard Worker           continue;
176*f4ee7fbaSAndroid Build Coastguard Worker         }
177*f4ee7fbaSAndroid Build Coastguard Worker 
178*f4ee7fbaSAndroid Build Coastguard Worker         /* Look for "--long_arg=#" via prefix match. */
179*f4ee7fbaSAndroid Build Coastguard Worker         if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN,
180*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_BLOCK_LEN)) == 0) {
181*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 'b';
182*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)];
183*f4ee7fbaSAndroid Build Coastguard Worker         } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN,
184*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_SLICE_LEN)) == 0) {
185*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 's';
186*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)];
187*f4ee7fbaSAndroid Build Coastguard Worker         } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN,
188*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) {
189*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 't';
190*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)];
191*f4ee7fbaSAndroid Build Coastguard Worker         } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP,
192*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) {
193*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 'u';
194*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)];
195*f4ee7fbaSAndroid Build Coastguard Worker         } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN,
196*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_CHUNK_LEN)) == 0) {
197*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 'c';
198*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)];
199*f4ee7fbaSAndroid Build Coastguard Worker         } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN,
200*f4ee7fbaSAndroid Build Coastguard Worker               std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) {
201*f4ee7fbaSAndroid Build Coastguard Worker           arg1 = 'o';
202*f4ee7fbaSAndroid Build Coastguard Worker           arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)];
203*f4ee7fbaSAndroid Build Coastguard Worker         } else {
204*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
205*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
206*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
207*f4ee7fbaSAndroid Build Coastguard Worker         }
208*f4ee7fbaSAndroid Build Coastguard Worker       }
209*f4ee7fbaSAndroid Build Coastguard Worker 
210*f4ee7fbaSAndroid Build Coastguard Worker       /* Look for "-f" short args or "--foo=#" long args. */
211*f4ee7fbaSAndroid Build Coastguard Worker       if (arg1 == 'b') {
212*f4ee7fbaSAndroid Build Coastguard Worker         blockSize = readInt(arg2);
213*f4ee7fbaSAndroid Build Coastguard Worker         if (blockSize < 16 || blockSize > 65536) {
214*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
215*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
216*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
217*f4ee7fbaSAndroid Build Coastguard Worker         }
218*f4ee7fbaSAndroid Build Coastguard Worker       } else if (arg1 == 's') {
219*f4ee7fbaSAndroid Build Coastguard Worker         sliceLen = readInt(arg2);
220*f4ee7fbaSAndroid Build Coastguard Worker         if (sliceLen < 4 || sliceLen > 256) {
221*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
222*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
223*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
224*f4ee7fbaSAndroid Build Coastguard Worker         }
225*f4ee7fbaSAndroid Build Coastguard Worker       } else if (arg1 == 't') {
226*f4ee7fbaSAndroid Build Coastguard Worker         targetSize = readInt(arg2);
227*f4ee7fbaSAndroid Build Coastguard Worker         if (targetSize < 256 || targetSize > (1 << 25)) {
228*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
229*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
230*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
231*f4ee7fbaSAndroid Build Coastguard Worker         }
232*f4ee7fbaSAndroid Build Coastguard Worker       } else if (arg1 == 'u') {
233*f4ee7fbaSAndroid Build Coastguard Worker         minimumPopulation = readInt(arg2);
234*f4ee7fbaSAndroid Build Coastguard Worker         if (minimumPopulation < 256 || minimumPopulation > 65536) {
235*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
236*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
237*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
238*f4ee7fbaSAndroid Build Coastguard Worker         }
239*f4ee7fbaSAndroid Build Coastguard Worker       } else if (arg1 == 'c') {
240*f4ee7fbaSAndroid Build Coastguard Worker         chunkLen = readInt(arg2);
241*f4ee7fbaSAndroid Build Coastguard Worker         if (chunkLen < 0 || chunkLen > INT_MAX) {
242*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
243*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
244*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
245*f4ee7fbaSAndroid Build Coastguard Worker         }
246*f4ee7fbaSAndroid Build Coastguard Worker       } else if (arg1 == 'o') {
247*f4ee7fbaSAndroid Build Coastguard Worker         overlapLen = readInt(arg2);
248*f4ee7fbaSAndroid Build Coastguard Worker         if (overlapLen < 0 || overlapLen > INT_MAX) {
249*f4ee7fbaSAndroid Build Coastguard Worker           printHelp(fileName(argv[0]));
250*f4ee7fbaSAndroid Build Coastguard Worker           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
251*f4ee7fbaSAndroid Build Coastguard Worker           exit(1);
252*f4ee7fbaSAndroid Build Coastguard Worker         }
253*f4ee7fbaSAndroid Build Coastguard Worker       } else {
254*f4ee7fbaSAndroid Build Coastguard Worker         printHelp(fileName(argv[0]));
255*f4ee7fbaSAndroid Build Coastguard Worker         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
256*f4ee7fbaSAndroid Build Coastguard Worker         exit(1);
257*f4ee7fbaSAndroid Build Coastguard Worker       }
258*f4ee7fbaSAndroid Build Coastguard Worker       continue;
259*f4ee7fbaSAndroid Build Coastguard Worker     }
260*f4ee7fbaSAndroid Build Coastguard Worker 
261*f4ee7fbaSAndroid Build Coastguard Worker     if (dictionaryArg == -1) {
262*f4ee7fbaSAndroid Build Coastguard Worker       if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
263*f4ee7fbaSAndroid Build Coastguard Worker         dictionaryArg = i;
264*f4ee7fbaSAndroid Build Coastguard Worker         continue;
265*f4ee7fbaSAndroid Build Coastguard Worker       }
266*f4ee7fbaSAndroid Build Coastguard Worker     }
267*f4ee7fbaSAndroid Build Coastguard Worker 
268*f4ee7fbaSAndroid Build Coastguard Worker     std::string content = readFile(argv[i]);
269*f4ee7fbaSAndroid Build Coastguard Worker     if (chunkLen == 0) {
270*f4ee7fbaSAndroid Build Coastguard Worker       pathArgs.push_back(i);
271*f4ee7fbaSAndroid Build Coastguard Worker       data.insert(data.end(), content.begin(), content.end());
272*f4ee7fbaSAndroid Build Coastguard Worker       total += content.size();
273*f4ee7fbaSAndroid Build Coastguard Worker       sizes.push_back(content.size());
274*f4ee7fbaSAndroid Build Coastguard Worker       continue;
275*f4ee7fbaSAndroid Build Coastguard Worker     } else if (chunkLen <= overlapLen) {
276*f4ee7fbaSAndroid Build Coastguard Worker       printHelp(fileName(argv[0]));
277*f4ee7fbaSAndroid Build Coastguard Worker       fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
278*f4ee7fbaSAndroid Build Coastguard Worker       exit(1);
279*f4ee7fbaSAndroid Build Coastguard Worker     }
280*f4ee7fbaSAndroid Build Coastguard Worker     for (size_t chunkStart = 0;
281*f4ee7fbaSAndroid Build Coastguard Worker         chunkStart < content.size();
282*f4ee7fbaSAndroid Build Coastguard Worker         chunkStart += chunkLen - overlapLen) {
283*f4ee7fbaSAndroid Build Coastguard Worker       std::string chunk = content.substr(chunkStart, chunkLen);
284*f4ee7fbaSAndroid Build Coastguard Worker       data.insert(data.end(), chunk.begin(), chunk.end());
285*f4ee7fbaSAndroid Build Coastguard Worker       total += chunk.size();
286*f4ee7fbaSAndroid Build Coastguard Worker       sizes.push_back(chunk.size());
287*f4ee7fbaSAndroid Build Coastguard Worker     }
288*f4ee7fbaSAndroid Build Coastguard Worker   }
289*f4ee7fbaSAndroid Build Coastguard Worker 
290*f4ee7fbaSAndroid Build Coastguard Worker   bool wantDictionary = (dictionaryArg == -1);
291*f4ee7fbaSAndroid Build Coastguard Worker   if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
292*f4ee7fbaSAndroid Build Coastguard Worker     wantDictionary = false;
293*f4ee7fbaSAndroid Build Coastguard Worker     if (chunkLen != 0) {
294*f4ee7fbaSAndroid Build Coastguard Worker       printHelp(fileName(argv[0]));
295*f4ee7fbaSAndroid Build Coastguard Worker       fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n");
296*f4ee7fbaSAndroid Build Coastguard Worker       exit(1);
297*f4ee7fbaSAndroid Build Coastguard Worker     }
298*f4ee7fbaSAndroid Build Coastguard Worker   }
299*f4ee7fbaSAndroid Build Coastguard Worker   if (wantDictionary || total == 0) {
300*f4ee7fbaSAndroid Build Coastguard Worker     printHelp(fileName(argv[0]));
301*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "Not enough arguments\n");
302*f4ee7fbaSAndroid Build Coastguard Worker     exit(1);
303*f4ee7fbaSAndroid Build Coastguard Worker   }
304*f4ee7fbaSAndroid Build Coastguard Worker 
305*f4ee7fbaSAndroid Build Coastguard Worker   if (method == METHOD_SIEVE) {
306*f4ee7fbaSAndroid Build Coastguard Worker     writeFile(argv[dictionaryArg], sieve_generate(
307*f4ee7fbaSAndroid Build Coastguard Worker         targetSize, sliceLen, sizes, data.data()));
308*f4ee7fbaSAndroid Build Coastguard Worker   } else if (method == METHOD_DM) {
309*f4ee7fbaSAndroid Build Coastguard Worker     writeFile(argv[dictionaryArg], DM_generate(
310*f4ee7fbaSAndroid Build Coastguard Worker         targetSize, sizes, data.data()));
311*f4ee7fbaSAndroid Build Coastguard Worker   } else if (method == METHOD_DURCHSCHLAG) {
312*f4ee7fbaSAndroid Build Coastguard Worker     writeFile(argv[dictionaryArg], durchschlag_generate(
313*f4ee7fbaSAndroid Build Coastguard Worker         targetSize, sliceLen, blockSize, sizes, data.data()));
314*f4ee7fbaSAndroid Build Coastguard Worker   } else if (method == METHOD_DISTILL) {
315*f4ee7fbaSAndroid Build Coastguard Worker     durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
316*f4ee7fbaSAndroid Build Coastguard Worker     writeSamples(argv, pathArgs, sizes, data.data());
317*f4ee7fbaSAndroid Build Coastguard Worker   } else if (method == METHOD_PURIFY) {
318*f4ee7fbaSAndroid Build Coastguard Worker     durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
319*f4ee7fbaSAndroid Build Coastguard Worker     writeSamples(argv, pathArgs, sizes, data.data());
320*f4ee7fbaSAndroid Build Coastguard Worker   } else {
321*f4ee7fbaSAndroid Build Coastguard Worker     printHelp(fileName(argv[0]));
322*f4ee7fbaSAndroid Build Coastguard Worker     fprintf(stderr, "Unknown generator\n");
323*f4ee7fbaSAndroid Build Coastguard Worker     exit(1);
324*f4ee7fbaSAndroid Build Coastguard Worker   }
325*f4ee7fbaSAndroid Build Coastguard Worker   return 0;
326*f4ee7fbaSAndroid Build Coastguard Worker }
327