1 /* bzcat.c - bzip2 decompression
2 *
3 * Copyright 2003, 2007 Rob Landley <[email protected]>
4 *
5 * Based on a close reading (but not the actual code) of the original bzip2
6 * decompression code by Julian R Seward ([email protected]), which also
7 * acknowledges contributions by Mike Burrows, David Wheeler, Peter Fenwick,
8 * Alistair Moffat, Radford Neal, Ian H. Witten, Robert Sedgewick, and
9 * Jon L. Bentley.
10 *
11 * No standard.
12
13
14 USE_BZCAT(NEWTOY(bzcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
15 USE_BUNZIP2(NEWTOY(bunzip2, "cftkv", TOYFLAG_USR|TOYFLAG_BIN))
16
17 config BUNZIP2
18 bool "bunzip2"
19 default y
20 help
21 usage: bunzip2 [-cftkv] [FILE...]
22
23 Decompress listed files (file.bz becomes file) deleting archive file(s).
24 Read from stdin if no files listed.
25
26 -c Force output to stdout
27 -f Force decompression (if FILE doesn't end in .bz, replace original)
28 -k Keep input files (-c and -t imply this)
29 -t Test integrity
30 -v Verbose
31
32 config BZCAT
33 bool "bzcat"
34 default y
35 help
36 usage: bzcat [FILE...]
37
38 Decompress listed files to stdout. Use stdin if no files listed.
39 */
40
41 #define FOR_bunzip2
42 #include "toys.h"
43
44 #define THREADS 1
45
46 // Constants for huffman coding
47 #define MAX_GROUPS 6
48 #define GROUP_SIZE 50 /* 64 would have been more efficient */
49 #define MAX_HUFCODE_BITS 20 /* Longest huffman code allowed */
50 #define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
51 #define SYMBOL_RUNA 0
52 #define SYMBOL_RUNB 1
53
54 // Other housekeeping constants
55 #define IOBUF_SIZE 4096
56
57 // Status return values
58 #define RETVAL_LAST_BLOCK (-100)
59 #define RETVAL_NOT_BZIP_DATA (-1)
60 #define RETVAL_DATA_ERROR (-2)
61 #define RETVAL_OBSOLETE_INPUT (-3)
62 #define RETVAL_EOF_IN (-4)
63 #define RETVAL_EOF_OUT (-5)
64
65 // This is what we know about each huffman coding group
66 struct group_data {
67 int limit[MAX_HUFCODE_BITS+1], base[MAX_HUFCODE_BITS], permute[MAX_SYMBOLS];
68 char minLen, maxLen;
69 };
70
71 // Data for burrows wheeler transform
72
73 struct bwdata {
74 unsigned int origPtr;
75 int byteCount[256];
76 // State saved when interrupting output
77 int writePos, writeRun, writeCount, writeCurrent;
78 unsigned int dataCRC, headerCRC;
79 unsigned int *dbuf;
80 };
81
82 // Structure holding all the housekeeping data, including IO buffers and
83 // memory that persists between calls to bunzip
84 struct bunzip_data {
85 // Input stream, input buffer, input bit buffer
86 int in_fd, inbufCount, inbufPos;
87 char *inbuf;
88 unsigned int inbufBitCount, inbufBits;
89
90 // Output buffer
91 char outbuf[IOBUF_SIZE];
92 int outbufPos;
93
94 unsigned int totalCRC;
95
96 // First pass decompression data (Huffman and MTF decoding)
97 char selectors[32768]; // nSelectors=15 bits
98 struct group_data groups[MAX_GROUPS]; // huffman coding tables
99 int symTotal, groupCount, nSelectors;
100 unsigned char symToByte[256], mtfSymbol[256];
101
102 // The CRC values stored in the block header and calculated from the data
103 unsigned int crc32Table[256];
104
105 // Second pass decompression data (burrows-wheeler transform)
106 unsigned int dbufSize;
107 struct bwdata bwdata[THREADS];
108 };
109
110 // Return the next nnn bits of input. All reads from the compressed input
111 // are done through this function. All reads are big endian.
get_bits(struct bunzip_data * bd,char bits_wanted)112 static unsigned int get_bits(struct bunzip_data *bd, char bits_wanted)
113 {
114 unsigned int bits = 0;
115
116 // If we need to get more data from the byte buffer, do so. (Loop getting
117 // one byte at a time to enforce endianness and avoid unaligned access.)
118 while (bd->inbufBitCount < bits_wanted) {
119
120 // If we need to read more data from file into byte buffer, do so
121 if (bd->inbufPos == bd->inbufCount) {
122 if (0 >= (bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE)))
123 longjmp((void *)toybuf, RETVAL_EOF_IN);
124 bd->inbufPos = 0;
125 }
126
127 // Avoid 32-bit overflow (dump bit buffer to top of output)
128 if (bd->inbufBitCount>=24) {
129 bits = bd->inbufBits&((1<<bd->inbufBitCount)-1);
130 bits_wanted -= bd->inbufBitCount;
131 bits <<= bits_wanted;
132 bd->inbufBitCount = 0;
133 }
134
135 // Grab next 8 bits of input from buffer.
136 bd->inbufBits = (bd->inbufBits<<8) | bd->inbuf[bd->inbufPos++];
137 bd->inbufBitCount += 8;
138 }
139
140 // Calculate result
141 bd->inbufBitCount -= bits_wanted;
142 bits |= (bd->inbufBits>>bd->inbufBitCount) & ((1<<bits_wanted)-1);
143
144 return bits;
145 }
146
147 /* Read block header at start of a new compressed data block. Consists of:
148 *
149 * 48 bits : Block signature, either pi (data block) or e (EOF block).
150 * 32 bits : bw->headerCRC
151 * 1 bit : obsolete feature flag.
152 * 24 bits : origPtr (Burrows-wheeler unwind index, only 20 bits ever used)
153 * 16 bits : Mapping table index.
154 *[16 bits]: symToByte[symTotal] (Mapping table. For each bit set in mapping
155 * table index above, read another 16 bits of mapping table data.
156 * If correspondig bit is unset, all bits in that mapping table
157 * section are 0.)
158 * 3 bits : groupCount (how many huffman tables used to encode, anywhere
159 * from 2 to MAX_GROUPS)
160 * variable: hufGroup[groupCount] (MTF encoded huffman table data.)
161 */
162
read_block_header(struct bunzip_data * bd,struct bwdata * bw)163 static int read_block_header(struct bunzip_data *bd, struct bwdata *bw)
164 {
165 struct group_data *hufGroup;
166 int hh, ii, jj, kk, symCount, *base, *limit;
167 unsigned char uc;
168
169 // Read in header signature and CRC (which is stored big endian)
170 ii = get_bits(bd, 24);
171 jj = get_bits(bd, 24);
172 bw->headerCRC = get_bits(bd,32);
173
174 // Is this the EOF block with CRC for whole file? (Constant is "e")
175 if (ii==0x177245 && jj==0x385090) return RETVAL_LAST_BLOCK;
176
177 // Is this a valid data block? (Constant is "pi".)
178 if (ii!=0x314159 || jj!=0x265359) return RETVAL_NOT_BZIP_DATA;
179
180 // We can add support for blockRandomised if anybody complains.
181 if (get_bits(bd,1)) return RETVAL_OBSOLETE_INPUT;
182 if ((bw->origPtr = get_bits(bd,24)) > bd->dbufSize) return RETVAL_DATA_ERROR;
183
184 // mapping table: if some byte values are never used (encoding things
185 // like ascii text), the compression code removes the gaps to have fewer
186 // symbols to deal with, and writes a sparse bitfield indicating which
187 // values were present. We make a translation table to convert the symbols
188 // back to the corresponding bytes.
189 hh = get_bits(bd, 16);
190 bd->symTotal = 0;
191 for (ii=0; ii<16; ii++) {
192 if (hh & (1 << (15 - ii))) {
193 kk = get_bits(bd, 16);
194 for (jj=0; jj<16; jj++)
195 if (kk & (1 << (15 - jj)))
196 bd->symToByte[bd->symTotal++] = (16 * ii) + jj;
197 }
198 }
199
200 // How many different huffman coding groups does this block use?
201 bd->groupCount = get_bits(bd,3);
202 if (bd->groupCount<2 || bd->groupCount>MAX_GROUPS) return RETVAL_DATA_ERROR;
203
204 // nSelectors: Every GROUP_SIZE many symbols we switch huffman coding
205 // tables. Each group has a selector, which is an index into the huffman
206 // coding table arrays.
207 //
208 // Read in the group selector array, which is stored as MTF encoded
209 // bit runs. (MTF = Move To Front. Every time a symbol occurs it's moved
210 // to the front of the table, so it has a shorter encoding next time.)
211 if (!(bd->nSelectors = get_bits(bd, 15))) return RETVAL_DATA_ERROR;
212 for (ii=0; ii<bd->groupCount; ii++) bd->mtfSymbol[ii] = ii;
213 for (ii=0; ii<bd->nSelectors; ii++) {
214
215 // Get next value
216 for(jj=0;get_bits(bd,1);jj++)
217 if (jj>=bd->groupCount) return RETVAL_DATA_ERROR;
218
219 // Decode MTF to get the next selector, and move it to the front.
220 uc = bd->mtfSymbol[jj];
221 memmove(bd->mtfSymbol+1, bd->mtfSymbol, jj);
222 bd->mtfSymbol[0] = bd->selectors[ii] = uc;
223 }
224
225 // Read the huffman coding tables for each group, which code for symTotal
226 // literal symbols, plus two run symbols (RUNA, RUNB)
227 symCount = bd->symTotal+2;
228 for (jj=0; jj<bd->groupCount; jj++) {
229 unsigned char length[MAX_SYMBOLS];
230 unsigned temp[MAX_HUFCODE_BITS+1];
231 int minLen, maxLen, pp;
232
233 // Read lengths
234 hh = get_bits(bd, 5);
235 for (ii = 0; ii < symCount; ii++) {
236 for(;;) {
237 // !hh || hh > MAX_HUFCODE_BITS in one test.
238 if (MAX_HUFCODE_BITS-1 < (unsigned)hh-1) return RETVAL_DATA_ERROR;
239 // Grab 2 bits instead of 1 (slightly smaller/faster). Stop if
240 // first bit is 0, otherwise second bit says whether to
241 // increment or decrement.
242 kk = get_bits(bd, 2);
243 if (kk & 2) hh += 1 - ((kk&1)<<1);
244 else {
245 bd->inbufBitCount++;
246 break;
247 }
248 }
249 length[ii] = hh;
250 }
251
252 // Find largest and smallest lengths in this group
253 minLen = maxLen = length[0];
254 for (ii = 1; ii < symCount; ii++) {
255 if(length[ii] > maxLen) maxLen = length[ii];
256 else if(length[ii] < minLen) minLen = length[ii];
257 }
258
259 /* Calculate permute[], base[], and limit[] tables from length[].
260 *
261 * permute[] is the lookup table for converting huffman coded symbols
262 * into decoded symbols. It contains symbol values sorted by length.
263 *
264 * base[] is the amount to subtract from the value of a huffman symbol
265 * of a given length when using permute[].
266 *
267 * limit[] indicates the largest numerical value a symbol with a given
268 * number of bits can have. It lets us know when to stop reading.
269 *
270 * To use these, keep reading bits until value <= limit[bitcount] or
271 * you've read over 20 bits (error). Then the decoded symbol
272 * equals permute[hufcode_value - base[hufcode_bitcount]].
273 */
274 hufGroup = bd->groups+jj;
275 hufGroup->minLen = minLen;
276 hufGroup->maxLen = maxLen;
277
278 // Note that minLen can't be smaller than 1, so we adjust the base
279 // and limit array pointers so we're not always wasting the first
280 // entry. We do this again when using them (during symbol decoding).
281 base = hufGroup->base-1;
282 limit = hufGroup->limit-1;
283
284 // zero temp[] and limit[], and calculate permute[]
285 pp = 0;
286 for (ii = minLen; ii <= maxLen; ii++) {
287 temp[ii] = limit[ii] = 0;
288 for (hh = 0; hh < symCount; hh++)
289 if (length[hh] == ii) hufGroup->permute[pp++] = hh;
290 }
291
292 // Count symbols coded for at each bit length
293 for (ii = 0; ii < symCount; ii++) temp[length[ii]]++;
294
295 /* Calculate limit[] (the largest symbol-coding value at each bit
296 * length, which is (previous limit<<1)+symbols at this level), and
297 * base[] (number of symbols to ignore at each bit length, which is
298 * limit minus the cumulative count of symbols coded for already). */
299 pp = hh = 0;
300 for (ii = minLen; ii < maxLen; ii++) {
301 pp += temp[ii];
302 limit[ii] = pp-1;
303 pp <<= 1;
304 base[ii+1] = pp-(hh+=temp[ii]);
305 }
306 limit[maxLen] = pp+temp[maxLen]-1;
307 limit[maxLen+1] = INT_MAX;
308 base[minLen] = 0;
309 }
310
311 return 0;
312 }
313
314 /* First pass, read block's symbols into dbuf[dbufCount].
315 *
316 * This undoes three types of compression: huffman coding, run length encoding,
317 * and move to front encoding. We have to undo all those to know when we've
318 * read enough input.
319 */
320
read_huffman_data(struct bunzip_data * bd,struct bwdata * bw)321 static int read_huffman_data(struct bunzip_data *bd, struct bwdata *bw)
322 {
323 struct group_data *hufGroup;
324 int ii, jj, kk, runPos, dbufCount, symCount, selector, nextSym,
325 *byteCount, *base, *limit;
326 unsigned hh, *dbuf = bw->dbuf;
327 unsigned char uc;
328
329 // We've finished reading and digesting the block header. Now read this
330 // block's huffman coded symbols from the file and undo the huffman coding
331 // and run length encoding, saving the result into dbuf[dbufCount++] = uc
332
333 // Initialize symbol occurrence counters and symbol mtf table
334 byteCount = bw->byteCount;
335 for(ii=0; ii<256; ii++) {
336 byteCount[ii] = 0;
337 bd->mtfSymbol[ii] = ii;
338 }
339
340 // Loop through compressed symbols. This is the first "tight inner loop"
341 // that needs to be micro-optimized for speed. (This one fills out dbuf[]
342 // linearly, staying in cache more, so isn't as limited by DRAM access.)
343 runPos = dbufCount = symCount = selector = 0;
344 // Some unnecessary initializations to shut gcc up.
345 base = limit = 0;
346 hufGroup = 0;
347 hh = 0;
348
349 for (;;) {
350 // Have we reached the end of this huffman group?
351 if (!(symCount--)) {
352 // Determine which huffman coding group to use.
353 symCount = GROUP_SIZE-1;
354 if (selector >= bd->nSelectors) return RETVAL_DATA_ERROR;
355 hufGroup = bd->groups + bd->selectors[selector++];
356 base = hufGroup->base-1;
357 limit = hufGroup->limit-1;
358 }
359
360 // Read next huffman-coded symbol (into jj).
361 ii = hufGroup->minLen;
362 jj = get_bits(bd, ii);
363 while (jj > limit[ii]) {
364 // if (ii > hufGroup->maxLen) return RETVAL_DATA_ERROR;
365 ii++;
366
367 // Unroll get_bits() to avoid a function call when the data's in
368 // the buffer already.
369 kk = bd->inbufBitCount
370 ? (bd->inbufBits >> --(bd->inbufBitCount)) & 1 : get_bits(bd, 1);
371 jj = (jj << 1) | kk;
372 }
373 // Huffman decode jj into nextSym (with bounds checking)
374 jj-=base[ii];
375
376 if (ii > hufGroup->maxLen || (unsigned)jj >= MAX_SYMBOLS)
377 return RETVAL_DATA_ERROR;
378 nextSym = hufGroup->permute[jj];
379
380 // If this is a repeated run, loop collecting data
381 if ((unsigned)nextSym <= SYMBOL_RUNB) {
382 // If this is the start of a new run, zero out counter
383 if(!runPos) {
384 runPos = 1;
385 hh = 0;
386 }
387
388 /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
389 each bit position, add 1 or 2 instead. For example,
390 1011 is 1<<0 + 1<<1 + 2<<2. 1010 is 2<<0 + 2<<1 + 1<<2.
391 You can make any bit pattern that way using 1 less symbol than
392 the basic or 0/1 method (except all bits 0, which would use no
393 symbols, but a run of length 0 doesn't mean anything in this
394 context). Thus space is saved. */
395 hh += (runPos << nextSym); // +runPos if RUNA; +2*runPos if RUNB
396 runPos <<= 1;
397 continue;
398 }
399
400 /* When we hit the first non-run symbol after a run, we now know
401 how many times to repeat the last literal, so append that many
402 copies to our buffer of decoded symbols (dbuf) now. (The last
403 literal used is the one at the head of the mtfSymbol array.) */
404 if (runPos) {
405 runPos = 0;
406 // Check for integer overflow
407 if (hh>bd->dbufSize || dbufCount+hh>bd->dbufSize)
408 return RETVAL_DATA_ERROR;
409
410 uc = bd->symToByte[bd->mtfSymbol[0]];
411 byteCount[uc] += hh;
412 while (hh--) dbuf[dbufCount++] = uc;
413 }
414
415 // Is this the terminating symbol?
416 if (nextSym>bd->symTotal) break;
417
418 /* At this point, the symbol we just decoded indicates a new literal
419 character. Subtract one to get the position in the MTF array
420 at which this literal is currently to be found. (Note that the
421 result can't be -1 or 0, because 0 and 1 are RUNA and RUNB.
422 Another instance of the first symbol in the mtf array, position 0,
423 would have been handled as part of a run.) */
424 if (dbufCount>=bd->dbufSize) return RETVAL_DATA_ERROR;
425 ii = nextSym - 1;
426 uc = bd->mtfSymbol[ii];
427 // On my laptop, unrolling this memmove() into a loop shaves 3.5% off
428 // the total running time.
429 while(ii--) bd->mtfSymbol[ii+1] = bd->mtfSymbol[ii];
430 bd->mtfSymbol[0] = uc;
431 uc = bd->symToByte[uc];
432
433 // We have our literal byte. Save it into dbuf.
434 byteCount[uc]++;
435 dbuf[dbufCount++] = (unsigned int)uc;
436 }
437
438 // Now we know what dbufCount is, do a better sanity check on origPtr.
439 if (bw->origPtr >= (bw->writeCount = dbufCount)) return RETVAL_DATA_ERROR;
440
441 return 0;
442 }
443
444 // Flush output buffer to disk
flush_bunzip_outbuf(struct bunzip_data * bd,int out_fd)445 static int flush_bunzip_outbuf(struct bunzip_data *bd, int out_fd)
446 {
447 if (bd->outbufPos) {
448 if (write(out_fd, bd->outbuf, bd->outbufPos) != bd->outbufPos)
449 return RETVAL_EOF_OUT;
450 bd->outbufPos = 0;
451 }
452
453 return 0;
454 }
455
burrows_wheeler_prep(struct bunzip_data * bd,struct bwdata * bw)456 static void burrows_wheeler_prep(struct bunzip_data *bd, struct bwdata *bw)
457 {
458 int ii, jj;
459 unsigned int *dbuf = bw->dbuf;
460 int *byteCount = bw->byteCount;
461
462 // Turn byteCount into cumulative occurrence counts of 0 to n-1.
463 jj = 0;
464 for (ii=0; ii<256; ii++) {
465 int kk = jj + byteCount[ii];
466 byteCount[ii] = jj;
467 jj = kk;
468 }
469
470 // Use occurrence counts to quickly figure out what order dbuf would be in
471 // if we sorted it.
472 for (ii=0; ii < bw->writeCount; ii++) {
473 unsigned char uc = dbuf[ii];
474 dbuf[byteCount[uc]] |= (ii << 8);
475 byteCount[uc]++;
476 }
477
478 // blockRandomised support would go here.
479
480 // Using ii as position, jj as previous character, hh as current character,
481 // and uc as run count.
482 bw->dataCRC = 0xffffffffL;
483
484 /* Decode first byte by hand to initialize "previous" byte. Note that it
485 doesn't get output, and if the first three characters are identical
486 it doesn't qualify as a run (hence uc=255, which will either wrap
487 to 1 or get reset). */
488 if (bw->writeCount) {
489 bw->writePos = dbuf[bw->origPtr];
490 bw->writeCurrent = (unsigned char)bw->writePos;
491 bw->writePos >>= 8;
492 bw->writeRun = -1;
493 }
494 }
495
496 // Decompress a block of text to intermediate buffer
read_bunzip_data(struct bunzip_data * bd)497 static int read_bunzip_data(struct bunzip_data *bd)
498 {
499 int rc = read_block_header(bd, bd->bwdata);
500 if (!rc) rc=read_huffman_data(bd, bd->bwdata);
501
502 // First thing that can be done by a background thread.
503 burrows_wheeler_prep(bd, bd->bwdata);
504
505 return rc;
506 }
507
508 // Undo burrows-wheeler transform on intermediate buffer to produce output.
509 // If !len, write up to len bytes of data to buf. Otherwise write to out_fd.
510 // Returns len ? bytes written : 0. Notice all errors are negative #'s.
511 //
512 // Burrows-wheeler transform is described at:
513 // http://dogma.net/markn/articles/bwt/bwt.htm
514 // http://marknelson.us/1996/09/01/bwt/
515
write_bunzip_data(struct bunzip_data * bd,struct bwdata * bw,int out_fd,char * outbuf,int len)516 static int write_bunzip_data(struct bunzip_data *bd, struct bwdata *bw,
517 int out_fd, char *outbuf, int len)
518 {
519 unsigned int *dbuf = bw->dbuf;
520 int count, pos, current, run, copies, outbyte, previous, gotcount = 0;
521
522 for (;;) {
523 // If last read was short due to end of file, return last block now
524 if (bw->writeCount < 0) return bw->writeCount;
525
526 // If we need to refill dbuf, do it.
527 if (!bw->writeCount) {
528 int i = read_bunzip_data(bd);
529 if (i) {
530 if (i == RETVAL_LAST_BLOCK) {
531 bw->writeCount = i;
532 return gotcount;
533 } else return i;
534 }
535 }
536
537 // loop generating output
538 count = bw->writeCount;
539 pos = bw->writePos;
540 current = bw->writeCurrent;
541 run = bw->writeRun;
542 while (count) {
543
544 // If somebody (like tar) wants a certain number of bytes of
545 // data from memory instead of written to a file, humor them.
546 if (len && bd->outbufPos >= len) goto dataus_interruptus;
547 count--;
548
549 // Follow sequence vector to undo Burrows-Wheeler transform.
550 previous = current;
551 pos = dbuf[pos];
552 current = pos&0xff;
553 pos >>= 8;
554
555 // Whenever we see 3 consecutive copies of the same byte,
556 // the 4th is a repeat count
557 if (run++ == 3) {
558 copies = current;
559 outbyte = previous;
560 current = -1;
561 } else {
562 copies = 1;
563 outbyte = current;
564 }
565
566 // Output bytes to buffer, flushing to file if necessary
567 while (copies--) {
568 if (bd->outbufPos == IOBUF_SIZE && flush_bunzip_outbuf(bd, out_fd))
569 return RETVAL_EOF_OUT;
570 bd->outbuf[bd->outbufPos++] = outbyte;
571 bw->dataCRC = (bw->dataCRC << 8)
572 ^ bd->crc32Table[(bw->dataCRC >> 24) ^ outbyte];
573 }
574 if (current != previous) run=0;
575 }
576
577 // decompression of this block completed successfully
578 bw->dataCRC = ~(bw->dataCRC);
579 bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bw->dataCRC;
580
581 // if this block had a crc error, force file level crc error.
582 if (bw->dataCRC != bw->headerCRC) {
583 bd->totalCRC = bw->headerCRC+1;
584
585 return RETVAL_LAST_BLOCK;
586 }
587 dataus_interruptus:
588 bw->writeCount = count;
589 if (len) {
590 gotcount += bd->outbufPos;
591 memcpy(outbuf, bd->outbuf, len);
592
593 // If we got enough data, checkpoint loop state and return
594 if ((len -= bd->outbufPos)<1) {
595 bd->outbufPos -= len;
596 if (bd->outbufPos) memmove(bd->outbuf, bd->outbuf+len, bd->outbufPos);
597 bw->writePos = pos;
598 bw->writeCurrent = current;
599 bw->writeRun = run;
600
601 return gotcount;
602 }
603 }
604 }
605 }
606
607 // Allocate the structure, read file header. If !len, src_fd contains
608 // filehandle to read from. Else inbuf contains data.
start_bunzip(struct bunzip_data ** bdp,int src_fd,char * inbuf,int len)609 static int start_bunzip(struct bunzip_data **bdp, int src_fd, char *inbuf,
610 int len)
611 {
612 struct bunzip_data *bd;
613 unsigned int i;
614
615 // Figure out how much data to allocate.
616 i = sizeof(struct bunzip_data);
617 if (!len) i += IOBUF_SIZE;
618
619 // Allocate bunzip_data. Most fields initialize to zero.
620 bd = *bdp = xzalloc(i);
621 if (len) {
622 bd->inbuf = inbuf;
623 bd->inbufCount = len;
624 bd->in_fd = -1;
625 } else {
626 bd->inbuf = (char *)(bd+1);
627 bd->in_fd = src_fd;
628 }
629
630 crc_init(bd->crc32Table, 0);
631
632 // Ensure that file starts with "BZh".
633 for (i=0;i<3;i++) if (get_bits(bd,8)!="BZh"[i]) return RETVAL_NOT_BZIP_DATA;
634
635 // Next byte ascii '1'-'9', indicates block size in units of 100k of
636 // uncompressed data. Allocate intermediate buffer for block.
637 i = get_bits(bd, 8);
638 if (i<'1' || i>'9') return RETVAL_NOT_BZIP_DATA;
639 bd->dbufSize = 100000*(i-'0')*THREADS;
640 for (i=0; i<THREADS; i++)
641 bd->bwdata[i].dbuf = xmalloc(bd->dbufSize * sizeof(int));
642
643 return 0;
644 }
645
646 // Example usage: decompress src_fd to dst_fd. (Stops at end of bzip data,
647 // not end of file.)
bunzipStream(int src_fd,int dst_fd)648 static char *bunzipStream(int src_fd, int dst_fd)
649 {
650 struct bunzip_data *bd;
651 char *bunzip_errors[] = {0, "not bzip", "bad data", "old format", "in EOF",
652 "out EOF"};
653 int i, j;
654
655 if (!(i = setjmp((void *)toybuf)) && !(i = start_bunzip(&bd, src_fd, 0, 0))) {
656 i = write_bunzip_data(bd, bd->bwdata, dst_fd, 0, 0);
657 if (i==RETVAL_LAST_BLOCK) {
658 if (bd->bwdata[0].headerCRC==bd->totalCRC) i = 0;
659 else i = RETVAL_DATA_ERROR;
660 }
661 }
662 if (!i) i = flush_bunzip_outbuf(bd, dst_fd);
663
664 for (j=0; j<THREADS; j++) free(bd->bwdata[j].dbuf);
665 free(bd);
666
667 return bunzip_errors[-i];
668 }
669
do_bzcat(int fd,char * name)670 static void do_bzcat(int fd, char *name)
671 {
672 char *err = bunzipStream(fd, 1);
673
674 if (err) {
675 // Exit silently for "out EOF" because pipelines.
676 if (err[1] != 'u') error_exit_raw(err);
677 toys.exitval = 1;
678 xexit();
679 }
680 }
681
bzcat_main(void)682 void bzcat_main(void)
683 {
684 loopfiles(toys.optargs, do_bzcat);
685 }
686
do_bunzip2(int fd,char * name)687 static void do_bunzip2(int fd, char *name)
688 {
689 int outfd = 1, rename = 0, len = strlen(name);
690 char *tmp, *err, *dotbz = 0;
691
692 // Trim off .bz or .bz2 extension
693 dotbz = name+len-3;
694 if ((len>3 && !strcmp(dotbz, ".bz")) || (len>4 && !strcmp(--dotbz, ".bz2")))
695 dotbz = 0;
696
697 // For - no replace
698 if (FLAG(t)) outfd = xopen("/dev/null", O_WRONLY);
699 else if ((fd || strcmp(name, "-")) && !FLAG(c)) {
700 if (FLAG(k) && (!dotbz || !access(name, X_OK)))
701 return error_msg("%s exists", name);
702 outfd = copy_tempfile(fd, name, &tmp);
703 rename++;
704 }
705
706 if (FLAG(v)) printf("%s:", name);
707 err = bunzipStream(fd, outfd);
708 if (FLAG(v)) {
709 printf("%s\n", err ? : "ok");
710 toys.exitval |= !!err;
711 } else if (err) error_msg_raw(err);
712
713 // can't test outfd==1 because may have been called with stdin+stdout closed
714 if (rename) {
715 if (FLAG(k)) {
716 free(tmp);
717 tmp = 0;
718 } else if (!err) {
719 if (dotbz) *dotbz = '.';
720 if (!unlink(name)) perror_msg_raw(name);
721 }
722 (err ? delete_tempfile : replace_tempfile)(-1, outfd, &tmp);
723 }
724 }
725
bunzip2_main(void)726 void bunzip2_main(void)
727 {
728 loopfiles(toys.optargs, do_bunzip2);
729 }
730