xref: /aosp_15_r20/external/toybox/toys/other/bzcat.c (revision cf5a6c84e2b8763fc1a7db14496fd4742913b199)
1 /* bzcat.c - bzip2 decompression
2  *
3  * Copyright 2003, 2007 Rob Landley <[email protected]>
4  *
5  * Based on a close reading (but not the actual code) of the original bzip2
6  * decompression code by Julian R Seward ([email protected]), which also
7  * acknowledges contributions by Mike Burrows, David Wheeler, Peter Fenwick,
8  * Alistair Moffat, Radford Neal, Ian H. Witten, Robert Sedgewick, and
9  * Jon L. Bentley.
10  *
11  * No standard.
12 
13 
14 USE_BZCAT(NEWTOY(bzcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
15 USE_BUNZIP2(NEWTOY(bunzip2, "cftkv", TOYFLAG_USR|TOYFLAG_BIN))
16 
17 config BUNZIP2
18   bool "bunzip2"
19   default y
20   help
21     usage: bunzip2 [-cftkv] [FILE...]
22 
23     Decompress listed files (file.bz becomes file) deleting archive file(s).
24     Read from stdin if no files listed.
25 
26     -c	Force output to stdout
27     -f	Force decompression (if FILE doesn't end in .bz, replace original)
28     -k	Keep input files (-c and -t imply this)
29     -t	Test integrity
30     -v	Verbose
31 
32 config BZCAT
33   bool "bzcat"
34   default y
35   help
36     usage: bzcat [FILE...]
37 
38     Decompress listed files to stdout. Use stdin if no files listed.
39 */
40 
41 #define FOR_bunzip2
42 #include "toys.h"
43 
44 #define THREADS 1
45 
46 // Constants for huffman coding
47 #define MAX_GROUPS               6
48 #define GROUP_SIZE               50     /* 64 would have been more efficient */
49 #define MAX_HUFCODE_BITS         20     /* Longest huffman code allowed */
50 #define MAX_SYMBOLS              258    /* 256 literals + RUNA + RUNB */
51 #define SYMBOL_RUNA              0
52 #define SYMBOL_RUNB              1
53 
54 // Other housekeeping constants
55 #define IOBUF_SIZE               4096
56 
57 // Status return values
58 #define RETVAL_LAST_BLOCK        (-100)
59 #define RETVAL_NOT_BZIP_DATA     (-1)
60 #define RETVAL_DATA_ERROR        (-2)
61 #define RETVAL_OBSOLETE_INPUT    (-3)
62 #define RETVAL_EOF_IN            (-4)
63 #define RETVAL_EOF_OUT           (-5)
64 
65 // This is what we know about each huffman coding group
66 struct group_data {
67   int limit[MAX_HUFCODE_BITS+1], base[MAX_HUFCODE_BITS], permute[MAX_SYMBOLS];
68   char minLen, maxLen;
69 };
70 
71 // Data for burrows wheeler transform
72 
73 struct bwdata {
74   unsigned int origPtr;
75   int byteCount[256];
76   // State saved when interrupting output
77   int writePos, writeRun, writeCount, writeCurrent;
78   unsigned int dataCRC, headerCRC;
79   unsigned int *dbuf;
80 };
81 
82 // Structure holding all the housekeeping data, including IO buffers and
83 // memory that persists between calls to bunzip
84 struct bunzip_data {
85   // Input stream, input buffer, input bit buffer
86   int in_fd, inbufCount, inbufPos;
87   char *inbuf;
88   unsigned int inbufBitCount, inbufBits;
89 
90   // Output buffer
91   char outbuf[IOBUF_SIZE];
92   int outbufPos;
93 
94   unsigned int totalCRC;
95 
96   // First pass decompression data (Huffman and MTF decoding)
97   char selectors[32768];                  // nSelectors=15 bits
98   struct group_data groups[MAX_GROUPS];   // huffman coding tables
99   int symTotal, groupCount, nSelectors;
100   unsigned char symToByte[256], mtfSymbol[256];
101 
102   // The CRC values stored in the block header and calculated from the data
103   unsigned int crc32Table[256];
104 
105   // Second pass decompression data (burrows-wheeler transform)
106   unsigned int dbufSize;
107   struct bwdata bwdata[THREADS];
108 };
109 
110 // Return the next nnn bits of input.  All reads from the compressed input
111 // are done through this function.  All reads are big endian.
get_bits(struct bunzip_data * bd,char bits_wanted)112 static unsigned int get_bits(struct bunzip_data *bd, char bits_wanted)
113 {
114   unsigned int bits = 0;
115 
116   // If we need to get more data from the byte buffer, do so.  (Loop getting
117   // one byte at a time to enforce endianness and avoid unaligned access.)
118   while (bd->inbufBitCount < bits_wanted) {
119 
120     // If we need to read more data from file into byte buffer, do so
121     if (bd->inbufPos == bd->inbufCount) {
122       if (0 >= (bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE)))
123         longjmp((void *)toybuf, RETVAL_EOF_IN);
124       bd->inbufPos = 0;
125     }
126 
127     // Avoid 32-bit overflow (dump bit buffer to top of output)
128     if (bd->inbufBitCount>=24) {
129       bits = bd->inbufBits&((1<<bd->inbufBitCount)-1);
130       bits_wanted -= bd->inbufBitCount;
131       bits <<= bits_wanted;
132       bd->inbufBitCount = 0;
133     }
134 
135     // Grab next 8 bits of input from buffer.
136     bd->inbufBits = (bd->inbufBits<<8) | bd->inbuf[bd->inbufPos++];
137     bd->inbufBitCount += 8;
138   }
139 
140   // Calculate result
141   bd->inbufBitCount -= bits_wanted;
142   bits |= (bd->inbufBits>>bd->inbufBitCount) & ((1<<bits_wanted)-1);
143 
144   return bits;
145 }
146 
147 /* Read block header at start of a new compressed data block.  Consists of:
148  *
149  * 48 bits : Block signature, either pi (data block) or e (EOF block).
150  * 32 bits : bw->headerCRC
151  * 1  bit  : obsolete feature flag.
152  * 24 bits : origPtr (Burrows-wheeler unwind index, only 20 bits ever used)
153  * 16 bits : Mapping table index.
154  *[16 bits]: symToByte[symTotal] (Mapping table.  For each bit set in mapping
155  *           table index above, read another 16 bits of mapping table data.
156  *           If correspondig bit is unset, all bits in that mapping table
157  *           section are 0.)
158  *  3 bits : groupCount (how many huffman tables used to encode, anywhere
159  *           from 2 to MAX_GROUPS)
160  * variable: hufGroup[groupCount] (MTF encoded huffman table data.)
161  */
162 
read_block_header(struct bunzip_data * bd,struct bwdata * bw)163 static int read_block_header(struct bunzip_data *bd, struct bwdata *bw)
164 {
165   struct group_data *hufGroup;
166   int hh, ii, jj, kk, symCount, *base, *limit;
167   unsigned char uc;
168 
169   // Read in header signature and CRC (which is stored big endian)
170   ii = get_bits(bd, 24);
171   jj = get_bits(bd, 24);
172   bw->headerCRC = get_bits(bd,32);
173 
174   // Is this the EOF block with CRC for whole file?  (Constant is "e")
175   if (ii==0x177245 && jj==0x385090) return RETVAL_LAST_BLOCK;
176 
177   // Is this a valid data block?  (Constant is "pi".)
178   if (ii!=0x314159 || jj!=0x265359) return RETVAL_NOT_BZIP_DATA;
179 
180   // We can add support for blockRandomised if anybody complains.
181   if (get_bits(bd,1)) return RETVAL_OBSOLETE_INPUT;
182   if ((bw->origPtr = get_bits(bd,24)) > bd->dbufSize) return RETVAL_DATA_ERROR;
183 
184   // mapping table: if some byte values are never used (encoding things
185   // like ascii text), the compression code removes the gaps to have fewer
186   // symbols to deal with, and writes a sparse bitfield indicating which
187   // values were present.  We make a translation table to convert the symbols
188   // back to the corresponding bytes.
189   hh = get_bits(bd, 16);
190   bd->symTotal = 0;
191   for (ii=0; ii<16; ii++) {
192     if (hh & (1 << (15 - ii))) {
193       kk = get_bits(bd, 16);
194       for (jj=0; jj<16; jj++)
195         if (kk & (1 << (15 - jj)))
196           bd->symToByte[bd->symTotal++] = (16 * ii) + jj;
197     }
198   }
199 
200   // How many different huffman coding groups does this block use?
201   bd->groupCount = get_bits(bd,3);
202   if (bd->groupCount<2 || bd->groupCount>MAX_GROUPS) return RETVAL_DATA_ERROR;
203 
204   // nSelectors: Every GROUP_SIZE many symbols we switch huffman coding
205   // tables.  Each group has a selector, which is an index into the huffman
206   // coding table arrays.
207   //
208   // Read in the group selector array, which is stored as MTF encoded
209   // bit runs.  (MTF = Move To Front.  Every time a symbol occurs it's moved
210   // to the front of the table, so it has a shorter encoding next time.)
211   if (!(bd->nSelectors = get_bits(bd, 15))) return RETVAL_DATA_ERROR;
212   for (ii=0; ii<bd->groupCount; ii++) bd->mtfSymbol[ii] = ii;
213   for (ii=0; ii<bd->nSelectors; ii++) {
214 
215     // Get next value
216     for(jj=0;get_bits(bd,1);jj++)
217       if (jj>=bd->groupCount) return RETVAL_DATA_ERROR;
218 
219     // Decode MTF to get the next selector, and move it to the front.
220     uc = bd->mtfSymbol[jj];
221     memmove(bd->mtfSymbol+1, bd->mtfSymbol, jj);
222     bd->mtfSymbol[0] = bd->selectors[ii] = uc;
223   }
224 
225   // Read the huffman coding tables for each group, which code for symTotal
226   // literal symbols, plus two run symbols (RUNA, RUNB)
227   symCount = bd->symTotal+2;
228   for (jj=0; jj<bd->groupCount; jj++) {
229     unsigned char length[MAX_SYMBOLS];
230     unsigned temp[MAX_HUFCODE_BITS+1];
231     int minLen, maxLen, pp;
232 
233     // Read lengths
234     hh = get_bits(bd, 5);
235     for (ii = 0; ii < symCount; ii++) {
236       for(;;) {
237         // !hh || hh > MAX_HUFCODE_BITS in one test.
238         if (MAX_HUFCODE_BITS-1 < (unsigned)hh-1) return RETVAL_DATA_ERROR;
239         // Grab 2 bits instead of 1 (slightly smaller/faster).  Stop if
240         // first bit is 0, otherwise second bit says whether to
241         // increment or decrement.
242         kk = get_bits(bd, 2);
243         if (kk & 2) hh += 1 - ((kk&1)<<1);
244         else {
245           bd->inbufBitCount++;
246           break;
247         }
248       }
249       length[ii] = hh;
250     }
251 
252     // Find largest and smallest lengths in this group
253     minLen = maxLen = length[0];
254     for (ii = 1; ii < symCount; ii++) {
255       if(length[ii] > maxLen) maxLen = length[ii];
256       else if(length[ii] < minLen) minLen = length[ii];
257     }
258 
259     /* Calculate permute[], base[], and limit[] tables from length[].
260      *
261      * permute[] is the lookup table for converting huffman coded symbols
262      * into decoded symbols.  It contains symbol values sorted by length.
263      *
264      * base[] is the amount to subtract from the value of a huffman symbol
265      * of a given length when using permute[].
266      *
267      * limit[] indicates the largest numerical value a symbol with a given
268      * number of bits can have.  It lets us know when to stop reading.
269      *
270      * To use these, keep reading bits until value <= limit[bitcount] or
271      * you've read over 20 bits (error).  Then the decoded symbol
272      * equals permute[hufcode_value - base[hufcode_bitcount]].
273      */
274     hufGroup = bd->groups+jj;
275     hufGroup->minLen = minLen;
276     hufGroup->maxLen = maxLen;
277 
278     // Note that minLen can't be smaller than 1, so we adjust the base
279     // and limit array pointers so we're not always wasting the first
280     // entry.  We do this again when using them (during symbol decoding).
281     base = hufGroup->base-1;
282     limit = hufGroup->limit-1;
283 
284     // zero temp[] and limit[], and calculate permute[]
285     pp = 0;
286     for (ii = minLen; ii <= maxLen; ii++) {
287       temp[ii] = limit[ii] = 0;
288       for (hh = 0; hh < symCount; hh++)
289         if (length[hh] == ii) hufGroup->permute[pp++] = hh;
290     }
291 
292     // Count symbols coded for at each bit length
293     for (ii = 0; ii < symCount; ii++) temp[length[ii]]++;
294 
295     /* Calculate limit[] (the largest symbol-coding value at each bit
296      * length, which is (previous limit<<1)+symbols at this level), and
297      * base[] (number of symbols to ignore at each bit length, which is
298      * limit minus the cumulative count of symbols coded for already). */
299     pp = hh = 0;
300     for (ii = minLen; ii < maxLen; ii++) {
301       pp += temp[ii];
302       limit[ii] = pp-1;
303       pp <<= 1;
304       base[ii+1] = pp-(hh+=temp[ii]);
305     }
306     limit[maxLen] = pp+temp[maxLen]-1;
307     limit[maxLen+1] = INT_MAX;
308     base[minLen] = 0;
309   }
310 
311   return 0;
312 }
313 
314 /* First pass, read block's symbols into dbuf[dbufCount].
315  *
316  * This undoes three types of compression: huffman coding, run length encoding,
317  * and move to front encoding.  We have to undo all those to know when we've
318  * read enough input.
319  */
320 
read_huffman_data(struct bunzip_data * bd,struct bwdata * bw)321 static int read_huffman_data(struct bunzip_data *bd, struct bwdata *bw)
322 {
323   struct group_data *hufGroup;
324   int ii, jj, kk, runPos, dbufCount, symCount, selector, nextSym,
325     *byteCount, *base, *limit;
326   unsigned hh, *dbuf = bw->dbuf;
327   unsigned char uc;
328 
329   // We've finished reading and digesting the block header.  Now read this
330   // block's huffman coded symbols from the file and undo the huffman coding
331   // and run length encoding, saving the result into dbuf[dbufCount++] = uc
332 
333   // Initialize symbol occurrence counters and symbol mtf table
334   byteCount = bw->byteCount;
335   for(ii=0; ii<256; ii++) {
336     byteCount[ii] = 0;
337     bd->mtfSymbol[ii] = ii;
338   }
339 
340   // Loop through compressed symbols.  This is the first "tight inner loop"
341   // that needs to be micro-optimized for speed.  (This one fills out dbuf[]
342   // linearly, staying in cache more, so isn't as limited by DRAM access.)
343   runPos = dbufCount = symCount = selector = 0;
344   // Some unnecessary initializations to shut gcc up.
345   base = limit = 0;
346   hufGroup = 0;
347   hh = 0;
348 
349   for (;;) {
350     // Have we reached the end of this huffman group?
351     if (!(symCount--)) {
352       // Determine which huffman coding group to use.
353       symCount = GROUP_SIZE-1;
354       if (selector >= bd->nSelectors) return RETVAL_DATA_ERROR;
355       hufGroup = bd->groups + bd->selectors[selector++];
356       base = hufGroup->base-1;
357       limit = hufGroup->limit-1;
358     }
359 
360     // Read next huffman-coded symbol (into jj).
361     ii = hufGroup->minLen;
362     jj = get_bits(bd, ii);
363     while (jj > limit[ii]) {
364       // if (ii > hufGroup->maxLen) return RETVAL_DATA_ERROR;
365       ii++;
366 
367       // Unroll get_bits() to avoid a function call when the data's in
368       // the buffer already.
369       kk = bd->inbufBitCount
370         ? (bd->inbufBits >> --(bd->inbufBitCount)) & 1 : get_bits(bd, 1);
371       jj = (jj << 1) | kk;
372     }
373     // Huffman decode jj into nextSym (with bounds checking)
374     jj-=base[ii];
375 
376     if (ii > hufGroup->maxLen || (unsigned)jj >= MAX_SYMBOLS)
377       return RETVAL_DATA_ERROR;
378     nextSym = hufGroup->permute[jj];
379 
380     // If this is a repeated run, loop collecting data
381     if ((unsigned)nextSym <= SYMBOL_RUNB) {
382       // If this is the start of a new run, zero out counter
383       if(!runPos) {
384         runPos = 1;
385         hh = 0;
386       }
387 
388       /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
389          each bit position, add 1 or 2 instead. For example,
390          1011 is 1<<0 + 1<<1 + 2<<2. 1010 is 2<<0 + 2<<1 + 1<<2.
391          You can make any bit pattern that way using 1 less symbol than
392          the basic or 0/1 method (except all bits 0, which would use no
393          symbols, but a run of length 0 doesn't mean anything in this
394          context). Thus space is saved. */
395       hh += (runPos << nextSym); // +runPos if RUNA; +2*runPos if RUNB
396       runPos <<= 1;
397       continue;
398     }
399 
400     /* When we hit the first non-run symbol after a run, we now know
401        how many times to repeat the last literal, so append that many
402        copies to our buffer of decoded symbols (dbuf) now. (The last
403        literal used is the one at the head of the mtfSymbol array.) */
404     if (runPos) {
405       runPos = 0;
406       // Check for integer overflow
407       if (hh>bd->dbufSize || dbufCount+hh>bd->dbufSize)
408         return RETVAL_DATA_ERROR;
409 
410       uc = bd->symToByte[bd->mtfSymbol[0]];
411       byteCount[uc] += hh;
412       while (hh--) dbuf[dbufCount++] = uc;
413     }
414 
415     // Is this the terminating symbol?
416     if (nextSym>bd->symTotal) break;
417 
418     /* At this point, the symbol we just decoded indicates a new literal
419        character. Subtract one to get the position in the MTF array
420        at which this literal is currently to be found. (Note that the
421        result can't be -1 or 0, because 0 and 1 are RUNA and RUNB.
422        Another instance of the first symbol in the mtf array, position 0,
423        would have been handled as part of a run.) */
424     if (dbufCount>=bd->dbufSize) return RETVAL_DATA_ERROR;
425     ii = nextSym - 1;
426     uc = bd->mtfSymbol[ii];
427     // On my laptop, unrolling this memmove() into a loop shaves 3.5% off
428     // the total running time.
429     while(ii--) bd->mtfSymbol[ii+1] = bd->mtfSymbol[ii];
430     bd->mtfSymbol[0] = uc;
431     uc = bd->symToByte[uc];
432 
433     // We have our literal byte.  Save it into dbuf.
434     byteCount[uc]++;
435     dbuf[dbufCount++] = (unsigned int)uc;
436   }
437 
438   // Now we know what dbufCount is, do a better sanity check on origPtr.
439   if (bw->origPtr >= (bw->writeCount = dbufCount)) return RETVAL_DATA_ERROR;
440 
441   return 0;
442 }
443 
444 // Flush output buffer to disk
flush_bunzip_outbuf(struct bunzip_data * bd,int out_fd)445 static int flush_bunzip_outbuf(struct bunzip_data *bd, int out_fd)
446 {
447   if (bd->outbufPos) {
448     if (write(out_fd, bd->outbuf, bd->outbufPos) != bd->outbufPos)
449       return RETVAL_EOF_OUT;
450     bd->outbufPos = 0;
451   }
452 
453   return 0;
454 }
455 
burrows_wheeler_prep(struct bunzip_data * bd,struct bwdata * bw)456 static void burrows_wheeler_prep(struct bunzip_data *bd, struct bwdata *bw)
457 {
458   int ii, jj;
459   unsigned int *dbuf = bw->dbuf;
460   int *byteCount = bw->byteCount;
461 
462   // Turn byteCount into cumulative occurrence counts of 0 to n-1.
463   jj = 0;
464   for (ii=0; ii<256; ii++) {
465     int kk = jj + byteCount[ii];
466     byteCount[ii] = jj;
467     jj = kk;
468   }
469 
470   // Use occurrence counts to quickly figure out what order dbuf would be in
471   // if we sorted it.
472   for (ii=0; ii < bw->writeCount; ii++) {
473     unsigned char uc = dbuf[ii];
474     dbuf[byteCount[uc]] |= (ii << 8);
475     byteCount[uc]++;
476   }
477 
478   // blockRandomised support would go here.
479 
480   // Using ii as position, jj as previous character, hh as current character,
481   // and uc as run count.
482   bw->dataCRC = 0xffffffffL;
483 
484   /* Decode first byte by hand to initialize "previous" byte. Note that it
485      doesn't get output, and if the first three characters are identical
486      it doesn't qualify as a run (hence uc=255, which will either wrap
487      to 1 or get reset). */
488   if (bw->writeCount) {
489     bw->writePos = dbuf[bw->origPtr];
490     bw->writeCurrent = (unsigned char)bw->writePos;
491     bw->writePos >>= 8;
492     bw->writeRun = -1;
493   }
494 }
495 
496 // Decompress a block of text to intermediate buffer
read_bunzip_data(struct bunzip_data * bd)497 static int read_bunzip_data(struct bunzip_data *bd)
498 {
499   int rc = read_block_header(bd, bd->bwdata);
500   if (!rc) rc=read_huffman_data(bd, bd->bwdata);
501 
502   // First thing that can be done by a background thread.
503   burrows_wheeler_prep(bd, bd->bwdata);
504 
505   return rc;
506 }
507 
508 // Undo burrows-wheeler transform on intermediate buffer to produce output.
509 // If !len, write up to len bytes of data to buf.  Otherwise write to out_fd.
510 // Returns len ? bytes written : 0.  Notice all errors are negative #'s.
511 //
512 // Burrows-wheeler transform is described at:
513 // http://dogma.net/markn/articles/bwt/bwt.htm
514 // http://marknelson.us/1996/09/01/bwt/
515 
write_bunzip_data(struct bunzip_data * bd,struct bwdata * bw,int out_fd,char * outbuf,int len)516 static int write_bunzip_data(struct bunzip_data *bd, struct bwdata *bw,
517   int out_fd, char *outbuf, int len)
518 {
519   unsigned int *dbuf = bw->dbuf;
520   int count, pos, current, run, copies, outbyte, previous, gotcount = 0;
521 
522   for (;;) {
523     // If last read was short due to end of file, return last block now
524     if (bw->writeCount < 0) return bw->writeCount;
525 
526     // If we need to refill dbuf, do it.
527     if (!bw->writeCount) {
528       int i = read_bunzip_data(bd);
529       if (i) {
530         if (i == RETVAL_LAST_BLOCK) {
531           bw->writeCount = i;
532           return gotcount;
533         } else return i;
534       }
535     }
536 
537     // loop generating output
538     count = bw->writeCount;
539     pos = bw->writePos;
540     current = bw->writeCurrent;
541     run = bw->writeRun;
542     while (count) {
543 
544       // If somebody (like tar) wants a certain number of bytes of
545       // data from memory instead of written to a file, humor them.
546       if (len && bd->outbufPos >= len) goto dataus_interruptus;
547       count--;
548 
549       // Follow sequence vector to undo Burrows-Wheeler transform.
550       previous = current;
551       pos = dbuf[pos];
552       current = pos&0xff;
553       pos >>= 8;
554 
555       // Whenever we see 3 consecutive copies of the same byte,
556       // the 4th is a repeat count
557       if (run++ == 3) {
558         copies = current;
559         outbyte = previous;
560         current = -1;
561       } else {
562         copies = 1;
563         outbyte = current;
564       }
565 
566       // Output bytes to buffer, flushing to file if necessary
567       while (copies--) {
568         if (bd->outbufPos == IOBUF_SIZE && flush_bunzip_outbuf(bd, out_fd))
569           return RETVAL_EOF_OUT;
570         bd->outbuf[bd->outbufPos++] = outbyte;
571         bw->dataCRC = (bw->dataCRC << 8)
572                 ^ bd->crc32Table[(bw->dataCRC >> 24) ^ outbyte];
573       }
574       if (current != previous) run=0;
575     }
576 
577     // decompression of this block completed successfully
578     bw->dataCRC = ~(bw->dataCRC);
579     bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bw->dataCRC;
580 
581     // if this block had a crc error, force file level crc error.
582     if (bw->dataCRC != bw->headerCRC) {
583       bd->totalCRC = bw->headerCRC+1;
584 
585       return RETVAL_LAST_BLOCK;
586     }
587 dataus_interruptus:
588     bw->writeCount = count;
589     if (len) {
590       gotcount += bd->outbufPos;
591       memcpy(outbuf, bd->outbuf, len);
592 
593       // If we got enough data, checkpoint loop state and return
594       if ((len -= bd->outbufPos)<1) {
595         bd->outbufPos -= len;
596         if (bd->outbufPos) memmove(bd->outbuf, bd->outbuf+len, bd->outbufPos);
597         bw->writePos = pos;
598         bw->writeCurrent = current;
599         bw->writeRun = run;
600 
601         return gotcount;
602       }
603     }
604   }
605 }
606 
607 // Allocate the structure, read file header. If !len, src_fd contains
608 // filehandle to read from. Else inbuf contains data.
start_bunzip(struct bunzip_data ** bdp,int src_fd,char * inbuf,int len)609 static int start_bunzip(struct bunzip_data **bdp, int src_fd, char *inbuf,
610   int len)
611 {
612   struct bunzip_data *bd;
613   unsigned int i;
614 
615   // Figure out how much data to allocate.
616   i = sizeof(struct bunzip_data);
617   if (!len) i += IOBUF_SIZE;
618 
619   // Allocate bunzip_data. Most fields initialize to zero.
620   bd = *bdp = xzalloc(i);
621   if (len) {
622     bd->inbuf = inbuf;
623     bd->inbufCount = len;
624     bd->in_fd = -1;
625   } else {
626     bd->inbuf = (char *)(bd+1);
627     bd->in_fd = src_fd;
628   }
629 
630   crc_init(bd->crc32Table, 0);
631 
632   // Ensure that file starts with "BZh".
633   for (i=0;i<3;i++) if (get_bits(bd,8)!="BZh"[i]) return RETVAL_NOT_BZIP_DATA;
634 
635   // Next byte ascii '1'-'9', indicates block size in units of 100k of
636   // uncompressed data. Allocate intermediate buffer for block.
637   i = get_bits(bd, 8);
638   if (i<'1' || i>'9') return RETVAL_NOT_BZIP_DATA;
639   bd->dbufSize = 100000*(i-'0')*THREADS;
640   for (i=0; i<THREADS; i++)
641     bd->bwdata[i].dbuf = xmalloc(bd->dbufSize * sizeof(int));
642 
643   return 0;
644 }
645 
646 // Example usage: decompress src_fd to dst_fd. (Stops at end of bzip data,
647 // not end of file.)
bunzipStream(int src_fd,int dst_fd)648 static char *bunzipStream(int src_fd, int dst_fd)
649 {
650   struct bunzip_data *bd;
651   char *bunzip_errors[] = {0, "not bzip", "bad data", "old format", "in EOF",
652     "out EOF"};
653   int i, j;
654 
655   if (!(i = setjmp((void *)toybuf)) && !(i = start_bunzip(&bd, src_fd, 0, 0))) {
656     i = write_bunzip_data(bd, bd->bwdata, dst_fd, 0, 0);
657     if (i==RETVAL_LAST_BLOCK) {
658       if (bd->bwdata[0].headerCRC==bd->totalCRC) i = 0;
659       else i = RETVAL_DATA_ERROR;
660     }
661   }
662   if (!i) i = flush_bunzip_outbuf(bd, dst_fd);
663 
664   for (j=0; j<THREADS; j++) free(bd->bwdata[j].dbuf);
665   free(bd);
666 
667   return bunzip_errors[-i];
668 }
669 
do_bzcat(int fd,char * name)670 static void do_bzcat(int fd, char *name)
671 {
672   char *err = bunzipStream(fd, 1);
673 
674   if (err) {
675     // Exit silently for "out EOF" because pipelines.
676     if (err[1] != 'u') error_exit_raw(err);
677     toys.exitval = 1;
678     xexit();
679   }
680 }
681 
bzcat_main(void)682 void bzcat_main(void)
683 {
684   loopfiles(toys.optargs, do_bzcat);
685 }
686 
do_bunzip2(int fd,char * name)687 static void do_bunzip2(int fd, char *name)
688 {
689   int outfd = 1, rename = 0, len = strlen(name);
690   char *tmp, *err, *dotbz = 0;
691 
692   // Trim off .bz or .bz2 extension
693   dotbz = name+len-3;
694   if ((len>3 && !strcmp(dotbz, ".bz")) || (len>4 && !strcmp(--dotbz, ".bz2")))
695     dotbz = 0;
696 
697   // For - no replace
698   if (FLAG(t)) outfd = xopen("/dev/null", O_WRONLY);
699   else if ((fd || strcmp(name, "-")) && !FLAG(c)) {
700     if (FLAG(k) && (!dotbz || !access(name, X_OK)))
701       return error_msg("%s exists", name);
702     outfd = copy_tempfile(fd, name, &tmp);
703     rename++;
704   }
705 
706   if (FLAG(v)) printf("%s:", name);
707   err = bunzipStream(fd, outfd);
708   if (FLAG(v)) {
709     printf("%s\n", err ? : "ok");
710     toys.exitval |= !!err;
711   } else if (err) error_msg_raw(err);
712 
713   // can't test outfd==1 because may have been called with stdin+stdout closed
714   if (rename) {
715     if (FLAG(k)) {
716       free(tmp);
717       tmp = 0;
718     } else if (!err) {
719       if (dotbz) *dotbz = '.';
720       if (!unlink(name)) perror_msg_raw(name);
721     }
722     (err ? delete_tempfile : replace_tempfile)(-1, outfd, &tmp);
723   }
724 }
725 
bunzip2_main(void)726 void bunzip2_main(void)
727 {
728   loopfiles(toys.optargs, do_bunzip2);
729 }
730