1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
20_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
26         encoding=None, errors=None, newline=None):
27    """Open a gzip-compressed file in binary or text mode.
28
29    The filename argument can be an actual filename (a str or bytes object), or
30    an existing file object to read from or write to.
31
32    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34    "rb", and the default compresslevel is 9.
35
36    For binary mode, this function is equivalent to the GzipFile constructor:
37    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38    and newline arguments must not be provided.
39
40    For text mode, a GzipFile object is created, and wrapped in an
41    io.TextIOWrapper instance with the specified encoding, error handling
42    behavior, and line ending(s).
43
44    """
45    if "t" in mode:
46        if "b" in mode:
47            raise ValueError("Invalid mode: %r" % (mode,))
48    else:
49        if encoding is not None:
50            raise ValueError("Argument 'encoding' not supported in binary mode")
51        if errors is not None:
52            raise ValueError("Argument 'errors' not supported in binary mode")
53        if newline is not None:
54            raise ValueError("Argument 'newline' not supported in binary mode")
55
56    gz_mode = mode.replace("t", "")
57    if isinstance(filename, (str, bytes, os.PathLike)):
58        binary_file = GzipFile(filename, gz_mode, compresslevel)
59    elif hasattr(filename, "read") or hasattr(filename, "write"):
60        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61    else:
62        raise TypeError("filename must be a str or bytes object, or a file")
63
64    if "t" in mode:
65        encoding = io.text_encoding(encoding)
66        return io.TextIOWrapper(binary_file, encoding, errors, newline)
67    else:
68        return binary_file
69
70def write32u(output, value):
71    # The L format writes the bit pattern correctly whether signed
72    # or unsigned.
73    output.write(struct.pack("<L", value))
74
75class _PaddedFile:
76    """Minimal read-only file object that prepends a string to the contents
77    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
78    essential functionality."""
79
80    def __init__(self, f, prepend=b''):
81        self._buffer = prepend
82        self._length = len(prepend)
83        self.file = f
84        self._read = 0
85
86    def read(self, size):
87        if self._read is None:
88            return self.file.read(size)
89        if self._read + size <= self._length:
90            read = self._read
91            self._read += size
92            return self._buffer[read:self._read]
93        else:
94            read = self._read
95            self._read = None
96            return self._buffer[read:] + \
97                   self.file.read(size-self._length+read)
98
99    def prepend(self, prepend=b''):
100        if self._read is None:
101            self._buffer = prepend
102        else:  # Assume data was read since the last prepend() call
103            self._read -= len(prepend)
104            return
105        self._length = len(self._buffer)
106        self._read = 0
107
108    def seek(self, off):
109        self._read = None
110        self._buffer = None
111        return self.file.seek(off)
112
113    def seekable(self):
114        return True  # Allows fast-forwarding even in unseekable streams
115
116
117class BadGzipFile(OSError):
118    """Exception raised in some cases for invalid gzip files."""
119
120
121class GzipFile(_compression.BaseStream):
122    """The GzipFile class simulates most of the methods of a file object with
123    the exception of the truncate() method.
124
125    This class only supports opening files in binary mode. If you need to open a
126    compressed file in text mode, use the gzip.open() function.
127
128    """
129
130    # Overridden with internal file object to be closed, if only a filename
131    # is passed in
132    myfileobj = None
133
134    def __init__(self, filename=None, mode=None,
135                 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
136        """Constructor for the GzipFile class.
137
138        At least one of fileobj and filename must be given a
139        non-trivial value.
140
141        The new class instance is based on fileobj, which can be a regular
142        file, an io.BytesIO object, or any other object which simulates a file.
143        It defaults to None, in which case filename is opened to provide
144        a file object.
145
146        When fileobj is not None, the filename argument is only used to be
147        included in the gzip file header, which may include the original
148        filename of the uncompressed file.  It defaults to the filename of
149        fileobj, if discernible; otherwise, it defaults to the empty string,
150        and in this case the original filename is not included in the header.
151
152        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
153        'xb' depending on whether the file will be read or written.  The default
154        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
155        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
156        'wb', 'a' and 'ab', and 'x' and 'xb'.
157
158        The compresslevel argument is an integer from 0 to 9 controlling the
159        level of compression; 1 is fastest and produces the least compression,
160        and 9 is slowest and produces the most compression. 0 is no compression
161        at all. The default is 9.
162
163        The mtime argument is an optional numeric timestamp to be written
164        to the last modification time field in the stream when compressing.
165        If omitted or None, the current time is used.
166
167        """
168
169        if mode and ('t' in mode or 'U' in mode):
170            raise ValueError("Invalid mode: {!r}".format(mode))
171        if mode and 'b' not in mode:
172            mode += 'b'
173        if fileobj is None:
174            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
175        if filename is None:
176            filename = getattr(fileobj, 'name', '')
177            if not isinstance(filename, (str, bytes)):
178                filename = ''
179        else:
180            filename = os.fspath(filename)
181        origmode = mode
182        if mode is None:
183            mode = getattr(fileobj, 'mode', 'rb')
184
185        if mode.startswith('r'):
186            self.mode = READ
187            raw = _GzipReader(fileobj)
188            self._buffer = io.BufferedReader(raw)
189            self.name = filename
190
191        elif mode.startswith(('w', 'a', 'x')):
192            if origmode is None:
193                import warnings
194                warnings.warn(
195                    "GzipFile was opened for writing, but this will "
196                    "change in future Python releases.  "
197                    "Specify the mode argument for opening it for writing.",
198                    FutureWarning, 2)
199            self.mode = WRITE
200            self._init_write(filename)
201            self.compress = zlib.compressobj(compresslevel,
202                                             zlib.DEFLATED,
203                                             -zlib.MAX_WBITS,
204                                             zlib.DEF_MEM_LEVEL,
205                                             0)
206            self._write_mtime = mtime
207        else:
208            raise ValueError("Invalid mode: {!r}".format(mode))
209
210        self.fileobj = fileobj
211
212        if self.mode == WRITE:
213            self._write_gzip_header(compresslevel)
214
215    @property
216    def filename(self):
217        import warnings
218        warnings.warn("use the name attribute", DeprecationWarning, 2)
219        if self.mode == WRITE and self.name[-3:] != ".gz":
220            return self.name + ".gz"
221        return self.name
222
223    @property
224    def mtime(self):
225        """Last modification time read from stream, or None"""
226        return self._buffer.raw._last_mtime
227
228    def __repr__(self):
229        s = repr(self.fileobj)
230        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
231
232    def _init_write(self, filename):
233        self.name = filename
234        self.crc = zlib.crc32(b"")
235        self.size = 0
236        self.writebuf = []
237        self.bufsize = 0
238        self.offset = 0  # Current file offset for seek(), tell(), etc
239
240    def _write_gzip_header(self, compresslevel):
241        self.fileobj.write(b'\037\213')             # magic header
242        self.fileobj.write(b'\010')                 # compression method
243        try:
244            # RFC 1952 requires the FNAME field to be Latin-1. Do not
245            # include filenames that cannot be represented that way.
246            fname = os.path.basename(self.name)
247            if not isinstance(fname, bytes):
248                fname = fname.encode('latin-1')
249            if fname.endswith(b'.gz'):
250                fname = fname[:-3]
251        except UnicodeEncodeError:
252            fname = b''
253        flags = 0
254        if fname:
255            flags = FNAME
256        self.fileobj.write(chr(flags).encode('latin-1'))
257        mtime = self._write_mtime
258        if mtime is None:
259            mtime = time.time()
260        write32u(self.fileobj, int(mtime))
261        if compresslevel == _COMPRESS_LEVEL_BEST:
262            xfl = b'\002'
263        elif compresslevel == _COMPRESS_LEVEL_FAST:
264            xfl = b'\004'
265        else:
266            xfl = b'\000'
267        self.fileobj.write(xfl)
268        self.fileobj.write(b'\377')
269        if fname:
270            self.fileobj.write(fname + b'\000')
271
272    def write(self,data):
273        self._check_not_closed()
274        if self.mode != WRITE:
275            import errno
276            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
277
278        if self.fileobj is None:
279            raise ValueError("write() on closed GzipFile object")
280
281        if isinstance(data, (bytes, bytearray)):
282            length = len(data)
283        else:
284            # accept any data that supports the buffer protocol
285            data = memoryview(data)
286            length = data.nbytes
287
288        if length > 0:
289            self.fileobj.write(self.compress.compress(data))
290            self.size += length
291            self.crc = zlib.crc32(data, self.crc)
292            self.offset += length
293
294        return length
295
296    def read(self, size=-1):
297        self._check_not_closed()
298        if self.mode != READ:
299            import errno
300            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
301        return self._buffer.read(size)
302
303    def read1(self, size=-1):
304        """Implements BufferedIOBase.read1()
305
306        Reads up to a buffer's worth of data if size is negative."""
307        self._check_not_closed()
308        if self.mode != READ:
309            import errno
310            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
311
312        if size < 0:
313            size = io.DEFAULT_BUFFER_SIZE
314        return self._buffer.read1(size)
315
316    def peek(self, n):
317        self._check_not_closed()
318        if self.mode != READ:
319            import errno
320            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
321        return self._buffer.peek(n)
322
323    @property
324    def closed(self):
325        return self.fileobj is None
326
327    def close(self):
328        fileobj = self.fileobj
329        if fileobj is None:
330            return
331        self.fileobj = None
332        try:
333            if self.mode == WRITE:
334                fileobj.write(self.compress.flush())
335                write32u(fileobj, self.crc)
336                # self.size may exceed 2 GiB, or even 4 GiB
337                write32u(fileobj, self.size & 0xffffffff)
338            elif self.mode == READ:
339                self._buffer.close()
340        finally:
341            myfileobj = self.myfileobj
342            if myfileobj:
343                self.myfileobj = None
344                myfileobj.close()
345
346    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
347        self._check_not_closed()
348        if self.mode == WRITE:
349            # Ensure the compressor's buffer is flushed
350            self.fileobj.write(self.compress.flush(zlib_mode))
351            self.fileobj.flush()
352
353    def fileno(self):
354        """Invoke the underlying file object's fileno() method.
355
356        This will raise AttributeError if the underlying file object
357        doesn't support fileno().
358        """
359        return self.fileobj.fileno()
360
361    def rewind(self):
362        '''Return the uncompressed stream file position indicator to the
363        beginning of the file'''
364        if self.mode != READ:
365            raise OSError("Can't rewind in write mode")
366        self._buffer.seek(0)
367
368    def readable(self):
369        return self.mode == READ
370
371    def writable(self):
372        return self.mode == WRITE
373
374    def seekable(self):
375        return True
376
377    def seek(self, offset, whence=io.SEEK_SET):
378        if self.mode == WRITE:
379            if whence != io.SEEK_SET:
380                if whence == io.SEEK_CUR:
381                    offset = self.offset + offset
382                else:
383                    raise ValueError('Seek from end not supported')
384            if offset < self.offset:
385                raise OSError('Negative seek in write mode')
386            count = offset - self.offset
387            chunk = b'\0' * 1024
388            for i in range(count // 1024):
389                self.write(chunk)
390            self.write(b'\0' * (count % 1024))
391        elif self.mode == READ:
392            self._check_not_closed()
393            return self._buffer.seek(offset, whence)
394
395        return self.offset
396
397    def readline(self, size=-1):
398        self._check_not_closed()
399        return self._buffer.readline(size)
400
401
402def _read_exact(fp, n):
403    '''Read exactly *n* bytes from `fp`
404
405    This method is required because fp may be unbuffered,
406    i.e. return short reads.
407    '''
408    data = fp.read(n)
409    while len(data) < n:
410        b = fp.read(n - len(data))
411        if not b:
412            raise EOFError("Compressed file ended before the "
413                           "end-of-stream marker was reached")
414        data += b
415    return data
416
417
418def _read_gzip_header(fp):
419    '''Read a gzip header from `fp` and progress to the end of the header.
420
421    Returns last mtime if header was present or None otherwise.
422    '''
423    magic = fp.read(2)
424    if magic == b'':
425        return None
426
427    if magic != b'\037\213':
428        raise BadGzipFile('Not a gzipped file (%r)' % magic)
429
430    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
431    if method != 8:
432        raise BadGzipFile('Unknown compression method')
433
434    if flag & FEXTRA:
435        # Read & discard the extra field, if present
436        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
437        _read_exact(fp, extra_len)
438    if flag & FNAME:
439        # Read and discard a null-terminated string containing the filename
440        while True:
441            s = fp.read(1)
442            if not s or s==b'\000':
443                break
444    if flag & FCOMMENT:
445        # Read and discard a null-terminated string containing a comment
446        while True:
447            s = fp.read(1)
448            if not s or s==b'\000':
449                break
450    if flag & FHCRC:
451        _read_exact(fp, 2)     # Read & discard the 16-bit header CRC
452    return last_mtime
453
454
455class _GzipReader(_compression.DecompressReader):
456    def __init__(self, fp):
457        super().__init__(_PaddedFile(fp), zlib.decompressobj,
458                         wbits=-zlib.MAX_WBITS)
459        # Set flag indicating start of a new member
460        self._new_member = True
461        self._last_mtime = None
462
463    def _init_read(self):
464        self._crc = zlib.crc32(b"")
465        self._stream_size = 0  # Decompressed size of unconcatenated stream
466
467    def _read_gzip_header(self):
468        last_mtime = _read_gzip_header(self._fp)
469        if last_mtime is None:
470            return False
471        self._last_mtime = last_mtime
472        return True
473
474    def read(self, size=-1):
475        if size < 0:
476            return self.readall()
477        # size=0 is special because decompress(max_length=0) is not supported
478        if not size:
479            return b""
480
481        # For certain input data, a single
482        # call to decompress() may not return
483        # any data. In this case, retry until we get some data or reach EOF.
484        while True:
485            if self._decompressor.eof:
486                # Ending case: we've come to the end of a member in the file,
487                # so finish up this member, and read a new gzip header.
488                # Check the CRC and file size, and set the flag so we read
489                # a new member
490                self._read_eof()
491                self._new_member = True
492                self._decompressor = self._decomp_factory(
493                    **self._decomp_args)
494
495            if self._new_member:
496                # If the _new_member flag is set, we have to
497                # jump to the next member, if there is one.
498                self._init_read()
499                if not self._read_gzip_header():
500                    self._size = self._pos
501                    return b""
502                self._new_member = False
503
504            # Read a chunk of data from the file
505            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
506
507            uncompress = self._decompressor.decompress(buf, size)
508            if self._decompressor.unconsumed_tail != b"":
509                self._fp.prepend(self._decompressor.unconsumed_tail)
510            elif self._decompressor.unused_data != b"":
511                # Prepend the already read bytes to the fileobj so they can
512                # be seen by _read_eof() and _read_gzip_header()
513                self._fp.prepend(self._decompressor.unused_data)
514
515            if uncompress != b"":
516                break
517            if buf == b"":
518                raise EOFError("Compressed file ended before the "
519                               "end-of-stream marker was reached")
520
521        self._add_read_data( uncompress )
522        self._pos += len(uncompress)
523        return uncompress
524
525    def _add_read_data(self, data):
526        self._crc = zlib.crc32(data, self._crc)
527        self._stream_size = self._stream_size + len(data)
528
529    def _read_eof(self):
530        # We've read to the end of the file
531        # We check that the computed CRC and size of the
532        # uncompressed data matches the stored values.  Note that the size
533        # stored is the true file size mod 2**32.
534        crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
535        if crc32 != self._crc:
536            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
537                                                             hex(self._crc)))
538        elif isize != (self._stream_size & 0xffffffff):
539            raise BadGzipFile("Incorrect length of data produced")
540
541        # Gzip files can be padded with zeroes and still have archives.
542        # Consume all zero bytes and set the file position to the first
543        # non-zero byte. See http://www.gzip.org/#faq8
544        c = b"\x00"
545        while c == b"\x00":
546            c = self._fp.read(1)
547        if c:
548            self._fp.prepend(c)
549
550    def _rewind(self):
551        super()._rewind()
552        self._new_member = True
553
554
555def _create_simple_gzip_header(compresslevel: int,
556                               mtime = None) -> bytes:
557    """
558    Write a simple gzip header with no extra fields.
559    :param compresslevel: Compresslevel used to determine the xfl bytes.
560    :param mtime: The mtime (must support conversion to a 32-bit integer).
561    :return: A bytes object representing the gzip header.
562    """
563    if mtime is None:
564        mtime = time.time()
565    if compresslevel == _COMPRESS_LEVEL_BEST:
566        xfl = 2
567    elif compresslevel == _COMPRESS_LEVEL_FAST:
568        xfl = 4
569    else:
570        xfl = 0
571    # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
572    # fields added to header), mtime, xfl and os (255 for unknown OS).
573    return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
574
575
576def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
577    """Compress data in one shot and return the compressed string.
578
579    compresslevel sets the compression level in range of 0-9.
580    mtime can be used to set the modification time. The modification time is
581    set to the current time by default.
582    """
583    if mtime == 0:
584        # Use zlib as it creates the header with 0 mtime by default.
585        # This is faster and with less overhead.
586        return zlib.compress(data, level=compresslevel, wbits=31)
587    header = _create_simple_gzip_header(compresslevel, mtime)
588    trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
589    # Wbits=-15 creates a raw deflate block.
590    return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
591            trailer)
592
593
594def decompress(data):
595    """Decompress a gzip compressed string in one shot.
596    Return the decompressed string.
597    """
598    decompressed_members = []
599    while True:
600        fp = io.BytesIO(data)
601        if _read_gzip_header(fp) is None:
602            return b"".join(decompressed_members)
603        # Use a zlib raw deflate compressor
604        do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
605        # Read all the data except the header
606        decompressed = do.decompress(data[fp.tell():])
607        if not do.eof or len(do.unused_data) < 8:
608            raise EOFError("Compressed file ended before the end-of-stream "
609                           "marker was reached")
610        crc, length = struct.unpack("<II", do.unused_data[:8])
611        if crc != zlib.crc32(decompressed):
612            raise BadGzipFile("CRC check failed")
613        if length != (len(decompressed) & 0xffffffff):
614            raise BadGzipFile("Incorrect length of data produced")
615        decompressed_members.append(decompressed)
616        data = do.unused_data[8:].lstrip(b"\x00")
617
618
619def main():
620    from argparse import ArgumentParser
621    parser = ArgumentParser(description=
622        "A simple command line interface for the gzip module: act like gzip, "
623        "but do not delete the input file.")
624    group = parser.add_mutually_exclusive_group()
625    group.add_argument('--fast', action='store_true', help='compress faster')
626    group.add_argument('--best', action='store_true', help='compress better')
627    group.add_argument("-d", "--decompress", action="store_true",
628                        help="act like gunzip instead of gzip")
629
630    parser.add_argument("args", nargs="*", default=["-"], metavar='file')
631    args = parser.parse_args()
632
633    compresslevel = _COMPRESS_LEVEL_TRADEOFF
634    if args.fast:
635        compresslevel = _COMPRESS_LEVEL_FAST
636    elif args.best:
637        compresslevel = _COMPRESS_LEVEL_BEST
638
639    for arg in args.args:
640        if args.decompress:
641            if arg == "-":
642                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
643                g = sys.stdout.buffer
644            else:
645                if arg[-3:] != ".gz":
646                    sys.exit(f"filename doesn't end in .gz: {arg!r}")
647                f = open(arg, "rb")
648                g = builtins.open(arg[:-3], "wb")
649        else:
650            if arg == "-":
651                f = sys.stdin.buffer
652                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
653                             compresslevel=compresslevel)
654            else:
655                f = builtins.open(arg, "rb")
656                g = open(arg + ".gz", "wb")
657        while True:
658            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
659            if not chunk:
660                break
661            g.write(chunk)
662        if g is not sys.stdout.buffer:
663            g.close()
664        if f is not sys.stdin.buffer:
665            f.close()
666
667if __name__ == '__main__':
668    main()
669