1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ, WRITE = 1, 2 19 20_COMPRESS_LEVEL_FAST = 1 21_COMPRESS_LEVEL_TRADEOFF = 6 22_COMPRESS_LEVEL_BEST = 9 23 24 25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, 26 encoding=None, errors=None, newline=None): 27 """Open a gzip-compressed file in binary or text mode. 28 29 The filename argument can be an actual filename (a str or bytes object), or 30 an existing file object to read from or write to. 31 32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 34 "rb", and the default compresslevel is 9. 35 36 For binary mode, this function is equivalent to the GzipFile constructor: 37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 38 and newline arguments must not be provided. 39 40 For text mode, a GzipFile object is created, and wrapped in an 41 io.TextIOWrapper instance with the specified encoding, error handling 42 behavior, and line ending(s). 43 44 """ 45 if "t" in mode: 46 if "b" in mode: 47 raise ValueError("Invalid mode: %r" % (mode,)) 48 else: 49 if encoding is not None: 50 raise ValueError("Argument 'encoding' not supported in binary mode") 51 if errors is not None: 52 raise ValueError("Argument 'errors' not supported in binary mode") 53 if newline is not None: 54 raise ValueError("Argument 'newline' not supported in binary mode") 55 56 gz_mode = mode.replace("t", "") 57 if isinstance(filename, (str, bytes, os.PathLike)): 58 binary_file = GzipFile(filename, gz_mode, compresslevel) 59 elif hasattr(filename, "read") or hasattr(filename, "write"): 60 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 61 else: 62 raise TypeError("filename must be a str or bytes object, or a file") 63 64 if "t" in mode: 65 encoding = io.text_encoding(encoding) 66 return io.TextIOWrapper(binary_file, encoding, errors, newline) 67 else: 68 return binary_file 69 70def write32u(output, value): 71 # The L format writes the bit pattern correctly whether signed 72 # or unsigned. 73 output.write(struct.pack("<L", value)) 74 75class _PaddedFile: 76 """Minimal read-only file object that prepends a string to the contents 77 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 78 essential functionality.""" 79 80 def __init__(self, f, prepend=b''): 81 self._buffer = prepend 82 self._length = len(prepend) 83 self.file = f 84 self._read = 0 85 86 def read(self, size): 87 if self._read is None: 88 return self.file.read(size) 89 if self._read + size <= self._length: 90 read = self._read 91 self._read += size 92 return self._buffer[read:self._read] 93 else: 94 read = self._read 95 self._read = None 96 return self._buffer[read:] + \ 97 self.file.read(size-self._length+read) 98 99 def prepend(self, prepend=b''): 100 if self._read is None: 101 self._buffer = prepend 102 else: # Assume data was read since the last prepend() call 103 self._read -= len(prepend) 104 return 105 self._length = len(self._buffer) 106 self._read = 0 107 108 def seek(self, off): 109 self._read = None 110 self._buffer = None 111 return self.file.seek(off) 112 113 def seekable(self): 114 return True # Allows fast-forwarding even in unseekable streams 115 116 117class BadGzipFile(OSError): 118 """Exception raised in some cases for invalid gzip files.""" 119 120 121class GzipFile(_compression.BaseStream): 122 """The GzipFile class simulates most of the methods of a file object with 123 the exception of the truncate() method. 124 125 This class only supports opening files in binary mode. If you need to open a 126 compressed file in text mode, use the gzip.open() function. 127 128 """ 129 130 # Overridden with internal file object to be closed, if only a filename 131 # is passed in 132 myfileobj = None 133 134 def __init__(self, filename=None, mode=None, 135 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): 136 """Constructor for the GzipFile class. 137 138 At least one of fileobj and filename must be given a 139 non-trivial value. 140 141 The new class instance is based on fileobj, which can be a regular 142 file, an io.BytesIO object, or any other object which simulates a file. 143 It defaults to None, in which case filename is opened to provide 144 a file object. 145 146 When fileobj is not None, the filename argument is only used to be 147 included in the gzip file header, which may include the original 148 filename of the uncompressed file. It defaults to the filename of 149 fileobj, if discernible; otherwise, it defaults to the empty string, 150 and in this case the original filename is not included in the header. 151 152 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 153 'xb' depending on whether the file will be read or written. The default 154 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 155 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 156 'wb', 'a' and 'ab', and 'x' and 'xb'. 157 158 The compresslevel argument is an integer from 0 to 9 controlling the 159 level of compression; 1 is fastest and produces the least compression, 160 and 9 is slowest and produces the most compression. 0 is no compression 161 at all. The default is 9. 162 163 The mtime argument is an optional numeric timestamp to be written 164 to the last modification time field in the stream when compressing. 165 If omitted or None, the current time is used. 166 167 """ 168 169 if mode and ('t' in mode or 'U' in mode): 170 raise ValueError("Invalid mode: {!r}".format(mode)) 171 if mode and 'b' not in mode: 172 mode += 'b' 173 if fileobj is None: 174 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 175 if filename is None: 176 filename = getattr(fileobj, 'name', '') 177 if not isinstance(filename, (str, bytes)): 178 filename = '' 179 else: 180 filename = os.fspath(filename) 181 origmode = mode 182 if mode is None: 183 mode = getattr(fileobj, 'mode', 'rb') 184 185 if mode.startswith('r'): 186 self.mode = READ 187 raw = _GzipReader(fileobj) 188 self._buffer = io.BufferedReader(raw) 189 self.name = filename 190 191 elif mode.startswith(('w', 'a', 'x')): 192 if origmode is None: 193 import warnings 194 warnings.warn( 195 "GzipFile was opened for writing, but this will " 196 "change in future Python releases. " 197 "Specify the mode argument for opening it for writing.", 198 FutureWarning, 2) 199 self.mode = WRITE 200 self._init_write(filename) 201 self.compress = zlib.compressobj(compresslevel, 202 zlib.DEFLATED, 203 -zlib.MAX_WBITS, 204 zlib.DEF_MEM_LEVEL, 205 0) 206 self._write_mtime = mtime 207 else: 208 raise ValueError("Invalid mode: {!r}".format(mode)) 209 210 self.fileobj = fileobj 211 212 if self.mode == WRITE: 213 self._write_gzip_header(compresslevel) 214 215 @property 216 def filename(self): 217 import warnings 218 warnings.warn("use the name attribute", DeprecationWarning, 2) 219 if self.mode == WRITE and self.name[-3:] != ".gz": 220 return self.name + ".gz" 221 return self.name 222 223 @property 224 def mtime(self): 225 """Last modification time read from stream, or None""" 226 return self._buffer.raw._last_mtime 227 228 def __repr__(self): 229 s = repr(self.fileobj) 230 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 231 232 def _init_write(self, filename): 233 self.name = filename 234 self.crc = zlib.crc32(b"") 235 self.size = 0 236 self.writebuf = [] 237 self.bufsize = 0 238 self.offset = 0 # Current file offset for seek(), tell(), etc 239 240 def _write_gzip_header(self, compresslevel): 241 self.fileobj.write(b'\037\213') # magic header 242 self.fileobj.write(b'\010') # compression method 243 try: 244 # RFC 1952 requires the FNAME field to be Latin-1. Do not 245 # include filenames that cannot be represented that way. 246 fname = os.path.basename(self.name) 247 if not isinstance(fname, bytes): 248 fname = fname.encode('latin-1') 249 if fname.endswith(b'.gz'): 250 fname = fname[:-3] 251 except UnicodeEncodeError: 252 fname = b'' 253 flags = 0 254 if fname: 255 flags = FNAME 256 self.fileobj.write(chr(flags).encode('latin-1')) 257 mtime = self._write_mtime 258 if mtime is None: 259 mtime = time.time() 260 write32u(self.fileobj, int(mtime)) 261 if compresslevel == _COMPRESS_LEVEL_BEST: 262 xfl = b'\002' 263 elif compresslevel == _COMPRESS_LEVEL_FAST: 264 xfl = b'\004' 265 else: 266 xfl = b'\000' 267 self.fileobj.write(xfl) 268 self.fileobj.write(b'\377') 269 if fname: 270 self.fileobj.write(fname + b'\000') 271 272 def write(self,data): 273 self._check_not_closed() 274 if self.mode != WRITE: 275 import errno 276 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 277 278 if self.fileobj is None: 279 raise ValueError("write() on closed GzipFile object") 280 281 if isinstance(data, (bytes, bytearray)): 282 length = len(data) 283 else: 284 # accept any data that supports the buffer protocol 285 data = memoryview(data) 286 length = data.nbytes 287 288 if length > 0: 289 self.fileobj.write(self.compress.compress(data)) 290 self.size += length 291 self.crc = zlib.crc32(data, self.crc) 292 self.offset += length 293 294 return length 295 296 def read(self, size=-1): 297 self._check_not_closed() 298 if self.mode != READ: 299 import errno 300 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 301 return self._buffer.read(size) 302 303 def read1(self, size=-1): 304 """Implements BufferedIOBase.read1() 305 306 Reads up to a buffer's worth of data if size is negative.""" 307 self._check_not_closed() 308 if self.mode != READ: 309 import errno 310 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 311 312 if size < 0: 313 size = io.DEFAULT_BUFFER_SIZE 314 return self._buffer.read1(size) 315 316 def peek(self, n): 317 self._check_not_closed() 318 if self.mode != READ: 319 import errno 320 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 321 return self._buffer.peek(n) 322 323 @property 324 def closed(self): 325 return self.fileobj is None 326 327 def close(self): 328 fileobj = self.fileobj 329 if fileobj is None: 330 return 331 self.fileobj = None 332 try: 333 if self.mode == WRITE: 334 fileobj.write(self.compress.flush()) 335 write32u(fileobj, self.crc) 336 # self.size may exceed 2 GiB, or even 4 GiB 337 write32u(fileobj, self.size & 0xffffffff) 338 elif self.mode == READ: 339 self._buffer.close() 340 finally: 341 myfileobj = self.myfileobj 342 if myfileobj: 343 self.myfileobj = None 344 myfileobj.close() 345 346 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 347 self._check_not_closed() 348 if self.mode == WRITE: 349 # Ensure the compressor's buffer is flushed 350 self.fileobj.write(self.compress.flush(zlib_mode)) 351 self.fileobj.flush() 352 353 def fileno(self): 354 """Invoke the underlying file object's fileno() method. 355 356 This will raise AttributeError if the underlying file object 357 doesn't support fileno(). 358 """ 359 return self.fileobj.fileno() 360 361 def rewind(self): 362 '''Return the uncompressed stream file position indicator to the 363 beginning of the file''' 364 if self.mode != READ: 365 raise OSError("Can't rewind in write mode") 366 self._buffer.seek(0) 367 368 def readable(self): 369 return self.mode == READ 370 371 def writable(self): 372 return self.mode == WRITE 373 374 def seekable(self): 375 return True 376 377 def seek(self, offset, whence=io.SEEK_SET): 378 if self.mode == WRITE: 379 if whence != io.SEEK_SET: 380 if whence == io.SEEK_CUR: 381 offset = self.offset + offset 382 else: 383 raise ValueError('Seek from end not supported') 384 if offset < self.offset: 385 raise OSError('Negative seek in write mode') 386 count = offset - self.offset 387 chunk = b'\0' * 1024 388 for i in range(count // 1024): 389 self.write(chunk) 390 self.write(b'\0' * (count % 1024)) 391 elif self.mode == READ: 392 self._check_not_closed() 393 return self._buffer.seek(offset, whence) 394 395 return self.offset 396 397 def readline(self, size=-1): 398 self._check_not_closed() 399 return self._buffer.readline(size) 400 401 402def _read_exact(fp, n): 403 '''Read exactly *n* bytes from `fp` 404 405 This method is required because fp may be unbuffered, 406 i.e. return short reads. 407 ''' 408 data = fp.read(n) 409 while len(data) < n: 410 b = fp.read(n - len(data)) 411 if not b: 412 raise EOFError("Compressed file ended before the " 413 "end-of-stream marker was reached") 414 data += b 415 return data 416 417 418def _read_gzip_header(fp): 419 '''Read a gzip header from `fp` and progress to the end of the header. 420 421 Returns last mtime if header was present or None otherwise. 422 ''' 423 magic = fp.read(2) 424 if magic == b'': 425 return None 426 427 if magic != b'\037\213': 428 raise BadGzipFile('Not a gzipped file (%r)' % magic) 429 430 (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8)) 431 if method != 8: 432 raise BadGzipFile('Unknown compression method') 433 434 if flag & FEXTRA: 435 # Read & discard the extra field, if present 436 extra_len, = struct.unpack("<H", _read_exact(fp, 2)) 437 _read_exact(fp, extra_len) 438 if flag & FNAME: 439 # Read and discard a null-terminated string containing the filename 440 while True: 441 s = fp.read(1) 442 if not s or s==b'\000': 443 break 444 if flag & FCOMMENT: 445 # Read and discard a null-terminated string containing a comment 446 while True: 447 s = fp.read(1) 448 if not s or s==b'\000': 449 break 450 if flag & FHCRC: 451 _read_exact(fp, 2) # Read & discard the 16-bit header CRC 452 return last_mtime 453 454 455class _GzipReader(_compression.DecompressReader): 456 def __init__(self, fp): 457 super().__init__(_PaddedFile(fp), zlib.decompressobj, 458 wbits=-zlib.MAX_WBITS) 459 # Set flag indicating start of a new member 460 self._new_member = True 461 self._last_mtime = None 462 463 def _init_read(self): 464 self._crc = zlib.crc32(b"") 465 self._stream_size = 0 # Decompressed size of unconcatenated stream 466 467 def _read_gzip_header(self): 468 last_mtime = _read_gzip_header(self._fp) 469 if last_mtime is None: 470 return False 471 self._last_mtime = last_mtime 472 return True 473 474 def read(self, size=-1): 475 if size < 0: 476 return self.readall() 477 # size=0 is special because decompress(max_length=0) is not supported 478 if not size: 479 return b"" 480 481 # For certain input data, a single 482 # call to decompress() may not return 483 # any data. In this case, retry until we get some data or reach EOF. 484 while True: 485 if self._decompressor.eof: 486 # Ending case: we've come to the end of a member in the file, 487 # so finish up this member, and read a new gzip header. 488 # Check the CRC and file size, and set the flag so we read 489 # a new member 490 self._read_eof() 491 self._new_member = True 492 self._decompressor = self._decomp_factory( 493 **self._decomp_args) 494 495 if self._new_member: 496 # If the _new_member flag is set, we have to 497 # jump to the next member, if there is one. 498 self._init_read() 499 if not self._read_gzip_header(): 500 self._size = self._pos 501 return b"" 502 self._new_member = False 503 504 # Read a chunk of data from the file 505 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 506 507 uncompress = self._decompressor.decompress(buf, size) 508 if self._decompressor.unconsumed_tail != b"": 509 self._fp.prepend(self._decompressor.unconsumed_tail) 510 elif self._decompressor.unused_data != b"": 511 # Prepend the already read bytes to the fileobj so they can 512 # be seen by _read_eof() and _read_gzip_header() 513 self._fp.prepend(self._decompressor.unused_data) 514 515 if uncompress != b"": 516 break 517 if buf == b"": 518 raise EOFError("Compressed file ended before the " 519 "end-of-stream marker was reached") 520 521 self._add_read_data( uncompress ) 522 self._pos += len(uncompress) 523 return uncompress 524 525 def _add_read_data(self, data): 526 self._crc = zlib.crc32(data, self._crc) 527 self._stream_size = self._stream_size + len(data) 528 529 def _read_eof(self): 530 # We've read to the end of the file 531 # We check that the computed CRC and size of the 532 # uncompressed data matches the stored values. Note that the size 533 # stored is the true file size mod 2**32. 534 crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8)) 535 if crc32 != self._crc: 536 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), 537 hex(self._crc))) 538 elif isize != (self._stream_size & 0xffffffff): 539 raise BadGzipFile("Incorrect length of data produced") 540 541 # Gzip files can be padded with zeroes and still have archives. 542 # Consume all zero bytes and set the file position to the first 543 # non-zero byte. See http://www.gzip.org/#faq8 544 c = b"\x00" 545 while c == b"\x00": 546 c = self._fp.read(1) 547 if c: 548 self._fp.prepend(c) 549 550 def _rewind(self): 551 super()._rewind() 552 self._new_member = True 553 554 555def _create_simple_gzip_header(compresslevel: int, 556 mtime = None) -> bytes: 557 """ 558 Write a simple gzip header with no extra fields. 559 :param compresslevel: Compresslevel used to determine the xfl bytes. 560 :param mtime: The mtime (must support conversion to a 32-bit integer). 561 :return: A bytes object representing the gzip header. 562 """ 563 if mtime is None: 564 mtime = time.time() 565 if compresslevel == _COMPRESS_LEVEL_BEST: 566 xfl = 2 567 elif compresslevel == _COMPRESS_LEVEL_FAST: 568 xfl = 4 569 else: 570 xfl = 0 571 # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra 572 # fields added to header), mtime, xfl and os (255 for unknown OS). 573 return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255) 574 575 576def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None): 577 """Compress data in one shot and return the compressed string. 578 579 compresslevel sets the compression level in range of 0-9. 580 mtime can be used to set the modification time. The modification time is 581 set to the current time by default. 582 """ 583 if mtime == 0: 584 # Use zlib as it creates the header with 0 mtime by default. 585 # This is faster and with less overhead. 586 return zlib.compress(data, level=compresslevel, wbits=31) 587 header = _create_simple_gzip_header(compresslevel, mtime) 588 trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff)) 589 # Wbits=-15 creates a raw deflate block. 590 return (header + zlib.compress(data, level=compresslevel, wbits=-15) + 591 trailer) 592 593 594def decompress(data): 595 """Decompress a gzip compressed string in one shot. 596 Return the decompressed string. 597 """ 598 decompressed_members = [] 599 while True: 600 fp = io.BytesIO(data) 601 if _read_gzip_header(fp) is None: 602 return b"".join(decompressed_members) 603 # Use a zlib raw deflate compressor 604 do = zlib.decompressobj(wbits=-zlib.MAX_WBITS) 605 # Read all the data except the header 606 decompressed = do.decompress(data[fp.tell():]) 607 if not do.eof or len(do.unused_data) < 8: 608 raise EOFError("Compressed file ended before the end-of-stream " 609 "marker was reached") 610 crc, length = struct.unpack("<II", do.unused_data[:8]) 611 if crc != zlib.crc32(decompressed): 612 raise BadGzipFile("CRC check failed") 613 if length != (len(decompressed) & 0xffffffff): 614 raise BadGzipFile("Incorrect length of data produced") 615 decompressed_members.append(decompressed) 616 data = do.unused_data[8:].lstrip(b"\x00") 617 618 619def main(): 620 from argparse import ArgumentParser 621 parser = ArgumentParser(description= 622 "A simple command line interface for the gzip module: act like gzip, " 623 "but do not delete the input file.") 624 group = parser.add_mutually_exclusive_group() 625 group.add_argument('--fast', action='store_true', help='compress faster') 626 group.add_argument('--best', action='store_true', help='compress better') 627 group.add_argument("-d", "--decompress", action="store_true", 628 help="act like gunzip instead of gzip") 629 630 parser.add_argument("args", nargs="*", default=["-"], metavar='file') 631 args = parser.parse_args() 632 633 compresslevel = _COMPRESS_LEVEL_TRADEOFF 634 if args.fast: 635 compresslevel = _COMPRESS_LEVEL_FAST 636 elif args.best: 637 compresslevel = _COMPRESS_LEVEL_BEST 638 639 for arg in args.args: 640 if args.decompress: 641 if arg == "-": 642 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 643 g = sys.stdout.buffer 644 else: 645 if arg[-3:] != ".gz": 646 sys.exit(f"filename doesn't end in .gz: {arg!r}") 647 f = open(arg, "rb") 648 g = builtins.open(arg[:-3], "wb") 649 else: 650 if arg == "-": 651 f = sys.stdin.buffer 652 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, 653 compresslevel=compresslevel) 654 else: 655 f = builtins.open(arg, "rb") 656 g = open(arg + ".gz", "wb") 657 while True: 658 chunk = f.read(io.DEFAULT_BUFFER_SIZE) 659 if not chunk: 660 break 661 g.write(chunk) 662 if g is not sys.stdout.buffer: 663 g.close() 664 if f is not sys.stdin.buffer: 665 f.close() 666 667if __name__ == '__main__': 668 main() 669