1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import io 11import __builtin__ 12 13__all__ = ["GzipFile","open"] 14 15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 16 17READ, WRITE = 1, 2 18 19def write32u(output, value): 20 # The L format writes the bit pattern correctly whether signed 21 # or unsigned. 22 output.write(struct.pack("<L", value)) 23 24def read32(input): 25 return struct.unpack("<I", input.read(4))[0] 26 27def open(filename, mode="rb", compresslevel=9): 28 """Shorthand for GzipFile(filename, mode, compresslevel). 29 30 The filename argument is required; mode defaults to 'rb' 31 and compresslevel defaults to 9. 32 33 """ 34 return GzipFile(filename, mode, compresslevel) 35 36class GzipFile(io.BufferedIOBase): 37 """The GzipFile class simulates most of the methods of a file object with 38 the exception of the readinto() and truncate() methods. 39 40 """ 41 42 myfileobj = None 43 max_read_chunk = 10 * 1024 * 1024 # 10Mb 44 45 def __init__(self, filename=None, mode=None, 46 compresslevel=9, fileobj=None, mtime=None): 47 """Constructor for the GzipFile class. 48 49 At least one of fileobj and filename must be given a 50 non-trivial value. 51 52 The new class instance is based on fileobj, which can be a regular 53 file, a StringIO object, or any other object which simulates a file. 54 It defaults to None, in which case filename is opened to provide 55 a file object. 56 57 When fileobj is not None, the filename argument is only used to be 58 included in the gzip file header, which may include the original 59 filename of the uncompressed file. It defaults to the filename of 60 fileobj, if discernible; otherwise, it defaults to the empty string, 61 and in this case the original filename is not included in the header. 62 63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 64 depending on whether the file will be read or written. The default 65 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 66 Be aware that only the 'rb', 'ab', and 'wb' values should be used 67 for cross-platform portability. 68 69 The compresslevel argument is an integer from 0 to 9 controlling the 70 level of compression; 1 is fastest and produces the least compression, 71 and 9 is slowest and produces the most compression. 0 is no compression 72 at all. The default is 9. 73 74 The mtime argument is an optional numeric timestamp to be written 75 to the stream when compressing. All gzip compressed streams 76 are required to contain a timestamp. If omitted or None, the 77 current time is used. This module ignores the timestamp when 78 decompressing; however, some programs, such as gunzip, make use 79 of it. The format of the timestamp is the same as that of the 80 return value of time.time() and of the st_mtime member of the 81 object returned by os.stat(). 82 83 """ 84 85 # Make sure we don't inadvertently enable universal newlines on the 86 # underlying file object - in read mode, this causes data corruption. 87 if mode: 88 mode = mode.replace('U', '') 89 # guarantee the file is opened in binary mode on platforms 90 # that care about that sort of thing 91 if mode and 'b' not in mode: 92 mode += 'b' 93 if fileobj is None: 94 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') 95 if filename is None: 96 # Issue #13781: os.fdopen() creates a fileobj with a bogus name 97 # attribute. Avoid saving this in the gzip header's filename field. 98 filename = getattr(fileobj, 'name', '') 99 if not isinstance(filename, basestring) or filename == '<fdopen>': 100 filename = '' 101 if mode is None: 102 if hasattr(fileobj, 'mode'): mode = fileobj.mode 103 else: mode = 'rb' 104 105 if mode[0:1] == 'r': 106 self.mode = READ 107 # Set flag indicating start of a new member 108 self._new_member = True 109 # Buffer data read from gzip file. extrastart is offset in 110 # stream where buffer starts. extrasize is number of 111 # bytes remaining in buffer from current stream position. 112 self.extrabuf = "" 113 self.extrasize = 0 114 self.extrastart = 0 115 self.name = filename 116 # Starts small, scales exponentially 117 self.min_readsize = 100 118 119 elif mode[0:1] == 'w' or mode[0:1] == 'a': 120 self.mode = WRITE 121 self._init_write(filename) 122 self.compress = zlib.compressobj(compresslevel, 123 zlib.DEFLATED, 124 -zlib.MAX_WBITS, 125 zlib.DEF_MEM_LEVEL, 126 0) 127 else: 128 raise IOError, "Mode " + mode + " not supported" 129 130 self.fileobj = fileobj 131 self.offset = 0 132 self.mtime = mtime 133 134 if self.mode == WRITE: 135 self._write_gzip_header() 136 137 @property 138 def filename(self): 139 import warnings 140 warnings.warn("use the name attribute", DeprecationWarning, 2) 141 if self.mode == WRITE and self.name[-3:] != ".gz": 142 return self.name + ".gz" 143 return self.name 144 145 def __repr__(self): 146 s = repr(self.fileobj) 147 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 148 149 def _check_closed(self): 150 """Raises a ValueError if the underlying file object has been closed. 151 152 """ 153 if self.closed: 154 raise ValueError('I/O operation on closed file.') 155 156 def _init_write(self, filename): 157 self.name = filename 158 self.crc = zlib.crc32("") & 0xffffffffL 159 self.size = 0 160 self.writebuf = [] 161 self.bufsize = 0 162 163 def _write_gzip_header(self): 164 self.fileobj.write('\037\213') # magic header 165 self.fileobj.write('\010') # compression method 166 try: 167 # RFC 1952 requires the FNAME field to be Latin-1. Do not 168 # include filenames that cannot be represented that way. 169 fname = os.path.basename(self.name) 170 if not isinstance(fname, str): 171 fname = fname.encode('latin-1') 172 if fname.endswith('.gz'): 173 fname = fname[:-3] 174 except UnicodeEncodeError: 175 fname = '' 176 flags = 0 177 if fname: 178 flags = FNAME 179 self.fileobj.write(chr(flags)) 180 mtime = self.mtime 181 if mtime is None: 182 mtime = time.time() 183 write32u(self.fileobj, long(mtime)) 184 self.fileobj.write('\002') 185 self.fileobj.write('\377') 186 if fname: 187 self.fileobj.write(fname + '\000') 188 189 def _init_read(self): 190 self.crc = zlib.crc32("") & 0xffffffffL 191 self.size = 0 192 193 def _read_gzip_header(self): 194 magic = self.fileobj.read(2) 195 if magic != '\037\213': 196 raise IOError, 'Not a gzipped file' 197 method = ord( self.fileobj.read(1) ) 198 if method != 8: 199 raise IOError, 'Unknown compression method' 200 flag = ord( self.fileobj.read(1) ) 201 self.mtime = read32(self.fileobj) 202 # extraflag = self.fileobj.read(1) 203 # os = self.fileobj.read(1) 204 self.fileobj.read(2) 205 206 if flag & FEXTRA: 207 # Read & discard the extra field, if present 208 xlen = ord(self.fileobj.read(1)) 209 xlen = xlen + 256*ord(self.fileobj.read(1)) 210 self.fileobj.read(xlen) 211 if flag & FNAME: 212 # Read and discard a null-terminated string containing the filename 213 while True: 214 s = self.fileobj.read(1) 215 if not s or s=='\000': 216 break 217 if flag & FCOMMENT: 218 # Read and discard a null-terminated string containing a comment 219 while True: 220 s = self.fileobj.read(1) 221 if not s or s=='\000': 222 break 223 if flag & FHCRC: 224 self.fileobj.read(2) # Read & discard the 16-bit header CRC 225 226 def write(self,data): 227 self._check_closed() 228 if self.mode != WRITE: 229 import errno 230 raise IOError(errno.EBADF, "write() on read-only GzipFile object") 231 232 if self.fileobj is None: 233 raise ValueError, "write() on closed GzipFile object" 234 235 # Convert data type if called by io.BufferedWriter. 236 if isinstance(data, memoryview): 237 data = data.tobytes() 238 239 if len(data) > 0: 240 self.fileobj.write(self.compress.compress(data)) 241 self.size += len(data) 242 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 243 self.offset += len(data) 244 245 return len(data) 246 247 def read(self, size=-1): 248 self._check_closed() 249 if self.mode != READ: 250 import errno 251 raise IOError(errno.EBADF, "read() on write-only GzipFile object") 252 253 if self.extrasize <= 0 and self.fileobj is None: 254 return '' 255 256 readsize = 1024 257 if size < 0: # get the whole thing 258 try: 259 while True: 260 self._read(readsize) 261 readsize = min(self.max_read_chunk, readsize * 2) 262 except EOFError: 263 size = self.extrasize 264 else: # just get some more of it 265 try: 266 while size > self.extrasize: 267 self._read(readsize) 268 readsize = min(self.max_read_chunk, readsize * 2) 269 except EOFError: 270 if size > self.extrasize: 271 size = self.extrasize 272 273 offset = self.offset - self.extrastart 274 chunk = self.extrabuf[offset: offset + size] 275 self.extrasize = self.extrasize - size 276 277 self.offset += size 278 return chunk 279 280 def _unread(self, buf): 281 self.extrasize = len(buf) + self.extrasize 282 self.offset -= len(buf) 283 284 def _read(self, size=1024): 285 if self.fileobj is None: 286 raise EOFError, "Reached EOF" 287 288 if self._new_member: 289 # If the _new_member flag is set, we have to 290 # jump to the next member, if there is one. 291 # 292 # First, check if we're at the end of the file; 293 # if so, it's time to stop; no more members to read. 294 pos = self.fileobj.tell() # Save current position 295 self.fileobj.seek(0, 2) # Seek to end of file 296 if pos == self.fileobj.tell(): 297 raise EOFError, "Reached EOF" 298 else: 299 self.fileobj.seek( pos ) # Return to original position 300 301 self._init_read() 302 self._read_gzip_header() 303 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 304 self._new_member = False 305 306 # Read a chunk of data from the file 307 buf = self.fileobj.read(size) 308 309 # If the EOF has been reached, flush the decompression object 310 # and mark this object as finished. 311 312 if buf == "": 313 uncompress = self.decompress.flush() 314 self._read_eof() 315 self._add_read_data( uncompress ) 316 raise EOFError, 'Reached EOF' 317 318 uncompress = self.decompress.decompress(buf) 319 self._add_read_data( uncompress ) 320 321 if self.decompress.unused_data != "": 322 # Ending case: we've come to the end of a member in the file, 323 # so seek back to the start of the unused data, finish up 324 # this member, and read a new gzip header. 325 # (The number of bytes to seek back is the length of the unused 326 # data, minus 8 because _read_eof() will rewind a further 8 bytes) 327 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) 328 329 # Check the CRC and file size, and set the flag so we read 330 # a new member on the next call 331 self._read_eof() 332 self._new_member = True 333 334 def _add_read_data(self, data): 335 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 336 offset = self.offset - self.extrastart 337 self.extrabuf = self.extrabuf[offset:] + data 338 self.extrasize = self.extrasize + len(data) 339 self.extrastart = self.offset 340 self.size = self.size + len(data) 341 342 def _read_eof(self): 343 # We've read to the end of the file, so we have to rewind in order 344 # to reread the 8 bytes containing the CRC and the file size. 345 # We check the that the computed CRC and size of the 346 # uncompressed data matches the stored values. Note that the size 347 # stored is the true file size mod 2**32. 348 self.fileobj.seek(-8, 1) 349 crc32 = read32(self.fileobj) 350 isize = read32(self.fileobj) # may exceed 2GB 351 if crc32 != self.crc: 352 raise IOError("CRC check failed %s != %s" % (hex(crc32), 353 hex(self.crc))) 354 elif isize != (self.size & 0xffffffffL): 355 raise IOError, "Incorrect length of data produced" 356 357 # Gzip files can be padded with zeroes and still have archives. 358 # Consume all zero bytes and set the file position to the first 359 # non-zero byte. See http://www.gzip.org/#faq8 360 c = "\x00" 361 while c == "\x00": 362 c = self.fileobj.read(1) 363 if c: 364 self.fileobj.seek(-1, 1) 365 366 @property 367 def closed(self): 368 return self.fileobj is None 369 370 def close(self): 371 fileobj = self.fileobj 372 if fileobj is None: 373 return 374 self.fileobj = None 375 try: 376 if self.mode == WRITE: 377 fileobj.write(self.compress.flush()) 378 write32u(fileobj, self.crc) 379 # self.size may exceed 2GB, or even 4GB 380 write32u(fileobj, self.size & 0xffffffffL) 381 finally: 382 myfileobj = self.myfileobj 383 if myfileobj: 384 self.myfileobj = None 385 myfileobj.close() 386 387 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 388 self._check_closed() 389 if self.mode == WRITE: 390 # Ensure the compressor's buffer is flushed 391 self.fileobj.write(self.compress.flush(zlib_mode)) 392 self.fileobj.flush() 393 394 def fileno(self): 395 """Invoke the underlying file object's fileno() method. 396 397 This will raise AttributeError if the underlying file object 398 doesn't support fileno(). 399 """ 400 return self.fileobj.fileno() 401 402 def rewind(self): 403 '''Return the uncompressed stream file position indicator to the 404 beginning of the file''' 405 if self.mode != READ: 406 raise IOError("Can't rewind in write mode") 407 self.fileobj.seek(0) 408 self._new_member = True 409 self.extrabuf = "" 410 self.extrasize = 0 411 self.extrastart = 0 412 self.offset = 0 413 414 def readable(self): 415 return self.mode == READ 416 417 def writable(self): 418 return self.mode == WRITE 419 420 def seekable(self): 421 return True 422 423 def seek(self, offset, whence=0): 424 if whence: 425 if whence == 1: 426 offset = self.offset + offset 427 else: 428 raise ValueError('Seek from end not supported') 429 if self.mode == WRITE: 430 if offset < self.offset: 431 raise IOError('Negative seek in write mode') 432 count = offset - self.offset 433 for i in xrange(count // 1024): 434 self.write(1024 * '\0') 435 self.write((count % 1024) * '\0') 436 elif self.mode == READ: 437 if offset < self.offset: 438 # for negative seek, rewind and do positive seek 439 self.rewind() 440 count = offset - self.offset 441 for i in xrange(count // 1024): 442 self.read(1024) 443 self.read(count % 1024) 444 445 return self.offset 446 447 def readline(self, size=-1): 448 if size < 0: 449 # Shortcut common case - newline found in buffer. 450 offset = self.offset - self.extrastart 451 i = self.extrabuf.find('\n', offset) + 1 452 if i > 0: 453 self.extrasize -= i - offset 454 self.offset += i - offset 455 return self.extrabuf[offset: i] 456 457 size = sys.maxint 458 readsize = self.min_readsize 459 else: 460 readsize = size 461 bufs = [] 462 while size != 0: 463 c = self.read(readsize) 464 i = c.find('\n') 465 466 # We set i=size to break out of the loop under two 467 # conditions: 1) there's no newline, and the chunk is 468 # larger than size, or 2) there is a newline, but the 469 # resulting line would be longer than 'size'. 470 if (size <= i) or (i == -1 and len(c) > size): 471 i = size - 1 472 473 if i >= 0 or c == '': 474 bufs.append(c[:i + 1]) # Add portion of last chunk 475 self._unread(c[i + 1:]) # Push back rest of chunk 476 break 477 478 # Append chunk to list, decrease 'size', 479 bufs.append(c) 480 size = size - len(c) 481 readsize = min(size, readsize * 2) 482 if readsize > self.min_readsize: 483 self.min_readsize = min(readsize, self.min_readsize * 2, 512) 484 return ''.join(bufs) # Return resulting line 485 486 487def _test(): 488 # Act like gzip; with -d, act like gunzip. 489 # The input file is not deleted, however, nor are any other gzip 490 # options or features supported. 491 args = sys.argv[1:] 492 decompress = args and args[0] == "-d" 493 if decompress: 494 args = args[1:] 495 if not args: 496 args = ["-"] 497 for arg in args: 498 if decompress: 499 if arg == "-": 500 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) 501 g = sys.stdout 502 else: 503 if arg[-3:] != ".gz": 504 print "filename doesn't end in .gz:", repr(arg) 505 continue 506 f = open(arg, "rb") 507 g = __builtin__.open(arg[:-3], "wb") 508 else: 509 if arg == "-": 510 f = sys.stdin 511 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) 512 else: 513 f = __builtin__.open(arg, "rb") 514 g = open(arg + ".gz", "wb") 515 while True: 516 chunk = f.read(1024) 517 if not chunk: 518 break 519 g.write(chunk) 520 if g is not sys.stdout: 521 g.close() 522 if f is not sys.stdin: 523 f.close() 524 525if __name__ == '__main__': 526 _test() 527