1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import io
11import __builtin__
12
13__all__ = ["GzipFile","open"]
14
15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
19def write32u(output, value):
20    # The L format writes the bit pattern correctly whether signed
21    # or unsigned.
22    output.write(struct.pack("<L", value))
23
24def read32(input):
25    return struct.unpack("<I", input.read(4))[0]
26
27def open(filename, mode="rb", compresslevel=9):
28    """Shorthand for GzipFile(filename, mode, compresslevel).
29
30    The filename argument is required; mode defaults to 'rb'
31    and compresslevel defaults to 9.
32
33    """
34    return GzipFile(filename, mode, compresslevel)
35
36class GzipFile(io.BufferedIOBase):
37    """The GzipFile class simulates most of the methods of a file object with
38    the exception of the readinto() and truncate() methods.
39
40    """
41
42    myfileobj = None
43    max_read_chunk = 10 * 1024 * 1024   # 10Mb
44
45    def __init__(self, filename=None, mode=None,
46                 compresslevel=9, fileobj=None, mtime=None):
47        """Constructor for the GzipFile class.
48
49        At least one of fileobj and filename must be given a
50        non-trivial value.
51
52        The new class instance is based on fileobj, which can be a regular
53        file, a StringIO object, or any other object which simulates a file.
54        It defaults to None, in which case filename is opened to provide
55        a file object.
56
57        When fileobj is not None, the filename argument is only used to be
58        included in the gzip file header, which may include the original
59        filename of the uncompressed file.  It defaults to the filename of
60        fileobj, if discernible; otherwise, it defaults to the empty string,
61        and in this case the original filename is not included in the header.
62
63        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64        depending on whether the file will be read or written.  The default
65        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66        Be aware that only the 'rb', 'ab', and 'wb' values should be used
67        for cross-platform portability.
68
69        The compresslevel argument is an integer from 0 to 9 controlling the
70        level of compression; 1 is fastest and produces the least compression,
71        and 9 is slowest and produces the most compression. 0 is no compression
72        at all. The default is 9.
73
74        The mtime argument is an optional numeric timestamp to be written
75        to the stream when compressing.  All gzip compressed streams
76        are required to contain a timestamp.  If omitted or None, the
77        current time is used.  This module ignores the timestamp when
78        decompressing; however, some programs, such as gunzip, make use
79        of it.  The format of the timestamp is the same as that of the
80        return value of time.time() and of the st_mtime member of the
81        object returned by os.stat().
82
83        """
84
85        # Make sure we don't inadvertently enable universal newlines on the
86        # underlying file object - in read mode, this causes data corruption.
87        if mode:
88            mode = mode.replace('U', '')
89        # guarantee the file is opened in binary mode on platforms
90        # that care about that sort of thing
91        if mode and 'b' not in mode:
92            mode += 'b'
93        if fileobj is None:
94            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
95        if filename is None:
96            # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97            # attribute. Avoid saving this in the gzip header's filename field.
98            filename = getattr(fileobj, 'name', '')
99            if not isinstance(filename, basestring) or filename == '<fdopen>':
100                filename = ''
101        if mode is None:
102            if hasattr(fileobj, 'mode'): mode = fileobj.mode
103            else: mode = 'rb'
104
105        if mode[0:1] == 'r':
106            self.mode = READ
107            # Set flag indicating start of a new member
108            self._new_member = True
109            # Buffer data read from gzip file. extrastart is offset in
110            # stream where buffer starts. extrasize is number of
111            # bytes remaining in buffer from current stream position.
112            self.extrabuf = ""
113            self.extrasize = 0
114            self.extrastart = 0
115            self.name = filename
116            # Starts small, scales exponentially
117            self.min_readsize = 100
118
119        elif mode[0:1] == 'w' or mode[0:1] == 'a':
120            self.mode = WRITE
121            self._init_write(filename)
122            self.compress = zlib.compressobj(compresslevel,
123                                             zlib.DEFLATED,
124                                             -zlib.MAX_WBITS,
125                                             zlib.DEF_MEM_LEVEL,
126                                             0)
127        else:
128            raise IOError, "Mode " + mode + " not supported"
129
130        self.fileobj = fileobj
131        self.offset = 0
132        self.mtime = mtime
133
134        if self.mode == WRITE:
135            self._write_gzip_header()
136
137    @property
138    def filename(self):
139        import warnings
140        warnings.warn("use the name attribute", DeprecationWarning, 2)
141        if self.mode == WRITE and self.name[-3:] != ".gz":
142            return self.name + ".gz"
143        return self.name
144
145    def __repr__(self):
146        s = repr(self.fileobj)
147        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
148
149    def _check_closed(self):
150        """Raises a ValueError if the underlying file object has been closed.
151
152        """
153        if self.closed:
154            raise ValueError('I/O operation on closed file.')
155
156    def _init_write(self, filename):
157        self.name = filename
158        self.crc = zlib.crc32("") & 0xffffffffL
159        self.size = 0
160        self.writebuf = []
161        self.bufsize = 0
162
163    def _write_gzip_header(self):
164        self.fileobj.write('\037\213')             # magic header
165        self.fileobj.write('\010')                 # compression method
166        try:
167            # RFC 1952 requires the FNAME field to be Latin-1. Do not
168            # include filenames that cannot be represented that way.
169            fname = os.path.basename(self.name)
170            if not isinstance(fname, str):
171                fname = fname.encode('latin-1')
172            if fname.endswith('.gz'):
173                fname = fname[:-3]
174        except UnicodeEncodeError:
175            fname = ''
176        flags = 0
177        if fname:
178            flags = FNAME
179        self.fileobj.write(chr(flags))
180        mtime = self.mtime
181        if mtime is None:
182            mtime = time.time()
183        write32u(self.fileobj, long(mtime))
184        self.fileobj.write('\002')
185        self.fileobj.write('\377')
186        if fname:
187            self.fileobj.write(fname + '\000')
188
189    def _init_read(self):
190        self.crc = zlib.crc32("") & 0xffffffffL
191        self.size = 0
192
193    def _read_gzip_header(self):
194        magic = self.fileobj.read(2)
195        if magic != '\037\213':
196            raise IOError, 'Not a gzipped file'
197        method = ord( self.fileobj.read(1) )
198        if method != 8:
199            raise IOError, 'Unknown compression method'
200        flag = ord( self.fileobj.read(1) )
201        self.mtime = read32(self.fileobj)
202        # extraflag = self.fileobj.read(1)
203        # os = self.fileobj.read(1)
204        self.fileobj.read(2)
205
206        if flag & FEXTRA:
207            # Read & discard the extra field, if present
208            xlen = ord(self.fileobj.read(1))
209            xlen = xlen + 256*ord(self.fileobj.read(1))
210            self.fileobj.read(xlen)
211        if flag & FNAME:
212            # Read and discard a null-terminated string containing the filename
213            while True:
214                s = self.fileobj.read(1)
215                if not s or s=='\000':
216                    break
217        if flag & FCOMMENT:
218            # Read and discard a null-terminated string containing a comment
219            while True:
220                s = self.fileobj.read(1)
221                if not s or s=='\000':
222                    break
223        if flag & FHCRC:
224            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
225
226    def write(self,data):
227        self._check_closed()
228        if self.mode != WRITE:
229            import errno
230            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
231
232        if self.fileobj is None:
233            raise ValueError, "write() on closed GzipFile object"
234
235        # Convert data type if called by io.BufferedWriter.
236        if isinstance(data, memoryview):
237            data = data.tobytes()
238
239        if len(data) > 0:
240            self.fileobj.write(self.compress.compress(data))
241            self.size += len(data)
242            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
243            self.offset += len(data)
244
245        return len(data)
246
247    def read(self, size=-1):
248        self._check_closed()
249        if self.mode != READ:
250            import errno
251            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
252
253        if self.extrasize <= 0 and self.fileobj is None:
254            return ''
255
256        readsize = 1024
257        if size < 0:        # get the whole thing
258            try:
259                while True:
260                    self._read(readsize)
261                    readsize = min(self.max_read_chunk, readsize * 2)
262            except EOFError:
263                size = self.extrasize
264        else:               # just get some more of it
265            try:
266                while size > self.extrasize:
267                    self._read(readsize)
268                    readsize = min(self.max_read_chunk, readsize * 2)
269            except EOFError:
270                if size > self.extrasize:
271                    size = self.extrasize
272
273        offset = self.offset - self.extrastart
274        chunk = self.extrabuf[offset: offset + size]
275        self.extrasize = self.extrasize - size
276
277        self.offset += size
278        return chunk
279
280    def _unread(self, buf):
281        self.extrasize = len(buf) + self.extrasize
282        self.offset -= len(buf)
283
284    def _read(self, size=1024):
285        if self.fileobj is None:
286            raise EOFError, "Reached EOF"
287
288        if self._new_member:
289            # If the _new_member flag is set, we have to
290            # jump to the next member, if there is one.
291            #
292            # First, check if we're at the end of the file;
293            # if so, it's time to stop; no more members to read.
294            pos = self.fileobj.tell()   # Save current position
295            self.fileobj.seek(0, 2)     # Seek to end of file
296            if pos == self.fileobj.tell():
297                raise EOFError, "Reached EOF"
298            else:
299                self.fileobj.seek( pos ) # Return to original position
300
301            self._init_read()
302            self._read_gzip_header()
303            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
304            self._new_member = False
305
306        # Read a chunk of data from the file
307        buf = self.fileobj.read(size)
308
309        # If the EOF has been reached, flush the decompression object
310        # and mark this object as finished.
311
312        if buf == "":
313            uncompress = self.decompress.flush()
314            self._read_eof()
315            self._add_read_data( uncompress )
316            raise EOFError, 'Reached EOF'
317
318        uncompress = self.decompress.decompress(buf)
319        self._add_read_data( uncompress )
320
321        if self.decompress.unused_data != "":
322            # Ending case: we've come to the end of a member in the file,
323            # so seek back to the start of the unused data, finish up
324            # this member, and read a new gzip header.
325            # (The number of bytes to seek back is the length of the unused
326            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
327            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
328
329            # Check the CRC and file size, and set the flag so we read
330            # a new member on the next call
331            self._read_eof()
332            self._new_member = True
333
334    def _add_read_data(self, data):
335        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
336        offset = self.offset - self.extrastart
337        self.extrabuf = self.extrabuf[offset:] + data
338        self.extrasize = self.extrasize + len(data)
339        self.extrastart = self.offset
340        self.size = self.size + len(data)
341
342    def _read_eof(self):
343        # We've read to the end of the file, so we have to rewind in order
344        # to reread the 8 bytes containing the CRC and the file size.
345        # We check the that the computed CRC and size of the
346        # uncompressed data matches the stored values.  Note that the size
347        # stored is the true file size mod 2**32.
348        self.fileobj.seek(-8, 1)
349        crc32 = read32(self.fileobj)
350        isize = read32(self.fileobj)  # may exceed 2GB
351        if crc32 != self.crc:
352            raise IOError("CRC check failed %s != %s" % (hex(crc32),
353                                                         hex(self.crc)))
354        elif isize != (self.size & 0xffffffffL):
355            raise IOError, "Incorrect length of data produced"
356
357        # Gzip files can be padded with zeroes and still have archives.
358        # Consume all zero bytes and set the file position to the first
359        # non-zero byte. See http://www.gzip.org/#faq8
360        c = "\x00"
361        while c == "\x00":
362            c = self.fileobj.read(1)
363        if c:
364            self.fileobj.seek(-1, 1)
365
366    @property
367    def closed(self):
368        return self.fileobj is None
369
370    def close(self):
371        fileobj = self.fileobj
372        if fileobj is None:
373            return
374        self.fileobj = None
375        try:
376            if self.mode == WRITE:
377                fileobj.write(self.compress.flush())
378                write32u(fileobj, self.crc)
379                # self.size may exceed 2GB, or even 4GB
380                write32u(fileobj, self.size & 0xffffffffL)
381        finally:
382            myfileobj = self.myfileobj
383            if myfileobj:
384                self.myfileobj = None
385                myfileobj.close()
386
387    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
388        self._check_closed()
389        if self.mode == WRITE:
390            # Ensure the compressor's buffer is flushed
391            self.fileobj.write(self.compress.flush(zlib_mode))
392            self.fileobj.flush()
393
394    def fileno(self):
395        """Invoke the underlying file object's fileno() method.
396
397        This will raise AttributeError if the underlying file object
398        doesn't support fileno().
399        """
400        return self.fileobj.fileno()
401
402    def rewind(self):
403        '''Return the uncompressed stream file position indicator to the
404        beginning of the file'''
405        if self.mode != READ:
406            raise IOError("Can't rewind in write mode")
407        self.fileobj.seek(0)
408        self._new_member = True
409        self.extrabuf = ""
410        self.extrasize = 0
411        self.extrastart = 0
412        self.offset = 0
413
414    def readable(self):
415        return self.mode == READ
416
417    def writable(self):
418        return self.mode == WRITE
419
420    def seekable(self):
421        return True
422
423    def seek(self, offset, whence=0):
424        if whence:
425            if whence == 1:
426                offset = self.offset + offset
427            else:
428                raise ValueError('Seek from end not supported')
429        if self.mode == WRITE:
430            if offset < self.offset:
431                raise IOError('Negative seek in write mode')
432            count = offset - self.offset
433            for i in xrange(count // 1024):
434                self.write(1024 * '\0')
435            self.write((count % 1024) * '\0')
436        elif self.mode == READ:
437            if offset < self.offset:
438                # for negative seek, rewind and do positive seek
439                self.rewind()
440            count = offset - self.offset
441            for i in xrange(count // 1024):
442                self.read(1024)
443            self.read(count % 1024)
444
445        return self.offset
446
447    def readline(self, size=-1):
448        if size < 0:
449            # Shortcut common case - newline found in buffer.
450            offset = self.offset - self.extrastart
451            i = self.extrabuf.find('\n', offset) + 1
452            if i > 0:
453                self.extrasize -= i - offset
454                self.offset += i - offset
455                return self.extrabuf[offset: i]
456
457            size = sys.maxint
458            readsize = self.min_readsize
459        else:
460            readsize = size
461        bufs = []
462        while size != 0:
463            c = self.read(readsize)
464            i = c.find('\n')
465
466            # We set i=size to break out of the loop under two
467            # conditions: 1) there's no newline, and the chunk is
468            # larger than size, or 2) there is a newline, but the
469            # resulting line would be longer than 'size'.
470            if (size <= i) or (i == -1 and len(c) > size):
471                i = size - 1
472
473            if i >= 0 or c == '':
474                bufs.append(c[:i + 1])    # Add portion of last chunk
475                self._unread(c[i + 1:])   # Push back rest of chunk
476                break
477
478            # Append chunk to list, decrease 'size',
479            bufs.append(c)
480            size = size - len(c)
481            readsize = min(size, readsize * 2)
482        if readsize > self.min_readsize:
483            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
484        return ''.join(bufs) # Return resulting line
485
486
487def _test():
488    # Act like gzip; with -d, act like gunzip.
489    # The input file is not deleted, however, nor are any other gzip
490    # options or features supported.
491    args = sys.argv[1:]
492    decompress = args and args[0] == "-d"
493    if decompress:
494        args = args[1:]
495    if not args:
496        args = ["-"]
497    for arg in args:
498        if decompress:
499            if arg == "-":
500                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
501                g = sys.stdout
502            else:
503                if arg[-3:] != ".gz":
504                    print "filename doesn't end in .gz:", repr(arg)
505                    continue
506                f = open(arg, "rb")
507                g = __builtin__.open(arg[:-3], "wb")
508        else:
509            if arg == "-":
510                f = sys.stdin
511                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
512            else:
513                f = __builtin__.open(arg, "rb")
514                g = open(arg + ".gz", "wb")
515        while True:
516            chunk = f.read(1024)
517            if not chunk:
518                break
519            g.write(chunk)
520        if g is not sys.stdout:
521            g.close()
522        if f is not sys.stdin:
523            f.close()
524
525if __name__ == '__main__':
526    _test()
527