util/bot/extract.py

# Copyright (c) 2015, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

"""Extracts archives."""


import hashlib
import optparse
import os
import os.path
import tarfile
import shutil
import sys
import zipfile


def CheckedJoin(output, path):
  """
  CheckedJoin returns os.path.join(output, path). It does sanity checks to
  ensure the resulting path is under output, but shouldn't be used on untrusted
  input.
  """
  path = os.path.normpath(path)
  if os.path.isabs(path) or path.startswith('.'):
    raise ValueError(path)
  return os.path.join(output, path)


class FileEntry(object):
  def __init__(self, path, mode, fileobj):
    self.path = path
    self.mode = mode
    self.fileobj = fileobj


class SymlinkEntry(object):
  def __init__(self, path, mode, target):
    self.path = path
    self.mode = mode
    self.target = target


def IterateZip(path):
  """
  IterateZip opens the zip file at path and returns a generator of entry objects
  for each file in it.
  """
  with zipfile.ZipFile(path, 'r') as zip_file:
    for info in zip_file.infolist():
      if info.filename.endswith('/'):
        continue
      yield FileEntry(info.filename, None, zip_file.open(info))


def IterateTar(path, compression):
  """
  IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of
  entry objects for each file in it.
  """
  with tarfile.open(path, 'r:' + compression) as tar_file:
    for info in tar_file:
      if info.isdir():
        pass
      elif info.issym():
        yield SymlinkEntry(info.name, None, info.linkname)
      elif info.isfile():
        yield FileEntry(info.name, info.mode, tar_file.extractfile(info))
      else:
        raise ValueError('Unknown entry type "%s"' % (info.name, ))


def main(args):
  parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT')
  parser.add_option('--no-prefix', dest='no_prefix', action='store_true',
                    help='Do not remove a prefix from paths in the archive.')
  options, args = parser.parse_args(args)

  if len(args) != 2:
    parser.print_help()
    return 1

  archive, output = args

  if not os.path.exists(archive):
    # Skip archives that weren't downloaded.
    return 0

  with open(archive, 'rb') as f:
    sha256 = hashlib.sha256()
    while True:
      chunk = f.read(1024 * 1024)
      if not chunk:
        break
      sha256.update(chunk)
    digest = sha256.hexdigest()

  stamp_path = os.path.join(output, ".boringssl_archive_digest")
  if os.path.exists(stamp_path):
    with open(stamp_path) as f:
      if f.read().strip() == digest:
        print("Already up-to-date.")
        return 0

  if archive.endswith('.zip'):
    entries = IterateZip(archive)
  elif archive.endswith('.tar.gz'):
    entries = IterateTar(archive, 'gz')
  elif archive.endswith('.tar.bz2'):
    entries = IterateTar(archive, 'bz2')
  elif archive.endswith('.tar.xz'):
    entries = IterateTar(archive, 'xz')
  else:
    raise ValueError(archive)

  try:
    if os.path.exists(output):
      print("Removing %s" % (output, ))
      shutil.rmtree(output)

    print("Extracting %s to %s" % (archive, output))
    prefix = None
    num_extracted = 0
    for entry in entries:
      # Even on Windows, zip files must always use forward slashes.
      if '\\' in entry.path or entry.path.startswith('/'):
        raise ValueError(entry.path)

      if not options.no_prefix:
        new_prefix, rest = entry.path.split('/', 1)

        # Ensure the archive is consistent.
        if prefix is None:
          prefix = new_prefix
        if prefix != new_prefix:
          raise ValueError((prefix, new_prefix))
      else:
        rest = entry.path

      # Extract the file into the output directory.
      fixed_path = CheckedJoin(output, rest)
      if not os.path.isdir(os.path.dirname(fixed_path)):
        os.makedirs(os.path.dirname(fixed_path))
      if isinstance(entry, FileEntry):
        with open(fixed_path, 'wb') as out:
          shutil.copyfileobj(entry.fileobj, out)
      elif isinstance(entry, SymlinkEntry):
        os.symlink(entry.target, fixed_path)
      else:
        raise TypeError('unknown entry type')

      # Fix up permissions if needbe.
      # TODO(davidben): To be extra tidy, this should only track the execute bit
      # as in git.
      if entry.mode is not None:
        os.chmod(fixed_path, entry.mode)

      # Print every 100 files, so bots do not time out on large archives.
      num_extracted += 1
      if num_extracted % 100 == 0:
        print("Extracted %d files..." % (num_extracted,))
  finally:
    entries.close()

  with open(stamp_path, 'w') as f:
    f.write(digest)

  print("Done. Extracted %d files." % (num_extracted,))
  return 0


if __name__ == '__main__':
  sys.exit(main(sys.argv[1:]))