autotest/site_utils/job_directories.py

*9c5db199SXin Liimport abc
*9c5db199SXin Liimport datetime
*9c5db199SXin Liimport glob
*9c5db199SXin Liimport json
*9c5db199SXin Liimport logging
*9c5db199SXin Liimport os
*9c5db199SXin Liimport re
*9c5db199SXin Liimport shutil
*9c5db199SXin Liimport six
*9c5db199SXin Li
*9c5db199SXin Liimport common
*9c5db199SXin Li
*9c5db199SXin Lifrom autotest_lib.client.common_lib import time_utils
*9c5db199SXin Lifrom autotest_lib.client.common_lib import utils
*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import constants
*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
*9c5db199SXin Li
*9c5db199SXin Litry:
*9c5db199SXin Li    from autotest_lib.utils.frozen_chromite.lib import metrics
*9c5db199SXin Liexcept ImportError:
*9c5db199SXin Li    metrics = utils.metrics_mock
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin LiSPECIAL_TASK_PATTERN = '.*/hosts/[^/]+/(\d+)-[^/]+'
*9c5db199SXin Li
*9c5db199SXin Lidef is_job_expired(age_limit, timestamp):
*9c5db199SXin Li    """Check whether a job timestamp is older than an age limit.
*9c5db199SXin Li
*9c5db199SXin Li    @param age_limit: Minimum age, measured in days.  If the value is
*9c5db199SXin Li                      not positive, the job is always expired.
*9c5db199SXin Li    @param timestamp: Timestamp of the job whose age we are checking.
*9c5db199SXin Li                      The format must match time_utils.TIME_FMT.
*9c5db199SXin Li
*9c5db199SXin Li    @returns True if the job is old enough to be expired.
*9c5db199SXin Li    """
*9c5db199SXin Li    if age_limit <= 0:
*9c5db199SXin Li        return True
*9c5db199SXin Li    job_time = time_utils.time_string_to_datetime(timestamp)
*9c5db199SXin Li    expiration = job_time + datetime.timedelta(days=age_limit)
*9c5db199SXin Li    return datetime.datetime.now() >= expiration
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef get_job_id_or_task_id(result_dir):
*9c5db199SXin Li    """Extract job id or special task id from result_dir
*9c5db199SXin Li
*9c5db199SXin Li    @param result_dir: path to the result dir.
*9c5db199SXin Li            For test job:
*9c5db199SXin Li            /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
*9c5db199SXin Li            The hostname at the end is optional.
*9c5db199SXin Li            For special task:
*9c5db199SXin Li            /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
*9c5db199SXin Li
*9c5db199SXin Li    @returns: str representing the job id or task id. Returns None if fail
*9c5db199SXin Li        to parse job or task id from the result_dir.
*9c5db199SXin Li    """
*9c5db199SXin Li    if not result_dir:
*9c5db199SXin Li        return
*9c5db199SXin Li    result_dir = os.path.abspath(result_dir)
*9c5db199SXin Li    # Result folder for job running inside container has only job id.
*9c5db199SXin Li    ssp_job_pattern = '.*/(\d+)$'
*9c5db199SXin Li    # Try to get the job ID from the last pattern of number-text. This avoids
*9c5db199SXin Li    # issue with path like 123-results/456-debug_user, in which 456 is the real
*9c5db199SXin Li    # job ID.
*9c5db199SXin Li    m_job = re.findall('.*/(\d+)-[^/]+', result_dir)
*9c5db199SXin Li    if m_job:
*9c5db199SXin Li        return m_job[-1]
*9c5db199SXin Li    m_special_task = re.match(SPECIAL_TASK_PATTERN, result_dir)
*9c5db199SXin Li    if m_special_task:
*9c5db199SXin Li        return m_special_task.group(1)
*9c5db199SXin Li    m_ssp_job_pattern = re.match(ssp_job_pattern, result_dir)
*9c5db199SXin Li    if m_ssp_job_pattern and utils.is_in_container():
*9c5db199SXin Li        return m_ssp_job_pattern.group(1)
*9c5db199SXin Li    return _get_swarming_run_id(result_dir)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _get_swarming_run_id(path):
*9c5db199SXin Li    """Extract the Swarming run_id for a Skylab task from the result path."""
*9c5db199SXin Li    # Legacy swarming results are in directories like
*9c5db199SXin Li    #   .../results/swarming-3e4391423c3a4311
*9c5db199SXin Li    # In particular, the ending digit is never 0
*9c5db199SXin Li    m_legacy_path = re.match('.*/swarming-([0-9a-fA-F]*[1-9a-fA-F])$', path)
*9c5db199SXin Li    if m_legacy_path:
*9c5db199SXin Li        return m_legacy_path.group(1)
*9c5db199SXin Li    # New style swarming results are in directories like
*9c5db199SXin Li    #   .../results/swarming-3e4391423c3a4310/1
*9c5db199SXin Li    # - Results are one directory deeper.
*9c5db199SXin Li    # - Ending digit of first directory is always 0.
*9c5db199SXin Li    m_path = re.match('.*/swarming-([0-9a-fA-F]*)0/([1-9a-fA-F])$', path)
*9c5db199SXin Li    if m_path:
*9c5db199SXin Li        return m_path.group(1) + m_path.group(2)
*9c5db199SXin Li    return None
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liclass _JobDirectory(six.with_metaclass(abc.ABCMeta, object)):
*9c5db199SXin Li    """State associated with a job to be offloaded.
*9c5db199SXin Li
*9c5db199SXin Li    The full life-cycle of a job (including failure events that
*9c5db199SXin Li    normally don't occur) looks like this:
*9c5db199SXin Li      1. The job's results directory is discovered by
*9c5db199SXin Li         `get_job_directories()`, and a job instance is created for it.
*9c5db199SXin Li      2. Calls to `offload()` have no effect so long as the job
*9c5db199SXin Li         isn't complete in the database and the job isn't expired
*9c5db199SXin Li         according to the `age_limit` parameter.
*9c5db199SXin Li      3. Eventually, the job is both finished and expired.  The next
*9c5db199SXin Li         call to `offload()` makes the first attempt to offload the
*9c5db199SXin Li         directory to GS.  Offload is attempted, but fails to complete
*9c5db199SXin Li         (e.g. because of a GS problem).
*9c5db199SXin Li      4. Finally, a call to `offload()` succeeds, and the directory no
*9c5db199SXin Li         longer exists.  Now `is_offloaded()` is true, so the job
*9c5db199SXin Li         instance is deleted, and future failures will not mention this
*9c5db199SXin Li         directory any more.
*9c5db199SXin Li
*9c5db199SXin Li    Only steps 1. and 4. are guaranteed to occur.  The others depend
*9c5db199SXin Li    on the timing of calls to `offload()`, and on the reliability of
*9c5db199SXin Li    the actual offload process.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    GLOB_PATTERN = None  # must be redefined in subclass
*9c5db199SXin Li
*9c5db199SXin Li    def __init__(self, resultsdir):
*9c5db199SXin Li        self.dirname = resultsdir
*9c5db199SXin Li        self._id = get_job_id_or_task_id(resultsdir)
*9c5db199SXin Li        self.offload_count = 0
*9c5db199SXin Li        self.first_offload_start = 0
*9c5db199SXin Li
*9c5db199SXin Li    @classmethod
*9c5db199SXin Li    def get_job_directories(cls):
*9c5db199SXin Li        """Return a list of directories of jobs that need offloading."""
*9c5db199SXin Li        return [d for d in glob.glob(cls.GLOB_PATTERN) if os.path.isdir(d)]
*9c5db199SXin Li
*9c5db199SXin Li    @abc.abstractmethod
*9c5db199SXin Li    def get_timestamp_if_finished(self):
*9c5db199SXin Li        """Return this job's timestamp from the database.
*9c5db199SXin Li
*9c5db199SXin Li        If the database has not marked the job as finished, return
*9c5db199SXin Li        `None`.  Otherwise, return a timestamp for the job.  The
*9c5db199SXin Li        timestamp is to be used to determine expiration in
*9c5db199SXin Li        `is_job_expired()`.
*9c5db199SXin Li
*9c5db199SXin Li        @return Return `None` if the job is still running; otherwise
*9c5db199SXin Li                return a string with a timestamp in the appropriate
*9c5db199SXin Li                format.
*9c5db199SXin Li        """
*9c5db199SXin Li        raise NotImplementedError("_JobDirectory.get_timestamp_if_finished")
*9c5db199SXin Li
*9c5db199SXin Li    def process_gs_instructions(self):
*9c5db199SXin Li        """Process any gs_offloader instructions for this special task.
*9c5db199SXin Li
*9c5db199SXin Li        @returns True/False if there is anything left to offload.
*9c5db199SXin Li        """
*9c5db199SXin Li        # Default support is to still offload the directory.
*9c5db199SXin Li        return True
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin LiNO_OFFLOAD_README = """These results have been deleted rather than offloaded.
*9c5db199SXin LiThis is the expected behavior for passing jobs from the Commit Queue."""
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liclass RegularJobDirectory(_JobDirectory):
*9c5db199SXin Li    """Subclass of _JobDirectory for regular test jobs."""
*9c5db199SXin Li
*9c5db199SXin Li    GLOB_PATTERN = '[0-9]*-*'
*9c5db199SXin Li
*9c5db199SXin Li    def process_gs_instructions(self):
*9c5db199SXin Li        """Process any gs_offloader instructions for this job.
*9c5db199SXin Li
*9c5db199SXin Li        @returns True/False if there is anything left to offload.
*9c5db199SXin Li        """
*9c5db199SXin Li        # Go through the gs_offloader instructions file for each test in this job.
*9c5db199SXin Li        for path in glob.glob(
*9c5db199SXin Li                os.path.join(self.dirname, '*',
*9c5db199SXin Li                             constants.GS_OFFLOADER_INSTRUCTIONS)):
*9c5db199SXin Li            with open(path, 'r') as f:
*9c5db199SXin Li                gs_off_instructions = json.load(f)
*9c5db199SXin Li            if gs_off_instructions.get(constants.GS_OFFLOADER_NO_OFFLOAD):
*9c5db199SXin Li                dirname = os.path.dirname(path)
*9c5db199SXin Li                _remove_log_directory_contents(dirname)
*9c5db199SXin Li
*9c5db199SXin Li        # Finally check if there's anything left to offload.
*9c5db199SXin Li        if os.path.exists(self.dirname) and not os.listdir(self.dirname):
*9c5db199SXin Li            shutil.rmtree(self.dirname)
*9c5db199SXin Li            return False
*9c5db199SXin Li        return True
*9c5db199SXin Li
*9c5db199SXin Li    def get_timestamp_if_finished(self):
*9c5db199SXin Li        """Get the timestamp to use for finished jobs.
*9c5db199SXin Li
*9c5db199SXin Li        @returns the latest hqe finished_on time. If the finished_on times are null
*9c5db199SXin Li                 returns the job's created_on time.
*9c5db199SXin Li        """
*9c5db199SXin Li        entry = _cached_afe().get_jobs(id=self._id, finished=True)
*9c5db199SXin Li        if not entry:
*9c5db199SXin Li            return None
*9c5db199SXin Li        hqes = _cached_afe().get_host_queue_entries(finished_on__isnull=False,
*9c5db199SXin Li                                                    job_id=self._id)
*9c5db199SXin Li        if not hqes:
*9c5db199SXin Li            return entry[0].created_on
*9c5db199SXin Li        # While most Jobs have 1 HQE, some can have multiple, so check them all.
*9c5db199SXin Li        return max([hqe.finished_on for hqe in hqes])
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _remove_log_directory_contents(dirpath):
*9c5db199SXin Li    """Remove log directory contents.
*9c5db199SXin Li
*9c5db199SXin Li    Leave a note explaining what has happened to the logs.
*9c5db199SXin Li
*9c5db199SXin Li    @param dirpath: Path to log directory.
*9c5db199SXin Li    """
*9c5db199SXin Li    shutil.rmtree(dirpath)
*9c5db199SXin Li    os.mkdir(dirpath)
*9c5db199SXin Li    breadcrumb_name = os.path.join(dirpath, 'logs-removed-readme.txt')
*9c5db199SXin Li    with open(breadcrumb_name, 'w') as f:
*9c5db199SXin Li        f.write(NO_OFFLOAD_README)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liclass SpecialJobDirectory(_JobDirectory):
*9c5db199SXin Li    """Subclass of _JobDirectory for special (per-host) jobs."""
*9c5db199SXin Li
*9c5db199SXin Li    GLOB_PATTERN = 'hosts/*/[0-9]*-*'
*9c5db199SXin Li
*9c5db199SXin Li    def __init__(self, resultsdir):
*9c5db199SXin Li        super(SpecialJobDirectory, self).__init__(resultsdir)
*9c5db199SXin Li
*9c5db199SXin Li    def get_timestamp_if_finished(self):
*9c5db199SXin Li        entry = _cached_afe().get_special_tasks(id=self._id, is_complete=True)
*9c5db199SXin Li        return entry[0].time_finished if entry else None
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _find_results_dir(dirname):
*9c5db199SXin Li    subdirs = []
*9c5db199SXin Li    for root, dirs, files in os.walk(dirname, topdown=True):
*9c5db199SXin Li        for f in files:
*9c5db199SXin Li            if f == _OFFLOAD_MARKER:
*9c5db199SXin Li                subdirs.append(root)
*9c5db199SXin Li    return subdirs
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Li_OFFLOAD_MARKER = ".ready_for_offload"
*9c5db199SXin Li_marker_parse_error_metric = metrics.Counter(
*9c5db199SXin Li    'chromeos/autotest/gs_offloader/offload_marker_parse_errors',
*9c5db199SXin Li    description='Errors parsing the offload marker file')
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liclass SwarmingJobDirectory(_JobDirectory):
*9c5db199SXin Li    """Subclass of _JobDirectory for Skylab swarming jobs."""
*9c5db199SXin Li
*9c5db199SXin Li    @classmethod
*9c5db199SXin Li    def get_job_directories(cls):
*9c5db199SXin Li        """Return a list of directories of jobs that need offloading."""
*9c5db199SXin Li        # Legacy swarming results are in directories like
*9c5db199SXin Li        #   .../results/swarming-3e4391423c3a4311
*9c5db199SXin Li        # In particular, the ending digit is never 0
*9c5db199SXin Li        jobdirs = [
*9c5db199SXin Li                d for d in glob.glob('swarming-[0-9a-f]*[1-9a-f]')
*9c5db199SXin Li                if os.path.isdir(d)
*9c5db199SXin Li        ]
*9c5db199SXin Li        # New style swarming results are in directories like
*9c5db199SXin Li        #   .../results/swarming-3e4391423c3a4310/1
*9c5db199SXin Li        # - Results are one directory deeper.
*9c5db199SXin Li        # - Ending digit of first directory is always 0.
*9c5db199SXin Li        new_style_topdir = [
*9c5db199SXin Li                d for d in glob.glob('swarming-[0-9a-f]*0') if os.path.isdir(d)
*9c5db199SXin Li        ]
*9c5db199SXin Li        # When there are multiple tests run in one test_runner build,
*9c5db199SXin Li        # the results will be one level deeper with the test_id
*9c5db199SXin Li        # as one further subdirectory.
*9c5db199SXin Li        # Example: .../results/swarming-3e4391423c3a4310/1/test_id
*9c5db199SXin Li        for topdir in new_style_topdir:
*9c5db199SXin Li            for d in glob.glob('%s/[1-9a-f]*' % topdir):
*9c5db199SXin Li                subdirs = _find_results_dir(d)
*9c5db199SXin Li                jobdirs += subdirs
*9c5db199SXin Li
*9c5db199SXin Li        return jobdirs
*9c5db199SXin Li
*9c5db199SXin Li    def get_timestamp_if_finished(self):
*9c5db199SXin Li        """Get the timestamp to use for finished jobs.
*9c5db199SXin Li
*9c5db199SXin Li        @returns the latest hqe finished_on time. If the finished_on times are null
*9c5db199SXin Li                 returns the job's created_on time.
*9c5db199SXin Li        """
*9c5db199SXin Li        marker_path = os.path.join(self.dirname, _OFFLOAD_MARKER)
*9c5db199SXin Li        try:
*9c5db199SXin Li            with open(marker_path) as f:
*9c5db199SXin Li                ts_string = f.read().strip()
*9c5db199SXin Li        except:
*9c5db199SXin Li            return None
*9c5db199SXin Li        try:
*9c5db199SXin Li            ts = int(ts_string)
*9c5db199SXin Li            return time_utils.epoch_time_to_date_string(ts)
*9c5db199SXin Li        except ValueError as e:
*9c5db199SXin Li            logging.debug('Error parsing %s for %s: %s', _OFFLOAD_MARKER,
*9c5db199SXin Li                          self.dirname, e)
*9c5db199SXin Li            _marker_parse_error_metric.increment()
*9c5db199SXin Li            return None
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Li_AFE = None
*9c5db199SXin Lidef _cached_afe():
*9c5db199SXin Li    global _AFE
*9c5db199SXin Li    if _AFE is None:
*9c5db199SXin Li        _AFE = frontend_wrappers.RetryingAFE()
*9c5db199SXin Li    return _AFE