toolchain-utils/crosperf/experiment_runner.py

*760c253cSXin Li# -*- coding: utf-8 -*-
*760c253cSXin Li# Copyright 2011 The ChromiumOS Authors
*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be
*760c253cSXin Li# found in the LICENSE file.
*760c253cSXin Li
*760c253cSXin Li"""The experiment runner module."""
*760c253cSXin Li
*760c253cSXin Liimport getpass
*760c253cSXin Liimport os
*760c253cSXin Liimport shutil
*760c253cSXin Liimport time
*760c253cSXin Li
*760c253cSXin Lifrom cros_utils import command_executer
*760c253cSXin Lifrom cros_utils import logger
*760c253cSXin Lifrom cros_utils.email_sender import EmailSender
*760c253cSXin Lifrom cros_utils.file_utils import FileUtils
*760c253cSXin Lifrom experiment_status import ExperimentStatus
*760c253cSXin Liimport lock_machine
*760c253cSXin Lifrom results_cache import CacheConditions
*760c253cSXin Lifrom results_cache import ResultsCache
*760c253cSXin Lifrom results_report import HTMLResultsReport
*760c253cSXin Lifrom results_report import JSONResultsReport
*760c253cSXin Lifrom results_report import TextResultsReport
*760c253cSXin Lifrom schedv2 import Schedv2
*760c253cSXin Liimport test_flag
*760c253cSXin Li
*760c253cSXin Liimport config
*760c253cSXin Li
*760c253cSXin Li
*760c253cSXin Lidef _WriteJSONReportToFile(experiment, results_dir, json_report):
*760c253cSXin Li    """Writes a JSON report to a file in results_dir."""
*760c253cSXin Li    has_llvm = any("llvm" in l.compiler for l in experiment.labels)
*760c253cSXin Li    compiler_string = "llvm" if has_llvm else "gcc"
*760c253cSXin Li    board = experiment.labels[0].board
*760c253cSXin Li    filename = "report_%s_%s_%s.%s.json" % (
*760c253cSXin Li        board,
*760c253cSXin Li        json_report.date,
*760c253cSXin Li        json_report.time.replace(":", "."),
*760c253cSXin Li        compiler_string,
*760c253cSXin Li    )
*760c253cSXin Li    fullname = os.path.join(results_dir, filename)
*760c253cSXin Li    report_text = json_report.GetReport()
*760c253cSXin Li    with open(fullname, "w") as out_file:
*760c253cSXin Li        out_file.write(report_text)
*760c253cSXin Li
*760c253cSXin Li
*760c253cSXin Liclass ExperimentRunner(object):
*760c253cSXin Li    """ExperimentRunner Class."""
*760c253cSXin Li
*760c253cSXin Li    STATUS_TIME_DELAY = 30
*760c253cSXin Li    THREAD_MONITOR_DELAY = 2
*760c253cSXin Li
*760c253cSXin Li    SUCCEEDED = 0
*760c253cSXin Li    HAS_FAILURE = 1
*760c253cSXin Li    ALL_FAILED = 2
*760c253cSXin Li
*760c253cSXin Li    def __init__(
*760c253cSXin Li        self,
*760c253cSXin Li        experiment,
*760c253cSXin Li        json_report,
*760c253cSXin Li        using_schedv2=False,
*760c253cSXin Li        log=None,
*760c253cSXin Li        cmd_exec=None,
*760c253cSXin Li    ):
*760c253cSXin Li        self._experiment = experiment
*760c253cSXin Li        self.l = log or logger.GetLogger(experiment.log_dir)
*760c253cSXin Li        self._ce = cmd_exec or command_executer.GetCommandExecuter(self.l)
*760c253cSXin Li        self._terminated = False
*760c253cSXin Li        self.json_report = json_report
*760c253cSXin Li        self.locked_machines = []
*760c253cSXin Li        if experiment.log_level != "verbose":
*760c253cSXin Li            self.STATUS_TIME_DELAY = 10
*760c253cSXin Li
*760c253cSXin Li        # Setting this to True will use crosperf sched v2 (feature in progress).
*760c253cSXin Li        self._using_schedv2 = using_schedv2
*760c253cSXin Li
*760c253cSXin Li    def _GetMachineList(self):
*760c253cSXin Li        """Return a list of all requested machines.
*760c253cSXin Li
*760c253cSXin Li        Create a list of all the requested machines, both global requests and
*760c253cSXin Li        label-specific requests, and return the list.
*760c253cSXin Li        """
*760c253cSXin Li        machines = self._experiment.remote
*760c253cSXin Li        # All Label.remote is a sublist of experiment.remote.
*760c253cSXin Li        for l in self._experiment.labels:
*760c253cSXin Li            for r in l.remote:
*760c253cSXin Li                assert r in machines
*760c253cSXin Li        return machines
*760c253cSXin Li
*760c253cSXin Li    def _UpdateMachineList(self, locked_machines):
*760c253cSXin Li        """Update machines lists to contain only locked machines.
*760c253cSXin Li
*760c253cSXin Li        Go through all the lists of requested machines, both global and
*760c253cSXin Li        label-specific requests, and remove any machine that we were not
*760c253cSXin Li        able to lock.
*760c253cSXin Li
*760c253cSXin Li        Args:
*760c253cSXin Li          locked_machines: A list of the machines we successfully locked.
*760c253cSXin Li        """
*760c253cSXin Li        for m in self._experiment.remote:
*760c253cSXin Li            if m not in locked_machines:
*760c253cSXin Li                self._experiment.remote.remove(m)
*760c253cSXin Li
*760c253cSXin Li        for l in self._experiment.labels:
*760c253cSXin Li            for m in l.remote:
*760c253cSXin Li                if m not in locked_machines:
*760c253cSXin Li                    l.remote.remove(m)
*760c253cSXin Li
*760c253cSXin Li    def _GetMachineType(self, lock_mgr, machine):
*760c253cSXin Li        """Get where is the machine from.
*760c253cSXin Li
*760c253cSXin Li        Returns:
*760c253cSXin Li          The location of the machine: local or crosfleet
*760c253cSXin Li        """
*760c253cSXin Li        # We assume that lab machine always starts with chromeos*, and local
*760c253cSXin Li        # machines are ip address.
*760c253cSXin Li        if "chromeos" in machine:
*760c253cSXin Li            if lock_mgr.CheckMachineInCrosfleet(machine):
*760c253cSXin Li                return "crosfleet"
*760c253cSXin Li            else:
*760c253cSXin Li                raise RuntimeError("Lab machine not in Crosfleet.")
*760c253cSXin Li        return "local"
*760c253cSXin Li
*760c253cSXin Li    def _LockAllMachines(self, experiment):
*760c253cSXin Li        """Attempt to globally lock all of the machines requested for run.
*760c253cSXin Li
*760c253cSXin Li        This method tries to lock all machines requested for this crosperf run
*760c253cSXin Li        in three different modes automatically, to prevent any other crosperf runs
*760c253cSXin Li        from being able to update/use the machines while this experiment is
*760c253cSXin Li        running:
*760c253cSXin Li          - Crosfleet machines: Use crosfleet lease-dut mechanism to lease
*760c253cSXin Li          - Local machines: Use file lock mechanism to lock
*760c253cSXin Li        """
*760c253cSXin Li        if test_flag.GetTestMode():
*760c253cSXin Li            self.locked_machines = self._GetMachineList()
*760c253cSXin Li            experiment.locked_machines = self.locked_machines
*760c253cSXin Li        else:
*760c253cSXin Li            experiment.lock_mgr = lock_machine.LockManager(
*760c253cSXin Li                self._GetMachineList(),
*760c253cSXin Li                "",
*760c253cSXin Li                experiment.labels[0].chromeos_root,
*760c253cSXin Li                experiment.locks_dir,
*760c253cSXin Li                log=self.l,
*760c253cSXin Li            )
*760c253cSXin Li            for m in experiment.lock_mgr.machines:
*760c253cSXin Li                machine_type = self._GetMachineType(experiment.lock_mgr, m)
*760c253cSXin Li                if machine_type == "local":
*760c253cSXin Li                    experiment.lock_mgr.AddMachineToLocal(m)
*760c253cSXin Li                elif machine_type == "crosfleet":
*760c253cSXin Li                    experiment.lock_mgr.AddMachineToCrosfleet(m)
*760c253cSXin Li            machine_states = experiment.lock_mgr.GetMachineStates("lock")
*760c253cSXin Li            experiment.lock_mgr.CheckMachineLocks(machine_states, "lock")
*760c253cSXin Li            self.locked_machines = experiment.lock_mgr.UpdateMachines(True)
*760c253cSXin Li            experiment.locked_machines = self.locked_machines
*760c253cSXin Li            self._UpdateMachineList(self.locked_machines)
*760c253cSXin Li            experiment.machine_manager.RemoveNonLockedMachines(
*760c253cSXin Li                self.locked_machines
*760c253cSXin Li            )
*760c253cSXin Li            if not self.locked_machines:
*760c253cSXin Li                raise RuntimeError("Unable to lock any machines.")
*760c253cSXin Li
*760c253cSXin Li    def _ClearCacheEntries(self, experiment):
*760c253cSXin Li        for br in experiment.benchmark_runs:
*760c253cSXin Li            cache = ResultsCache()
*760c253cSXin Li            cache.Init(
*760c253cSXin Li                br.label.chromeos_image,
*760c253cSXin Li                br.label.chromeos_root,
*760c253cSXin Li                br.benchmark.test_name,
*760c253cSXin Li                br.iteration,
*760c253cSXin Li                br.test_args,
*760c253cSXin Li                br.profiler_args,
*760c253cSXin Li                br.machine_manager,
*760c253cSXin Li                br.machine,
*760c253cSXin Li                br.label.board,
*760c253cSXin Li                br.cache_conditions,
*760c253cSXin Li                br.logger(),
*760c253cSXin Li                br.log_level,
*760c253cSXin Li                br.label,
*760c253cSXin Li                br.share_cache,
*760c253cSXin Li                br.benchmark.suite,
*760c253cSXin Li                br.benchmark.show_all_results,
*760c253cSXin Li                br.benchmark.run_local,
*760c253cSXin Li                br.benchmark.cwp_dso,
*760c253cSXin Li            )
*760c253cSXin Li            cache_dir = cache.GetCacheDirForWrite()
*760c253cSXin Li            if os.path.exists(cache_dir):
*760c253cSXin Li                self.l.LogOutput("Removing cache dir: %s" % cache_dir)
*760c253cSXin Li                shutil.rmtree(cache_dir)
*760c253cSXin Li
*760c253cSXin Li    def _Run(self, experiment):
*760c253cSXin Li        try:
*760c253cSXin Li            # We should not lease machines if tests are launched via `crosfleet
*760c253cSXin Li            # create-test`. This is because leasing DUT in crosfleet will create a
*760c253cSXin Li            # no-op task on the DUT and new test created will be hanging there.
*760c253cSXin Li            # TODO(zhizhouy): Need to check whether machine is ready or not before
*760c253cSXin Li            # assigning a test to it.
*760c253cSXin Li            if not experiment.no_lock and not experiment.crosfleet:
*760c253cSXin Li                self._LockAllMachines(experiment)
*760c253cSXin Li            # Calculate all checksums of avaiable/locked machines, to ensure same
*760c253cSXin Li            # label has same machines for testing
*760c253cSXin Li            experiment.SetCheckSums(forceSameImage=True)
*760c253cSXin Li            if self._using_schedv2:
*760c253cSXin Li                schedv2 = Schedv2(experiment)
*760c253cSXin Li                experiment.set_schedv2(schedv2)
*760c253cSXin Li            if CacheConditions.FALSE in experiment.cache_conditions:
*760c253cSXin Li                self._ClearCacheEntries(experiment)
*760c253cSXin Li            status = ExperimentStatus(experiment)
*760c253cSXin Li            experiment.Run()
*760c253cSXin Li            last_status_time = 0
*760c253cSXin Li            last_status_string = ""
*760c253cSXin Li            try:
*760c253cSXin Li                if experiment.log_level != "verbose":
*760c253cSXin Li                    self.l.LogStartDots()
*760c253cSXin Li                while not experiment.IsComplete():
*760c253cSXin Li                    if last_status_time + self.STATUS_TIME_DELAY < time.time():
*760c253cSXin Li                        last_status_time = time.time()
*760c253cSXin Li                        border = "=============================="
*760c253cSXin Li                        if experiment.log_level == "verbose":
*760c253cSXin Li                            self.l.LogOutput(border)
*760c253cSXin Li                            self.l.LogOutput(status.GetProgressString())
*760c253cSXin Li                            self.l.LogOutput(status.GetStatusString())
*760c253cSXin Li                            self.l.LogOutput(border)
*760c253cSXin Li                        else:
*760c253cSXin Li                            current_status_string = status.GetStatusString()
*760c253cSXin Li                            if current_status_string != last_status_string:
*760c253cSXin Li                                self.l.LogEndDots()
*760c253cSXin Li                                self.l.LogOutput(border)
*760c253cSXin Li                                self.l.LogOutput(current_status_string)
*760c253cSXin Li                                self.l.LogOutput(border)
*760c253cSXin Li                                last_status_string = current_status_string
*760c253cSXin Li                            else:
*760c253cSXin Li                                self.l.LogAppendDot()
*760c253cSXin Li                    time.sleep(self.THREAD_MONITOR_DELAY)
*760c253cSXin Li            except KeyboardInterrupt:
*760c253cSXin Li                self._terminated = True
*760c253cSXin Li                self.l.LogError("Ctrl-c pressed. Cleaning up...")
*760c253cSXin Li                experiment.Terminate()
*760c253cSXin Li                raise
*760c253cSXin Li            except SystemExit:
*760c253cSXin Li                self._terminated = True
*760c253cSXin Li                self.l.LogError("Unexpected exit. Cleaning up...")
*760c253cSXin Li                experiment.Terminate()
*760c253cSXin Li                raise
*760c253cSXin Li        finally:
*760c253cSXin Li            experiment.Cleanup()
*760c253cSXin Li
*760c253cSXin Li    def _PrintTable(self, experiment):
*760c253cSXin Li        self.l.LogOutput(
*760c253cSXin Li            TextResultsReport.FromExperiment(experiment).GetReport()
*760c253cSXin Li        )
*760c253cSXin Li
*760c253cSXin Li    def _Email(self, experiment):
*760c253cSXin Li        # Only email by default if a new run was completed.
*760c253cSXin Li        send_mail = False
*760c253cSXin Li        for benchmark_run in experiment.benchmark_runs:
*760c253cSXin Li            if not benchmark_run.cache_hit:
*760c253cSXin Li                send_mail = True
*760c253cSXin Li                break
*760c253cSXin Li        if (
*760c253cSXin Li            not send_mail
*760c253cSXin Li            and not experiment.email_to
*760c253cSXin Li            or config.GetConfig("no_email")
*760c253cSXin Li        ):
*760c253cSXin Li            return
*760c253cSXin Li
*760c253cSXin Li        label_names = []
*760c253cSXin Li        for label in experiment.labels:
*760c253cSXin Li            label_names.append(label.name)
*760c253cSXin Li        subject = "%s: %s" % (experiment.name, " vs. ".join(label_names))
*760c253cSXin Li
*760c253cSXin Li        text_report = TextResultsReport.FromExperiment(
*760c253cSXin Li            experiment, True
*760c253cSXin Li        ).GetReport()
*760c253cSXin Li        text_report += (
*760c253cSXin Li            "\nResults are stored in %s.\n" % experiment.results_directory
*760c253cSXin Li        )
*760c253cSXin Li        text_report = "<pre style='font-size: 13px'>%s</pre>" % text_report
*760c253cSXin Li        html_report = HTMLResultsReport.FromExperiment(experiment).GetReport()
*760c253cSXin Li        attachment = EmailSender.Attachment("report.html", html_report)
*760c253cSXin Li        email_to = experiment.email_to or []
*760c253cSXin Li        email_to.append(getpass.getuser())
*760c253cSXin Li        EmailSender().SendEmail(
*760c253cSXin Li            email_to,
*760c253cSXin Li            subject,
*760c253cSXin Li            text_report,
*760c253cSXin Li            attachments=[attachment],
*760c253cSXin Li            msg_type="html",
*760c253cSXin Li        )
*760c253cSXin Li
*760c253cSXin Li    def _StoreResults(self, experiment):
*760c253cSXin Li        if self._terminated:
*760c253cSXin Li            return self.ALL_FAILED
*760c253cSXin Li
*760c253cSXin Li        results_directory = experiment.results_directory
*760c253cSXin Li        FileUtils().RmDir(results_directory)
*760c253cSXin Li        FileUtils().MkDirP(results_directory)
*760c253cSXin Li        self.l.LogOutput("Storing experiment file in %s." % results_directory)
*760c253cSXin Li        experiment_file_path = os.path.join(results_directory, "experiment.exp")
*760c253cSXin Li        FileUtils().WriteFile(experiment_file_path, experiment.experiment_file)
*760c253cSXin Li
*760c253cSXin Li        all_failed = True
*760c253cSXin Li
*760c253cSXin Li        topstats_file = os.path.join(results_directory, "topstats.log")
*760c253cSXin Li        self.l.LogOutput(
*760c253cSXin Li            "Storing top statistics of each benchmark run into %s."
*760c253cSXin Li            % topstats_file
*760c253cSXin Li        )
*760c253cSXin Li        # Track if any iterations for a given benchmark has passed for each
*760c253cSXin Li        # label.
*760c253cSXin Li        benchmarks_passes = {}
*760c253cSXin Li        with open(topstats_file, "w") as top_fd:
*760c253cSXin Li            for benchmark_run in experiment.benchmark_runs:
*760c253cSXin Li                benchmarks_passes.setdefault(
*760c253cSXin Li                    benchmark_run.label.name,
*760c253cSXin Li                    {benchmark_run.benchmark.name: False},
*760c253cSXin Li                )
*760c253cSXin Li                if benchmark_run.result:
*760c253cSXin Li                    if not benchmark_run.result.retval:
*760c253cSXin Li                        all_failed = False
*760c253cSXin Li                        benchmarks_passes[benchmark_run.label.name][
*760c253cSXin Li                            benchmark_run.benchmark.name
*760c253cSXin Li                        ] = True
*760c253cSXin Li                    # Header with benchmark run name.
*760c253cSXin Li                    top_fd.write("%s\n" % str(benchmark_run))
*760c253cSXin Li                    # Formatted string with top statistics.
*760c253cSXin Li                    top_fd.write(benchmark_run.result.FormatStringTopCommands())
*760c253cSXin Li                    top_fd.write("\n\n")
*760c253cSXin Li
*760c253cSXin Li        if all_failed:
*760c253cSXin Li            return self.ALL_FAILED
*760c253cSXin Li        # Set has_passes if atleast one iteration of all benchmarks has passed
*760c253cSXin Li        # for every label.
*760c253cSXin Li        has_passes = True
*760c253cSXin Li        for benchmarks in benchmarks_passes.values():
*760c253cSXin Li            has_passes = has_passes and all(benchmarks.values())
*760c253cSXin Li
*760c253cSXin Li        self.l.LogOutput("Storing results of each benchmark run.")
*760c253cSXin Li        for benchmark_run in experiment.benchmark_runs:
*760c253cSXin Li            if benchmark_run.result:
*760c253cSXin Li                benchmark_run_name = "".join(
*760c253cSXin Li                    ch for ch in benchmark_run.name if ch.isalnum()
*760c253cSXin Li                )
*760c253cSXin Li                benchmark_run_path = os.path.join(
*760c253cSXin Li                    results_directory, benchmark_run_name
*760c253cSXin Li                )
*760c253cSXin Li                if experiment.compress_results:
*760c253cSXin Li                    benchmark_run.result.CompressResultsTo(benchmark_run_path)
*760c253cSXin Li                else:
*760c253cSXin Li                    benchmark_run.result.CopyResultsTo(benchmark_run_path)
*760c253cSXin Li                # Don't remove benchmark tmp if it was a cache hit.
*760c253cSXin Li                benchmark_run.result.CleanUp(
*760c253cSXin Li                    benchmark_run.benchmark.rm_chroot_tmp
*760c253cSXin Li                    and not benchmark_run.cache_hit
*760c253cSXin Li                )
*760c253cSXin Li
*760c253cSXin Li        self.l.LogOutput("Storing results report in %s." % results_directory)
*760c253cSXin Li        results_table_path = os.path.join(results_directory, "results.html")
*760c253cSXin Li        report = HTMLResultsReport.FromExperiment(experiment).GetReport()
*760c253cSXin Li        if self.json_report:
*760c253cSXin Li            json_report = JSONResultsReport.FromExperiment(
*760c253cSXin Li                experiment, json_args={"indent": 2}
*760c253cSXin Li            )
*760c253cSXin Li            _WriteJSONReportToFile(experiment, results_directory, json_report)
*760c253cSXin Li
*760c253cSXin Li        FileUtils().WriteFile(results_table_path, report)
*760c253cSXin Li
*760c253cSXin Li        self.l.LogOutput(
*760c253cSXin Li            "Storing email message body in %s." % results_directory
*760c253cSXin Li        )
*760c253cSXin Li        msg_file_path = os.path.join(results_directory, "msg_body.html")
*760c253cSXin Li        text_report = TextResultsReport.FromExperiment(
*760c253cSXin Li            experiment, True
*760c253cSXin Li        ).GetReport()
*760c253cSXin Li        text_report += (
*760c253cSXin Li            "\nResults are stored in %s.\n" % experiment.results_directory
*760c253cSXin Li        )
*760c253cSXin Li        msg_body = "<pre style='font-size: 13px'>%s</pre>" % text_report
*760c253cSXin Li        FileUtils().WriteFile(msg_file_path, msg_body)
*760c253cSXin Li
*760c253cSXin Li        return self.SUCCEEDED if has_passes else self.HAS_FAILURE
*760c253cSXin Li
*760c253cSXin Li    def Run(self):
*760c253cSXin Li        try:
*760c253cSXin Li            self._Run(self._experiment)
*760c253cSXin Li        finally:
*760c253cSXin Li            # Always print the report at the end of the run.
*760c253cSXin Li            self._PrintTable(self._experiment)
*760c253cSXin Li            ret = self._StoreResults(self._experiment)
*760c253cSXin Li            if ret != self.ALL_FAILED:
*760c253cSXin Li                self._Email(self._experiment)
*760c253cSXin Li        return ret
*760c253cSXin Li
*760c253cSXin Li
*760c253cSXin Liclass MockExperimentRunner(ExperimentRunner):
*760c253cSXin Li    """Mocked ExperimentRunner for testing."""
*760c253cSXin Li
*760c253cSXin Li    def __init__(self, experiment, json_report):
*760c253cSXin Li        super(MockExperimentRunner, self).__init__(experiment, json_report)
*760c253cSXin Li
*760c253cSXin Li    def _Run(self, experiment):
*760c253cSXin Li        self.l.LogOutput(
*760c253cSXin Li            "Would run the following experiment: '%s'." % experiment.name
*760c253cSXin Li        )
*760c253cSXin Li
*760c253cSXin Li    def _PrintTable(self, experiment):
*760c253cSXin Li        self.l.LogOutput("Would print the experiment table.")
*760c253cSXin Li
*760c253cSXin Li    def _Email(self, experiment):
*760c253cSXin Li        self.l.LogOutput("Would send result email.")
*760c253cSXin Li
*760c253cSXin Li    def _StoreResults(self, experiment):
*760c253cSXin Li        self.l.LogOutput("Would store the results.")