xref: /aosp_15_r20/external/toolchain-utils/crosperf/experiment_runner.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1*760c253cSXin Li# -*- coding: utf-8 -*-
2*760c253cSXin Li# Copyright 2011 The ChromiumOS Authors
3*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be
4*760c253cSXin Li# found in the LICENSE file.
5*760c253cSXin Li
6*760c253cSXin Li"""The experiment runner module."""
7*760c253cSXin Li
8*760c253cSXin Liimport getpass
9*760c253cSXin Liimport os
10*760c253cSXin Liimport shutil
11*760c253cSXin Liimport time
12*760c253cSXin Li
13*760c253cSXin Lifrom cros_utils import command_executer
14*760c253cSXin Lifrom cros_utils import logger
15*760c253cSXin Lifrom cros_utils.email_sender import EmailSender
16*760c253cSXin Lifrom cros_utils.file_utils import FileUtils
17*760c253cSXin Lifrom experiment_status import ExperimentStatus
18*760c253cSXin Liimport lock_machine
19*760c253cSXin Lifrom results_cache import CacheConditions
20*760c253cSXin Lifrom results_cache import ResultsCache
21*760c253cSXin Lifrom results_report import HTMLResultsReport
22*760c253cSXin Lifrom results_report import JSONResultsReport
23*760c253cSXin Lifrom results_report import TextResultsReport
24*760c253cSXin Lifrom schedv2 import Schedv2
25*760c253cSXin Liimport test_flag
26*760c253cSXin Li
27*760c253cSXin Liimport config
28*760c253cSXin Li
29*760c253cSXin Li
30*760c253cSXin Lidef _WriteJSONReportToFile(experiment, results_dir, json_report):
31*760c253cSXin Li    """Writes a JSON report to a file in results_dir."""
32*760c253cSXin Li    has_llvm = any("llvm" in l.compiler for l in experiment.labels)
33*760c253cSXin Li    compiler_string = "llvm" if has_llvm else "gcc"
34*760c253cSXin Li    board = experiment.labels[0].board
35*760c253cSXin Li    filename = "report_%s_%s_%s.%s.json" % (
36*760c253cSXin Li        board,
37*760c253cSXin Li        json_report.date,
38*760c253cSXin Li        json_report.time.replace(":", "."),
39*760c253cSXin Li        compiler_string,
40*760c253cSXin Li    )
41*760c253cSXin Li    fullname = os.path.join(results_dir, filename)
42*760c253cSXin Li    report_text = json_report.GetReport()
43*760c253cSXin Li    with open(fullname, "w") as out_file:
44*760c253cSXin Li        out_file.write(report_text)
45*760c253cSXin Li
46*760c253cSXin Li
47*760c253cSXin Liclass ExperimentRunner(object):
48*760c253cSXin Li    """ExperimentRunner Class."""
49*760c253cSXin Li
50*760c253cSXin Li    STATUS_TIME_DELAY = 30
51*760c253cSXin Li    THREAD_MONITOR_DELAY = 2
52*760c253cSXin Li
53*760c253cSXin Li    SUCCEEDED = 0
54*760c253cSXin Li    HAS_FAILURE = 1
55*760c253cSXin Li    ALL_FAILED = 2
56*760c253cSXin Li
57*760c253cSXin Li    def __init__(
58*760c253cSXin Li        self,
59*760c253cSXin Li        experiment,
60*760c253cSXin Li        json_report,
61*760c253cSXin Li        using_schedv2=False,
62*760c253cSXin Li        log=None,
63*760c253cSXin Li        cmd_exec=None,
64*760c253cSXin Li    ):
65*760c253cSXin Li        self._experiment = experiment
66*760c253cSXin Li        self.l = log or logger.GetLogger(experiment.log_dir)
67*760c253cSXin Li        self._ce = cmd_exec or command_executer.GetCommandExecuter(self.l)
68*760c253cSXin Li        self._terminated = False
69*760c253cSXin Li        self.json_report = json_report
70*760c253cSXin Li        self.locked_machines = []
71*760c253cSXin Li        if experiment.log_level != "verbose":
72*760c253cSXin Li            self.STATUS_TIME_DELAY = 10
73*760c253cSXin Li
74*760c253cSXin Li        # Setting this to True will use crosperf sched v2 (feature in progress).
75*760c253cSXin Li        self._using_schedv2 = using_schedv2
76*760c253cSXin Li
77*760c253cSXin Li    def _GetMachineList(self):
78*760c253cSXin Li        """Return a list of all requested machines.
79*760c253cSXin Li
80*760c253cSXin Li        Create a list of all the requested machines, both global requests and
81*760c253cSXin Li        label-specific requests, and return the list.
82*760c253cSXin Li        """
83*760c253cSXin Li        machines = self._experiment.remote
84*760c253cSXin Li        # All Label.remote is a sublist of experiment.remote.
85*760c253cSXin Li        for l in self._experiment.labels:
86*760c253cSXin Li            for r in l.remote:
87*760c253cSXin Li                assert r in machines
88*760c253cSXin Li        return machines
89*760c253cSXin Li
90*760c253cSXin Li    def _UpdateMachineList(self, locked_machines):
91*760c253cSXin Li        """Update machines lists to contain only locked machines.
92*760c253cSXin Li
93*760c253cSXin Li        Go through all the lists of requested machines, both global and
94*760c253cSXin Li        label-specific requests, and remove any machine that we were not
95*760c253cSXin Li        able to lock.
96*760c253cSXin Li
97*760c253cSXin Li        Args:
98*760c253cSXin Li          locked_machines: A list of the machines we successfully locked.
99*760c253cSXin Li        """
100*760c253cSXin Li        for m in self._experiment.remote:
101*760c253cSXin Li            if m not in locked_machines:
102*760c253cSXin Li                self._experiment.remote.remove(m)
103*760c253cSXin Li
104*760c253cSXin Li        for l in self._experiment.labels:
105*760c253cSXin Li            for m in l.remote:
106*760c253cSXin Li                if m not in locked_machines:
107*760c253cSXin Li                    l.remote.remove(m)
108*760c253cSXin Li
109*760c253cSXin Li    def _GetMachineType(self, lock_mgr, machine):
110*760c253cSXin Li        """Get where is the machine from.
111*760c253cSXin Li
112*760c253cSXin Li        Returns:
113*760c253cSXin Li          The location of the machine: local or crosfleet
114*760c253cSXin Li        """
115*760c253cSXin Li        # We assume that lab machine always starts with chromeos*, and local
116*760c253cSXin Li        # machines are ip address.
117*760c253cSXin Li        if "chromeos" in machine:
118*760c253cSXin Li            if lock_mgr.CheckMachineInCrosfleet(machine):
119*760c253cSXin Li                return "crosfleet"
120*760c253cSXin Li            else:
121*760c253cSXin Li                raise RuntimeError("Lab machine not in Crosfleet.")
122*760c253cSXin Li        return "local"
123*760c253cSXin Li
124*760c253cSXin Li    def _LockAllMachines(self, experiment):
125*760c253cSXin Li        """Attempt to globally lock all of the machines requested for run.
126*760c253cSXin Li
127*760c253cSXin Li        This method tries to lock all machines requested for this crosperf run
128*760c253cSXin Li        in three different modes automatically, to prevent any other crosperf runs
129*760c253cSXin Li        from being able to update/use the machines while this experiment is
130*760c253cSXin Li        running:
131*760c253cSXin Li          - Crosfleet machines: Use crosfleet lease-dut mechanism to lease
132*760c253cSXin Li          - Local machines: Use file lock mechanism to lock
133*760c253cSXin Li        """
134*760c253cSXin Li        if test_flag.GetTestMode():
135*760c253cSXin Li            self.locked_machines = self._GetMachineList()
136*760c253cSXin Li            experiment.locked_machines = self.locked_machines
137*760c253cSXin Li        else:
138*760c253cSXin Li            experiment.lock_mgr = lock_machine.LockManager(
139*760c253cSXin Li                self._GetMachineList(),
140*760c253cSXin Li                "",
141*760c253cSXin Li                experiment.labels[0].chromeos_root,
142*760c253cSXin Li                experiment.locks_dir,
143*760c253cSXin Li                log=self.l,
144*760c253cSXin Li            )
145*760c253cSXin Li            for m in experiment.lock_mgr.machines:
146*760c253cSXin Li                machine_type = self._GetMachineType(experiment.lock_mgr, m)
147*760c253cSXin Li                if machine_type == "local":
148*760c253cSXin Li                    experiment.lock_mgr.AddMachineToLocal(m)
149*760c253cSXin Li                elif machine_type == "crosfleet":
150*760c253cSXin Li                    experiment.lock_mgr.AddMachineToCrosfleet(m)
151*760c253cSXin Li            machine_states = experiment.lock_mgr.GetMachineStates("lock")
152*760c253cSXin Li            experiment.lock_mgr.CheckMachineLocks(machine_states, "lock")
153*760c253cSXin Li            self.locked_machines = experiment.lock_mgr.UpdateMachines(True)
154*760c253cSXin Li            experiment.locked_machines = self.locked_machines
155*760c253cSXin Li            self._UpdateMachineList(self.locked_machines)
156*760c253cSXin Li            experiment.machine_manager.RemoveNonLockedMachines(
157*760c253cSXin Li                self.locked_machines
158*760c253cSXin Li            )
159*760c253cSXin Li            if not self.locked_machines:
160*760c253cSXin Li                raise RuntimeError("Unable to lock any machines.")
161*760c253cSXin Li
162*760c253cSXin Li    def _ClearCacheEntries(self, experiment):
163*760c253cSXin Li        for br in experiment.benchmark_runs:
164*760c253cSXin Li            cache = ResultsCache()
165*760c253cSXin Li            cache.Init(
166*760c253cSXin Li                br.label.chromeos_image,
167*760c253cSXin Li                br.label.chromeos_root,
168*760c253cSXin Li                br.benchmark.test_name,
169*760c253cSXin Li                br.iteration,
170*760c253cSXin Li                br.test_args,
171*760c253cSXin Li                br.profiler_args,
172*760c253cSXin Li                br.machine_manager,
173*760c253cSXin Li                br.machine,
174*760c253cSXin Li                br.label.board,
175*760c253cSXin Li                br.cache_conditions,
176*760c253cSXin Li                br.logger(),
177*760c253cSXin Li                br.log_level,
178*760c253cSXin Li                br.label,
179*760c253cSXin Li                br.share_cache,
180*760c253cSXin Li                br.benchmark.suite,
181*760c253cSXin Li                br.benchmark.show_all_results,
182*760c253cSXin Li                br.benchmark.run_local,
183*760c253cSXin Li                br.benchmark.cwp_dso,
184*760c253cSXin Li            )
185*760c253cSXin Li            cache_dir = cache.GetCacheDirForWrite()
186*760c253cSXin Li            if os.path.exists(cache_dir):
187*760c253cSXin Li                self.l.LogOutput("Removing cache dir: %s" % cache_dir)
188*760c253cSXin Li                shutil.rmtree(cache_dir)
189*760c253cSXin Li
190*760c253cSXin Li    def _Run(self, experiment):
191*760c253cSXin Li        try:
192*760c253cSXin Li            # We should not lease machines if tests are launched via `crosfleet
193*760c253cSXin Li            # create-test`. This is because leasing DUT in crosfleet will create a
194*760c253cSXin Li            # no-op task on the DUT and new test created will be hanging there.
195*760c253cSXin Li            # TODO(zhizhouy): Need to check whether machine is ready or not before
196*760c253cSXin Li            # assigning a test to it.
197*760c253cSXin Li            if not experiment.no_lock and not experiment.crosfleet:
198*760c253cSXin Li                self._LockAllMachines(experiment)
199*760c253cSXin Li            # Calculate all checksums of avaiable/locked machines, to ensure same
200*760c253cSXin Li            # label has same machines for testing
201*760c253cSXin Li            experiment.SetCheckSums(forceSameImage=True)
202*760c253cSXin Li            if self._using_schedv2:
203*760c253cSXin Li                schedv2 = Schedv2(experiment)
204*760c253cSXin Li                experiment.set_schedv2(schedv2)
205*760c253cSXin Li            if CacheConditions.FALSE in experiment.cache_conditions:
206*760c253cSXin Li                self._ClearCacheEntries(experiment)
207*760c253cSXin Li            status = ExperimentStatus(experiment)
208*760c253cSXin Li            experiment.Run()
209*760c253cSXin Li            last_status_time = 0
210*760c253cSXin Li            last_status_string = ""
211*760c253cSXin Li            try:
212*760c253cSXin Li                if experiment.log_level != "verbose":
213*760c253cSXin Li                    self.l.LogStartDots()
214*760c253cSXin Li                while not experiment.IsComplete():
215*760c253cSXin Li                    if last_status_time + self.STATUS_TIME_DELAY < time.time():
216*760c253cSXin Li                        last_status_time = time.time()
217*760c253cSXin Li                        border = "=============================="
218*760c253cSXin Li                        if experiment.log_level == "verbose":
219*760c253cSXin Li                            self.l.LogOutput(border)
220*760c253cSXin Li                            self.l.LogOutput(status.GetProgressString())
221*760c253cSXin Li                            self.l.LogOutput(status.GetStatusString())
222*760c253cSXin Li                            self.l.LogOutput(border)
223*760c253cSXin Li                        else:
224*760c253cSXin Li                            current_status_string = status.GetStatusString()
225*760c253cSXin Li                            if current_status_string != last_status_string:
226*760c253cSXin Li                                self.l.LogEndDots()
227*760c253cSXin Li                                self.l.LogOutput(border)
228*760c253cSXin Li                                self.l.LogOutput(current_status_string)
229*760c253cSXin Li                                self.l.LogOutput(border)
230*760c253cSXin Li                                last_status_string = current_status_string
231*760c253cSXin Li                            else:
232*760c253cSXin Li                                self.l.LogAppendDot()
233*760c253cSXin Li                    time.sleep(self.THREAD_MONITOR_DELAY)
234*760c253cSXin Li            except KeyboardInterrupt:
235*760c253cSXin Li                self._terminated = True
236*760c253cSXin Li                self.l.LogError("Ctrl-c pressed. Cleaning up...")
237*760c253cSXin Li                experiment.Terminate()
238*760c253cSXin Li                raise
239*760c253cSXin Li            except SystemExit:
240*760c253cSXin Li                self._terminated = True
241*760c253cSXin Li                self.l.LogError("Unexpected exit. Cleaning up...")
242*760c253cSXin Li                experiment.Terminate()
243*760c253cSXin Li                raise
244*760c253cSXin Li        finally:
245*760c253cSXin Li            experiment.Cleanup()
246*760c253cSXin Li
247*760c253cSXin Li    def _PrintTable(self, experiment):
248*760c253cSXin Li        self.l.LogOutput(
249*760c253cSXin Li            TextResultsReport.FromExperiment(experiment).GetReport()
250*760c253cSXin Li        )
251*760c253cSXin Li
252*760c253cSXin Li    def _Email(self, experiment):
253*760c253cSXin Li        # Only email by default if a new run was completed.
254*760c253cSXin Li        send_mail = False
255*760c253cSXin Li        for benchmark_run in experiment.benchmark_runs:
256*760c253cSXin Li            if not benchmark_run.cache_hit:
257*760c253cSXin Li                send_mail = True
258*760c253cSXin Li                break
259*760c253cSXin Li        if (
260*760c253cSXin Li            not send_mail
261*760c253cSXin Li            and not experiment.email_to
262*760c253cSXin Li            or config.GetConfig("no_email")
263*760c253cSXin Li        ):
264*760c253cSXin Li            return
265*760c253cSXin Li
266*760c253cSXin Li        label_names = []
267*760c253cSXin Li        for label in experiment.labels:
268*760c253cSXin Li            label_names.append(label.name)
269*760c253cSXin Li        subject = "%s: %s" % (experiment.name, " vs. ".join(label_names))
270*760c253cSXin Li
271*760c253cSXin Li        text_report = TextResultsReport.FromExperiment(
272*760c253cSXin Li            experiment, True
273*760c253cSXin Li        ).GetReport()
274*760c253cSXin Li        text_report += (
275*760c253cSXin Li            "\nResults are stored in %s.\n" % experiment.results_directory
276*760c253cSXin Li        )
277*760c253cSXin Li        text_report = "<pre style='font-size: 13px'>%s</pre>" % text_report
278*760c253cSXin Li        html_report = HTMLResultsReport.FromExperiment(experiment).GetReport()
279*760c253cSXin Li        attachment = EmailSender.Attachment("report.html", html_report)
280*760c253cSXin Li        email_to = experiment.email_to or []
281*760c253cSXin Li        email_to.append(getpass.getuser())
282*760c253cSXin Li        EmailSender().SendEmail(
283*760c253cSXin Li            email_to,
284*760c253cSXin Li            subject,
285*760c253cSXin Li            text_report,
286*760c253cSXin Li            attachments=[attachment],
287*760c253cSXin Li            msg_type="html",
288*760c253cSXin Li        )
289*760c253cSXin Li
290*760c253cSXin Li    def _StoreResults(self, experiment):
291*760c253cSXin Li        if self._terminated:
292*760c253cSXin Li            return self.ALL_FAILED
293*760c253cSXin Li
294*760c253cSXin Li        results_directory = experiment.results_directory
295*760c253cSXin Li        FileUtils().RmDir(results_directory)
296*760c253cSXin Li        FileUtils().MkDirP(results_directory)
297*760c253cSXin Li        self.l.LogOutput("Storing experiment file in %s." % results_directory)
298*760c253cSXin Li        experiment_file_path = os.path.join(results_directory, "experiment.exp")
299*760c253cSXin Li        FileUtils().WriteFile(experiment_file_path, experiment.experiment_file)
300*760c253cSXin Li
301*760c253cSXin Li        all_failed = True
302*760c253cSXin Li
303*760c253cSXin Li        topstats_file = os.path.join(results_directory, "topstats.log")
304*760c253cSXin Li        self.l.LogOutput(
305*760c253cSXin Li            "Storing top statistics of each benchmark run into %s."
306*760c253cSXin Li            % topstats_file
307*760c253cSXin Li        )
308*760c253cSXin Li        # Track if any iterations for a given benchmark has passed for each
309*760c253cSXin Li        # label.
310*760c253cSXin Li        benchmarks_passes = {}
311*760c253cSXin Li        with open(topstats_file, "w") as top_fd:
312*760c253cSXin Li            for benchmark_run in experiment.benchmark_runs:
313*760c253cSXin Li                benchmarks_passes.setdefault(
314*760c253cSXin Li                    benchmark_run.label.name,
315*760c253cSXin Li                    {benchmark_run.benchmark.name: False},
316*760c253cSXin Li                )
317*760c253cSXin Li                if benchmark_run.result:
318*760c253cSXin Li                    if not benchmark_run.result.retval:
319*760c253cSXin Li                        all_failed = False
320*760c253cSXin Li                        benchmarks_passes[benchmark_run.label.name][
321*760c253cSXin Li                            benchmark_run.benchmark.name
322*760c253cSXin Li                        ] = True
323*760c253cSXin Li                    # Header with benchmark run name.
324*760c253cSXin Li                    top_fd.write("%s\n" % str(benchmark_run))
325*760c253cSXin Li                    # Formatted string with top statistics.
326*760c253cSXin Li                    top_fd.write(benchmark_run.result.FormatStringTopCommands())
327*760c253cSXin Li                    top_fd.write("\n\n")
328*760c253cSXin Li
329*760c253cSXin Li        if all_failed:
330*760c253cSXin Li            return self.ALL_FAILED
331*760c253cSXin Li        # Set has_passes if atleast one iteration of all benchmarks has passed
332*760c253cSXin Li        # for every label.
333*760c253cSXin Li        has_passes = True
334*760c253cSXin Li        for benchmarks in benchmarks_passes.values():
335*760c253cSXin Li            has_passes = has_passes and all(benchmarks.values())
336*760c253cSXin Li
337*760c253cSXin Li        self.l.LogOutput("Storing results of each benchmark run.")
338*760c253cSXin Li        for benchmark_run in experiment.benchmark_runs:
339*760c253cSXin Li            if benchmark_run.result:
340*760c253cSXin Li                benchmark_run_name = "".join(
341*760c253cSXin Li                    ch for ch in benchmark_run.name if ch.isalnum()
342*760c253cSXin Li                )
343*760c253cSXin Li                benchmark_run_path = os.path.join(
344*760c253cSXin Li                    results_directory, benchmark_run_name
345*760c253cSXin Li                )
346*760c253cSXin Li                if experiment.compress_results:
347*760c253cSXin Li                    benchmark_run.result.CompressResultsTo(benchmark_run_path)
348*760c253cSXin Li                else:
349*760c253cSXin Li                    benchmark_run.result.CopyResultsTo(benchmark_run_path)
350*760c253cSXin Li                # Don't remove benchmark tmp if it was a cache hit.
351*760c253cSXin Li                benchmark_run.result.CleanUp(
352*760c253cSXin Li                    benchmark_run.benchmark.rm_chroot_tmp
353*760c253cSXin Li                    and not benchmark_run.cache_hit
354*760c253cSXin Li                )
355*760c253cSXin Li
356*760c253cSXin Li        self.l.LogOutput("Storing results report in %s." % results_directory)
357*760c253cSXin Li        results_table_path = os.path.join(results_directory, "results.html")
358*760c253cSXin Li        report = HTMLResultsReport.FromExperiment(experiment).GetReport()
359*760c253cSXin Li        if self.json_report:
360*760c253cSXin Li            json_report = JSONResultsReport.FromExperiment(
361*760c253cSXin Li                experiment, json_args={"indent": 2}
362*760c253cSXin Li            )
363*760c253cSXin Li            _WriteJSONReportToFile(experiment, results_directory, json_report)
364*760c253cSXin Li
365*760c253cSXin Li        FileUtils().WriteFile(results_table_path, report)
366*760c253cSXin Li
367*760c253cSXin Li        self.l.LogOutput(
368*760c253cSXin Li            "Storing email message body in %s." % results_directory
369*760c253cSXin Li        )
370*760c253cSXin Li        msg_file_path = os.path.join(results_directory, "msg_body.html")
371*760c253cSXin Li        text_report = TextResultsReport.FromExperiment(
372*760c253cSXin Li            experiment, True
373*760c253cSXin Li        ).GetReport()
374*760c253cSXin Li        text_report += (
375*760c253cSXin Li            "\nResults are stored in %s.\n" % experiment.results_directory
376*760c253cSXin Li        )
377*760c253cSXin Li        msg_body = "<pre style='font-size: 13px'>%s</pre>" % text_report
378*760c253cSXin Li        FileUtils().WriteFile(msg_file_path, msg_body)
379*760c253cSXin Li
380*760c253cSXin Li        return self.SUCCEEDED if has_passes else self.HAS_FAILURE
381*760c253cSXin Li
382*760c253cSXin Li    def Run(self):
383*760c253cSXin Li        try:
384*760c253cSXin Li            self._Run(self._experiment)
385*760c253cSXin Li        finally:
386*760c253cSXin Li            # Always print the report at the end of the run.
387*760c253cSXin Li            self._PrintTable(self._experiment)
388*760c253cSXin Li            ret = self._StoreResults(self._experiment)
389*760c253cSXin Li            if ret != self.ALL_FAILED:
390*760c253cSXin Li                self._Email(self._experiment)
391*760c253cSXin Li        return ret
392*760c253cSXin Li
393*760c253cSXin Li
394*760c253cSXin Liclass MockExperimentRunner(ExperimentRunner):
395*760c253cSXin Li    """Mocked ExperimentRunner for testing."""
396*760c253cSXin Li
397*760c253cSXin Li    def __init__(self, experiment, json_report):
398*760c253cSXin Li        super(MockExperimentRunner, self).__init__(experiment, json_report)
399*760c253cSXin Li
400*760c253cSXin Li    def _Run(self, experiment):
401*760c253cSXin Li        self.l.LogOutput(
402*760c253cSXin Li            "Would run the following experiment: '%s'." % experiment.name
403*760c253cSXin Li        )
404*760c253cSXin Li
405*760c253cSXin Li    def _PrintTable(self, experiment):
406*760c253cSXin Li        self.l.LogOutput("Would print the experiment table.")
407*760c253cSXin Li
408*760c253cSXin Li    def _Email(self, experiment):
409*760c253cSXin Li        self.l.LogOutput("Would send result email.")
410*760c253cSXin Li
411*760c253cSXin Li    def _StoreResults(self, experiment):
412*760c253cSXin Li        self.l.LogOutput("Would store the results.")
413