xref: /aosp_15_r20/external/toolchain-utils/crosperf/experiment.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1*760c253cSXin Li# -*- coding: utf-8 -*-
2*760c253cSXin Li# Copyright 2013 The ChromiumOS Authors
3*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be
4*760c253cSXin Li# found in the LICENSE file.
5*760c253cSXin Li
6*760c253cSXin Li"""The experiment setting module."""
7*760c253cSXin Li
8*760c253cSXin Li
9*760c253cSXin Liimport os
10*760c253cSXin Lifrom threading import Lock
11*760c253cSXin Liimport time
12*760c253cSXin Li
13*760c253cSXin Liimport benchmark_run
14*760c253cSXin Lifrom cros_utils import logger
15*760c253cSXin Lifrom cros_utils import misc
16*760c253cSXin Lifrom machine_manager import BadChecksum
17*760c253cSXin Lifrom machine_manager import MachineManager
18*760c253cSXin Lifrom machine_manager import MockMachineManager
19*760c253cSXin Liimport test_flag
20*760c253cSXin Li
21*760c253cSXin Li
22*760c253cSXin Liclass Experiment(object):
23*760c253cSXin Li    """Class representing an Experiment to be run."""
24*760c253cSXin Li
25*760c253cSXin Li    def __init__(
26*760c253cSXin Li        self,
27*760c253cSXin Li        name,
28*760c253cSXin Li        remote,
29*760c253cSXin Li        working_directory,
30*760c253cSXin Li        chromeos_root,
31*760c253cSXin Li        cache_conditions,
32*760c253cSXin Li        labels,
33*760c253cSXin Li        benchmarks,
34*760c253cSXin Li        experiment_file,
35*760c253cSXin Li        email_to,
36*760c253cSXin Li        acquire_timeout,
37*760c253cSXin Li        log_dir,
38*760c253cSXin Li        log_level,
39*760c253cSXin Li        share_cache,
40*760c253cSXin Li        results_directory,
41*760c253cSXin Li        compress_results,
42*760c253cSXin Li        locks_directory,
43*760c253cSXin Li        cwp_dso,
44*760c253cSXin Li        ignore_min_max,
45*760c253cSXin Li        crosfleet,
46*760c253cSXin Li        dut_config,
47*760c253cSXin Li        keep_stateful: bool,
48*760c253cSXin Li        no_lock: bool,
49*760c253cSXin Li    ):
50*760c253cSXin Li        self.name = name
51*760c253cSXin Li        self.working_directory = working_directory
52*760c253cSXin Li        self.remote = remote
53*760c253cSXin Li        self.chromeos_root = chromeos_root
54*760c253cSXin Li        self.cache_conditions = cache_conditions
55*760c253cSXin Li        self.experiment_file = experiment_file
56*760c253cSXin Li        self.email_to = email_to
57*760c253cSXin Li        if not results_directory:
58*760c253cSXin Li            self.results_directory = os.path.join(
59*760c253cSXin Li                self.working_directory, self.name + "_results"
60*760c253cSXin Li            )
61*760c253cSXin Li        else:
62*760c253cSXin Li            self.results_directory = misc.CanonicalizePath(results_directory)
63*760c253cSXin Li        self.compress_results = compress_results
64*760c253cSXin Li        self.log_dir = log_dir
65*760c253cSXin Li        self.log_level = log_level
66*760c253cSXin Li        self.labels = labels
67*760c253cSXin Li        self.benchmarks = benchmarks
68*760c253cSXin Li        self.num_complete = 0
69*760c253cSXin Li        self.num_run_complete = 0
70*760c253cSXin Li        self.share_cache = share_cache
71*760c253cSXin Li        self.active_threads = []
72*760c253cSXin Li        self.locks_dir = locks_directory
73*760c253cSXin Li        self.locked_machines = []
74*760c253cSXin Li        self.lock_mgr = None
75*760c253cSXin Li        self.cwp_dso = cwp_dso
76*760c253cSXin Li        self.ignore_min_max = ignore_min_max
77*760c253cSXin Li        self.crosfleet = crosfleet
78*760c253cSXin Li        self.no_lock = no_lock
79*760c253cSXin Li        self.l = logger.GetLogger(log_dir)
80*760c253cSXin Li
81*760c253cSXin Li        if not self.benchmarks:
82*760c253cSXin Li            raise RuntimeError("No benchmarks specified")
83*760c253cSXin Li        if not self.labels:
84*760c253cSXin Li            raise RuntimeError("No labels specified")
85*760c253cSXin Li        if not remote and not self.crosfleet:
86*760c253cSXin Li            raise RuntimeError("No remote hosts specified")
87*760c253cSXin Li
88*760c253cSXin Li        # We need one chromeos_root to run the benchmarks in, but it doesn't
89*760c253cSXin Li        # matter where it is, unless the ABIs are different.
90*760c253cSXin Li        if not chromeos_root:
91*760c253cSXin Li            for label in self.labels:
92*760c253cSXin Li                if label.chromeos_root:
93*760c253cSXin Li                    chromeos_root = label.chromeos_root
94*760c253cSXin Li                    break
95*760c253cSXin Li        if not chromeos_root:
96*760c253cSXin Li            raise RuntimeError(
97*760c253cSXin Li                "No chromeos_root given and could not determine "
98*760c253cSXin Li                "one from the image path."
99*760c253cSXin Li            )
100*760c253cSXin Li
101*760c253cSXin Li        machine_manager_fn = MachineManager
102*760c253cSXin Li        if test_flag.GetTestMode():
103*760c253cSXin Li            machine_manager_fn = MockMachineManager
104*760c253cSXin Li        self.machine_manager = machine_manager_fn(
105*760c253cSXin Li            chromeos_root,
106*760c253cSXin Li            acquire_timeout,
107*760c253cSXin Li            log_level,
108*760c253cSXin Li            locks_directory,
109*760c253cSXin Li            keep_stateful=keep_stateful,
110*760c253cSXin Li        )
111*760c253cSXin Li        self.l = logger.GetLogger(log_dir)
112*760c253cSXin Li
113*760c253cSXin Li        for machine in self.remote:
114*760c253cSXin Li            # machine_manager.AddMachine only adds reachable machines.
115*760c253cSXin Li            self.machine_manager.AddMachine(machine)
116*760c253cSXin Li        # Now machine_manager._all_machines contains a list of reachable
117*760c253cSXin Li        # machines. This is a subset of self.remote. We make both lists the same.
118*760c253cSXin Li        self.remote = [m.name for m in self.machine_manager.GetAllMachines()]
119*760c253cSXin Li        if not self.remote:
120*760c253cSXin Li            raise RuntimeError("No machine available for running experiment.")
121*760c253cSXin Li
122*760c253cSXin Li        # Initialize checksums for all machines, ignore errors at this time.
123*760c253cSXin Li        # The checksum will be double checked, and image will be flashed after
124*760c253cSXin Li        # duts are locked/leased.
125*760c253cSXin Li        self.SetCheckSums()
126*760c253cSXin Li
127*760c253cSXin Li        self.start_time = None
128*760c253cSXin Li        self.benchmark_runs = self._GenerateBenchmarkRuns(dut_config)
129*760c253cSXin Li
130*760c253cSXin Li        self._schedv2 = None
131*760c253cSXin Li        self._internal_counter_lock = Lock()
132*760c253cSXin Li
133*760c253cSXin Li    def set_schedv2(self, schedv2):
134*760c253cSXin Li        self._schedv2 = schedv2
135*760c253cSXin Li
136*760c253cSXin Li    def schedv2(self):
137*760c253cSXin Li        return self._schedv2
138*760c253cSXin Li
139*760c253cSXin Li    def _GenerateBenchmarkRuns(self, dut_config):
140*760c253cSXin Li        """Generate benchmark runs from labels and benchmark defintions."""
141*760c253cSXin Li        benchmark_runs = []
142*760c253cSXin Li        for label in self.labels:
143*760c253cSXin Li            for benchmark in self.benchmarks:
144*760c253cSXin Li                for iteration in range(1, benchmark.iterations + 1):
145*760c253cSXin Li                    benchmark_run_name = "%s: %s (%s)" % (
146*760c253cSXin Li                        label.name,
147*760c253cSXin Li                        benchmark.name,
148*760c253cSXin Li                        iteration,
149*760c253cSXin Li                    )
150*760c253cSXin Li                    full_name = "%s_%s_%s" % (
151*760c253cSXin Li                        label.name,
152*760c253cSXin Li                        benchmark.name,
153*760c253cSXin Li                        iteration,
154*760c253cSXin Li                    )
155*760c253cSXin Li                    logger_to_use = logger.Logger(
156*760c253cSXin Li                        self.log_dir, "run.%s" % (full_name), True
157*760c253cSXin Li                    )
158*760c253cSXin Li                    benchmark_runs.append(
159*760c253cSXin Li                        benchmark_run.BenchmarkRun(
160*760c253cSXin Li                            benchmark_run_name,
161*760c253cSXin Li                            benchmark,
162*760c253cSXin Li                            label,
163*760c253cSXin Li                            iteration,
164*760c253cSXin Li                            self.cache_conditions,
165*760c253cSXin Li                            self.machine_manager,
166*760c253cSXin Li                            logger_to_use,
167*760c253cSXin Li                            self.log_level,
168*760c253cSXin Li                            self.share_cache,
169*760c253cSXin Li                            dut_config,
170*760c253cSXin Li                        )
171*760c253cSXin Li                    )
172*760c253cSXin Li
173*760c253cSXin Li        return benchmark_runs
174*760c253cSXin Li
175*760c253cSXin Li    def SetCheckSums(self, forceSameImage=False):
176*760c253cSXin Li        for label in self.labels:
177*760c253cSXin Li            # We filter out label remotes that are not reachable (not in
178*760c253cSXin Li            # self.remote). So each label.remote is a sublist of experiment.remote.
179*760c253cSXin Li            label.remote = [r for r in label.remote if r in self.remote]
180*760c253cSXin Li            try:
181*760c253cSXin Li                self.machine_manager.ComputeCommonCheckSum(label)
182*760c253cSXin Li            except BadChecksum:
183*760c253cSXin Li                # Force same image on all machines, then we do checksum again. No
184*760c253cSXin Li                # bailout if checksums still do not match.
185*760c253cSXin Li                # TODO (zhizhouy): Need to figure out how flashing image will influence
186*760c253cSXin Li                # the new checksum.
187*760c253cSXin Li                if forceSameImage:
188*760c253cSXin Li                    self.machine_manager.ForceSameImageToAllMachines(label)
189*760c253cSXin Li                    self.machine_manager.ComputeCommonCheckSum(label)
190*760c253cSXin Li
191*760c253cSXin Li            self.machine_manager.ComputeCommonCheckSumString(label)
192*760c253cSXin Li
193*760c253cSXin Li    def Build(self):
194*760c253cSXin Li        pass
195*760c253cSXin Li
196*760c253cSXin Li    def Terminate(self):
197*760c253cSXin Li        if self._schedv2 is not None:
198*760c253cSXin Li            self._schedv2.terminate()
199*760c253cSXin Li        else:
200*760c253cSXin Li            for t in self.benchmark_runs:
201*760c253cSXin Li                if t.isAlive():
202*760c253cSXin Li                    self.l.LogError("Terminating run: '%s'." % t.name)
203*760c253cSXin Li                    t.Terminate()
204*760c253cSXin Li
205*760c253cSXin Li    def IsComplete(self):
206*760c253cSXin Li        if self._schedv2:
207*760c253cSXin Li            return self._schedv2.is_complete()
208*760c253cSXin Li        if self.active_threads:
209*760c253cSXin Li            for t in self.active_threads:
210*760c253cSXin Li                if t.isAlive():
211*760c253cSXin Li                    t.join(0)
212*760c253cSXin Li                if not t.isAlive():
213*760c253cSXin Li                    self.num_complete += 1
214*760c253cSXin Li                    if not t.cache_hit:
215*760c253cSXin Li                        self.num_run_complete += 1
216*760c253cSXin Li                    self.active_threads.remove(t)
217*760c253cSXin Li            return False
218*760c253cSXin Li        return True
219*760c253cSXin Li
220*760c253cSXin Li    def BenchmarkRunFinished(self, br):
221*760c253cSXin Li        """Update internal counters after br finishes.
222*760c253cSXin Li
223*760c253cSXin Li        Note this is only used by schedv2 and is called by multiple threads.
224*760c253cSXin Li        Never throw any exception here.
225*760c253cSXin Li        """
226*760c253cSXin Li
227*760c253cSXin Li        assert self._schedv2 is not None
228*760c253cSXin Li        with self._internal_counter_lock:
229*760c253cSXin Li            self.num_complete += 1
230*760c253cSXin Li            if not br.cache_hit:
231*760c253cSXin Li                self.num_run_complete += 1
232*760c253cSXin Li
233*760c253cSXin Li    def Run(self):
234*760c253cSXin Li        self.start_time = time.time()
235*760c253cSXin Li        if self._schedv2 is not None:
236*760c253cSXin Li            self._schedv2.run_sched()
237*760c253cSXin Li        else:
238*760c253cSXin Li            self.active_threads = []
239*760c253cSXin Li            for run in self.benchmark_runs:
240*760c253cSXin Li                # Set threads to daemon so program exits when ctrl-c is pressed.
241*760c253cSXin Li                run.daemon = True
242*760c253cSXin Li                run.start()
243*760c253cSXin Li                self.active_threads.append(run)
244*760c253cSXin Li
245*760c253cSXin Li    def SetCacheConditions(self, cache_conditions):
246*760c253cSXin Li        for run in self.benchmark_runs:
247*760c253cSXin Li            run.SetCacheConditions(cache_conditions)
248*760c253cSXin Li
249*760c253cSXin Li    def Cleanup(self):
250*760c253cSXin Li        """Make sure all machines are unlocked."""
251*760c253cSXin Li        if self.locks_dir:
252*760c253cSXin Li            # We are using the file locks mechanism, so call machine_manager.Cleanup
253*760c253cSXin Li            # to unlock everything.
254*760c253cSXin Li            self.machine_manager.Cleanup()
255*760c253cSXin Li
256*760c253cSXin Li        if test_flag.GetTestMode() or not self.locked_machines:
257*760c253cSXin Li            return
258*760c253cSXin Li
259*760c253cSXin Li        # If we locked any machines earlier, make sure we unlock them now.
260*760c253cSXin Li        if self.lock_mgr:
261*760c253cSXin Li            machine_states = self.lock_mgr.GetMachineStates("unlock")
262*760c253cSXin Li            self.lock_mgr.CheckMachineLocks(machine_states, "unlock")
263*760c253cSXin Li            unlocked_machines = self.lock_mgr.UpdateMachines(False)
264*760c253cSXin Li            failed_machines = [
265*760c253cSXin Li                m for m in self.locked_machines if m not in unlocked_machines
266*760c253cSXin Li            ]
267*760c253cSXin Li            if failed_machines:
268*760c253cSXin Li                raise RuntimeError(
269*760c253cSXin Li                    "These machines are not unlocked correctly: %s"
270*760c253cSXin Li                    % failed_machines
271*760c253cSXin Li                )
272*760c253cSXin Li            self.lock_mgr = None
273