1*760c253cSXin Li# -*- coding: utf-8 -*- 2*760c253cSXin Li# Copyright 2013 The ChromiumOS Authors 3*760c253cSXin Li# Use of this source code is governed by a BSD-style license that can be 4*760c253cSXin Li# found in the LICENSE file. 5*760c253cSXin Li 6*760c253cSXin Li"""The experiment setting module.""" 7*760c253cSXin Li 8*760c253cSXin Li 9*760c253cSXin Liimport os 10*760c253cSXin Lifrom threading import Lock 11*760c253cSXin Liimport time 12*760c253cSXin Li 13*760c253cSXin Liimport benchmark_run 14*760c253cSXin Lifrom cros_utils import logger 15*760c253cSXin Lifrom cros_utils import misc 16*760c253cSXin Lifrom machine_manager import BadChecksum 17*760c253cSXin Lifrom machine_manager import MachineManager 18*760c253cSXin Lifrom machine_manager import MockMachineManager 19*760c253cSXin Liimport test_flag 20*760c253cSXin Li 21*760c253cSXin Li 22*760c253cSXin Liclass Experiment(object): 23*760c253cSXin Li """Class representing an Experiment to be run.""" 24*760c253cSXin Li 25*760c253cSXin Li def __init__( 26*760c253cSXin Li self, 27*760c253cSXin Li name, 28*760c253cSXin Li remote, 29*760c253cSXin Li working_directory, 30*760c253cSXin Li chromeos_root, 31*760c253cSXin Li cache_conditions, 32*760c253cSXin Li labels, 33*760c253cSXin Li benchmarks, 34*760c253cSXin Li experiment_file, 35*760c253cSXin Li email_to, 36*760c253cSXin Li acquire_timeout, 37*760c253cSXin Li log_dir, 38*760c253cSXin Li log_level, 39*760c253cSXin Li share_cache, 40*760c253cSXin Li results_directory, 41*760c253cSXin Li compress_results, 42*760c253cSXin Li locks_directory, 43*760c253cSXin Li cwp_dso, 44*760c253cSXin Li ignore_min_max, 45*760c253cSXin Li crosfleet, 46*760c253cSXin Li dut_config, 47*760c253cSXin Li keep_stateful: bool, 48*760c253cSXin Li no_lock: bool, 49*760c253cSXin Li ): 50*760c253cSXin Li self.name = name 51*760c253cSXin Li self.working_directory = working_directory 52*760c253cSXin Li self.remote = remote 53*760c253cSXin Li self.chromeos_root = chromeos_root 54*760c253cSXin Li self.cache_conditions = cache_conditions 55*760c253cSXin Li self.experiment_file = experiment_file 56*760c253cSXin Li self.email_to = email_to 57*760c253cSXin Li if not results_directory: 58*760c253cSXin Li self.results_directory = os.path.join( 59*760c253cSXin Li self.working_directory, self.name + "_results" 60*760c253cSXin Li ) 61*760c253cSXin Li else: 62*760c253cSXin Li self.results_directory = misc.CanonicalizePath(results_directory) 63*760c253cSXin Li self.compress_results = compress_results 64*760c253cSXin Li self.log_dir = log_dir 65*760c253cSXin Li self.log_level = log_level 66*760c253cSXin Li self.labels = labels 67*760c253cSXin Li self.benchmarks = benchmarks 68*760c253cSXin Li self.num_complete = 0 69*760c253cSXin Li self.num_run_complete = 0 70*760c253cSXin Li self.share_cache = share_cache 71*760c253cSXin Li self.active_threads = [] 72*760c253cSXin Li self.locks_dir = locks_directory 73*760c253cSXin Li self.locked_machines = [] 74*760c253cSXin Li self.lock_mgr = None 75*760c253cSXin Li self.cwp_dso = cwp_dso 76*760c253cSXin Li self.ignore_min_max = ignore_min_max 77*760c253cSXin Li self.crosfleet = crosfleet 78*760c253cSXin Li self.no_lock = no_lock 79*760c253cSXin Li self.l = logger.GetLogger(log_dir) 80*760c253cSXin Li 81*760c253cSXin Li if not self.benchmarks: 82*760c253cSXin Li raise RuntimeError("No benchmarks specified") 83*760c253cSXin Li if not self.labels: 84*760c253cSXin Li raise RuntimeError("No labels specified") 85*760c253cSXin Li if not remote and not self.crosfleet: 86*760c253cSXin Li raise RuntimeError("No remote hosts specified") 87*760c253cSXin Li 88*760c253cSXin Li # We need one chromeos_root to run the benchmarks in, but it doesn't 89*760c253cSXin Li # matter where it is, unless the ABIs are different. 90*760c253cSXin Li if not chromeos_root: 91*760c253cSXin Li for label in self.labels: 92*760c253cSXin Li if label.chromeos_root: 93*760c253cSXin Li chromeos_root = label.chromeos_root 94*760c253cSXin Li break 95*760c253cSXin Li if not chromeos_root: 96*760c253cSXin Li raise RuntimeError( 97*760c253cSXin Li "No chromeos_root given and could not determine " 98*760c253cSXin Li "one from the image path." 99*760c253cSXin Li ) 100*760c253cSXin Li 101*760c253cSXin Li machine_manager_fn = MachineManager 102*760c253cSXin Li if test_flag.GetTestMode(): 103*760c253cSXin Li machine_manager_fn = MockMachineManager 104*760c253cSXin Li self.machine_manager = machine_manager_fn( 105*760c253cSXin Li chromeos_root, 106*760c253cSXin Li acquire_timeout, 107*760c253cSXin Li log_level, 108*760c253cSXin Li locks_directory, 109*760c253cSXin Li keep_stateful=keep_stateful, 110*760c253cSXin Li ) 111*760c253cSXin Li self.l = logger.GetLogger(log_dir) 112*760c253cSXin Li 113*760c253cSXin Li for machine in self.remote: 114*760c253cSXin Li # machine_manager.AddMachine only adds reachable machines. 115*760c253cSXin Li self.machine_manager.AddMachine(machine) 116*760c253cSXin Li # Now machine_manager._all_machines contains a list of reachable 117*760c253cSXin Li # machines. This is a subset of self.remote. We make both lists the same. 118*760c253cSXin Li self.remote = [m.name for m in self.machine_manager.GetAllMachines()] 119*760c253cSXin Li if not self.remote: 120*760c253cSXin Li raise RuntimeError("No machine available for running experiment.") 121*760c253cSXin Li 122*760c253cSXin Li # Initialize checksums for all machines, ignore errors at this time. 123*760c253cSXin Li # The checksum will be double checked, and image will be flashed after 124*760c253cSXin Li # duts are locked/leased. 125*760c253cSXin Li self.SetCheckSums() 126*760c253cSXin Li 127*760c253cSXin Li self.start_time = None 128*760c253cSXin Li self.benchmark_runs = self._GenerateBenchmarkRuns(dut_config) 129*760c253cSXin Li 130*760c253cSXin Li self._schedv2 = None 131*760c253cSXin Li self._internal_counter_lock = Lock() 132*760c253cSXin Li 133*760c253cSXin Li def set_schedv2(self, schedv2): 134*760c253cSXin Li self._schedv2 = schedv2 135*760c253cSXin Li 136*760c253cSXin Li def schedv2(self): 137*760c253cSXin Li return self._schedv2 138*760c253cSXin Li 139*760c253cSXin Li def _GenerateBenchmarkRuns(self, dut_config): 140*760c253cSXin Li """Generate benchmark runs from labels and benchmark defintions.""" 141*760c253cSXin Li benchmark_runs = [] 142*760c253cSXin Li for label in self.labels: 143*760c253cSXin Li for benchmark in self.benchmarks: 144*760c253cSXin Li for iteration in range(1, benchmark.iterations + 1): 145*760c253cSXin Li benchmark_run_name = "%s: %s (%s)" % ( 146*760c253cSXin Li label.name, 147*760c253cSXin Li benchmark.name, 148*760c253cSXin Li iteration, 149*760c253cSXin Li ) 150*760c253cSXin Li full_name = "%s_%s_%s" % ( 151*760c253cSXin Li label.name, 152*760c253cSXin Li benchmark.name, 153*760c253cSXin Li iteration, 154*760c253cSXin Li ) 155*760c253cSXin Li logger_to_use = logger.Logger( 156*760c253cSXin Li self.log_dir, "run.%s" % (full_name), True 157*760c253cSXin Li ) 158*760c253cSXin Li benchmark_runs.append( 159*760c253cSXin Li benchmark_run.BenchmarkRun( 160*760c253cSXin Li benchmark_run_name, 161*760c253cSXin Li benchmark, 162*760c253cSXin Li label, 163*760c253cSXin Li iteration, 164*760c253cSXin Li self.cache_conditions, 165*760c253cSXin Li self.machine_manager, 166*760c253cSXin Li logger_to_use, 167*760c253cSXin Li self.log_level, 168*760c253cSXin Li self.share_cache, 169*760c253cSXin Li dut_config, 170*760c253cSXin Li ) 171*760c253cSXin Li ) 172*760c253cSXin Li 173*760c253cSXin Li return benchmark_runs 174*760c253cSXin Li 175*760c253cSXin Li def SetCheckSums(self, forceSameImage=False): 176*760c253cSXin Li for label in self.labels: 177*760c253cSXin Li # We filter out label remotes that are not reachable (not in 178*760c253cSXin Li # self.remote). So each label.remote is a sublist of experiment.remote. 179*760c253cSXin Li label.remote = [r for r in label.remote if r in self.remote] 180*760c253cSXin Li try: 181*760c253cSXin Li self.machine_manager.ComputeCommonCheckSum(label) 182*760c253cSXin Li except BadChecksum: 183*760c253cSXin Li # Force same image on all machines, then we do checksum again. No 184*760c253cSXin Li # bailout if checksums still do not match. 185*760c253cSXin Li # TODO (zhizhouy): Need to figure out how flashing image will influence 186*760c253cSXin Li # the new checksum. 187*760c253cSXin Li if forceSameImage: 188*760c253cSXin Li self.machine_manager.ForceSameImageToAllMachines(label) 189*760c253cSXin Li self.machine_manager.ComputeCommonCheckSum(label) 190*760c253cSXin Li 191*760c253cSXin Li self.machine_manager.ComputeCommonCheckSumString(label) 192*760c253cSXin Li 193*760c253cSXin Li def Build(self): 194*760c253cSXin Li pass 195*760c253cSXin Li 196*760c253cSXin Li def Terminate(self): 197*760c253cSXin Li if self._schedv2 is not None: 198*760c253cSXin Li self._schedv2.terminate() 199*760c253cSXin Li else: 200*760c253cSXin Li for t in self.benchmark_runs: 201*760c253cSXin Li if t.isAlive(): 202*760c253cSXin Li self.l.LogError("Terminating run: '%s'." % t.name) 203*760c253cSXin Li t.Terminate() 204*760c253cSXin Li 205*760c253cSXin Li def IsComplete(self): 206*760c253cSXin Li if self._schedv2: 207*760c253cSXin Li return self._schedv2.is_complete() 208*760c253cSXin Li if self.active_threads: 209*760c253cSXin Li for t in self.active_threads: 210*760c253cSXin Li if t.isAlive(): 211*760c253cSXin Li t.join(0) 212*760c253cSXin Li if not t.isAlive(): 213*760c253cSXin Li self.num_complete += 1 214*760c253cSXin Li if not t.cache_hit: 215*760c253cSXin Li self.num_run_complete += 1 216*760c253cSXin Li self.active_threads.remove(t) 217*760c253cSXin Li return False 218*760c253cSXin Li return True 219*760c253cSXin Li 220*760c253cSXin Li def BenchmarkRunFinished(self, br): 221*760c253cSXin Li """Update internal counters after br finishes. 222*760c253cSXin Li 223*760c253cSXin Li Note this is only used by schedv2 and is called by multiple threads. 224*760c253cSXin Li Never throw any exception here. 225*760c253cSXin Li """ 226*760c253cSXin Li 227*760c253cSXin Li assert self._schedv2 is not None 228*760c253cSXin Li with self._internal_counter_lock: 229*760c253cSXin Li self.num_complete += 1 230*760c253cSXin Li if not br.cache_hit: 231*760c253cSXin Li self.num_run_complete += 1 232*760c253cSXin Li 233*760c253cSXin Li def Run(self): 234*760c253cSXin Li self.start_time = time.time() 235*760c253cSXin Li if self._schedv2 is not None: 236*760c253cSXin Li self._schedv2.run_sched() 237*760c253cSXin Li else: 238*760c253cSXin Li self.active_threads = [] 239*760c253cSXin Li for run in self.benchmark_runs: 240*760c253cSXin Li # Set threads to daemon so program exits when ctrl-c is pressed. 241*760c253cSXin Li run.daemon = True 242*760c253cSXin Li run.start() 243*760c253cSXin Li self.active_threads.append(run) 244*760c253cSXin Li 245*760c253cSXin Li def SetCacheConditions(self, cache_conditions): 246*760c253cSXin Li for run in self.benchmark_runs: 247*760c253cSXin Li run.SetCacheConditions(cache_conditions) 248*760c253cSXin Li 249*760c253cSXin Li def Cleanup(self): 250*760c253cSXin Li """Make sure all machines are unlocked.""" 251*760c253cSXin Li if self.locks_dir: 252*760c253cSXin Li # We are using the file locks mechanism, so call machine_manager.Cleanup 253*760c253cSXin Li # to unlock everything. 254*760c253cSXin Li self.machine_manager.Cleanup() 255*760c253cSXin Li 256*760c253cSXin Li if test_flag.GetTestMode() or not self.locked_machines: 257*760c253cSXin Li return 258*760c253cSXin Li 259*760c253cSXin Li # If we locked any machines earlier, make sure we unlock them now. 260*760c253cSXin Li if self.lock_mgr: 261*760c253cSXin Li machine_states = self.lock_mgr.GetMachineStates("unlock") 262*760c253cSXin Li self.lock_mgr.CheckMachineLocks(machine_states, "unlock") 263*760c253cSXin Li unlocked_machines = self.lock_mgr.UpdateMachines(False) 264*760c253cSXin Li failed_machines = [ 265*760c253cSXin Li m for m in self.locked_machines if m not in unlocked_machines 266*760c253cSXin Li ] 267*760c253cSXin Li if failed_machines: 268*760c253cSXin Li raise RuntimeError( 269*760c253cSXin Li "These machines are not unlocked correctly: %s" 270*760c253cSXin Li % failed_machines 271*760c253cSXin Li ) 272*760c253cSXin Li self.lock_mgr = None 273