1*9c5db199SXin Li#!/usr/bin/env python3 2*9c5db199SXin Li# Copyright 2015 The Chromium OS Authors. All rights reserved. 3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be 4*9c5db199SXin Li# found in the LICENSE file. 5*9c5db199SXin Li 6*9c5db199SXin Li"""Adjust pool balances to cover DUT shortfalls. 7*9c5db199SXin Li 8*9c5db199SXin LiThis command takes all broken DUTs in a specific pool for specific 9*9c5db199SXin Limodels and swaps them with working DUTs taken from a selected pool 10*9c5db199SXin Liof spares. The command is meant primarily for replacing broken DUTs 11*9c5db199SXin Liin critical pools like BVT or CQ, but it can also be used to adjust 12*9c5db199SXin Lipool sizes, or to create or remove pools. 13*9c5db199SXin Li 14*9c5db199SXin Liusage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] 15*9c5db199SXin Li 16*9c5db199SXin Lipositional arguments: 17*9c5db199SXin Li POOL Name of the pool to balance 18*9c5db199SXin Li MODEL Names of models to balance 19*9c5db199SXin Li 20*9c5db199SXin Lioptional arguments: 21*9c5db199SXin Li -h, --help show this help message and exit 22*9c5db199SXin Li -t COUNT, --total COUNT 23*9c5db199SXin Li Set the number of DUTs in the pool to the specified 24*9c5db199SXin Li count for every MODEL 25*9c5db199SXin Li -a COUNT, --grow COUNT 26*9c5db199SXin Li Add the specified number of DUTs to the pool for every 27*9c5db199SXin Li MODEL 28*9c5db199SXin Li -d COUNT, --shrink COUNT 29*9c5db199SXin Li Remove the specified number of DUTs from the pool for 30*9c5db199SXin Li every MODEL 31*9c5db199SXin Li -s POOL, --spare POOL 32*9c5db199SXin Li Pool from which to draw replacement spares (default: 33*9c5db199SXin Li pool:suites) 34*9c5db199SXin Li -p PHASE, --phase PHASE 35*9c5db199SXin Li Phase to restrict the balance pool operation to 36*9c5db199SXin Li --sku SKU The specific SKU we intend to swap with 37*9c5db199SXin Li -n, --dry-run Report actions to take in the form of shell commands 38*9c5db199SXin Li 39*9c5db199SXin Li 40*9c5db199SXin LiThe command attempts to remove all broken DUTs from the target POOL 41*9c5db199SXin Lifor every MODEL, and replace them with enough working DUTs taken 42*9c5db199SXin Lifrom the spare pool to bring the strength of POOL to the requested 43*9c5db199SXin Litotal COUNT. 44*9c5db199SXin Li 45*9c5db199SXin LiIf no COUNT options are supplied (i.e. there are no --total, --grow, 46*9c5db199SXin Lior --shrink options), the command will maintain the current totals of 47*9c5db199SXin LiDUTs for every MODEL in the target POOL. 48*9c5db199SXin Li 49*9c5db199SXin LiIf not enough working spares are available, broken DUTs may be left 50*9c5db199SXin Liin the pool to keep the pool at the target COUNT. 51*9c5db199SXin Li 52*9c5db199SXin LiWhen reducing pool size, working DUTs will be returned after broken 53*9c5db199SXin LiDUTs, if it's necessary to achieve the target COUNT. 54*9c5db199SXin Li 55*9c5db199SXin Li""" 56*9c5db199SXin Li 57*9c5db199SXin Li 58*9c5db199SXin Liimport argparse 59*9c5db199SXin Liimport os 60*9c5db199SXin Liimport re 61*9c5db199SXin Liimport sys 62*9c5db199SXin Liimport time 63*9c5db199SXin Li 64*9c5db199SXin Liimport common 65*9c5db199SXin Lifrom autotest_lib.server import constants 66*9c5db199SXin Lifrom autotest_lib.server import site_utils 67*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68*9c5db199SXin Lifrom autotest_lib.server.lib import status_history 69*9c5db199SXin Lifrom autotest_lib.site_utils import lab_inventory 70*9c5db199SXin Lifrom autotest_lib.utils import labellib 71*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import metrics 72*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import parallel 73*9c5db199SXin Li 74*9c5db199SXin Li#This must be imported after chromite.lib.metrics 75*9c5db199SXin Lifrom infra_libs import ts_mon 76*9c5db199SXin Li 77*9c5db199SXin Li_POOL_PREFIX = constants.Labels.POOL_PREFIX 78*9c5db199SXin Li# This is the ratio of all models we should calculate the default max 79*9c5db199SXin Li# number of broken models against. It seemed like the best choice that 80*9c5db199SXin Li# was neither too strict nor lax. 81*9c5db199SXin Li_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 82*9c5db199SXin Li 83*9c5db199SXin Li_ALL_CRITICAL_POOLS = 'all_critical_pools' 84*9c5db199SXin Li_SPARE_DEFAULT = lab_inventory.SPARE_POOL 85*9c5db199SXin Li 86*9c5db199SXin Li 87*9c5db199SXin Li# _VALID_POOL_PATTERN - Regular expression matching pool names that will 88*9c5db199SXin Li# be accepted on the command line. 89*9c5db199SXin Li# 90*9c5db199SXin Li# Note: This pattern was selected merely to recognize all existing pool 91*9c5db199SXin Li# names; there's no underlying technical restriction motivating this 92*9c5db199SXin Li# pattern. No reasonable request to add more special characters to the 93*9c5db199SXin Li# allowed set should be refused. 94*9c5db199SXin Li 95*9c5db199SXin Li_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$') 96*9c5db199SXin Li 97*9c5db199SXin Li 98*9c5db199SXin Lidef _log_message(message, *args): 99*9c5db199SXin Li """Log a message with optional format arguments to stdout. 100*9c5db199SXin Li 101*9c5db199SXin Li This function logs a single line to stdout, with formatting 102*9c5db199SXin Li if necessary, and without adornments. 103*9c5db199SXin Li 104*9c5db199SXin Li If `*args` are supplied, the message will be formatted using 105*9c5db199SXin Li the arguments. 106*9c5db199SXin Li 107*9c5db199SXin Li @param message Message to be logged, possibly after formatting. 108*9c5db199SXin Li @param args Format arguments. If empty, the message is logged 109*9c5db199SXin Li without formatting. 110*9c5db199SXin Li 111*9c5db199SXin Li """ 112*9c5db199SXin Li if args: 113*9c5db199SXin Li message = message % args 114*9c5db199SXin Li sys.stdout.write('%s\n' % message) 115*9c5db199SXin Li 116*9c5db199SXin Li 117*9c5db199SXin Lidef _log_info(dry_run, message, *args): 118*9c5db199SXin Li """Log information in a dry-run dependent fashion. 119*9c5db199SXin Li 120*9c5db199SXin Li This function logs a single line to stdout, with formatting 121*9c5db199SXin Li if necessary. When logging for a dry run, the message is 122*9c5db199SXin Li printed as a shell comment, rather than as unadorned text. 123*9c5db199SXin Li 124*9c5db199SXin Li If `*args` are supplied, the message will be formatted using 125*9c5db199SXin Li the arguments. 126*9c5db199SXin Li 127*9c5db199SXin Li @param message Message to be logged, possibly after formatting. 128*9c5db199SXin Li @param args Format arguments. If empty, the message is logged 129*9c5db199SXin Li without formatting. 130*9c5db199SXin Li 131*9c5db199SXin Li """ 132*9c5db199SXin Li if dry_run: 133*9c5db199SXin Li message = '# ' + message 134*9c5db199SXin Li _log_message(message, *args) 135*9c5db199SXin Li 136*9c5db199SXin Li 137*9c5db199SXin Lidef _log_error(message, *args): 138*9c5db199SXin Li """Log an error to stderr, with optional format arguments. 139*9c5db199SXin Li 140*9c5db199SXin Li This function logs a single line to stderr, prefixed to indicate 141*9c5db199SXin Li that it is an error message. 142*9c5db199SXin Li 143*9c5db199SXin Li If `*args` are supplied, the message will be formatted using 144*9c5db199SXin Li the arguments. 145*9c5db199SXin Li 146*9c5db199SXin Li @param message Message to be logged, possibly after formatting. 147*9c5db199SXin Li @param args Format arguments. If empty, the message is logged 148*9c5db199SXin Li without formatting. 149*9c5db199SXin Li 150*9c5db199SXin Li """ 151*9c5db199SXin Li if args: 152*9c5db199SXin Li message = message % args 153*9c5db199SXin Li sys.stderr.write('ERROR: %s\n' % message) 154*9c5db199SXin Li 155*9c5db199SXin Li 156*9c5db199SXin Liclass _DUTPool(object): 157*9c5db199SXin Li """Information about a pool of DUTs matching given labels. 158*9c5db199SXin Li 159*9c5db199SXin Li This class collects information about all DUTs for a given pool and matching 160*9c5db199SXin Li the given labels, and divides them into three categories: 161*9c5db199SXin Li + Working - the DUT is working for testing, and not locked. 162*9c5db199SXin Li + Broken - the DUT is unable to run tests, or it is locked. 163*9c5db199SXin Li + Ineligible - the DUT is not available to be removed from this pool. The 164*9c5db199SXin Li DUT may be either working or broken. 165*9c5db199SXin Li 166*9c5db199SXin Li DUTs with more than one pool: label are ineligible for exchange 167*9c5db199SXin Li during balancing. This is done for the sake of chameleon hosts, 168*9c5db199SXin Li which must always be assigned to pool:suites. These DUTs are 169*9c5db199SXin Li always marked with pool:chameleon to prevent their reassignment. 170*9c5db199SXin Li 171*9c5db199SXin Li TODO(jrbarnette): The use of `pool:chamelon` (instead of just 172*9c5db199SXin Li the `chameleon` label is a hack that should be eliminated. 173*9c5db199SXin Li 174*9c5db199SXin Li _DUTPool instances are used to track both main pools that need 175*9c5db199SXin Li to be resupplied with working DUTs and spare pools that supply 176*9c5db199SXin Li those DUTs. 177*9c5db199SXin Li 178*9c5db199SXin Li @property pool Name of the pool associated with 179*9c5db199SXin Li this pool of DUTs. 180*9c5db199SXin Li @property labels Labels that constrain the DUTs to consider. 181*9c5db199SXin Li @property working_hosts The list of this pool's working DUTs. 182*9c5db199SXin Li @property broken_hosts The list of this pool's broken DUTs. 183*9c5db199SXin Li @property ineligible_hosts The list of this pool's ineligible DUTs. 184*9c5db199SXin Li @property pool_labels A list of labels that identify a DUT as part 185*9c5db199SXin Li of this pool. 186*9c5db199SXin Li @property total_hosts The total number of hosts in pool. 187*9c5db199SXin Li 188*9c5db199SXin Li """ 189*9c5db199SXin Li 190*9c5db199SXin Li def __init__(self, afe, pool, labels, start_time, end_time): 191*9c5db199SXin Li self.pool = pool 192*9c5db199SXin Li self.labels = labellib.LabelsMapping(labels) 193*9c5db199SXin Li self.labels['pool'] = pool 194*9c5db199SXin Li self._pool_labels = [_POOL_PREFIX + self.pool] 195*9c5db199SXin Li 196*9c5db199SXin Li self.working_hosts = [] 197*9c5db199SXin Li self.broken_hosts = [] 198*9c5db199SXin Li self.ineligible_hosts = [] 199*9c5db199SXin Li self.total_hosts = self._get_hosts(afe, start_time, end_time) 200*9c5db199SXin Li 201*9c5db199SXin Li 202*9c5db199SXin Li def _get_hosts(self, afe, start_time, end_time): 203*9c5db199SXin Li all_histories = status_history.HostJobHistory.get_multiple_histories( 204*9c5db199SXin Li afe, start_time, end_time, self.labels.getlabels()) 205*9c5db199SXin Li for h in all_histories: 206*9c5db199SXin Li host = h.host 207*9c5db199SXin Li host_pools = [l for l in host.labels 208*9c5db199SXin Li if l.startswith(_POOL_PREFIX)] 209*9c5db199SXin Li if len(host_pools) != 1: 210*9c5db199SXin Li self.ineligible_hosts.append(host) 211*9c5db199SXin Li else: 212*9c5db199SXin Li diag = h.last_diagnosis()[0] 213*9c5db199SXin Li if (diag == status_history.WORKING and 214*9c5db199SXin Li not host.locked): 215*9c5db199SXin Li self.working_hosts.append(host) 216*9c5db199SXin Li else: 217*9c5db199SXin Li self.broken_hosts.append(host) 218*9c5db199SXin Li return len(all_histories) 219*9c5db199SXin Li 220*9c5db199SXin Li 221*9c5db199SXin Li @property 222*9c5db199SXin Li def pool_labels(self): 223*9c5db199SXin Li """Return the AFE labels that identify this pool. 224*9c5db199SXin Li 225*9c5db199SXin Li The returned labels are the labels that must be removed 226*9c5db199SXin Li to remove a DUT from the pool, or added to add a DUT. 227*9c5db199SXin Li 228*9c5db199SXin Li @return A list of AFE labels suitable for AFE.add_labels() 229*9c5db199SXin Li or AFE.remove_labels(). 230*9c5db199SXin Li 231*9c5db199SXin Li """ 232*9c5db199SXin Li return self._pool_labels 233*9c5db199SXin Li 234*9c5db199SXin Li def calculate_spares_needed(self, target_total): 235*9c5db199SXin Li """Calculate and log the spares needed to achieve a target. 236*9c5db199SXin Li 237*9c5db199SXin Li Return how many working spares are needed to achieve the 238*9c5db199SXin Li given `target_total` with all DUTs working. 239*9c5db199SXin Li 240*9c5db199SXin Li The spares count may be positive or negative. Positive 241*9c5db199SXin Li values indicate spares are needed to replace broken DUTs in 242*9c5db199SXin Li order to reach the target; negative numbers indicate that 243*9c5db199SXin Li no spares are needed, and that a corresponding number of 244*9c5db199SXin Li working devices can be returned. 245*9c5db199SXin Li 246*9c5db199SXin Li If the new target total would require returning ineligible 247*9c5db199SXin Li DUTs, an error is logged, and the target total is adjusted 248*9c5db199SXin Li so that those DUTs are not exchanged. 249*9c5db199SXin Li 250*9c5db199SXin Li @param target_total The new target pool size. 251*9c5db199SXin Li 252*9c5db199SXin Li @return The number of spares needed. 253*9c5db199SXin Li 254*9c5db199SXin Li """ 255*9c5db199SXin Li num_ineligible = len(self.ineligible_hosts) 256*9c5db199SXin Li spares_needed = target_total >= num_ineligible 257*9c5db199SXin Li metrics.Boolean( 258*9c5db199SXin Li 'chromeos/autotest/balance_pools/exhausted_pools', 259*9c5db199SXin Li 'True for each pool/model which requests more DUTs than supplied', 260*9c5db199SXin Li # TODO(jrbarnette) The 'board' field is a legacy. We need 261*9c5db199SXin Li # to leave it here until we do the extra work Monarch 262*9c5db199SXin Li # requires to delete a field. 263*9c5db199SXin Li field_spec=[ 264*9c5db199SXin Li ts_mon.StringField('pool'), 265*9c5db199SXin Li ts_mon.StringField('board'), 266*9c5db199SXin Li ts_mon.StringField('model'), 267*9c5db199SXin Li ]).set( 268*9c5db199SXin Li not spares_needed, 269*9c5db199SXin Li fields={ 270*9c5db199SXin Li 'pool': self.pool, 271*9c5db199SXin Li 'board': self.labels.get('model', ''), 272*9c5db199SXin Li 'model': self.labels.get('model', ''), 273*9c5db199SXin Li }, 274*9c5db199SXin Li ) 275*9c5db199SXin Li if not spares_needed: 276*9c5db199SXin Li _log_error( 277*9c5db199SXin Li '%s pool (%s): Target of %d is below minimum of %d DUTs.', 278*9c5db199SXin Li self.pool, self.labels, target_total, num_ineligible, 279*9c5db199SXin Li ) 280*9c5db199SXin Li _log_error('Adjusting target to %d DUTs.', num_ineligible) 281*9c5db199SXin Li target_total = num_ineligible 282*9c5db199SXin Li else: 283*9c5db199SXin Li _log_message('%s %s pool: Target of %d is above minimum.', 284*9c5db199SXin Li self.labels.get('model', ''), self.pool, target_total) 285*9c5db199SXin Li adjustment = target_total - self.total_hosts 286*9c5db199SXin Li return len(self.broken_hosts) + adjustment 287*9c5db199SXin Li 288*9c5db199SXin Li def allocate_surplus(self, num_broken): 289*9c5db199SXin Li """Allocate a list DUTs that can returned as surplus. 290*9c5db199SXin Li 291*9c5db199SXin Li Return a list of devices that can be returned in order to 292*9c5db199SXin Li reduce this pool's supply. Broken DUTs will be preferred 293*9c5db199SXin Li over working ones. 294*9c5db199SXin Li 295*9c5db199SXin Li The `num_broken` parameter indicates the number of broken 296*9c5db199SXin Li DUTs to be left in the pool. If this number exceeds the 297*9c5db199SXin Li number of broken DUTs actually in the pool, the returned 298*9c5db199SXin Li list will be empty. If this number is negative, it 299*9c5db199SXin Li indicates a number of working DUTs to be returned in 300*9c5db199SXin Li addition to all broken ones. 301*9c5db199SXin Li 302*9c5db199SXin Li @param num_broken Total number of broken DUTs to be left in 303*9c5db199SXin Li this pool. 304*9c5db199SXin Li 305*9c5db199SXin Li @return A list of DUTs to be returned as surplus. 306*9c5db199SXin Li 307*9c5db199SXin Li """ 308*9c5db199SXin Li if num_broken >= 0: 309*9c5db199SXin Li surplus = self.broken_hosts[num_broken:] 310*9c5db199SXin Li return surplus 311*9c5db199SXin Li else: 312*9c5db199SXin Li return (self.broken_hosts + 313*9c5db199SXin Li self.working_hosts[:-num_broken]) 314*9c5db199SXin Li 315*9c5db199SXin Li 316*9c5db199SXin Lidef _exchange_labels(dry_run, hosts, target_pool, spare_pool): 317*9c5db199SXin Li """Reassign a list of DUTs from one pool to another. 318*9c5db199SXin Li 319*9c5db199SXin Li For all the given hosts, remove all labels associated with 320*9c5db199SXin Li `spare_pool`, and add the labels for `target_pool`. 321*9c5db199SXin Li 322*9c5db199SXin Li If `dry_run` is true, perform no changes, but log the `atest` 323*9c5db199SXin Li commands needed to accomplish the necessary label changes. 324*9c5db199SXin Li 325*9c5db199SXin Li @param dry_run Whether the logging is for a dry run or 326*9c5db199SXin Li for actual execution. 327*9c5db199SXin Li @param hosts List of DUTs (AFE hosts) to be reassigned. 328*9c5db199SXin Li @param target_pool The `_DUTPool` object from which the hosts 329*9c5db199SXin Li are drawn. 330*9c5db199SXin Li @param spare_pool The `_DUTPool` object to which the hosts 331*9c5db199SXin Li will be added. 332*9c5db199SXin Li 333*9c5db199SXin Li """ 334*9c5db199SXin Li _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 335*9c5db199SXin Li len(hosts), spare_pool.pool, target_pool.pool) 336*9c5db199SXin Li metrics.Counter( 337*9c5db199SXin Li 'chromeos/autotest/balance_pools/duts_moved', 338*9c5db199SXin Li 'DUTs transferred between pools', 339*9c5db199SXin Li # TODO(jrbarnette) The 'board' field is a legacy. We need to 340*9c5db199SXin Li # leave it here until we do the extra work Monarch requires to 341*9c5db199SXin Li # delete a field. 342*9c5db199SXin Li field_spec=[ 343*9c5db199SXin Li ts_mon.StringField('board'), 344*9c5db199SXin Li ts_mon.StringField('model'), 345*9c5db199SXin Li ts_mon.StringField('source_pool'), 346*9c5db199SXin Li ts_mon.StringField('target_pool'), 347*9c5db199SXin Li ] 348*9c5db199SXin Li ).increment_by( 349*9c5db199SXin Li len(hosts), 350*9c5db199SXin Li fields={ 351*9c5db199SXin Li 'board': target_pool.labels.get('model', ''), 352*9c5db199SXin Li 'model': target_pool.labels.get('model', ''), 353*9c5db199SXin Li 'source_pool': spare_pool.pool, 354*9c5db199SXin Li 'target_pool': target_pool.pool, 355*9c5db199SXin Li }, 356*9c5db199SXin Li ) 357*9c5db199SXin Li if not hosts: 358*9c5db199SXin Li return 359*9c5db199SXin Li 360*9c5db199SXin Li additions = target_pool.pool_labels 361*9c5db199SXin Li removals = spare_pool.pool_labels 362*9c5db199SXin Li for host in hosts: 363*9c5db199SXin Li if not dry_run: 364*9c5db199SXin Li _log_message('Updating host: %s.', host.hostname) 365*9c5db199SXin Li host.remove_labels(removals) 366*9c5db199SXin Li host.add_labels(additions) 367*9c5db199SXin Li else: 368*9c5db199SXin Li _log_message('atest label remove -m %s %s', 369*9c5db199SXin Li host.hostname, ' '.join(removals)) 370*9c5db199SXin Li _log_message('atest label add -m %s %s', 371*9c5db199SXin Li host.hostname, ' '.join(additions)) 372*9c5db199SXin Li 373*9c5db199SXin Li 374*9c5db199SXin Lidef _balance_model(arguments, afe, pool, labels, start_time, end_time): 375*9c5db199SXin Li """Balance one model as requested by command line arguments. 376*9c5db199SXin Li 377*9c5db199SXin Li @param arguments Parsed command line arguments. 378*9c5db199SXin Li @param afe AFE object to be used for the changes. 379*9c5db199SXin Li @param pool Pool of the model to be balanced. 380*9c5db199SXin Li @param labels Restrict the balancing operation within DUTs 381*9c5db199SXin Li that have these labels. 382*9c5db199SXin Li @param start_time Start time for HostJobHistory objects in 383*9c5db199SXin Li the DUT pools. 384*9c5db199SXin Li @param end_time End time for HostJobHistory objects in the 385*9c5db199SXin Li DUT pools. 386*9c5db199SXin Li 387*9c5db199SXin Li """ 388*9c5db199SXin Li spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) 389*9c5db199SXin Li main_pool = _DUTPool(afe, pool, labels, start_time, end_time) 390*9c5db199SXin Li 391*9c5db199SXin Li target_total = main_pool.total_hosts 392*9c5db199SXin Li if arguments.total is not None: 393*9c5db199SXin Li target_total = arguments.total 394*9c5db199SXin Li elif arguments.grow: 395*9c5db199SXin Li target_total += arguments.grow 396*9c5db199SXin Li elif arguments.shrink: 397*9c5db199SXin Li target_total -= arguments.shrink 398*9c5db199SXin Li 399*9c5db199SXin Li spares_needed = main_pool.calculate_spares_needed(target_total) 400*9c5db199SXin Li if spares_needed > 0: 401*9c5db199SXin Li spare_duts = spare_pool.working_hosts[:spares_needed] 402*9c5db199SXin Li shortfall = spares_needed - len(spare_duts) 403*9c5db199SXin Li else: 404*9c5db199SXin Li spare_duts = [] 405*9c5db199SXin Li shortfall = spares_needed 406*9c5db199SXin Li 407*9c5db199SXin Li surplus_duts = main_pool.allocate_surplus(shortfall) 408*9c5db199SXin Li 409*9c5db199SXin Li if spares_needed or surplus_duts or arguments.verbose: 410*9c5db199SXin Li dry_run = arguments.dry_run 411*9c5db199SXin Li _log_message('') 412*9c5db199SXin Li 413*9c5db199SXin Li _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) 414*9c5db199SXin Li _log_info(dry_run, 415*9c5db199SXin Li 'Total %d DUTs, %d working, %d broken, %d reserved.', 416*9c5db199SXin Li main_pool.total_hosts, len(main_pool.working_hosts), 417*9c5db199SXin Li len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 418*9c5db199SXin Li 419*9c5db199SXin Li if spares_needed > 0: 420*9c5db199SXin Li add_msg = 'grow pool by %d DUTs' % spares_needed 421*9c5db199SXin Li elif spares_needed < 0: 422*9c5db199SXin Li add_msg = 'shrink pool by %d DUTs' % -spares_needed 423*9c5db199SXin Li else: 424*9c5db199SXin Li add_msg = 'no change to pool size' 425*9c5db199SXin Li _log_info(dry_run, 'Target is %d working DUTs; %s.', 426*9c5db199SXin Li target_total, add_msg) 427*9c5db199SXin Li 428*9c5db199SXin Li _log_info(dry_run, 429*9c5db199SXin Li '%s %s pool has %d spares available for balancing pool %s', 430*9c5db199SXin Li labels, spare_pool.pool, len(spare_pool.working_hosts), 431*9c5db199SXin Li main_pool.pool) 432*9c5db199SXin Li 433*9c5db199SXin Li if spares_needed > len(spare_duts): 434*9c5db199SXin Li _log_error('Not enough spares: need %d, only have %d.', 435*9c5db199SXin Li spares_needed, len(spare_duts)) 436*9c5db199SXin Li elif shortfall >= 0: 437*9c5db199SXin Li _log_info(dry_run, 438*9c5db199SXin Li '%s %s pool will return %d broken DUTs, ' 439*9c5db199SXin Li 'leaving %d still in the pool.', 440*9c5db199SXin Li labels, main_pool.pool, 441*9c5db199SXin Li len(surplus_duts), 442*9c5db199SXin Li len(main_pool.broken_hosts) - len(surplus_duts)) 443*9c5db199SXin Li else: 444*9c5db199SXin Li _log_info(dry_run, 445*9c5db199SXin Li '%s %s pool will return %d surplus DUTs, ' 446*9c5db199SXin Li 'including %d working DUTs.', 447*9c5db199SXin Li labels, main_pool.pool, 448*9c5db199SXin Li len(main_pool.broken_hosts) - shortfall, 449*9c5db199SXin Li -shortfall) 450*9c5db199SXin Li 451*9c5db199SXin Li if (len(main_pool.broken_hosts) > arguments.max_broken and 452*9c5db199SXin Li not arguments.force_rebalance): 453*9c5db199SXin Li _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 454*9c5db199SXin Li labels, main_pool.pool, len(main_pool.broken_hosts)) 455*9c5db199SXin Li _log_error('Please investigate this model to for a bug ') 456*9c5db199SXin Li _log_error('that is bricking devices. Once you have finished your ') 457*9c5db199SXin Li _log_error('investigation, you can force a rebalance with ') 458*9c5db199SXin Li _log_error('--force-rebalance') 459*9c5db199SXin Li spare_duts = [] 460*9c5db199SXin Li surplus_duts = [] 461*9c5db199SXin Li 462*9c5db199SXin Li if not spare_duts and not surplus_duts: 463*9c5db199SXin Li if arguments.verbose: 464*9c5db199SXin Li _log_info(arguments.dry_run, 'No exchange required.') 465*9c5db199SXin Li 466*9c5db199SXin Li _exchange_labels(arguments.dry_run, surplus_duts, 467*9c5db199SXin Li spare_pool, main_pool) 468*9c5db199SXin Li _exchange_labels(arguments.dry_run, spare_duts, 469*9c5db199SXin Li main_pool, spare_pool) 470*9c5db199SXin Li 471*9c5db199SXin Li 472*9c5db199SXin Lidef _parse_command(argv): 473*9c5db199SXin Li """Parse the command line arguments. 474*9c5db199SXin Li 475*9c5db199SXin Li Create an argument parser for this command's syntax, parse the 476*9c5db199SXin Li command line, and return the result of the `ArgumentParser` 477*9c5db199SXin Li `parse_args()` method. 478*9c5db199SXin Li 479*9c5db199SXin Li @param argv Standard command line argument vector; `argv[0]` is 480*9c5db199SXin Li assumed to be the command name. 481*9c5db199SXin Li 482*9c5db199SXin Li @return Result returned by `ArgumentParser.parse_args()`. 483*9c5db199SXin Li 484*9c5db199SXin Li """ 485*9c5db199SXin Li parser = argparse.ArgumentParser( 486*9c5db199SXin Li prog=os.path.basename(argv[0]), 487*9c5db199SXin Li description='Balance pool shortages from spares on reserve') 488*9c5db199SXin Li 489*9c5db199SXin Li parser.add_argument( 490*9c5db199SXin Li '-w', '--web', type=str, default=None, 491*9c5db199SXin Li help='AFE host to use. Default comes from shadow_config.', 492*9c5db199SXin Li ) 493*9c5db199SXin Li count_group = parser.add_mutually_exclusive_group() 494*9c5db199SXin Li count_group.add_argument('-t', '--total', type=int, 495*9c5db199SXin Li metavar='COUNT', default=None, 496*9c5db199SXin Li help='Set the number of DUTs in the ' 497*9c5db199SXin Li 'pool to the specified count for ' 498*9c5db199SXin Li 'every MODEL') 499*9c5db199SXin Li count_group.add_argument('-a', '--grow', type=int, 500*9c5db199SXin Li metavar='COUNT', default=None, 501*9c5db199SXin Li help='Add the specified number of DUTs ' 502*9c5db199SXin Li 'to the pool for every MODEL') 503*9c5db199SXin Li count_group.add_argument('-d', '--shrink', type=int, 504*9c5db199SXin Li metavar='COUNT', default=None, 505*9c5db199SXin Li help='Remove the specified number of DUTs ' 506*9c5db199SXin Li 'from the pool for every MODEL') 507*9c5db199SXin Li 508*9c5db199SXin Li parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, 509*9c5db199SXin Li metavar='POOL', 510*9c5db199SXin Li help='Pool from which to draw replacement ' 511*9c5db199SXin Li 'spares (default: pool:%s)' % _SPARE_DEFAULT) 512*9c5db199SXin Li parser.add_argument('-n', '--dry-run', action='store_true', 513*9c5db199SXin Li help='Report actions to take in the form of ' 514*9c5db199SXin Li 'shell commands') 515*9c5db199SXin Li parser.add_argument('-v', '--verbose', action='store_true', 516*9c5db199SXin Li help='Print more detail about calculations for debug ' 517*9c5db199SXin Li 'purposes.') 518*9c5db199SXin Li 519*9c5db199SXin Li parser.add_argument('-m', '--max-broken', default=2, type=int, 520*9c5db199SXin Li metavar='COUNT', 521*9c5db199SXin Li help='Only rebalance a pool if it has at most ' 522*9c5db199SXin Li 'COUNT broken DUTs.') 523*9c5db199SXin Li parser.add_argument('-f', '--force-rebalance', action='store_true', 524*9c5db199SXin Li help='Forcefully rebalance all DUTs in a pool, even ' 525*9c5db199SXin Li 'if it has a large number of broken DUTs. ' 526*9c5db199SXin Li 'Before doing this, please investigate whether ' 527*9c5db199SXin Li 'there is a bug that is bricking devices in the ' 528*9c5db199SXin Li 'lab.') 529*9c5db199SXin Li parser.add_argument('--production', action='store_true', 530*9c5db199SXin Li help='Treat this as a production run. This will ' 531*9c5db199SXin Li 'collect metrics.') 532*9c5db199SXin Li 533*9c5db199SXin Li parser.add_argument( 534*9c5db199SXin Li '--all-models', 535*9c5db199SXin Li action='store_true', 536*9c5db199SXin Li help='Rebalance all managed models. This will do a very expensive ' 537*9c5db199SXin Li 'check to see how many models have at least one broken DUT. ' 538*9c5db199SXin Li 'To bypass that check, set --max-broken-models to 0.', 539*9c5db199SXin Li ) 540*9c5db199SXin Li parser.add_argument( 541*9c5db199SXin Li '--max-broken-models', default=None, type=int, metavar='COUNT', 542*9c5db199SXin Li help='Only rebalance all models if number of models with broken ' 543*9c5db199SXin Li 'DUTs in the specified pool is less than COUNT.', 544*9c5db199SXin Li ) 545*9c5db199SXin Li 546*9c5db199SXin Li parser.add_argument('pool', 547*9c5db199SXin Li metavar='POOL', 548*9c5db199SXin Li help='Name of the pool to balance. Use %s to balance ' 549*9c5db199SXin Li 'all critical pools' % _ALL_CRITICAL_POOLS) 550*9c5db199SXin Li parser.add_argument('models', nargs='*', metavar='MODEL', 551*9c5db199SXin Li help='Names of models to balance.') 552*9c5db199SXin Li 553*9c5db199SXin Li parser.add_argument('-p', '--phase', metavar='PHASE', 554*9c5db199SXin Li help='Optional phase label to restrict balance ' 555*9c5db199SXin Li 'operation to.') 556*9c5db199SXin Li 557*9c5db199SXin Li parser.add_argument('--sku', type=str, 558*9c5db199SXin Li help='Optional name of sku to restrict to.') 559*9c5db199SXin Li 560*9c5db199SXin Li arguments = parser.parse_args(argv[1:]) 561*9c5db199SXin Li 562*9c5db199SXin Li # Error-check arguments. 563*9c5db199SXin Li if arguments.models and arguments.all_models: 564*9c5db199SXin Li parser.error('Cannot specify individual models on the command line ' 565*9c5db199SXin Li 'when using --all-models.') 566*9c5db199SXin Li if (arguments.pool == _ALL_CRITICAL_POOLS and 567*9c5db199SXin Li arguments.spare != _SPARE_DEFAULT): 568*9c5db199SXin Li parser.error('Cannot specify --spare pool to be %s when balancing all ' 569*9c5db199SXin Li 'critical pools.' % _SPARE_DEFAULT) 570*9c5db199SXin Li for p in (arguments.spare, arguments.pool): 571*9c5db199SXin Li if not _VALID_POOL_PATTERN.match(p): 572*9c5db199SXin Li parser.error('Invalid pool name: %s' % p) 573*9c5db199SXin Li return arguments 574*9c5db199SXin Li 575*9c5db199SXin Li 576*9c5db199SXin Lidef infer_balancer_targets(afe, arguments, pools): 577*9c5db199SXin Li """Take some arguments and translate them to a list of models to balance 578*9c5db199SXin Li 579*9c5db199SXin Li Args: 580*9c5db199SXin Li @param afe AFE object to be used for taking inventory. 581*9c5db199SXin Li @param arguments Parsed command line arguments. 582*9c5db199SXin Li @param pools The list of pools to balance. 583*9c5db199SXin Li 584*9c5db199SXin Li @returns a list of (model, labels) tuples to be balanced 585*9c5db199SXin Li 586*9c5db199SXin Li """ 587*9c5db199SXin Li balancer_targets = [] 588*9c5db199SXin Li 589*9c5db199SXin Li for pool in pools: 590*9c5db199SXin Li if arguments.all_models: 591*9c5db199SXin Li inventory = lab_inventory.get_inventory(afe) 592*9c5db199SXin Li for model in inventory.get_pool_models(pool): 593*9c5db199SXin Li labels = labellib.LabelsMapping() 594*9c5db199SXin Li labels['model'] = model 595*9c5db199SXin Li if arguments.phase: 596*9c5db199SXin Li labels['phase'] = arguments.phase 597*9c5db199SXin Li balancer_targets.append((pool, labels.getlabels())) 598*9c5db199SXin Li else: 599*9c5db199SXin Li for model in arguments.models: 600*9c5db199SXin Li labels = labellib.LabelsMapping() 601*9c5db199SXin Li labels['model'] = model 602*9c5db199SXin Li if arguments.sku: 603*9c5db199SXin Li labels['sku'] = arguments.sku 604*9c5db199SXin Li if arguments.phase: 605*9c5db199SXin Li labels['phase'] = arguments.phase 606*9c5db199SXin Li balancer_targets.append((pool, labels.getlabels())) 607*9c5db199SXin Li return balancer_targets 608*9c5db199SXin Li 609*9c5db199SXin Li 610*9c5db199SXin Lidef main(argv): 611*9c5db199SXin Li """Standard main routine. 612*9c5db199SXin Li 613*9c5db199SXin Li @param argv Command line arguments including `sys.argv[0]`. 614*9c5db199SXin Li 615*9c5db199SXin Li """ 616*9c5db199SXin Li arguments = _parse_command(argv) 617*9c5db199SXin Li if arguments.production: 618*9c5db199SXin Li metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools', 619*9c5db199SXin Li indirect=True) 620*9c5db199SXin Li else: 621*9c5db199SXin Li metrics_manager = site_utils.TrivialContextManager() 622*9c5db199SXin Li 623*9c5db199SXin Li with metrics_manager: 624*9c5db199SXin Li with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'): 625*9c5db199SXin Li end_time = time.time() 626*9c5db199SXin Li start_time = end_time - 24 * 60 * 60 627*9c5db199SXin Li afe = frontend_wrappers.RetryingAFE(server=arguments.web) 628*9c5db199SXin Li 629*9c5db199SXin Li def balancer(pool, labels): 630*9c5db199SXin Li """Balance the specified model. 631*9c5db199SXin Li 632*9c5db199SXin Li @param pool: The pool to rebalance for the model. 633*9c5db199SXin Li @param labels: labels to restrict to balancing operations 634*9c5db199SXin Li within. 635*9c5db199SXin Li """ 636*9c5db199SXin Li _balance_model(arguments, afe, pool, labels, 637*9c5db199SXin Li start_time, end_time) 638*9c5db199SXin Li _log_message('') 639*9c5db199SXin Li 640*9c5db199SXin Li pools = (lab_inventory.CRITICAL_POOLS 641*9c5db199SXin Li if arguments.pool == _ALL_CRITICAL_POOLS 642*9c5db199SXin Li else [arguments.pool]) 643*9c5db199SXin Li balancer_targets = infer_balancer_targets(afe, arguments, pools) 644*9c5db199SXin Li try: 645*9c5db199SXin Li parallel.RunTasksInProcessPool( 646*9c5db199SXin Li balancer, 647*9c5db199SXin Li balancer_targets, 648*9c5db199SXin Li processes=8, 649*9c5db199SXin Li ) 650*9c5db199SXin Li except KeyboardInterrupt: 651*9c5db199SXin Li pass 652*9c5db199SXin Li 653*9c5db199SXin Li 654*9c5db199SXin Liif __name__ == '__main__': 655*9c5db199SXin Li main(sys.argv) 656