autotest/site_utils/balance_pools.py

*9c5db199SXin Li#!/usr/bin/env python3
*9c5db199SXin Li# Copyright 2015 The Chromium OS Authors. All rights reserved.
*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
*9c5db199SXin Li# found in the LICENSE file.
*9c5db199SXin Li
*9c5db199SXin Li"""Adjust pool balances to cover DUT shortfalls.
*9c5db199SXin Li
*9c5db199SXin LiThis command takes all broken DUTs in a specific pool for specific
*9c5db199SXin Limodels and swaps them with working DUTs taken from a selected pool
*9c5db199SXin Liof spares.  The command is meant primarily for replacing broken DUTs
*9c5db199SXin Liin critical pools like BVT or CQ, but it can also be used to adjust
*9c5db199SXin Lipool sizes, or to create or remove pools.
*9c5db199SXin Li
*9c5db199SXin Liusage:  balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
*9c5db199SXin Li
*9c5db199SXin Lipositional arguments:
*9c5db199SXin Li  POOL                  Name of the pool to balance
*9c5db199SXin Li  MODEL                 Names of models to balance
*9c5db199SXin Li
*9c5db199SXin Lioptional arguments:
*9c5db199SXin Li  -h, --help            show this help message and exit
*9c5db199SXin Li  -t COUNT, --total COUNT
*9c5db199SXin Li                        Set the number of DUTs in the pool to the specified
*9c5db199SXin Li                        count for every MODEL
*9c5db199SXin Li  -a COUNT, --grow COUNT
*9c5db199SXin Li                        Add the specified number of DUTs to the pool for every
*9c5db199SXin Li                        MODEL
*9c5db199SXin Li  -d COUNT, --shrink COUNT
*9c5db199SXin Li                        Remove the specified number of DUTs from the pool for
*9c5db199SXin Li                        every MODEL
*9c5db199SXin Li  -s POOL, --spare POOL
*9c5db199SXin Li                        Pool from which to draw replacement spares (default:
*9c5db199SXin Li                        pool:suites)
*9c5db199SXin Li  -p PHASE, --phase PHASE
*9c5db199SXin Li                        Phase to restrict the balance pool operation to
*9c5db199SXin Li  --sku SKU             The specific SKU we intend to swap with
*9c5db199SXin Li  -n, --dry-run         Report actions to take in the form of shell commands
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin LiThe command attempts to remove all broken DUTs from the target POOL
*9c5db199SXin Lifor every MODEL, and replace them with enough working DUTs taken
*9c5db199SXin Lifrom the spare pool to bring the strength of POOL to the requested
*9c5db199SXin Litotal COUNT.
*9c5db199SXin Li
*9c5db199SXin LiIf no COUNT options are supplied (i.e. there are no --total, --grow,
*9c5db199SXin Lior --shrink options), the command will maintain the current totals of
*9c5db199SXin LiDUTs for every MODEL in the target POOL.
*9c5db199SXin Li
*9c5db199SXin LiIf not enough working spares are available, broken DUTs may be left
*9c5db199SXin Liin the pool to keep the pool at the target COUNT.
*9c5db199SXin Li
*9c5db199SXin LiWhen reducing pool size, working DUTs will be returned after broken
*9c5db199SXin LiDUTs, if it's necessary to achieve the target COUNT.
*9c5db199SXin Li
*9c5db199SXin Li"""
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liimport argparse
*9c5db199SXin Liimport os
*9c5db199SXin Liimport re
*9c5db199SXin Liimport sys
*9c5db199SXin Liimport time
*9c5db199SXin Li
*9c5db199SXin Liimport common
*9c5db199SXin Lifrom autotest_lib.server import constants
*9c5db199SXin Lifrom autotest_lib.server import site_utils
*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
*9c5db199SXin Lifrom autotest_lib.server.lib import status_history
*9c5db199SXin Lifrom autotest_lib.site_utils import lab_inventory
*9c5db199SXin Lifrom autotest_lib.utils import labellib
*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import metrics
*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import parallel
*9c5db199SXin Li
*9c5db199SXin Li#This must be imported after chromite.lib.metrics
*9c5db199SXin Lifrom infra_libs import ts_mon
*9c5db199SXin Li
*9c5db199SXin Li_POOL_PREFIX = constants.Labels.POOL_PREFIX
*9c5db199SXin Li# This is the ratio of all models we should calculate the default max
*9c5db199SXin Li# number of broken models against.  It seemed like the best choice that
*9c5db199SXin Li# was neither too strict nor lax.
*9c5db199SXin Li_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
*9c5db199SXin Li
*9c5db199SXin Li_ALL_CRITICAL_POOLS = 'all_critical_pools'
*9c5db199SXin Li_SPARE_DEFAULT = lab_inventory.SPARE_POOL
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Li# _VALID_POOL_PATTERN - Regular expression matching pool names that will
*9c5db199SXin Li# be accepted on the command line.
*9c5db199SXin Li#
*9c5db199SXin Li# Note: This pattern was selected merely to recognize all existing pool
*9c5db199SXin Li# names; there's no underlying technical restriction motivating this
*9c5db199SXin Li# pattern.  No reasonable request to add more special characters to the
*9c5db199SXin Li# allowed set should be refused.
*9c5db199SXin Li
*9c5db199SXin Li_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _log_message(message, *args):
*9c5db199SXin Li    """Log a message with optional format arguments to stdout.
*9c5db199SXin Li
*9c5db199SXin Li    This function logs a single line to stdout, with formatting
*9c5db199SXin Li    if necessary, and without adornments.
*9c5db199SXin Li
*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
*9c5db199SXin Li    the arguments.
*9c5db199SXin Li
*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
*9c5db199SXin Li                    without formatting.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    if args:
*9c5db199SXin Li        message = message % args
*9c5db199SXin Li    sys.stdout.write('%s\n' % message)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _log_info(dry_run, message, *args):
*9c5db199SXin Li    """Log information in a dry-run dependent fashion.
*9c5db199SXin Li
*9c5db199SXin Li    This function logs a single line to stdout, with formatting
*9c5db199SXin Li    if necessary.  When logging for a dry run, the message is
*9c5db199SXin Li    printed as a shell comment, rather than as unadorned text.
*9c5db199SXin Li
*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
*9c5db199SXin Li    the arguments.
*9c5db199SXin Li
*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
*9c5db199SXin Li                    without formatting.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    if dry_run:
*9c5db199SXin Li        message = '# ' + message
*9c5db199SXin Li    _log_message(message, *args)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _log_error(message, *args):
*9c5db199SXin Li    """Log an error to stderr, with optional format arguments.
*9c5db199SXin Li
*9c5db199SXin Li    This function logs a single line to stderr, prefixed to indicate
*9c5db199SXin Li    that it is an error message.
*9c5db199SXin Li
*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
*9c5db199SXin Li    the arguments.
*9c5db199SXin Li
*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
*9c5db199SXin Li                    without formatting.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    if args:
*9c5db199SXin Li        message = message % args
*9c5db199SXin Li    sys.stderr.write('ERROR: %s\n' % message)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liclass _DUTPool(object):
*9c5db199SXin Li    """Information about a pool of DUTs matching given labels.
*9c5db199SXin Li
*9c5db199SXin Li    This class collects information about all DUTs for a given pool and matching
*9c5db199SXin Li    the given labels, and divides them into three categories:
*9c5db199SXin Li      + Working - the DUT is working for testing, and not locked.
*9c5db199SXin Li      + Broken - the DUT is unable to run tests, or it is locked.
*9c5db199SXin Li      + Ineligible - the DUT is not available to be removed from this pool.  The
*9c5db199SXin Li            DUT may be either working or broken.
*9c5db199SXin Li
*9c5db199SXin Li    DUTs with more than one pool: label are ineligible for exchange
*9c5db199SXin Li    during balancing.  This is done for the sake of chameleon hosts,
*9c5db199SXin Li    which must always be assigned to pool:suites.  These DUTs are
*9c5db199SXin Li    always marked with pool:chameleon to prevent their reassignment.
*9c5db199SXin Li
*9c5db199SXin Li    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
*9c5db199SXin Li    the `chameleon` label is a hack that should be eliminated.
*9c5db199SXin Li
*9c5db199SXin Li    _DUTPool instances are used to track both main pools that need
*9c5db199SXin Li    to be resupplied with working DUTs and spare pools that supply
*9c5db199SXin Li    those DUTs.
*9c5db199SXin Li
*9c5db199SXin Li    @property pool                Name of the pool associated with
*9c5db199SXin Li                                  this pool of DUTs.
*9c5db199SXin Li    @property labels              Labels that constrain the DUTs to consider.
*9c5db199SXin Li    @property working_hosts       The list of this pool's working DUTs.
*9c5db199SXin Li    @property broken_hosts        The list of this pool's broken DUTs.
*9c5db199SXin Li    @property ineligible_hosts    The list of this pool's ineligible DUTs.
*9c5db199SXin Li    @property pool_labels         A list of labels that identify a DUT as part
*9c5db199SXin Li                                  of this pool.
*9c5db199SXin Li    @property total_hosts         The total number of hosts in pool.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li
*9c5db199SXin Li    def __init__(self, afe, pool, labels, start_time, end_time):
*9c5db199SXin Li        self.pool = pool
*9c5db199SXin Li        self.labels = labellib.LabelsMapping(labels)
*9c5db199SXin Li        self.labels['pool'] = pool
*9c5db199SXin Li        self._pool_labels = [_POOL_PREFIX + self.pool]
*9c5db199SXin Li
*9c5db199SXin Li        self.working_hosts = []
*9c5db199SXin Li        self.broken_hosts = []
*9c5db199SXin Li        self.ineligible_hosts = []
*9c5db199SXin Li        self.total_hosts = self._get_hosts(afe, start_time, end_time)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Li    def _get_hosts(self, afe, start_time, end_time):
*9c5db199SXin Li        all_histories = status_history.HostJobHistory.get_multiple_histories(
*9c5db199SXin Li                afe, start_time, end_time, self.labels.getlabels())
*9c5db199SXin Li        for h in all_histories:
*9c5db199SXin Li            host = h.host
*9c5db199SXin Li            host_pools = [l for l in host.labels
*9c5db199SXin Li                          if l.startswith(_POOL_PREFIX)]
*9c5db199SXin Li            if len(host_pools) != 1:
*9c5db199SXin Li                self.ineligible_hosts.append(host)
*9c5db199SXin Li            else:
*9c5db199SXin Li                diag = h.last_diagnosis()[0]
*9c5db199SXin Li                if (diag == status_history.WORKING and
*9c5db199SXin Li                        not host.locked):
*9c5db199SXin Li                    self.working_hosts.append(host)
*9c5db199SXin Li                else:
*9c5db199SXin Li                    self.broken_hosts.append(host)
*9c5db199SXin Li        return len(all_histories)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Li    @property
*9c5db199SXin Li    def pool_labels(self):
*9c5db199SXin Li        """Return the AFE labels that identify this pool.
*9c5db199SXin Li
*9c5db199SXin Li        The returned labels are the labels that must be removed
*9c5db199SXin Li        to remove a DUT from the pool, or added to add a DUT.
*9c5db199SXin Li
*9c5db199SXin Li        @return A list of AFE labels suitable for AFE.add_labels()
*9c5db199SXin Li                or AFE.remove_labels().
*9c5db199SXin Li
*9c5db199SXin Li        """
*9c5db199SXin Li        return self._pool_labels
*9c5db199SXin Li
*9c5db199SXin Li    def calculate_spares_needed(self, target_total):
*9c5db199SXin Li        """Calculate and log the spares needed to achieve a target.
*9c5db199SXin Li
*9c5db199SXin Li        Return how many working spares are needed to achieve the
*9c5db199SXin Li        given `target_total` with all DUTs working.
*9c5db199SXin Li
*9c5db199SXin Li        The spares count may be positive or negative.  Positive
*9c5db199SXin Li        values indicate spares are needed to replace broken DUTs in
*9c5db199SXin Li        order to reach the target; negative numbers indicate that
*9c5db199SXin Li        no spares are needed, and that a corresponding number of
*9c5db199SXin Li        working devices can be returned.
*9c5db199SXin Li
*9c5db199SXin Li        If the new target total would require returning ineligible
*9c5db199SXin Li        DUTs, an error is logged, and the target total is adjusted
*9c5db199SXin Li        so that those DUTs are not exchanged.
*9c5db199SXin Li
*9c5db199SXin Li        @param target_total  The new target pool size.
*9c5db199SXin Li
*9c5db199SXin Li        @return The number of spares needed.
*9c5db199SXin Li
*9c5db199SXin Li        """
*9c5db199SXin Li        num_ineligible = len(self.ineligible_hosts)
*9c5db199SXin Li        spares_needed = target_total >= num_ineligible
*9c5db199SXin Li        metrics.Boolean(
*9c5db199SXin Li            'chromeos/autotest/balance_pools/exhausted_pools',
*9c5db199SXin Li            'True for each pool/model which requests more DUTs than supplied',
*9c5db199SXin Li            # TODO(jrbarnette) The 'board' field is a legacy.  We need
*9c5db199SXin Li            # to leave it here until we do the extra work Monarch
*9c5db199SXin Li            # requires to delete a field.
*9c5db199SXin Li            field_spec=[
*9c5db199SXin Li                    ts_mon.StringField('pool'),
*9c5db199SXin Li                    ts_mon.StringField('board'),
*9c5db199SXin Li                    ts_mon.StringField('model'),
*9c5db199SXin Li            ]).set(
*9c5db199SXin Li                    not spares_needed,
*9c5db199SXin Li                    fields={
*9c5db199SXin Li                            'pool': self.pool,
*9c5db199SXin Li                            'board': self.labels.get('model', ''),
*9c5db199SXin Li                            'model': self.labels.get('model', ''),
*9c5db199SXin Li                    },
*9c5db199SXin Li        )
*9c5db199SXin Li        if not spares_needed:
*9c5db199SXin Li            _log_error(
*9c5db199SXin Li                    '%s pool (%s): Target of %d is below minimum of %d DUTs.',
*9c5db199SXin Li                    self.pool, self.labels, target_total, num_ineligible,
*9c5db199SXin Li            )
*9c5db199SXin Li            _log_error('Adjusting target to %d DUTs.', num_ineligible)
*9c5db199SXin Li            target_total = num_ineligible
*9c5db199SXin Li        else:
*9c5db199SXin Li            _log_message('%s %s pool: Target of %d is above minimum.',
*9c5db199SXin Li                         self.labels.get('model', ''), self.pool, target_total)
*9c5db199SXin Li        adjustment = target_total - self.total_hosts
*9c5db199SXin Li        return len(self.broken_hosts) + adjustment
*9c5db199SXin Li
*9c5db199SXin Li    def allocate_surplus(self, num_broken):
*9c5db199SXin Li        """Allocate a list DUTs that can returned as surplus.
*9c5db199SXin Li
*9c5db199SXin Li        Return a list of devices that can be returned in order to
*9c5db199SXin Li        reduce this pool's supply.  Broken DUTs will be preferred
*9c5db199SXin Li        over working ones.
*9c5db199SXin Li
*9c5db199SXin Li        The `num_broken` parameter indicates the number of broken
*9c5db199SXin Li        DUTs to be left in the pool.  If this number exceeds the
*9c5db199SXin Li        number of broken DUTs actually in the pool, the returned
*9c5db199SXin Li        list will be empty.  If this number is negative, it
*9c5db199SXin Li        indicates a number of working DUTs to be returned in
*9c5db199SXin Li        addition to all broken ones.
*9c5db199SXin Li
*9c5db199SXin Li        @param num_broken    Total number of broken DUTs to be left in
*9c5db199SXin Li                             this pool.
*9c5db199SXin Li
*9c5db199SXin Li        @return A list of DUTs to be returned as surplus.
*9c5db199SXin Li
*9c5db199SXin Li        """
*9c5db199SXin Li        if num_broken >= 0:
*9c5db199SXin Li            surplus = self.broken_hosts[num_broken:]
*9c5db199SXin Li            return surplus
*9c5db199SXin Li        else:
*9c5db199SXin Li            return (self.broken_hosts +
*9c5db199SXin Li                    self.working_hosts[:-num_broken])
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _exchange_labels(dry_run, hosts, target_pool, spare_pool):
*9c5db199SXin Li    """Reassign a list of DUTs from one pool to another.
*9c5db199SXin Li
*9c5db199SXin Li    For all the given hosts, remove all labels associated with
*9c5db199SXin Li    `spare_pool`, and add the labels for `target_pool`.
*9c5db199SXin Li
*9c5db199SXin Li    If `dry_run` is true, perform no changes, but log the `atest`
*9c5db199SXin Li    commands needed to accomplish the necessary label changes.
*9c5db199SXin Li
*9c5db199SXin Li    @param dry_run       Whether the logging is for a dry run or
*9c5db199SXin Li                         for actual execution.
*9c5db199SXin Li    @param hosts         List of DUTs (AFE hosts) to be reassigned.
*9c5db199SXin Li    @param target_pool   The `_DUTPool` object from which the hosts
*9c5db199SXin Li                         are drawn.
*9c5db199SXin Li    @param spare_pool    The `_DUTPool` object to which the hosts
*9c5db199SXin Li                         will be added.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
*9c5db199SXin Li              len(hosts), spare_pool.pool, target_pool.pool)
*9c5db199SXin Li    metrics.Counter(
*9c5db199SXin Li        'chromeos/autotest/balance_pools/duts_moved',
*9c5db199SXin Li        'DUTs transferred between pools',
*9c5db199SXin Li        # TODO(jrbarnette) The 'board' field is a legacy.  We need to
*9c5db199SXin Li        # leave it here until we do the extra work Monarch requires to
*9c5db199SXin Li        # delete a field.
*9c5db199SXin Li        field_spec=[
*9c5db199SXin Li                ts_mon.StringField('board'),
*9c5db199SXin Li                ts_mon.StringField('model'),
*9c5db199SXin Li                ts_mon.StringField('source_pool'),
*9c5db199SXin Li                ts_mon.StringField('target_pool'),
*9c5db199SXin Li        ]
*9c5db199SXin Li    ).increment_by(
*9c5db199SXin Li            len(hosts),
*9c5db199SXin Li            fields={
*9c5db199SXin Li                    'board': target_pool.labels.get('model', ''),
*9c5db199SXin Li                    'model': target_pool.labels.get('model', ''),
*9c5db199SXin Li                    'source_pool': spare_pool.pool,
*9c5db199SXin Li                    'target_pool': target_pool.pool,
*9c5db199SXin Li            },
*9c5db199SXin Li    )
*9c5db199SXin Li    if not hosts:
*9c5db199SXin Li        return
*9c5db199SXin Li
*9c5db199SXin Li    additions = target_pool.pool_labels
*9c5db199SXin Li    removals = spare_pool.pool_labels
*9c5db199SXin Li    for host in hosts:
*9c5db199SXin Li        if not dry_run:
*9c5db199SXin Li            _log_message('Updating host: %s.', host.hostname)
*9c5db199SXin Li            host.remove_labels(removals)
*9c5db199SXin Li            host.add_labels(additions)
*9c5db199SXin Li        else:
*9c5db199SXin Li            _log_message('atest label remove -m %s %s',
*9c5db199SXin Li                         host.hostname, ' '.join(removals))
*9c5db199SXin Li            _log_message('atest label add -m %s %s',
*9c5db199SXin Li                         host.hostname, ' '.join(additions))
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _balance_model(arguments, afe, pool, labels, start_time, end_time):
*9c5db199SXin Li    """Balance one model as requested by command line arguments.
*9c5db199SXin Li
*9c5db199SXin Li    @param arguments     Parsed command line arguments.
*9c5db199SXin Li    @param afe           AFE object to be used for the changes.
*9c5db199SXin Li    @param pool          Pool of the model to be balanced.
*9c5db199SXin Li    @param labels        Restrict the balancing operation within DUTs
*9c5db199SXin Li                         that have these labels.
*9c5db199SXin Li    @param start_time    Start time for HostJobHistory objects in
*9c5db199SXin Li                         the DUT pools.
*9c5db199SXin Li    @param end_time      End time for HostJobHistory objects in the
*9c5db199SXin Li                         DUT pools.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
*9c5db199SXin Li    main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
*9c5db199SXin Li
*9c5db199SXin Li    target_total = main_pool.total_hosts
*9c5db199SXin Li    if arguments.total is not None:
*9c5db199SXin Li        target_total = arguments.total
*9c5db199SXin Li    elif arguments.grow:
*9c5db199SXin Li        target_total += arguments.grow
*9c5db199SXin Li    elif arguments.shrink:
*9c5db199SXin Li        target_total -= arguments.shrink
*9c5db199SXin Li
*9c5db199SXin Li    spares_needed = main_pool.calculate_spares_needed(target_total)
*9c5db199SXin Li    if spares_needed > 0:
*9c5db199SXin Li        spare_duts = spare_pool.working_hosts[:spares_needed]
*9c5db199SXin Li        shortfall = spares_needed - len(spare_duts)
*9c5db199SXin Li    else:
*9c5db199SXin Li        spare_duts = []
*9c5db199SXin Li        shortfall = spares_needed
*9c5db199SXin Li
*9c5db199SXin Li    surplus_duts = main_pool.allocate_surplus(shortfall)
*9c5db199SXin Li
*9c5db199SXin Li    if spares_needed or surplus_duts or arguments.verbose:
*9c5db199SXin Li        dry_run = arguments.dry_run
*9c5db199SXin Li        _log_message('')
*9c5db199SXin Li
*9c5db199SXin Li        _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
*9c5db199SXin Li        _log_info(dry_run,
*9c5db199SXin Li                  'Total %d DUTs, %d working, %d broken, %d reserved.',
*9c5db199SXin Li                  main_pool.total_hosts, len(main_pool.working_hosts),
*9c5db199SXin Li                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
*9c5db199SXin Li
*9c5db199SXin Li        if spares_needed > 0:
*9c5db199SXin Li            add_msg = 'grow pool by %d DUTs' % spares_needed
*9c5db199SXin Li        elif spares_needed < 0:
*9c5db199SXin Li            add_msg = 'shrink pool by %d DUTs' % -spares_needed
*9c5db199SXin Li        else:
*9c5db199SXin Li            add_msg = 'no change to pool size'
*9c5db199SXin Li        _log_info(dry_run, 'Target is %d working DUTs; %s.',
*9c5db199SXin Li                  target_total, add_msg)
*9c5db199SXin Li
*9c5db199SXin Li        _log_info(dry_run,
*9c5db199SXin Li                  '%s %s pool has %d spares available for balancing pool %s',
*9c5db199SXin Li                  labels, spare_pool.pool, len(spare_pool.working_hosts),
*9c5db199SXin Li                  main_pool.pool)
*9c5db199SXin Li
*9c5db199SXin Li        if spares_needed > len(spare_duts):
*9c5db199SXin Li            _log_error('Not enough spares: need %d, only have %d.',
*9c5db199SXin Li                       spares_needed, len(spare_duts))
*9c5db199SXin Li        elif shortfall >= 0:
*9c5db199SXin Li            _log_info(dry_run,
*9c5db199SXin Li                      '%s %s pool will return %d broken DUTs, '
*9c5db199SXin Li                      'leaving %d still in the pool.',
*9c5db199SXin Li                      labels, main_pool.pool,
*9c5db199SXin Li                      len(surplus_duts),
*9c5db199SXin Li                      len(main_pool.broken_hosts) - len(surplus_duts))
*9c5db199SXin Li        else:
*9c5db199SXin Li            _log_info(dry_run,
*9c5db199SXin Li                      '%s %s pool will return %d surplus DUTs, '
*9c5db199SXin Li                      'including %d working DUTs.',
*9c5db199SXin Li                      labels, main_pool.pool,
*9c5db199SXin Li                      len(main_pool.broken_hosts) - shortfall,
*9c5db199SXin Li                      -shortfall)
*9c5db199SXin Li
*9c5db199SXin Li    if (len(main_pool.broken_hosts) > arguments.max_broken and
*9c5db199SXin Li        not arguments.force_rebalance):
*9c5db199SXin Li        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
*9c5db199SXin Li                   labels, main_pool.pool, len(main_pool.broken_hosts))
*9c5db199SXin Li        _log_error('Please investigate this model to for a bug ')
*9c5db199SXin Li        _log_error('that is bricking devices. Once you have finished your ')
*9c5db199SXin Li        _log_error('investigation, you can force a rebalance with ')
*9c5db199SXin Li        _log_error('--force-rebalance')
*9c5db199SXin Li        spare_duts = []
*9c5db199SXin Li        surplus_duts = []
*9c5db199SXin Li
*9c5db199SXin Li    if not spare_duts and not surplus_duts:
*9c5db199SXin Li        if arguments.verbose:
*9c5db199SXin Li            _log_info(arguments.dry_run, 'No exchange required.')
*9c5db199SXin Li
*9c5db199SXin Li    _exchange_labels(arguments.dry_run, surplus_duts,
*9c5db199SXin Li                     spare_pool, main_pool)
*9c5db199SXin Li    _exchange_labels(arguments.dry_run, spare_duts,
*9c5db199SXin Li                     main_pool, spare_pool)
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef _parse_command(argv):
*9c5db199SXin Li    """Parse the command line arguments.
*9c5db199SXin Li
*9c5db199SXin Li    Create an argument parser for this command's syntax, parse the
*9c5db199SXin Li    command line, and return the result of the `ArgumentParser`
*9c5db199SXin Li    `parse_args()` method.
*9c5db199SXin Li
*9c5db199SXin Li    @param argv Standard command line argument vector; `argv[0]` is
*9c5db199SXin Li                assumed to be the command name.
*9c5db199SXin Li
*9c5db199SXin Li    @return Result returned by `ArgumentParser.parse_args()`.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    parser = argparse.ArgumentParser(
*9c5db199SXin Li            prog=os.path.basename(argv[0]),
*9c5db199SXin Li            description='Balance pool shortages from spares on reserve')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument(
*9c5db199SXin Li        '-w', '--web', type=str, default=None,
*9c5db199SXin Li        help='AFE host to use. Default comes from shadow_config.',
*9c5db199SXin Li    )
*9c5db199SXin Li    count_group = parser.add_mutually_exclusive_group()
*9c5db199SXin Li    count_group.add_argument('-t', '--total', type=int,
*9c5db199SXin Li                             metavar='COUNT', default=None,
*9c5db199SXin Li                             help='Set the number of DUTs in the '
*9c5db199SXin Li                                  'pool to the specified count for '
*9c5db199SXin Li                                  'every MODEL')
*9c5db199SXin Li    count_group.add_argument('-a', '--grow', type=int,
*9c5db199SXin Li                             metavar='COUNT', default=None,
*9c5db199SXin Li                             help='Add the specified number of DUTs '
*9c5db199SXin Li                                  'to the pool for every MODEL')
*9c5db199SXin Li    count_group.add_argument('-d', '--shrink', type=int,
*9c5db199SXin Li                             metavar='COUNT', default=None,
*9c5db199SXin Li                             help='Remove the specified number of DUTs '
*9c5db199SXin Li                                  'from the pool for every MODEL')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
*9c5db199SXin Li                        metavar='POOL',
*9c5db199SXin Li                        help='Pool from which to draw replacement '
*9c5db199SXin Li                             'spares (default: pool:%s)' % _SPARE_DEFAULT)
*9c5db199SXin Li    parser.add_argument('-n', '--dry-run', action='store_true',
*9c5db199SXin Li                        help='Report actions to take in the form of '
*9c5db199SXin Li                             'shell commands')
*9c5db199SXin Li    parser.add_argument('-v', '--verbose', action='store_true',
*9c5db199SXin Li                        help='Print more detail about calculations for debug '
*9c5db199SXin Li                             'purposes.')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument('-m', '--max-broken', default=2, type=int,
*9c5db199SXin Li                        metavar='COUNT',
*9c5db199SXin Li                        help='Only rebalance a pool if it has at most '
*9c5db199SXin Li                             'COUNT broken DUTs.')
*9c5db199SXin Li    parser.add_argument('-f', '--force-rebalance', action='store_true',
*9c5db199SXin Li                        help='Forcefully rebalance all DUTs in a pool, even '
*9c5db199SXin Li                             'if it has a large number of broken DUTs. '
*9c5db199SXin Li                             'Before doing this, please investigate whether '
*9c5db199SXin Li                             'there is a bug that is bricking devices in the '
*9c5db199SXin Li                             'lab.')
*9c5db199SXin Li    parser.add_argument('--production', action='store_true',
*9c5db199SXin Li                        help='Treat this as a production run. This will '
*9c5db199SXin Li                             'collect metrics.')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument(
*9c5db199SXin Li            '--all-models',
*9c5db199SXin Li            action='store_true',
*9c5db199SXin Li            help='Rebalance all managed models.  This will do a very expensive '
*9c5db199SXin Li                 'check to see how many models have at least one broken DUT. '
*9c5db199SXin Li                 'To bypass that check, set --max-broken-models to 0.',
*9c5db199SXin Li    )
*9c5db199SXin Li    parser.add_argument(
*9c5db199SXin Li            '--max-broken-models', default=None, type=int, metavar='COUNT',
*9c5db199SXin Li            help='Only rebalance all models if number of models with broken '
*9c5db199SXin Li                 'DUTs in the specified pool is less than COUNT.',
*9c5db199SXin Li    )
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument('pool',
*9c5db199SXin Li                        metavar='POOL',
*9c5db199SXin Li                        help='Name of the pool to balance.  Use %s to balance '
*9c5db199SXin Li                             'all critical pools' % _ALL_CRITICAL_POOLS)
*9c5db199SXin Li    parser.add_argument('models', nargs='*', metavar='MODEL',
*9c5db199SXin Li                        help='Names of models to balance.')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument('-p', '--phase', metavar='PHASE',
*9c5db199SXin Li                        help='Optional phase label to restrict balance '
*9c5db199SXin Li                        'operation to.')
*9c5db199SXin Li
*9c5db199SXin Li    parser.add_argument('--sku', type=str,
*9c5db199SXin Li                        help='Optional name of sku to restrict to.')
*9c5db199SXin Li
*9c5db199SXin Li    arguments = parser.parse_args(argv[1:])
*9c5db199SXin Li
*9c5db199SXin Li    # Error-check arguments.
*9c5db199SXin Li    if arguments.models and arguments.all_models:
*9c5db199SXin Li        parser.error('Cannot specify individual models on the command line '
*9c5db199SXin Li                     'when using --all-models.')
*9c5db199SXin Li    if (arguments.pool == _ALL_CRITICAL_POOLS and
*9c5db199SXin Li        arguments.spare != _SPARE_DEFAULT):
*9c5db199SXin Li        parser.error('Cannot specify --spare pool to be %s when balancing all '
*9c5db199SXin Li                     'critical pools.' % _SPARE_DEFAULT)
*9c5db199SXin Li    for p in (arguments.spare, arguments.pool):
*9c5db199SXin Li        if not _VALID_POOL_PATTERN.match(p):
*9c5db199SXin Li            parser.error('Invalid pool name: %s' % p)
*9c5db199SXin Li    return arguments
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef infer_balancer_targets(afe, arguments, pools):
*9c5db199SXin Li    """Take some arguments and translate them to a list of models to balance
*9c5db199SXin Li
*9c5db199SXin Li    Args:
*9c5db199SXin Li    @param afe           AFE object to be used for taking inventory.
*9c5db199SXin Li    @param arguments     Parsed command line arguments.
*9c5db199SXin Li    @param pools         The list of pools to balance.
*9c5db199SXin Li
*9c5db199SXin Li    @returns    a list of (model, labels) tuples to be balanced
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    balancer_targets = []
*9c5db199SXin Li
*9c5db199SXin Li    for pool in pools:
*9c5db199SXin Li        if arguments.all_models:
*9c5db199SXin Li            inventory = lab_inventory.get_inventory(afe)
*9c5db199SXin Li            for model in inventory.get_pool_models(pool):
*9c5db199SXin Li                labels = labellib.LabelsMapping()
*9c5db199SXin Li                labels['model'] = model
*9c5db199SXin Li                if arguments.phase:
*9c5db199SXin Li                    labels['phase'] = arguments.phase
*9c5db199SXin Li                balancer_targets.append((pool, labels.getlabels()))
*9c5db199SXin Li        else:
*9c5db199SXin Li            for model in arguments.models:
*9c5db199SXin Li                labels = labellib.LabelsMapping()
*9c5db199SXin Li                labels['model'] = model
*9c5db199SXin Li                if arguments.sku:
*9c5db199SXin Li                    labels['sku'] = arguments.sku
*9c5db199SXin Li                if arguments.phase:
*9c5db199SXin Li                    labels['phase'] = arguments.phase
*9c5db199SXin Li                balancer_targets.append((pool, labels.getlabels()))
*9c5db199SXin Li    return balancer_targets
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Lidef main(argv):
*9c5db199SXin Li    """Standard main routine.
*9c5db199SXin Li
*9c5db199SXin Li    @param argv  Command line arguments including `sys.argv[0]`.
*9c5db199SXin Li
*9c5db199SXin Li    """
*9c5db199SXin Li    arguments = _parse_command(argv)
*9c5db199SXin Li    if arguments.production:
*9c5db199SXin Li        metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
*9c5db199SXin Li                                                           indirect=True)
*9c5db199SXin Li    else:
*9c5db199SXin Li        metrics_manager = site_utils.TrivialContextManager()
*9c5db199SXin Li
*9c5db199SXin Li    with metrics_manager:
*9c5db199SXin Li        with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
*9c5db199SXin Li            end_time = time.time()
*9c5db199SXin Li            start_time = end_time - 24 * 60 * 60
*9c5db199SXin Li            afe = frontend_wrappers.RetryingAFE(server=arguments.web)
*9c5db199SXin Li
*9c5db199SXin Li            def balancer(pool, labels):
*9c5db199SXin Li                """Balance the specified model.
*9c5db199SXin Li
*9c5db199SXin Li                @param pool: The pool to rebalance for the model.
*9c5db199SXin Li                @param labels: labels to restrict to balancing operations
*9c5db199SXin Li                        within.
*9c5db199SXin Li                """
*9c5db199SXin Li                _balance_model(arguments, afe, pool, labels,
*9c5db199SXin Li                               start_time, end_time)
*9c5db199SXin Li                _log_message('')
*9c5db199SXin Li
*9c5db199SXin Li            pools = (lab_inventory.CRITICAL_POOLS
*9c5db199SXin Li                    if arguments.pool == _ALL_CRITICAL_POOLS
*9c5db199SXin Li                    else [arguments.pool])
*9c5db199SXin Li            balancer_targets = infer_balancer_targets(afe, arguments, pools)
*9c5db199SXin Li            try:
*9c5db199SXin Li                parallel.RunTasksInProcessPool(
*9c5db199SXin Li                        balancer,
*9c5db199SXin Li                        balancer_targets,
*9c5db199SXin Li                        processes=8,
*9c5db199SXin Li                )
*9c5db199SXin Li            except KeyboardInterrupt:
*9c5db199SXin Li                pass
*9c5db199SXin Li
*9c5db199SXin Li
*9c5db199SXin Liif __name__ == '__main__':
*9c5db199SXin Li    main(sys.argv)