xref: /aosp_15_r20/external/autotest/site_utils/balance_pools.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li#!/usr/bin/env python3
2*9c5db199SXin Li# Copyright 2015 The Chromium OS Authors. All rights reserved.
3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
4*9c5db199SXin Li# found in the LICENSE file.
5*9c5db199SXin Li
6*9c5db199SXin Li"""Adjust pool balances to cover DUT shortfalls.
7*9c5db199SXin Li
8*9c5db199SXin LiThis command takes all broken DUTs in a specific pool for specific
9*9c5db199SXin Limodels and swaps them with working DUTs taken from a selected pool
10*9c5db199SXin Liof spares.  The command is meant primarily for replacing broken DUTs
11*9c5db199SXin Liin critical pools like BVT or CQ, but it can also be used to adjust
12*9c5db199SXin Lipool sizes, or to create or remove pools.
13*9c5db199SXin Li
14*9c5db199SXin Liusage:  balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
15*9c5db199SXin Li
16*9c5db199SXin Lipositional arguments:
17*9c5db199SXin Li  POOL                  Name of the pool to balance
18*9c5db199SXin Li  MODEL                 Names of models to balance
19*9c5db199SXin Li
20*9c5db199SXin Lioptional arguments:
21*9c5db199SXin Li  -h, --help            show this help message and exit
22*9c5db199SXin Li  -t COUNT, --total COUNT
23*9c5db199SXin Li                        Set the number of DUTs in the pool to the specified
24*9c5db199SXin Li                        count for every MODEL
25*9c5db199SXin Li  -a COUNT, --grow COUNT
26*9c5db199SXin Li                        Add the specified number of DUTs to the pool for every
27*9c5db199SXin Li                        MODEL
28*9c5db199SXin Li  -d COUNT, --shrink COUNT
29*9c5db199SXin Li                        Remove the specified number of DUTs from the pool for
30*9c5db199SXin Li                        every MODEL
31*9c5db199SXin Li  -s POOL, --spare POOL
32*9c5db199SXin Li                        Pool from which to draw replacement spares (default:
33*9c5db199SXin Li                        pool:suites)
34*9c5db199SXin Li  -p PHASE, --phase PHASE
35*9c5db199SXin Li                        Phase to restrict the balance pool operation to
36*9c5db199SXin Li  --sku SKU             The specific SKU we intend to swap with
37*9c5db199SXin Li  -n, --dry-run         Report actions to take in the form of shell commands
38*9c5db199SXin Li
39*9c5db199SXin Li
40*9c5db199SXin LiThe command attempts to remove all broken DUTs from the target POOL
41*9c5db199SXin Lifor every MODEL, and replace them with enough working DUTs taken
42*9c5db199SXin Lifrom the spare pool to bring the strength of POOL to the requested
43*9c5db199SXin Litotal COUNT.
44*9c5db199SXin Li
45*9c5db199SXin LiIf no COUNT options are supplied (i.e. there are no --total, --grow,
46*9c5db199SXin Lior --shrink options), the command will maintain the current totals of
47*9c5db199SXin LiDUTs for every MODEL in the target POOL.
48*9c5db199SXin Li
49*9c5db199SXin LiIf not enough working spares are available, broken DUTs may be left
50*9c5db199SXin Liin the pool to keep the pool at the target COUNT.
51*9c5db199SXin Li
52*9c5db199SXin LiWhen reducing pool size, working DUTs will be returned after broken
53*9c5db199SXin LiDUTs, if it's necessary to achieve the target COUNT.
54*9c5db199SXin Li
55*9c5db199SXin Li"""
56*9c5db199SXin Li
57*9c5db199SXin Li
58*9c5db199SXin Liimport argparse
59*9c5db199SXin Liimport os
60*9c5db199SXin Liimport re
61*9c5db199SXin Liimport sys
62*9c5db199SXin Liimport time
63*9c5db199SXin Li
64*9c5db199SXin Liimport common
65*9c5db199SXin Lifrom autotest_lib.server import constants
66*9c5db199SXin Lifrom autotest_lib.server import site_utils
67*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
68*9c5db199SXin Lifrom autotest_lib.server.lib import status_history
69*9c5db199SXin Lifrom autotest_lib.site_utils import lab_inventory
70*9c5db199SXin Lifrom autotest_lib.utils import labellib
71*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import metrics
72*9c5db199SXin Lifrom autotest_lib.utils.frozen_chromite.lib import parallel
73*9c5db199SXin Li
74*9c5db199SXin Li#This must be imported after chromite.lib.metrics
75*9c5db199SXin Lifrom infra_libs import ts_mon
76*9c5db199SXin Li
77*9c5db199SXin Li_POOL_PREFIX = constants.Labels.POOL_PREFIX
78*9c5db199SXin Li# This is the ratio of all models we should calculate the default max
79*9c5db199SXin Li# number of broken models against.  It seemed like the best choice that
80*9c5db199SXin Li# was neither too strict nor lax.
81*9c5db199SXin Li_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
82*9c5db199SXin Li
83*9c5db199SXin Li_ALL_CRITICAL_POOLS = 'all_critical_pools'
84*9c5db199SXin Li_SPARE_DEFAULT = lab_inventory.SPARE_POOL
85*9c5db199SXin Li
86*9c5db199SXin Li
87*9c5db199SXin Li# _VALID_POOL_PATTERN - Regular expression matching pool names that will
88*9c5db199SXin Li# be accepted on the command line.
89*9c5db199SXin Li#
90*9c5db199SXin Li# Note: This pattern was selected merely to recognize all existing pool
91*9c5db199SXin Li# names; there's no underlying technical restriction motivating this
92*9c5db199SXin Li# pattern.  No reasonable request to add more special characters to the
93*9c5db199SXin Li# allowed set should be refused.
94*9c5db199SXin Li
95*9c5db199SXin Li_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')
96*9c5db199SXin Li
97*9c5db199SXin Li
98*9c5db199SXin Lidef _log_message(message, *args):
99*9c5db199SXin Li    """Log a message with optional format arguments to stdout.
100*9c5db199SXin Li
101*9c5db199SXin Li    This function logs a single line to stdout, with formatting
102*9c5db199SXin Li    if necessary, and without adornments.
103*9c5db199SXin Li
104*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
105*9c5db199SXin Li    the arguments.
106*9c5db199SXin Li
107*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
108*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
109*9c5db199SXin Li                    without formatting.
110*9c5db199SXin Li
111*9c5db199SXin Li    """
112*9c5db199SXin Li    if args:
113*9c5db199SXin Li        message = message % args
114*9c5db199SXin Li    sys.stdout.write('%s\n' % message)
115*9c5db199SXin Li
116*9c5db199SXin Li
117*9c5db199SXin Lidef _log_info(dry_run, message, *args):
118*9c5db199SXin Li    """Log information in a dry-run dependent fashion.
119*9c5db199SXin Li
120*9c5db199SXin Li    This function logs a single line to stdout, with formatting
121*9c5db199SXin Li    if necessary.  When logging for a dry run, the message is
122*9c5db199SXin Li    printed as a shell comment, rather than as unadorned text.
123*9c5db199SXin Li
124*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
125*9c5db199SXin Li    the arguments.
126*9c5db199SXin Li
127*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
128*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
129*9c5db199SXin Li                    without formatting.
130*9c5db199SXin Li
131*9c5db199SXin Li    """
132*9c5db199SXin Li    if dry_run:
133*9c5db199SXin Li        message = '# ' + message
134*9c5db199SXin Li    _log_message(message, *args)
135*9c5db199SXin Li
136*9c5db199SXin Li
137*9c5db199SXin Lidef _log_error(message, *args):
138*9c5db199SXin Li    """Log an error to stderr, with optional format arguments.
139*9c5db199SXin Li
140*9c5db199SXin Li    This function logs a single line to stderr, prefixed to indicate
141*9c5db199SXin Li    that it is an error message.
142*9c5db199SXin Li
143*9c5db199SXin Li    If `*args` are supplied, the message will be formatted using
144*9c5db199SXin Li    the arguments.
145*9c5db199SXin Li
146*9c5db199SXin Li    @param message  Message to be logged, possibly after formatting.
147*9c5db199SXin Li    @param args     Format arguments.  If empty, the message is logged
148*9c5db199SXin Li                    without formatting.
149*9c5db199SXin Li
150*9c5db199SXin Li    """
151*9c5db199SXin Li    if args:
152*9c5db199SXin Li        message = message % args
153*9c5db199SXin Li    sys.stderr.write('ERROR: %s\n' % message)
154*9c5db199SXin Li
155*9c5db199SXin Li
156*9c5db199SXin Liclass _DUTPool(object):
157*9c5db199SXin Li    """Information about a pool of DUTs matching given labels.
158*9c5db199SXin Li
159*9c5db199SXin Li    This class collects information about all DUTs for a given pool and matching
160*9c5db199SXin Li    the given labels, and divides them into three categories:
161*9c5db199SXin Li      + Working - the DUT is working for testing, and not locked.
162*9c5db199SXin Li      + Broken - the DUT is unable to run tests, or it is locked.
163*9c5db199SXin Li      + Ineligible - the DUT is not available to be removed from this pool.  The
164*9c5db199SXin Li            DUT may be either working or broken.
165*9c5db199SXin Li
166*9c5db199SXin Li    DUTs with more than one pool: label are ineligible for exchange
167*9c5db199SXin Li    during balancing.  This is done for the sake of chameleon hosts,
168*9c5db199SXin Li    which must always be assigned to pool:suites.  These DUTs are
169*9c5db199SXin Li    always marked with pool:chameleon to prevent their reassignment.
170*9c5db199SXin Li
171*9c5db199SXin Li    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
172*9c5db199SXin Li    the `chameleon` label is a hack that should be eliminated.
173*9c5db199SXin Li
174*9c5db199SXin Li    _DUTPool instances are used to track both main pools that need
175*9c5db199SXin Li    to be resupplied with working DUTs and spare pools that supply
176*9c5db199SXin Li    those DUTs.
177*9c5db199SXin Li
178*9c5db199SXin Li    @property pool                Name of the pool associated with
179*9c5db199SXin Li                                  this pool of DUTs.
180*9c5db199SXin Li    @property labels              Labels that constrain the DUTs to consider.
181*9c5db199SXin Li    @property working_hosts       The list of this pool's working DUTs.
182*9c5db199SXin Li    @property broken_hosts        The list of this pool's broken DUTs.
183*9c5db199SXin Li    @property ineligible_hosts    The list of this pool's ineligible DUTs.
184*9c5db199SXin Li    @property pool_labels         A list of labels that identify a DUT as part
185*9c5db199SXin Li                                  of this pool.
186*9c5db199SXin Li    @property total_hosts         The total number of hosts in pool.
187*9c5db199SXin Li
188*9c5db199SXin Li    """
189*9c5db199SXin Li
190*9c5db199SXin Li    def __init__(self, afe, pool, labels, start_time, end_time):
191*9c5db199SXin Li        self.pool = pool
192*9c5db199SXin Li        self.labels = labellib.LabelsMapping(labels)
193*9c5db199SXin Li        self.labels['pool'] = pool
194*9c5db199SXin Li        self._pool_labels = [_POOL_PREFIX + self.pool]
195*9c5db199SXin Li
196*9c5db199SXin Li        self.working_hosts = []
197*9c5db199SXin Li        self.broken_hosts = []
198*9c5db199SXin Li        self.ineligible_hosts = []
199*9c5db199SXin Li        self.total_hosts = self._get_hosts(afe, start_time, end_time)
200*9c5db199SXin Li
201*9c5db199SXin Li
202*9c5db199SXin Li    def _get_hosts(self, afe, start_time, end_time):
203*9c5db199SXin Li        all_histories = status_history.HostJobHistory.get_multiple_histories(
204*9c5db199SXin Li                afe, start_time, end_time, self.labels.getlabels())
205*9c5db199SXin Li        for h in all_histories:
206*9c5db199SXin Li            host = h.host
207*9c5db199SXin Li            host_pools = [l for l in host.labels
208*9c5db199SXin Li                          if l.startswith(_POOL_PREFIX)]
209*9c5db199SXin Li            if len(host_pools) != 1:
210*9c5db199SXin Li                self.ineligible_hosts.append(host)
211*9c5db199SXin Li            else:
212*9c5db199SXin Li                diag = h.last_diagnosis()[0]
213*9c5db199SXin Li                if (diag == status_history.WORKING and
214*9c5db199SXin Li                        not host.locked):
215*9c5db199SXin Li                    self.working_hosts.append(host)
216*9c5db199SXin Li                else:
217*9c5db199SXin Li                    self.broken_hosts.append(host)
218*9c5db199SXin Li        return len(all_histories)
219*9c5db199SXin Li
220*9c5db199SXin Li
221*9c5db199SXin Li    @property
222*9c5db199SXin Li    def pool_labels(self):
223*9c5db199SXin Li        """Return the AFE labels that identify this pool.
224*9c5db199SXin Li
225*9c5db199SXin Li        The returned labels are the labels that must be removed
226*9c5db199SXin Li        to remove a DUT from the pool, or added to add a DUT.
227*9c5db199SXin Li
228*9c5db199SXin Li        @return A list of AFE labels suitable for AFE.add_labels()
229*9c5db199SXin Li                or AFE.remove_labels().
230*9c5db199SXin Li
231*9c5db199SXin Li        """
232*9c5db199SXin Li        return self._pool_labels
233*9c5db199SXin Li
234*9c5db199SXin Li    def calculate_spares_needed(self, target_total):
235*9c5db199SXin Li        """Calculate and log the spares needed to achieve a target.
236*9c5db199SXin Li
237*9c5db199SXin Li        Return how many working spares are needed to achieve the
238*9c5db199SXin Li        given `target_total` with all DUTs working.
239*9c5db199SXin Li
240*9c5db199SXin Li        The spares count may be positive or negative.  Positive
241*9c5db199SXin Li        values indicate spares are needed to replace broken DUTs in
242*9c5db199SXin Li        order to reach the target; negative numbers indicate that
243*9c5db199SXin Li        no spares are needed, and that a corresponding number of
244*9c5db199SXin Li        working devices can be returned.
245*9c5db199SXin Li
246*9c5db199SXin Li        If the new target total would require returning ineligible
247*9c5db199SXin Li        DUTs, an error is logged, and the target total is adjusted
248*9c5db199SXin Li        so that those DUTs are not exchanged.
249*9c5db199SXin Li
250*9c5db199SXin Li        @param target_total  The new target pool size.
251*9c5db199SXin Li
252*9c5db199SXin Li        @return The number of spares needed.
253*9c5db199SXin Li
254*9c5db199SXin Li        """
255*9c5db199SXin Li        num_ineligible = len(self.ineligible_hosts)
256*9c5db199SXin Li        spares_needed = target_total >= num_ineligible
257*9c5db199SXin Li        metrics.Boolean(
258*9c5db199SXin Li            'chromeos/autotest/balance_pools/exhausted_pools',
259*9c5db199SXin Li            'True for each pool/model which requests more DUTs than supplied',
260*9c5db199SXin Li            # TODO(jrbarnette) The 'board' field is a legacy.  We need
261*9c5db199SXin Li            # to leave it here until we do the extra work Monarch
262*9c5db199SXin Li            # requires to delete a field.
263*9c5db199SXin Li            field_spec=[
264*9c5db199SXin Li                    ts_mon.StringField('pool'),
265*9c5db199SXin Li                    ts_mon.StringField('board'),
266*9c5db199SXin Li                    ts_mon.StringField('model'),
267*9c5db199SXin Li            ]).set(
268*9c5db199SXin Li                    not spares_needed,
269*9c5db199SXin Li                    fields={
270*9c5db199SXin Li                            'pool': self.pool,
271*9c5db199SXin Li                            'board': self.labels.get('model', ''),
272*9c5db199SXin Li                            'model': self.labels.get('model', ''),
273*9c5db199SXin Li                    },
274*9c5db199SXin Li        )
275*9c5db199SXin Li        if not spares_needed:
276*9c5db199SXin Li            _log_error(
277*9c5db199SXin Li                    '%s pool (%s): Target of %d is below minimum of %d DUTs.',
278*9c5db199SXin Li                    self.pool, self.labels, target_total, num_ineligible,
279*9c5db199SXin Li            )
280*9c5db199SXin Li            _log_error('Adjusting target to %d DUTs.', num_ineligible)
281*9c5db199SXin Li            target_total = num_ineligible
282*9c5db199SXin Li        else:
283*9c5db199SXin Li            _log_message('%s %s pool: Target of %d is above minimum.',
284*9c5db199SXin Li                         self.labels.get('model', ''), self.pool, target_total)
285*9c5db199SXin Li        adjustment = target_total - self.total_hosts
286*9c5db199SXin Li        return len(self.broken_hosts) + adjustment
287*9c5db199SXin Li
288*9c5db199SXin Li    def allocate_surplus(self, num_broken):
289*9c5db199SXin Li        """Allocate a list DUTs that can returned as surplus.
290*9c5db199SXin Li
291*9c5db199SXin Li        Return a list of devices that can be returned in order to
292*9c5db199SXin Li        reduce this pool's supply.  Broken DUTs will be preferred
293*9c5db199SXin Li        over working ones.
294*9c5db199SXin Li
295*9c5db199SXin Li        The `num_broken` parameter indicates the number of broken
296*9c5db199SXin Li        DUTs to be left in the pool.  If this number exceeds the
297*9c5db199SXin Li        number of broken DUTs actually in the pool, the returned
298*9c5db199SXin Li        list will be empty.  If this number is negative, it
299*9c5db199SXin Li        indicates a number of working DUTs to be returned in
300*9c5db199SXin Li        addition to all broken ones.
301*9c5db199SXin Li
302*9c5db199SXin Li        @param num_broken    Total number of broken DUTs to be left in
303*9c5db199SXin Li                             this pool.
304*9c5db199SXin Li
305*9c5db199SXin Li        @return A list of DUTs to be returned as surplus.
306*9c5db199SXin Li
307*9c5db199SXin Li        """
308*9c5db199SXin Li        if num_broken >= 0:
309*9c5db199SXin Li            surplus = self.broken_hosts[num_broken:]
310*9c5db199SXin Li            return surplus
311*9c5db199SXin Li        else:
312*9c5db199SXin Li            return (self.broken_hosts +
313*9c5db199SXin Li                    self.working_hosts[:-num_broken])
314*9c5db199SXin Li
315*9c5db199SXin Li
316*9c5db199SXin Lidef _exchange_labels(dry_run, hosts, target_pool, spare_pool):
317*9c5db199SXin Li    """Reassign a list of DUTs from one pool to another.
318*9c5db199SXin Li
319*9c5db199SXin Li    For all the given hosts, remove all labels associated with
320*9c5db199SXin Li    `spare_pool`, and add the labels for `target_pool`.
321*9c5db199SXin Li
322*9c5db199SXin Li    If `dry_run` is true, perform no changes, but log the `atest`
323*9c5db199SXin Li    commands needed to accomplish the necessary label changes.
324*9c5db199SXin Li
325*9c5db199SXin Li    @param dry_run       Whether the logging is for a dry run or
326*9c5db199SXin Li                         for actual execution.
327*9c5db199SXin Li    @param hosts         List of DUTs (AFE hosts) to be reassigned.
328*9c5db199SXin Li    @param target_pool   The `_DUTPool` object from which the hosts
329*9c5db199SXin Li                         are drawn.
330*9c5db199SXin Li    @param spare_pool    The `_DUTPool` object to which the hosts
331*9c5db199SXin Li                         will be added.
332*9c5db199SXin Li
333*9c5db199SXin Li    """
334*9c5db199SXin Li    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
335*9c5db199SXin Li              len(hosts), spare_pool.pool, target_pool.pool)
336*9c5db199SXin Li    metrics.Counter(
337*9c5db199SXin Li        'chromeos/autotest/balance_pools/duts_moved',
338*9c5db199SXin Li        'DUTs transferred between pools',
339*9c5db199SXin Li        # TODO(jrbarnette) The 'board' field is a legacy.  We need to
340*9c5db199SXin Li        # leave it here until we do the extra work Monarch requires to
341*9c5db199SXin Li        # delete a field.
342*9c5db199SXin Li        field_spec=[
343*9c5db199SXin Li                ts_mon.StringField('board'),
344*9c5db199SXin Li                ts_mon.StringField('model'),
345*9c5db199SXin Li                ts_mon.StringField('source_pool'),
346*9c5db199SXin Li                ts_mon.StringField('target_pool'),
347*9c5db199SXin Li        ]
348*9c5db199SXin Li    ).increment_by(
349*9c5db199SXin Li            len(hosts),
350*9c5db199SXin Li            fields={
351*9c5db199SXin Li                    'board': target_pool.labels.get('model', ''),
352*9c5db199SXin Li                    'model': target_pool.labels.get('model', ''),
353*9c5db199SXin Li                    'source_pool': spare_pool.pool,
354*9c5db199SXin Li                    'target_pool': target_pool.pool,
355*9c5db199SXin Li            },
356*9c5db199SXin Li    )
357*9c5db199SXin Li    if not hosts:
358*9c5db199SXin Li        return
359*9c5db199SXin Li
360*9c5db199SXin Li    additions = target_pool.pool_labels
361*9c5db199SXin Li    removals = spare_pool.pool_labels
362*9c5db199SXin Li    for host in hosts:
363*9c5db199SXin Li        if not dry_run:
364*9c5db199SXin Li            _log_message('Updating host: %s.', host.hostname)
365*9c5db199SXin Li            host.remove_labels(removals)
366*9c5db199SXin Li            host.add_labels(additions)
367*9c5db199SXin Li        else:
368*9c5db199SXin Li            _log_message('atest label remove -m %s %s',
369*9c5db199SXin Li                         host.hostname, ' '.join(removals))
370*9c5db199SXin Li            _log_message('atest label add -m %s %s',
371*9c5db199SXin Li                         host.hostname, ' '.join(additions))
372*9c5db199SXin Li
373*9c5db199SXin Li
374*9c5db199SXin Lidef _balance_model(arguments, afe, pool, labels, start_time, end_time):
375*9c5db199SXin Li    """Balance one model as requested by command line arguments.
376*9c5db199SXin Li
377*9c5db199SXin Li    @param arguments     Parsed command line arguments.
378*9c5db199SXin Li    @param afe           AFE object to be used for the changes.
379*9c5db199SXin Li    @param pool          Pool of the model to be balanced.
380*9c5db199SXin Li    @param labels        Restrict the balancing operation within DUTs
381*9c5db199SXin Li                         that have these labels.
382*9c5db199SXin Li    @param start_time    Start time for HostJobHistory objects in
383*9c5db199SXin Li                         the DUT pools.
384*9c5db199SXin Li    @param end_time      End time for HostJobHistory objects in the
385*9c5db199SXin Li                         DUT pools.
386*9c5db199SXin Li
387*9c5db199SXin Li    """
388*9c5db199SXin Li    spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
389*9c5db199SXin Li    main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
390*9c5db199SXin Li
391*9c5db199SXin Li    target_total = main_pool.total_hosts
392*9c5db199SXin Li    if arguments.total is not None:
393*9c5db199SXin Li        target_total = arguments.total
394*9c5db199SXin Li    elif arguments.grow:
395*9c5db199SXin Li        target_total += arguments.grow
396*9c5db199SXin Li    elif arguments.shrink:
397*9c5db199SXin Li        target_total -= arguments.shrink
398*9c5db199SXin Li
399*9c5db199SXin Li    spares_needed = main_pool.calculate_spares_needed(target_total)
400*9c5db199SXin Li    if spares_needed > 0:
401*9c5db199SXin Li        spare_duts = spare_pool.working_hosts[:spares_needed]
402*9c5db199SXin Li        shortfall = spares_needed - len(spare_duts)
403*9c5db199SXin Li    else:
404*9c5db199SXin Li        spare_duts = []
405*9c5db199SXin Li        shortfall = spares_needed
406*9c5db199SXin Li
407*9c5db199SXin Li    surplus_duts = main_pool.allocate_surplus(shortfall)
408*9c5db199SXin Li
409*9c5db199SXin Li    if spares_needed or surplus_duts or arguments.verbose:
410*9c5db199SXin Li        dry_run = arguments.dry_run
411*9c5db199SXin Li        _log_message('')
412*9c5db199SXin Li
413*9c5db199SXin Li        _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
414*9c5db199SXin Li        _log_info(dry_run,
415*9c5db199SXin Li                  'Total %d DUTs, %d working, %d broken, %d reserved.',
416*9c5db199SXin Li                  main_pool.total_hosts, len(main_pool.working_hosts),
417*9c5db199SXin Li                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
418*9c5db199SXin Li
419*9c5db199SXin Li        if spares_needed > 0:
420*9c5db199SXin Li            add_msg = 'grow pool by %d DUTs' % spares_needed
421*9c5db199SXin Li        elif spares_needed < 0:
422*9c5db199SXin Li            add_msg = 'shrink pool by %d DUTs' % -spares_needed
423*9c5db199SXin Li        else:
424*9c5db199SXin Li            add_msg = 'no change to pool size'
425*9c5db199SXin Li        _log_info(dry_run, 'Target is %d working DUTs; %s.',
426*9c5db199SXin Li                  target_total, add_msg)
427*9c5db199SXin Li
428*9c5db199SXin Li        _log_info(dry_run,
429*9c5db199SXin Li                  '%s %s pool has %d spares available for balancing pool %s',
430*9c5db199SXin Li                  labels, spare_pool.pool, len(spare_pool.working_hosts),
431*9c5db199SXin Li                  main_pool.pool)
432*9c5db199SXin Li
433*9c5db199SXin Li        if spares_needed > len(spare_duts):
434*9c5db199SXin Li            _log_error('Not enough spares: need %d, only have %d.',
435*9c5db199SXin Li                       spares_needed, len(spare_duts))
436*9c5db199SXin Li        elif shortfall >= 0:
437*9c5db199SXin Li            _log_info(dry_run,
438*9c5db199SXin Li                      '%s %s pool will return %d broken DUTs, '
439*9c5db199SXin Li                      'leaving %d still in the pool.',
440*9c5db199SXin Li                      labels, main_pool.pool,
441*9c5db199SXin Li                      len(surplus_duts),
442*9c5db199SXin Li                      len(main_pool.broken_hosts) - len(surplus_duts))
443*9c5db199SXin Li        else:
444*9c5db199SXin Li            _log_info(dry_run,
445*9c5db199SXin Li                      '%s %s pool will return %d surplus DUTs, '
446*9c5db199SXin Li                      'including %d working DUTs.',
447*9c5db199SXin Li                      labels, main_pool.pool,
448*9c5db199SXin Li                      len(main_pool.broken_hosts) - shortfall,
449*9c5db199SXin Li                      -shortfall)
450*9c5db199SXin Li
451*9c5db199SXin Li    if (len(main_pool.broken_hosts) > arguments.max_broken and
452*9c5db199SXin Li        not arguments.force_rebalance):
453*9c5db199SXin Li        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
454*9c5db199SXin Li                   labels, main_pool.pool, len(main_pool.broken_hosts))
455*9c5db199SXin Li        _log_error('Please investigate this model to for a bug ')
456*9c5db199SXin Li        _log_error('that is bricking devices. Once you have finished your ')
457*9c5db199SXin Li        _log_error('investigation, you can force a rebalance with ')
458*9c5db199SXin Li        _log_error('--force-rebalance')
459*9c5db199SXin Li        spare_duts = []
460*9c5db199SXin Li        surplus_duts = []
461*9c5db199SXin Li
462*9c5db199SXin Li    if not spare_duts and not surplus_duts:
463*9c5db199SXin Li        if arguments.verbose:
464*9c5db199SXin Li            _log_info(arguments.dry_run, 'No exchange required.')
465*9c5db199SXin Li
466*9c5db199SXin Li    _exchange_labels(arguments.dry_run, surplus_duts,
467*9c5db199SXin Li                     spare_pool, main_pool)
468*9c5db199SXin Li    _exchange_labels(arguments.dry_run, spare_duts,
469*9c5db199SXin Li                     main_pool, spare_pool)
470*9c5db199SXin Li
471*9c5db199SXin Li
472*9c5db199SXin Lidef _parse_command(argv):
473*9c5db199SXin Li    """Parse the command line arguments.
474*9c5db199SXin Li
475*9c5db199SXin Li    Create an argument parser for this command's syntax, parse the
476*9c5db199SXin Li    command line, and return the result of the `ArgumentParser`
477*9c5db199SXin Li    `parse_args()` method.
478*9c5db199SXin Li
479*9c5db199SXin Li    @param argv Standard command line argument vector; `argv[0]` is
480*9c5db199SXin Li                assumed to be the command name.
481*9c5db199SXin Li
482*9c5db199SXin Li    @return Result returned by `ArgumentParser.parse_args()`.
483*9c5db199SXin Li
484*9c5db199SXin Li    """
485*9c5db199SXin Li    parser = argparse.ArgumentParser(
486*9c5db199SXin Li            prog=os.path.basename(argv[0]),
487*9c5db199SXin Li            description='Balance pool shortages from spares on reserve')
488*9c5db199SXin Li
489*9c5db199SXin Li    parser.add_argument(
490*9c5db199SXin Li        '-w', '--web', type=str, default=None,
491*9c5db199SXin Li        help='AFE host to use. Default comes from shadow_config.',
492*9c5db199SXin Li    )
493*9c5db199SXin Li    count_group = parser.add_mutually_exclusive_group()
494*9c5db199SXin Li    count_group.add_argument('-t', '--total', type=int,
495*9c5db199SXin Li                             metavar='COUNT', default=None,
496*9c5db199SXin Li                             help='Set the number of DUTs in the '
497*9c5db199SXin Li                                  'pool to the specified count for '
498*9c5db199SXin Li                                  'every MODEL')
499*9c5db199SXin Li    count_group.add_argument('-a', '--grow', type=int,
500*9c5db199SXin Li                             metavar='COUNT', default=None,
501*9c5db199SXin Li                             help='Add the specified number of DUTs '
502*9c5db199SXin Li                                  'to the pool for every MODEL')
503*9c5db199SXin Li    count_group.add_argument('-d', '--shrink', type=int,
504*9c5db199SXin Li                             metavar='COUNT', default=None,
505*9c5db199SXin Li                             help='Remove the specified number of DUTs '
506*9c5db199SXin Li                                  'from the pool for every MODEL')
507*9c5db199SXin Li
508*9c5db199SXin Li    parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
509*9c5db199SXin Li                        metavar='POOL',
510*9c5db199SXin Li                        help='Pool from which to draw replacement '
511*9c5db199SXin Li                             'spares (default: pool:%s)' % _SPARE_DEFAULT)
512*9c5db199SXin Li    parser.add_argument('-n', '--dry-run', action='store_true',
513*9c5db199SXin Li                        help='Report actions to take in the form of '
514*9c5db199SXin Li                             'shell commands')
515*9c5db199SXin Li    parser.add_argument('-v', '--verbose', action='store_true',
516*9c5db199SXin Li                        help='Print more detail about calculations for debug '
517*9c5db199SXin Li                             'purposes.')
518*9c5db199SXin Li
519*9c5db199SXin Li    parser.add_argument('-m', '--max-broken', default=2, type=int,
520*9c5db199SXin Li                        metavar='COUNT',
521*9c5db199SXin Li                        help='Only rebalance a pool if it has at most '
522*9c5db199SXin Li                             'COUNT broken DUTs.')
523*9c5db199SXin Li    parser.add_argument('-f', '--force-rebalance', action='store_true',
524*9c5db199SXin Li                        help='Forcefully rebalance all DUTs in a pool, even '
525*9c5db199SXin Li                             'if it has a large number of broken DUTs. '
526*9c5db199SXin Li                             'Before doing this, please investigate whether '
527*9c5db199SXin Li                             'there is a bug that is bricking devices in the '
528*9c5db199SXin Li                             'lab.')
529*9c5db199SXin Li    parser.add_argument('--production', action='store_true',
530*9c5db199SXin Li                        help='Treat this as a production run. This will '
531*9c5db199SXin Li                             'collect metrics.')
532*9c5db199SXin Li
533*9c5db199SXin Li    parser.add_argument(
534*9c5db199SXin Li            '--all-models',
535*9c5db199SXin Li            action='store_true',
536*9c5db199SXin Li            help='Rebalance all managed models.  This will do a very expensive '
537*9c5db199SXin Li                 'check to see how many models have at least one broken DUT. '
538*9c5db199SXin Li                 'To bypass that check, set --max-broken-models to 0.',
539*9c5db199SXin Li    )
540*9c5db199SXin Li    parser.add_argument(
541*9c5db199SXin Li            '--max-broken-models', default=None, type=int, metavar='COUNT',
542*9c5db199SXin Li            help='Only rebalance all models if number of models with broken '
543*9c5db199SXin Li                 'DUTs in the specified pool is less than COUNT.',
544*9c5db199SXin Li    )
545*9c5db199SXin Li
546*9c5db199SXin Li    parser.add_argument('pool',
547*9c5db199SXin Li                        metavar='POOL',
548*9c5db199SXin Li                        help='Name of the pool to balance.  Use %s to balance '
549*9c5db199SXin Li                             'all critical pools' % _ALL_CRITICAL_POOLS)
550*9c5db199SXin Li    parser.add_argument('models', nargs='*', metavar='MODEL',
551*9c5db199SXin Li                        help='Names of models to balance.')
552*9c5db199SXin Li
553*9c5db199SXin Li    parser.add_argument('-p', '--phase', metavar='PHASE',
554*9c5db199SXin Li                        help='Optional phase label to restrict balance '
555*9c5db199SXin Li                        'operation to.')
556*9c5db199SXin Li
557*9c5db199SXin Li    parser.add_argument('--sku', type=str,
558*9c5db199SXin Li                        help='Optional name of sku to restrict to.')
559*9c5db199SXin Li
560*9c5db199SXin Li    arguments = parser.parse_args(argv[1:])
561*9c5db199SXin Li
562*9c5db199SXin Li    # Error-check arguments.
563*9c5db199SXin Li    if arguments.models and arguments.all_models:
564*9c5db199SXin Li        parser.error('Cannot specify individual models on the command line '
565*9c5db199SXin Li                     'when using --all-models.')
566*9c5db199SXin Li    if (arguments.pool == _ALL_CRITICAL_POOLS and
567*9c5db199SXin Li        arguments.spare != _SPARE_DEFAULT):
568*9c5db199SXin Li        parser.error('Cannot specify --spare pool to be %s when balancing all '
569*9c5db199SXin Li                     'critical pools.' % _SPARE_DEFAULT)
570*9c5db199SXin Li    for p in (arguments.spare, arguments.pool):
571*9c5db199SXin Li        if not _VALID_POOL_PATTERN.match(p):
572*9c5db199SXin Li            parser.error('Invalid pool name: %s' % p)
573*9c5db199SXin Li    return arguments
574*9c5db199SXin Li
575*9c5db199SXin Li
576*9c5db199SXin Lidef infer_balancer_targets(afe, arguments, pools):
577*9c5db199SXin Li    """Take some arguments and translate them to a list of models to balance
578*9c5db199SXin Li
579*9c5db199SXin Li    Args:
580*9c5db199SXin Li    @param afe           AFE object to be used for taking inventory.
581*9c5db199SXin Li    @param arguments     Parsed command line arguments.
582*9c5db199SXin Li    @param pools         The list of pools to balance.
583*9c5db199SXin Li
584*9c5db199SXin Li    @returns    a list of (model, labels) tuples to be balanced
585*9c5db199SXin Li
586*9c5db199SXin Li    """
587*9c5db199SXin Li    balancer_targets = []
588*9c5db199SXin Li
589*9c5db199SXin Li    for pool in pools:
590*9c5db199SXin Li        if arguments.all_models:
591*9c5db199SXin Li            inventory = lab_inventory.get_inventory(afe)
592*9c5db199SXin Li            for model in inventory.get_pool_models(pool):
593*9c5db199SXin Li                labels = labellib.LabelsMapping()
594*9c5db199SXin Li                labels['model'] = model
595*9c5db199SXin Li                if arguments.phase:
596*9c5db199SXin Li                    labels['phase'] = arguments.phase
597*9c5db199SXin Li                balancer_targets.append((pool, labels.getlabels()))
598*9c5db199SXin Li        else:
599*9c5db199SXin Li            for model in arguments.models:
600*9c5db199SXin Li                labels = labellib.LabelsMapping()
601*9c5db199SXin Li                labels['model'] = model
602*9c5db199SXin Li                if arguments.sku:
603*9c5db199SXin Li                    labels['sku'] = arguments.sku
604*9c5db199SXin Li                if arguments.phase:
605*9c5db199SXin Li                    labels['phase'] = arguments.phase
606*9c5db199SXin Li                balancer_targets.append((pool, labels.getlabels()))
607*9c5db199SXin Li    return balancer_targets
608*9c5db199SXin Li
609*9c5db199SXin Li
610*9c5db199SXin Lidef main(argv):
611*9c5db199SXin Li    """Standard main routine.
612*9c5db199SXin Li
613*9c5db199SXin Li    @param argv  Command line arguments including `sys.argv[0]`.
614*9c5db199SXin Li
615*9c5db199SXin Li    """
616*9c5db199SXin Li    arguments = _parse_command(argv)
617*9c5db199SXin Li    if arguments.production:
618*9c5db199SXin Li        metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
619*9c5db199SXin Li                                                           indirect=True)
620*9c5db199SXin Li    else:
621*9c5db199SXin Li        metrics_manager = site_utils.TrivialContextManager()
622*9c5db199SXin Li
623*9c5db199SXin Li    with metrics_manager:
624*9c5db199SXin Li        with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
625*9c5db199SXin Li            end_time = time.time()
626*9c5db199SXin Li            start_time = end_time - 24 * 60 * 60
627*9c5db199SXin Li            afe = frontend_wrappers.RetryingAFE(server=arguments.web)
628*9c5db199SXin Li
629*9c5db199SXin Li            def balancer(pool, labels):
630*9c5db199SXin Li                """Balance the specified model.
631*9c5db199SXin Li
632*9c5db199SXin Li                @param pool: The pool to rebalance for the model.
633*9c5db199SXin Li                @param labels: labels to restrict to balancing operations
634*9c5db199SXin Li                        within.
635*9c5db199SXin Li                """
636*9c5db199SXin Li                _balance_model(arguments, afe, pool, labels,
637*9c5db199SXin Li                               start_time, end_time)
638*9c5db199SXin Li                _log_message('')
639*9c5db199SXin Li
640*9c5db199SXin Li            pools = (lab_inventory.CRITICAL_POOLS
641*9c5db199SXin Li                    if arguments.pool == _ALL_CRITICAL_POOLS
642*9c5db199SXin Li                    else [arguments.pool])
643*9c5db199SXin Li            balancer_targets = infer_balancer_targets(afe, arguments, pools)
644*9c5db199SXin Li            try:
645*9c5db199SXin Li                parallel.RunTasksInProcessPool(
646*9c5db199SXin Li                        balancer,
647*9c5db199SXin Li                        balancer_targets,
648*9c5db199SXin Li                        processes=8,
649*9c5db199SXin Li                )
650*9c5db199SXin Li            except KeyboardInterrupt:
651*9c5db199SXin Li                pass
652*9c5db199SXin Li
653*9c5db199SXin Li
654*9c5db199SXin Liif __name__ == '__main__':
655*9c5db199SXin Li    main(sys.argv)
656