xref: /aosp_15_r20/external/autotest/site_utils/diagnosis_utils.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li#!/usr/bin/python3
2*9c5db199SXin Li#
3*9c5db199SXin Li# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
5*9c5db199SXin Li# found in the LICENSE file.
6*9c5db199SXin Li
7*9c5db199SXin Liimport datetime as datetime_base
8*9c5db199SXin Liimport logging
9*9c5db199SXin Lifrom datetime import datetime
10*9c5db199SXin Li
11*9c5db199SXin Liimport common
12*9c5db199SXin Li
13*9c5db199SXin Lifrom autotest_lib.client.common_lib import global_config
14*9c5db199SXin Lifrom autotest_lib.server import utils
15*9c5db199SXin Lifrom autotest_lib.server.cros.dynamic_suite import reporting_utils
16*9c5db199SXin Li
17*9c5db199SXin LiCONFIG = global_config.global_config
18*9c5db199SXin Li
19*9c5db199SXin Li
20*9c5db199SXin Liclass DUTsNotAvailableError(utils.TestLabException):
21*9c5db199SXin Li    """Raised when a DUT label combination is not available in the lab."""
22*9c5db199SXin Li
23*9c5db199SXin Li
24*9c5db199SXin Liclass NotEnoughDutsError(utils.TestLabException):
25*9c5db199SXin Li    """Rasied when the lab doesn't have the minimum number of duts."""
26*9c5db199SXin Li
27*9c5db199SXin Li    def __init__(self, labels, num_available, num_required, hosts):
28*9c5db199SXin Li        """Initialize instance.
29*9c5db199SXin Li
30*9c5db199SXin Li        Please pass arguments by keyword.
31*9c5db199SXin Li
32*9c5db199SXin Li        @param labels: Labels required, including board an pool labels.
33*9c5db199SXin Li        @param num_available: Number of available hosts.
34*9c5db199SXin Li        @param num_required: Number of hosts required.
35*9c5db199SXin Li        @param hosts: Sequence of Host instances for given board and pool.
36*9c5db199SXin Li        """
37*9c5db199SXin Li        self.labels = labels
38*9c5db199SXin Li        self.num_available = num_available
39*9c5db199SXin Li        self.num_required = num_required
40*9c5db199SXin Li        self.hosts = hosts
41*9c5db199SXin Li        self.bug_id = None
42*9c5db199SXin Li        self.suite_name = None
43*9c5db199SXin Li        self.build = None
44*9c5db199SXin Li
45*9c5db199SXin Li
46*9c5db199SXin Li    def __repr__(self):
47*9c5db199SXin Li        return (
48*9c5db199SXin Li            '<{cls} at 0x{id:x} with'
49*9c5db199SXin Li            ' labels={this.labels!r},'
50*9c5db199SXin Li            ' num_available={this.num_available!r},'
51*9c5db199SXin Li            ' num_required={this.num_required!r},'
52*9c5db199SXin Li            ' bug_id={this.bug_id!r},'
53*9c5db199SXin Li            ' suite_name={this.suite_name!r},'
54*9c5db199SXin Li            ' build={this.build!r}>'
55*9c5db199SXin Li            .format(cls=type(self).__name__, id=id(self), this=self)
56*9c5db199SXin Li        )
57*9c5db199SXin Li
58*9c5db199SXin Li
59*9c5db199SXin Li    def __str__(self):
60*9c5db199SXin Li        msg_parts = [
61*9c5db199SXin Li            'Not enough DUTs for requirements: {this.labels};'
62*9c5db199SXin Li            ' required: {this.num_required}, found: {this.num_available}'
63*9c5db199SXin Li        ]
64*9c5db199SXin Li        format_dict = {'this': self}
65*9c5db199SXin Li        if self.bug_id is not None:
66*9c5db199SXin Li            msg_parts.append('bug: {bug_url}')
67*9c5db199SXin Li            format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
68*9c5db199SXin Li        if self.suite_name is not None:
69*9c5db199SXin Li            msg_parts.append('suite: {this.suite_name}')
70*9c5db199SXin Li        if self.build is not None:
71*9c5db199SXin Li            msg_parts.append('build: {this.build}')
72*9c5db199SXin Li        return ', '.join(msg_parts).format(**format_dict)
73*9c5db199SXin Li
74*9c5db199SXin Li
75*9c5db199SXin Liclass SimpleTimer(object):
76*9c5db199SXin Li    """A simple timer used to periodically check if a deadline has passed."""
77*9c5db199SXin Li
78*9c5db199SXin Li    def _reset(self):
79*9c5db199SXin Li        """Reset the deadline."""
80*9c5db199SXin Li        if not self.interval_hours or self.interval_hours < 0:
81*9c5db199SXin Li            logging.error('Bad interval %s', self.interval_hours)
82*9c5db199SXin Li            self.deadline = None
83*9c5db199SXin Li            return
84*9c5db199SXin Li        self.deadline = datetime.now() + datetime_base.timedelta(
85*9c5db199SXin Li                hours=self.interval_hours)
86*9c5db199SXin Li
87*9c5db199SXin Li
88*9c5db199SXin Li    def __init__(self, interval_hours=0.5):
89*9c5db199SXin Li        """Initialize a simple periodic deadline timer.
90*9c5db199SXin Li
91*9c5db199SXin Li        @param interval_hours: Interval of the deadline.
92*9c5db199SXin Li        """
93*9c5db199SXin Li        self.interval_hours = interval_hours
94*9c5db199SXin Li        self._reset()
95*9c5db199SXin Li
96*9c5db199SXin Li
97*9c5db199SXin Li    def poll(self):
98*9c5db199SXin Li        """Poll the timer to see if we've hit the deadline.
99*9c5db199SXin Li
100*9c5db199SXin Li        This method resets the deadline if it has passed. If the deadline
101*9c5db199SXin Li        hasn't been set, or the current time is less than the deadline, the
102*9c5db199SXin Li        method returns False.
103*9c5db199SXin Li
104*9c5db199SXin Li        @return: True if the deadline has passed, False otherwise.
105*9c5db199SXin Li        """
106*9c5db199SXin Li        if not self.deadline or datetime.now() < self.deadline:
107*9c5db199SXin Li            return False
108*9c5db199SXin Li        self._reset()
109*9c5db199SXin Li        return True
110*9c5db199SXin Li
111*9c5db199SXin Li
112*9c5db199SXin Liclass JobTimer(object):
113*9c5db199SXin Li    """Utility class capable of measuring job timeouts.
114*9c5db199SXin Li    """
115*9c5db199SXin Li
116*9c5db199SXin Li    # Format used in datetime - string conversion.
117*9c5db199SXin Li    time_format = '%m-%d-%Y [%H:%M:%S]'
118*9c5db199SXin Li
119*9c5db199SXin Li    def __init__(self, job_created_time, timeout_mins):
120*9c5db199SXin Li        """JobTimer constructor.
121*9c5db199SXin Li
122*9c5db199SXin Li        @param job_created_time: float representing the time a job was
123*9c5db199SXin Li            created. Eg: time.time()
124*9c5db199SXin Li        @param timeout_mins: float representing the timeout in minutes.
125*9c5db199SXin Li        """
126*9c5db199SXin Li        self.job_created_time = datetime.fromtimestamp(job_created_time)
127*9c5db199SXin Li        self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
128*9c5db199SXin Li        self.debug_output_timer = SimpleTimer(interval_hours=0.5)
129*9c5db199SXin Li        self.past_halftime = False
130*9c5db199SXin Li
131*9c5db199SXin Li
132*9c5db199SXin Li    @classmethod
133*9c5db199SXin Li    def format_time(cls, datetime_obj):
134*9c5db199SXin Li        """Get the string formatted version of the datetime object.
135*9c5db199SXin Li
136*9c5db199SXin Li        @param datetime_obj: A datetime.datetime object.
137*9c5db199SXin Li            Eg: datetime.datetime.now()
138*9c5db199SXin Li
139*9c5db199SXin Li        @return: A formatted string containing the date/time of the
140*9c5db199SXin Li            input datetime.
141*9c5db199SXin Li        """
142*9c5db199SXin Li        return datetime_obj.strftime(cls.time_format)
143*9c5db199SXin Li
144*9c5db199SXin Li
145*9c5db199SXin Li    def elapsed_time(self):
146*9c5db199SXin Li        """Get the time elapsed since this job was created.
147*9c5db199SXin Li
148*9c5db199SXin Li        @return: A timedelta object representing the elapsed time.
149*9c5db199SXin Li        """
150*9c5db199SXin Li        return datetime.now() - self.job_created_time
151*9c5db199SXin Li
152*9c5db199SXin Li
153*9c5db199SXin Li    def is_suite_timeout(self):
154*9c5db199SXin Li        """Check if the suite timed out.
155*9c5db199SXin Li
156*9c5db199SXin Li        @return: True if more than timeout_hours has elapsed since the suite job
157*9c5db199SXin Li            was created.
158*9c5db199SXin Li        """
159*9c5db199SXin Li        if self.elapsed_time() >= self.timeout_hours:
160*9c5db199SXin Li            logging.info('Suite timed out. Started on %s, timed out on %s',
161*9c5db199SXin Li                         self.format_time(self.job_created_time),
162*9c5db199SXin Li                         self.format_time(datetime.now()))
163*9c5db199SXin Li            return True
164*9c5db199SXin Li        return False
165*9c5db199SXin Li
166*9c5db199SXin Li
167*9c5db199SXin Li    def first_past_halftime(self):
168*9c5db199SXin Li        """Check if we just crossed half time.
169*9c5db199SXin Li
170*9c5db199SXin Li        This method will only return True once, the first time it is called
171*9c5db199SXin Li        after a job's elapsed time is past half its timeout.
172*9c5db199SXin Li
173*9c5db199SXin Li        @return True: If this is the first call of the method after halftime.
174*9c5db199SXin Li        """
175*9c5db199SXin Li        if (not self.past_halftime and
176*9c5db199SXin Li            self.elapsed_time() > self.timeout_hours/2):
177*9c5db199SXin Li            self.past_halftime = True
178*9c5db199SXin Li            return True
179*9c5db199SXin Li        return False
180*9c5db199SXin Li
181*9c5db199SXin Li
182*9c5db199SXin Liclass RPCHelper(object):
183*9c5db199SXin Li    """A class to help diagnose a suite run through the rpc interface.
184*9c5db199SXin Li    """
185*9c5db199SXin Li
186*9c5db199SXin Li    def __init__(self, rpc_interface):
187*9c5db199SXin Li        """Constructor for rpc helper class.
188*9c5db199SXin Li
189*9c5db199SXin Li        @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
190*9c5db199SXin Li        """
191*9c5db199SXin Li        self.rpc_interface = rpc_interface
192*9c5db199SXin Li
193*9c5db199SXin Li
194*9c5db199SXin Li    def check_dut_availability(self, labels, minimum_duts=0,
195*9c5db199SXin Li                               skip_duts_check=False):
196*9c5db199SXin Li        """Check if DUT availability for a given board and pool is less than
197*9c5db199SXin Li        minimum.
198*9c5db199SXin Li
199*9c5db199SXin Li        @param labels: DUT label dependencies, including board and pool
200*9c5db199SXin Li                       labels.
201*9c5db199SXin Li        @param minimum_duts: Minimum Number of available machines required to
202*9c5db199SXin Li                             run the suite. Default is set to 0, which means do
203*9c5db199SXin Li                             not force the check of available machines before
204*9c5db199SXin Li                             running the suite.
205*9c5db199SXin Li        @param skip_duts_check: If True, skip minimum available DUTs check.
206*9c5db199SXin Li        @raise: NotEnoughDutsError if DUT availability is lower than minimum.
207*9c5db199SXin Li        @raise: DUTsNotAvailableError if no host found for requested
208*9c5db199SXin Li                board/pool.
209*9c5db199SXin Li        """
210*9c5db199SXin Li        if minimum_duts == 0:
211*9c5db199SXin Li            return
212*9c5db199SXin Li
213*9c5db199SXin Li        hosts = self.rpc_interface.get_hosts(
214*9c5db199SXin Li                invalid=False, multiple_labels=labels)
215*9c5db199SXin Li        if not hosts:
216*9c5db199SXin Li            raise DUTsNotAvailableError(
217*9c5db199SXin Li                    'No hosts found for labels %r. The test lab '
218*9c5db199SXin Li                    'currently does not cover test for those DUTs.' %
219*9c5db199SXin Li                    (labels,))
220*9c5db199SXin Li
221*9c5db199SXin Li        if skip_duts_check:
222*9c5db199SXin Li            # Bypass minimum avilable DUTs check
223*9c5db199SXin Li            logging.debug('skip_duts_check is on, do not enforce minimum '
224*9c5db199SXin Li                          'DUTs check.')
225*9c5db199SXin Li            return
226*9c5db199SXin Li
227*9c5db199SXin Li        if len(hosts) < minimum_duts:
228*9c5db199SXin Li            logging.debug('The total number of DUTs for %r is %d, '
229*9c5db199SXin Li                          'which is less than %d, the required minimum '
230*9c5db199SXin Li                          'number of available DUTS', labels, len(hosts),
231*9c5db199SXin Li                          minimum_duts)
232*9c5db199SXin Li
233*9c5db199SXin Li        available_hosts = 0
234*9c5db199SXin Li        for host in hosts:
235*9c5db199SXin Li            if host.is_available():
236*9c5db199SXin Li                available_hosts += 1
237*9c5db199SXin Li        logging.debug('%d of %d DUTs are available for %r.',
238*9c5db199SXin Li                      available_hosts, len(hosts), labels)
239*9c5db199SXin Li        if available_hosts < minimum_duts:
240*9c5db199SXin Li            raise NotEnoughDutsError(
241*9c5db199SXin Li                labels=labels,
242*9c5db199SXin Li                num_available=available_hosts,
243*9c5db199SXin Li                num_required=minimum_duts,
244*9c5db199SXin Li                hosts=hosts)
245*9c5db199SXin Li
246*9c5db199SXin Li
247*9c5db199SXin Li    def diagnose_job(self, job_id, instance_server):
248*9c5db199SXin Li        """Diagnose a suite job.
249*9c5db199SXin Li
250*9c5db199SXin Li        Logs information about the jobs that are still to run in the suite.
251*9c5db199SXin Li
252*9c5db199SXin Li        @param job_id: The id of the suite job to get information about.
253*9c5db199SXin Li            No meaningful information gets logged if the id is for a sub-job.
254*9c5db199SXin Li        @param instance_server: The instance server.
255*9c5db199SXin Li            Eg: cautotest, cautotest-cq, localhost.
256*9c5db199SXin Li        """
257*9c5db199SXin Li        incomplete_jobs = self.rpc_interface.get_jobs(
258*9c5db199SXin Li                parent_job_id=job_id, summary=True,
259*9c5db199SXin Li                hostqueueentry__complete=False)
260*9c5db199SXin Li        if incomplete_jobs:
261*9c5db199SXin Li            logging.info('\n%s printing summary of incomplete jobs (%s):\n',
262*9c5db199SXin Li                         JobTimer.format_time(datetime.now()),
263*9c5db199SXin Li                         len(incomplete_jobs))
264*9c5db199SXin Li            for job in incomplete_jobs:
265*9c5db199SXin Li                logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
266*9c5db199SXin Li                             reporting_utils.link_job(job.id, instance_server))
267*9c5db199SXin Li        else:
268*9c5db199SXin Li            logging.info('All jobs in suite have already completed.')
269