xref: /aosp_15_r20/external/autotest/server/cros/provisioner.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Lint as: python2, python3
2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6from __future__ import print_function
7
8import logging
9import os
10import re
11import six
12import sys
13import six.moves.urllib.parse
14
15from autotest_lib.client.bin import utils
16from autotest_lib.client.common_lib import error
17from autotest_lib.client.common_lib.cros import dev_server
18from autotest_lib.client.common_lib.cros import kernel_utils
19from autotest_lib.server import autotest
20from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
21from autotest_lib.server.cros.dynamic_suite import tools
22
23try:
24    from autotest_lib.utils.frozen_chromite.lib import metrics
25except ImportError:
26    metrics = utils.metrics_mock
27
28
29def _metric_name(base_name):
30    return 'chromeos/autotest/provision/' + base_name
31
32
33_QUICK_PROVISION_SCRIPT = 'quick-provision'
34
35# PROVISION_FAILED - A flag file to indicate provision failures.  The
36# file is created at the start of any AU procedure (see
37# `ChromiumOSProvisioner._prepare_host()`).  The file's location in
38# stateful means that on successul update it will be removed.  Thus, if
39# this file exists, it indicates that we've tried and failed in a
40# previous attempt to update.
41PROVISION_FAILED = '/var/tmp/provision_failed'
42
43# A flag file used to enable special handling in lab DUTs.  Some
44# parts of the system in Chromium OS test images will behave in ways
45# convenient to the test lab when this file is present.  Generally,
46# we create this immediately after any update completes.
47LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine'
48
49# _TARGET_VERSION - A file containing the new version to which we plan
50# to update.  This file is used by the CrOS shutdown code to detect and
51# handle certain version downgrade cases.  Specifically:  Downgrading
52# may trigger an unwanted powerwash in the target build when the
53# following conditions are met:
54#  * Source build is a v4.4 kernel with R69-10756.0.0 or later.
55#  * Target build predates the R69-10756.0.0 cutoff.
56# When this file is present and indicates a downgrade, the OS shutdown
57# code on the DUT knows how to prevent the powerwash.
58_TARGET_VERSION = '/run/update_target_version'
59
60# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned
61# when the Host.reboot() method fails.  The source of this text comes
62# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py.
63
64_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot'
65
66DEVSERVER_PORT = '8082'
67GS_CACHE_PORT = '8888'
68
69
70class _AttributedUpdateError(error.TestFail):
71    """Update failure with an attributed cause."""
72
73    def __init__(self, attribution, msg):
74        super(_AttributedUpdateError,
75              self).__init__('%s: %s' % (attribution, msg))
76        self._message = msg
77
78    def _classify(self):
79        for err_pattern, classification in self._CLASSIFIERS:
80            if re.match(err_pattern, self._message):
81                return classification
82        return None
83
84    @property
85    def failure_summary(self):
86        """Summarize this error for metrics reporting."""
87        classification = self._classify()
88        if classification:
89            return '%s: %s' % (self._SUMMARY, classification)
90        else:
91            return self._SUMMARY
92
93
94class HostUpdateError(_AttributedUpdateError):
95    """Failure updating a DUT attributable to the DUT.
96
97    This class of exception should be raised when the most likely cause
98    of failure was a condition existing on the DUT prior to the update,
99    such as a hardware problem, or a bug in the software on the DUT.
100    """
101
102    DUT_DOWN = 'No answer to ssh'
103
104    _SUMMARY = 'DUT failed prior to update'
105    _CLASSIFIERS = [
106            (DUT_DOWN, DUT_DOWN),
107            (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'),
108    ]
109
110    def __init__(self, hostname, msg):
111        super(HostUpdateError,
112              self).__init__('Error on %s prior to update' % hostname, msg)
113
114
115class ImageInstallError(_AttributedUpdateError):
116    """Failure updating a DUT when installing from the devserver.
117
118    This class of exception should be raised when the target DUT fails
119    to download and install the target image from the devserver, and
120    either the devserver or the DUT might be at fault.
121    """
122
123    _SUMMARY = 'Image failed to download and install'
124    _CLASSIFIERS = []
125
126    def __init__(self, hostname, devserver, msg):
127        super(ImageInstallError, self).__init__(
128                'Download and install failed from %s onto %s' %
129                (devserver, hostname), msg)
130
131
132class NewBuildUpdateError(_AttributedUpdateError):
133    """Failure updating a DUT attributable to the target build.
134
135    This class of exception should be raised when updating to a new
136    build fails, and the most likely cause of the failure is a bug in
137    the newly installed target build.
138    """
139
140    CHROME_FAILURE = 'Chrome failed to reach login screen'
141    ROLLBACK_FAILURE = 'System rolled back to previous build'
142
143    _SUMMARY = 'New build failed'
144    _CLASSIFIERS = [
145            (CHROME_FAILURE, 'Chrome did not start'),
146            (ROLLBACK_FAILURE, ROLLBACK_FAILURE),
147    ]
148
149    def __init__(self, update_version, msg):
150        super(NewBuildUpdateError,
151              self).__init__('Failure in build %s' % update_version, msg)
152
153    @property
154    def failure_summary(self):
155        #pylint: disable=missing-docstring
156        return 'Build failed to work after installing'
157
158
159def _url_to_version(update_url):
160    """Return the version based on update_url.
161
162    @param update_url: url to the image to update to.
163
164    """
165    # The ChromeOS version is generally the last element in the URL. The only
166    # exception is delta update URLs, which are rooted under the version; e.g.,
167    # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to
168    # strip off the au section of the path before reading the version.
169    return re.sub('/au/.*', '',
170                  six.moves.urllib.parse.urlparse(update_url).path).split(
171                          '/')[-1].strip()
172
173
174def url_to_image_name(update_url):
175    """Return the image name based on update_url.
176
177    From a URL like:
178        http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0
179    return lumpy-release/R27-3837.0.0
180
181    @param update_url: url to the image to update to.
182    @returns a string representing the image name in the update_url.
183
184    """
185    return six.moves.urllib.parse.urlparse(update_url).path[len('/update/'):]
186
187
188def get_update_failure_reason(exception):
189    """Convert an exception into a failure reason for metrics.
190
191    The passed in `exception` should be one raised by failure of
192    `ChromiumOSProvisioner.run_provision`.  The returned string will describe
193    the failure.  If the input exception value is not a truish value
194    the return value will be `None`.
195
196    The number of possible return strings is restricted to a limited
197    enumeration of values so that the string may be safely used in
198    Monarch metrics without worrying about cardinality of the range of
199    string values.
200
201    @param exception  Exception to be converted to a failure reason.
202
203    @return A string suitable for use in Monarch metrics, or `None`.
204    """
205    if exception:
206        if isinstance(exception, _AttributedUpdateError):
207            return exception.failure_summary
208        else:
209            return 'Unknown Error: %s' % type(exception).__name__
210    return None
211
212
213class ChromiumOSProvisioner(object):
214    """Chromium OS specific DUT update functionality."""
215
216    def __init__(self,
217                 update_url,
218                 host=None,
219                 interactive=True,
220                 is_release_bucket=None,
221                 is_servohost=False,
222                 public_bucket=False):
223        """Initializes the object.
224
225        @param update_url: The URL we want the update to use.
226        @param host: A client.common_lib.hosts.Host implementation.
227        @param interactive: Bool whether we are doing an interactive update.
228        @param is_release_bucket: If True, use release bucket
229            gs://chromeos-releases.
230        @param is_servohost: Bool whether the update target is a servohost.
231        @param public_bucket: True to copy payloads to a public throwaway GS
232            bucket. This avoids using a lab cache server, so local test runs
233            can provision without any special setup.
234        """
235        self.update_url = update_url
236        self.host = host
237        self.interactive = interactive
238        self.update_version = _url_to_version(update_url)
239        self._is_release_bucket = is_release_bucket
240        self._is_servohost = is_servohost
241        self._public_bucket = public_bucket
242
243    def _run(self, cmd, *args, **kwargs):
244        """Abbreviated form of self.host.run(...)"""
245        return self.host.run(cmd, *args, **kwargs)
246
247    def _rootdev(self, options=''):
248        """Returns the stripped output of rootdev <options>.
249
250        @param options: options to run rootdev.
251
252        """
253        return self._run('rootdev %s' % options).stdout.strip()
254
255    def _reset_update_engine(self):
256        """Resets the host to prepare for a clean update regardless of state."""
257        self._run('stop ui || true')
258        self._run('stop update-engine || true; start update-engine')
259
260    def _reset_stateful_partition(self):
261        """Clear any pending stateful update request."""
262        cmd = ['rm', '-rf']
263        for f in ('var_new', 'dev_image_new', '.update_available'):
264            cmd += [os.path.join('/mnt/stateful_partition', f)]
265        # TODO(b/165024723): This is a temporary measure until we figure out the
266        # root cause of this bug.
267        for f in ('dev_image/share/tast/data', 'dev_image/libexec/tast',
268                  'dev_image/tmp/tast'):
269            cmd += [os.path.join('/mnt/stateful_partition', f)]
270        cmd += [_TARGET_VERSION, '2>&1']
271        self._run(cmd)
272
273    def _set_target_version(self):
274        """Set the "target version" for the update."""
275        # Version strings that come from release buckets do not have RXX- at the
276        # beginning. So remove this prefix only if the version has it.
277        version_number = (self.update_version.split('-')[1] if
278                          '-' in self.update_version else self.update_version)
279        self._run('echo %s > %s' % (version_number, _TARGET_VERSION))
280
281    def _revert_boot_partition(self):
282        """Revert the boot partition."""
283        part = self._rootdev('-s')
284        logging.warning('Reverting update; Boot partition will be %s', part)
285        return self._run('/postinst %s 2>&1' % part)
286
287    def _get_remote_script(self, script_name):
288        """Ensure that `script_name` is present on the DUT.
289
290        The given script (e.g. `quick-provision`) may be present in the
291        stateful partition under /usr/local/bin, or we may have to
292        download it from the devserver.
293
294        Determine whether the script is present or must be downloaded
295        and download if necessary.  Then, return a command fragment
296        sufficient to run the script from whereever it now lives on the
297        DUT.
298
299        @param script_name  The name of the script as expected in
300                            /usr/local/bin and on the devserver.
301        @return A string with the command (minus arguments) that will
302                run the target script.
303        """
304        remote_script = '/usr/local/bin/%s' % script_name
305        if self.host.path_exists(remote_script):
306            return remote_script
307        self.host.run('mkdir -p -m 1777 /usr/local/tmp')
308        remote_tmp_script = '/usr/local/tmp/%s' % script_name
309        server_name = six.moves.urllib.parse.urlparse(self.update_url)[1]
310        script_url = 'http://%s/static/%s' % (server_name, script_name)
311        fetch_script = 'curl -Ss -o %s %s && head -1 %s' % (
312                remote_tmp_script, script_url, remote_tmp_script)
313
314        first_line = self._run(fetch_script).stdout.strip()
315
316        if first_line and first_line.startswith('#!'):
317            script_interpreter = first_line.lstrip('#!')
318            if script_interpreter:
319                return '%s %s' % (script_interpreter, remote_tmp_script)
320        return None
321
322    def _prepare_host(self):
323        """Make sure the target DUT is working and ready for update.
324
325        Initially, the target DUT's state is unknown.  The DUT is
326        expected to be online, but we strive to be forgiving if Chrome
327        and/or the update engine aren't fully functional.
328        """
329        # Summary of work, and the rationale:
330        #  1. Reboot, because it's a good way to clear out problems.
331        #  2. Touch the PROVISION_FAILED file, to allow repair to detect
332        #     failure later.
333        #  3. Run the hook for host class specific preparation.
334        #  4. Stop Chrome, because the system is designed to eventually
335        #     reboot if Chrome is stuck in a crash loop.
336        #  5. Force `update-engine` to start, because if Chrome failed
337        #     to start properly, the status of the `update-engine` job
338        #     will be uncertain.
339        if not self.host.is_up():
340            raise HostUpdateError(self.host.hostname, HostUpdateError.DUT_DOWN)
341        self._reset_stateful_partition()
342        # Servohost reboot logic is handled by themselves.
343        if not self._is_servohost:
344            self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
345            self._run('touch %s' % PROVISION_FAILED)
346        self.host.prepare_for_update()
347        # Servohost will only update via quick provision.
348        if not self._is_servohost:
349            self._reset_update_engine()
350        logging.info('Updating from version %s to %s.',
351                     self.host.get_release_version(), self.update_version)
352
353    def _quick_provision_with_gs_cache(self, provision_command, devserver_name,
354                                       image_name):
355        """Run quick_provision using GsCache server.
356
357        @param provision_command: The path of quick_provision command.
358        @param devserver_name: The devserver name and port (optional).
359        @param image_name: The image to be installed.
360        """
361        logging.info('Try quick provision with gs_cache.')
362        # If enabled, GsCache server listion on different port on the
363        # devserver.
364        gs_cache_server = devserver_name.replace(DEVSERVER_PORT, GS_CACHE_PORT)
365        gs_cache_url = (
366                'http://%s/download/%s' %
367                (gs_cache_server, 'chromeos-releases'
368                 if self._is_release_bucket else 'chromeos-image-archive'))
369
370        # Check if GS_Cache server is enabled on the server.
371        self._run('curl -s -o /dev/null %s' % gs_cache_url)
372
373        command = '%s --noreboot %s %s' % (provision_command, image_name,
374                                           gs_cache_url)
375        self._run(command)
376        metrics.Counter(
377                _metric_name('quick_provision')).increment(fields={
378                        'devserver': devserver_name,
379                        'gs_cache': True
380                })
381
382    def _quick_provision_with_devserver(self, provision_command,
383                                        devserver_name, image_name):
384        """Run quick_provision using legacy devserver.
385
386        @param provision_command: The path of quick_provision command.
387        @param devserver_name: The devserver name and port (optional).
388        @param image_name: The image to be installed.
389        """
390        logging.info('Try quick provision with devserver.')
391        ds = dev_server.ImageServer('http://%s' % devserver_name)
392        archive_url = ('gs://chromeos-releases/%s' %
393                       image_name if self._is_release_bucket else None)
394        try:
395            ds.stage_artifacts(
396                    image_name,
397                    ['quick_provision', 'stateful', 'autotest_packages'],
398                    archive_url=archive_url)
399        except dev_server.DevServerException as e:
400            six.reraise(error.TestFail, str(e), sys.exc_info()[2])
401
402        static_url = 'http://%s/static' % devserver_name
403        command = '%s --noreboot %s %s' % (provision_command, image_name,
404                                           static_url)
405        self._run(command)
406        metrics.Counter(
407                _metric_name('quick_provision')).increment(fields={
408                        'devserver': devserver_name,
409                        'gs_cache': False
410                })
411
412    def _quick_provision_with_public_bucket(self, provision_command,
413                                            image_name):
414        """Run quick_provision using public GS bucket.
415
416        @param provision_command: The path of quick_provision command.
417        @param image_name: The image to be installed.
418        """
419        logging.info('Try quick provision with public bucket.')
420
421        bucket_url = self.update_url[:self.update_url.find(image_name) - 1]
422        command = '%s --noreboot %s %s' % (provision_command, image_name,
423                                           bucket_url)
424        self._run(command)
425
426    def _install_update(self):
427        """Install an updating using the `quick-provision` script.
428
429        This uses the `quick-provision` script to download and install
430        a root FS, kernel and stateful filesystem content.
431
432        @return The kernel expected to be booted next.
433        """
434        logging.info('Installing image at %s onto %s', self.update_url,
435                     self.host.hostname)
436        server_name = six.moves.urllib.parse.urlparse(self.update_url)[1]
437        if self._public_bucket:
438            image_name = self.update_url.partition('provision/')[2]
439        else:
440            image_name = url_to_image_name(self.update_url)
441
442        logging.info('Installing image using quick-provision.')
443        provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT)
444        try:
445            if self._public_bucket:
446                self._quick_provision_with_public_bucket(
447                        provision_command, image_name)
448            else:
449                try:
450                    self._quick_provision_with_gs_cache(
451                            provision_command, server_name, image_name)
452                except Exception as e:
453                    logging.error(
454                            'Failed to quick-provision with gscache with '
455                            'error %s', e)
456                    self._quick_provision_with_devserver(
457                            provision_command, server_name, image_name)
458
459            self._set_target_version()
460            return kernel_utils.verify_kernel_state_after_update(self.host)
461        except Exception:
462            # N.B.  We handle only `Exception` here.  Non-Exception
463            # classes (such as KeyboardInterrupt) are handled by our
464            # caller.
465            logging.exception('quick-provision script failed;')
466            self._revert_boot_partition()
467            self._reset_stateful_partition()
468            self._reset_update_engine()
469            return None
470
471    def _complete_update(self, expected_kernel):
472        """Finish the update, and confirm that it succeeded.
473
474        Initial condition is that the target build has been downloaded
475        and installed on the DUT, but has not yet been booted.  This
476        function is responsible for rebooting the DUT, and checking that
477        the new build is running successfully.
478
479        @param expected_kernel: kernel expected to be active after reboot.
480        """
481        # Regarding the 'crossystem' command below: In some cases,
482        # the update flow puts the TPM into a state such that it
483        # fails verification.  We don't know why.  However, this
484        # call papers over the problem by clearing the TPM during
485        # the reboot.
486        #
487        # We ignore failures from 'crossystem'.  Although failure
488        # here is unexpected, and could signal a bug, the point of
489        # the exercise is to paper over problems; allowing this to
490        # fail would defeat the purpose.
491        self._run('crossystem clear_tpm_owner_request=1', ignore_status=True)
492        self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
493
494        # Touch the lab machine file to leave a marker that
495        # distinguishes this image from other test images.
496        # Afterwards, we must re-run the autoreboot script because
497        # it depends on the LAB_MACHINE_FILE.
498        autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || '
499                          '( touch "$FILE" ; start autoreboot )')
500        self._run(autoreboot_cmd % LAB_MACHINE_FILE)
501        try:
502            kernel_utils.verify_boot_expectations(
503                    expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE,
504                    self.host)
505        except Exception:
506            # When the system is rolled back, the provision_failed file is
507            # removed. So add it back here and re-raise the exception.
508            self._run('touch %s' % PROVISION_FAILED)
509            raise
510
511        logging.debug('Cleaning up old autotest directories.')
512        try:
513            installed_autodir = autotest.Autotest.get_installed_autodir(
514                    self.host)
515            self._run('rm -rf ' + installed_autodir)
516        except autotest.AutodirNotFoundError:
517            logging.debug('No autotest installed directory found.')
518
519    def run_provision(self):
520        """Perform a full provision of a DUT in the test lab.
521
522        This downloads and installs the root FS and stateful partition
523        content needed for the update specified in `self.host` and
524        `self.update_url`.  The provision is performed according to the
525        requirements for provisioning a DUT for testing the requested
526        build.
527
528        At the end of the procedure, metrics are reported describing the
529        outcome of the operation.
530
531        @returns A tuple of the form `(image_name, attributes)`, where
532            `image_name` is the name of the image installed, and
533            `attributes` is new attributes to be applied to the DUT.
534        """
535        server_name = ""
536        if not self._public_bucket:
537            server_name = dev_server.get_resolved_hostname(self.update_url)
538            metrics.Counter(_metric_name('install')).increment(
539                    fields={'devserver': server_name})
540
541        try:
542            self._prepare_host()
543        except _AttributedUpdateError:
544            raise
545        except Exception as e:
546            logging.exception('Failure preparing host prior to update.')
547            raise HostUpdateError(self.host.hostname, str(e))
548
549        try:
550            expected_kernel = self._install_update()
551        except _AttributedUpdateError:
552            raise
553        except Exception as e:
554            logging.exception('Failure during download and install.')
555            raise ImageInstallError(self.host.hostname, server_name, str(e))
556
557        # Servohost will handle post update process themselves.
558        if not self._is_servohost:
559            try:
560                self._complete_update(expected_kernel)
561            except _AttributedUpdateError:
562                raise
563            except Exception as e:
564                logging.exception('Failure from build after update.')
565                raise NewBuildUpdateError(self.update_version, str(e))
566
567        image_name = url_to_image_name(self.update_url)
568        # update_url is different from devserver url needed to stage autotest
569        # packages, therefore, resolve a new devserver url here.
570        devserver_url = dev_server.ImageServer.resolve(
571                image_name, self.host.hostname).url()
572        repo_url = tools.get_package_url(devserver_url, image_name)
573        return image_name, {ds_constants.JOB_REPO_URL: repo_url}
574