1# Lint as: python2, python3 2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6from __future__ import print_function 7 8import logging 9import os 10import re 11import six 12import sys 13import six.moves.urllib.parse 14 15from autotest_lib.client.bin import utils 16from autotest_lib.client.common_lib import error 17from autotest_lib.client.common_lib.cros import dev_server 18from autotest_lib.client.common_lib.cros import kernel_utils 19from autotest_lib.server import autotest 20from autotest_lib.server.cros.dynamic_suite import constants as ds_constants 21from autotest_lib.server.cros.dynamic_suite import tools 22 23try: 24 from autotest_lib.utils.frozen_chromite.lib import metrics 25except ImportError: 26 metrics = utils.metrics_mock 27 28 29def _metric_name(base_name): 30 return 'chromeos/autotest/provision/' + base_name 31 32 33_QUICK_PROVISION_SCRIPT = 'quick-provision' 34 35# PROVISION_FAILED - A flag file to indicate provision failures. The 36# file is created at the start of any AU procedure (see 37# `ChromiumOSProvisioner._prepare_host()`). The file's location in 38# stateful means that on successul update it will be removed. Thus, if 39# this file exists, it indicates that we've tried and failed in a 40# previous attempt to update. 41PROVISION_FAILED = '/var/tmp/provision_failed' 42 43# A flag file used to enable special handling in lab DUTs. Some 44# parts of the system in Chromium OS test images will behave in ways 45# convenient to the test lab when this file is present. Generally, 46# we create this immediately after any update completes. 47LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine' 48 49# _TARGET_VERSION - A file containing the new version to which we plan 50# to update. This file is used by the CrOS shutdown code to detect and 51# handle certain version downgrade cases. Specifically: Downgrading 52# may trigger an unwanted powerwash in the target build when the 53# following conditions are met: 54# * Source build is a v4.4 kernel with R69-10756.0.0 or later. 55# * Target build predates the R69-10756.0.0 cutoff. 56# When this file is present and indicates a downgrade, the OS shutdown 57# code on the DUT knows how to prevent the powerwash. 58_TARGET_VERSION = '/run/update_target_version' 59 60# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned 61# when the Host.reboot() method fails. The source of this text comes 62# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py. 63 64_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot' 65 66DEVSERVER_PORT = '8082' 67GS_CACHE_PORT = '8888' 68 69 70class _AttributedUpdateError(error.TestFail): 71 """Update failure with an attributed cause.""" 72 73 def __init__(self, attribution, msg): 74 super(_AttributedUpdateError, 75 self).__init__('%s: %s' % (attribution, msg)) 76 self._message = msg 77 78 def _classify(self): 79 for err_pattern, classification in self._CLASSIFIERS: 80 if re.match(err_pattern, self._message): 81 return classification 82 return None 83 84 @property 85 def failure_summary(self): 86 """Summarize this error for metrics reporting.""" 87 classification = self._classify() 88 if classification: 89 return '%s: %s' % (self._SUMMARY, classification) 90 else: 91 return self._SUMMARY 92 93 94class HostUpdateError(_AttributedUpdateError): 95 """Failure updating a DUT attributable to the DUT. 96 97 This class of exception should be raised when the most likely cause 98 of failure was a condition existing on the DUT prior to the update, 99 such as a hardware problem, or a bug in the software on the DUT. 100 """ 101 102 DUT_DOWN = 'No answer to ssh' 103 104 _SUMMARY = 'DUT failed prior to update' 105 _CLASSIFIERS = [ 106 (DUT_DOWN, DUT_DOWN), 107 (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'), 108 ] 109 110 def __init__(self, hostname, msg): 111 super(HostUpdateError, 112 self).__init__('Error on %s prior to update' % hostname, msg) 113 114 115class ImageInstallError(_AttributedUpdateError): 116 """Failure updating a DUT when installing from the devserver. 117 118 This class of exception should be raised when the target DUT fails 119 to download and install the target image from the devserver, and 120 either the devserver or the DUT might be at fault. 121 """ 122 123 _SUMMARY = 'Image failed to download and install' 124 _CLASSIFIERS = [] 125 126 def __init__(self, hostname, devserver, msg): 127 super(ImageInstallError, self).__init__( 128 'Download and install failed from %s onto %s' % 129 (devserver, hostname), msg) 130 131 132class NewBuildUpdateError(_AttributedUpdateError): 133 """Failure updating a DUT attributable to the target build. 134 135 This class of exception should be raised when updating to a new 136 build fails, and the most likely cause of the failure is a bug in 137 the newly installed target build. 138 """ 139 140 CHROME_FAILURE = 'Chrome failed to reach login screen' 141 ROLLBACK_FAILURE = 'System rolled back to previous build' 142 143 _SUMMARY = 'New build failed' 144 _CLASSIFIERS = [ 145 (CHROME_FAILURE, 'Chrome did not start'), 146 (ROLLBACK_FAILURE, ROLLBACK_FAILURE), 147 ] 148 149 def __init__(self, update_version, msg): 150 super(NewBuildUpdateError, 151 self).__init__('Failure in build %s' % update_version, msg) 152 153 @property 154 def failure_summary(self): 155 #pylint: disable=missing-docstring 156 return 'Build failed to work after installing' 157 158 159def _url_to_version(update_url): 160 """Return the version based on update_url. 161 162 @param update_url: url to the image to update to. 163 164 """ 165 # The ChromeOS version is generally the last element in the URL. The only 166 # exception is delta update URLs, which are rooted under the version; e.g., 167 # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to 168 # strip off the au section of the path before reading the version. 169 return re.sub('/au/.*', '', 170 six.moves.urllib.parse.urlparse(update_url).path).split( 171 '/')[-1].strip() 172 173 174def url_to_image_name(update_url): 175 """Return the image name based on update_url. 176 177 From a URL like: 178 http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0 179 return lumpy-release/R27-3837.0.0 180 181 @param update_url: url to the image to update to. 182 @returns a string representing the image name in the update_url. 183 184 """ 185 return six.moves.urllib.parse.urlparse(update_url).path[len('/update/'):] 186 187 188def get_update_failure_reason(exception): 189 """Convert an exception into a failure reason for metrics. 190 191 The passed in `exception` should be one raised by failure of 192 `ChromiumOSProvisioner.run_provision`. The returned string will describe 193 the failure. If the input exception value is not a truish value 194 the return value will be `None`. 195 196 The number of possible return strings is restricted to a limited 197 enumeration of values so that the string may be safely used in 198 Monarch metrics without worrying about cardinality of the range of 199 string values. 200 201 @param exception Exception to be converted to a failure reason. 202 203 @return A string suitable for use in Monarch metrics, or `None`. 204 """ 205 if exception: 206 if isinstance(exception, _AttributedUpdateError): 207 return exception.failure_summary 208 else: 209 return 'Unknown Error: %s' % type(exception).__name__ 210 return None 211 212 213class ChromiumOSProvisioner(object): 214 """Chromium OS specific DUT update functionality.""" 215 216 def __init__(self, 217 update_url, 218 host=None, 219 interactive=True, 220 is_release_bucket=None, 221 is_servohost=False, 222 public_bucket=False): 223 """Initializes the object. 224 225 @param update_url: The URL we want the update to use. 226 @param host: A client.common_lib.hosts.Host implementation. 227 @param interactive: Bool whether we are doing an interactive update. 228 @param is_release_bucket: If True, use release bucket 229 gs://chromeos-releases. 230 @param is_servohost: Bool whether the update target is a servohost. 231 @param public_bucket: True to copy payloads to a public throwaway GS 232 bucket. This avoids using a lab cache server, so local test runs 233 can provision without any special setup. 234 """ 235 self.update_url = update_url 236 self.host = host 237 self.interactive = interactive 238 self.update_version = _url_to_version(update_url) 239 self._is_release_bucket = is_release_bucket 240 self._is_servohost = is_servohost 241 self._public_bucket = public_bucket 242 243 def _run(self, cmd, *args, **kwargs): 244 """Abbreviated form of self.host.run(...)""" 245 return self.host.run(cmd, *args, **kwargs) 246 247 def _rootdev(self, options=''): 248 """Returns the stripped output of rootdev <options>. 249 250 @param options: options to run rootdev. 251 252 """ 253 return self._run('rootdev %s' % options).stdout.strip() 254 255 def _reset_update_engine(self): 256 """Resets the host to prepare for a clean update regardless of state.""" 257 self._run('stop ui || true') 258 self._run('stop update-engine || true; start update-engine') 259 260 def _reset_stateful_partition(self): 261 """Clear any pending stateful update request.""" 262 cmd = ['rm', '-rf'] 263 for f in ('var_new', 'dev_image_new', '.update_available'): 264 cmd += [os.path.join('/mnt/stateful_partition', f)] 265 # TODO(b/165024723): This is a temporary measure until we figure out the 266 # root cause of this bug. 267 for f in ('dev_image/share/tast/data', 'dev_image/libexec/tast', 268 'dev_image/tmp/tast'): 269 cmd += [os.path.join('/mnt/stateful_partition', f)] 270 cmd += [_TARGET_VERSION, '2>&1'] 271 self._run(cmd) 272 273 def _set_target_version(self): 274 """Set the "target version" for the update.""" 275 # Version strings that come from release buckets do not have RXX- at the 276 # beginning. So remove this prefix only if the version has it. 277 version_number = (self.update_version.split('-')[1] if 278 '-' in self.update_version else self.update_version) 279 self._run('echo %s > %s' % (version_number, _TARGET_VERSION)) 280 281 def _revert_boot_partition(self): 282 """Revert the boot partition.""" 283 part = self._rootdev('-s') 284 logging.warning('Reverting update; Boot partition will be %s', part) 285 return self._run('/postinst %s 2>&1' % part) 286 287 def _get_remote_script(self, script_name): 288 """Ensure that `script_name` is present on the DUT. 289 290 The given script (e.g. `quick-provision`) may be present in the 291 stateful partition under /usr/local/bin, or we may have to 292 download it from the devserver. 293 294 Determine whether the script is present or must be downloaded 295 and download if necessary. Then, return a command fragment 296 sufficient to run the script from whereever it now lives on the 297 DUT. 298 299 @param script_name The name of the script as expected in 300 /usr/local/bin and on the devserver. 301 @return A string with the command (minus arguments) that will 302 run the target script. 303 """ 304 remote_script = '/usr/local/bin/%s' % script_name 305 if self.host.path_exists(remote_script): 306 return remote_script 307 self.host.run('mkdir -p -m 1777 /usr/local/tmp') 308 remote_tmp_script = '/usr/local/tmp/%s' % script_name 309 server_name = six.moves.urllib.parse.urlparse(self.update_url)[1] 310 script_url = 'http://%s/static/%s' % (server_name, script_name) 311 fetch_script = 'curl -Ss -o %s %s && head -1 %s' % ( 312 remote_tmp_script, script_url, remote_tmp_script) 313 314 first_line = self._run(fetch_script).stdout.strip() 315 316 if first_line and first_line.startswith('#!'): 317 script_interpreter = first_line.lstrip('#!') 318 if script_interpreter: 319 return '%s %s' % (script_interpreter, remote_tmp_script) 320 return None 321 322 def _prepare_host(self): 323 """Make sure the target DUT is working and ready for update. 324 325 Initially, the target DUT's state is unknown. The DUT is 326 expected to be online, but we strive to be forgiving if Chrome 327 and/or the update engine aren't fully functional. 328 """ 329 # Summary of work, and the rationale: 330 # 1. Reboot, because it's a good way to clear out problems. 331 # 2. Touch the PROVISION_FAILED file, to allow repair to detect 332 # failure later. 333 # 3. Run the hook for host class specific preparation. 334 # 4. Stop Chrome, because the system is designed to eventually 335 # reboot if Chrome is stuck in a crash loop. 336 # 5. Force `update-engine` to start, because if Chrome failed 337 # to start properly, the status of the `update-engine` job 338 # will be uncertain. 339 if not self.host.is_up(): 340 raise HostUpdateError(self.host.hostname, HostUpdateError.DUT_DOWN) 341 self._reset_stateful_partition() 342 # Servohost reboot logic is handled by themselves. 343 if not self._is_servohost: 344 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 345 self._run('touch %s' % PROVISION_FAILED) 346 self.host.prepare_for_update() 347 # Servohost will only update via quick provision. 348 if not self._is_servohost: 349 self._reset_update_engine() 350 logging.info('Updating from version %s to %s.', 351 self.host.get_release_version(), self.update_version) 352 353 def _quick_provision_with_gs_cache(self, provision_command, devserver_name, 354 image_name): 355 """Run quick_provision using GsCache server. 356 357 @param provision_command: The path of quick_provision command. 358 @param devserver_name: The devserver name and port (optional). 359 @param image_name: The image to be installed. 360 """ 361 logging.info('Try quick provision with gs_cache.') 362 # If enabled, GsCache server listion on different port on the 363 # devserver. 364 gs_cache_server = devserver_name.replace(DEVSERVER_PORT, GS_CACHE_PORT) 365 gs_cache_url = ( 366 'http://%s/download/%s' % 367 (gs_cache_server, 'chromeos-releases' 368 if self._is_release_bucket else 'chromeos-image-archive')) 369 370 # Check if GS_Cache server is enabled on the server. 371 self._run('curl -s -o /dev/null %s' % gs_cache_url) 372 373 command = '%s --noreboot %s %s' % (provision_command, image_name, 374 gs_cache_url) 375 self._run(command) 376 metrics.Counter( 377 _metric_name('quick_provision')).increment(fields={ 378 'devserver': devserver_name, 379 'gs_cache': True 380 }) 381 382 def _quick_provision_with_devserver(self, provision_command, 383 devserver_name, image_name): 384 """Run quick_provision using legacy devserver. 385 386 @param provision_command: The path of quick_provision command. 387 @param devserver_name: The devserver name and port (optional). 388 @param image_name: The image to be installed. 389 """ 390 logging.info('Try quick provision with devserver.') 391 ds = dev_server.ImageServer('http://%s' % devserver_name) 392 archive_url = ('gs://chromeos-releases/%s' % 393 image_name if self._is_release_bucket else None) 394 try: 395 ds.stage_artifacts( 396 image_name, 397 ['quick_provision', 'stateful', 'autotest_packages'], 398 archive_url=archive_url) 399 except dev_server.DevServerException as e: 400 six.reraise(error.TestFail, str(e), sys.exc_info()[2]) 401 402 static_url = 'http://%s/static' % devserver_name 403 command = '%s --noreboot %s %s' % (provision_command, image_name, 404 static_url) 405 self._run(command) 406 metrics.Counter( 407 _metric_name('quick_provision')).increment(fields={ 408 'devserver': devserver_name, 409 'gs_cache': False 410 }) 411 412 def _quick_provision_with_public_bucket(self, provision_command, 413 image_name): 414 """Run quick_provision using public GS bucket. 415 416 @param provision_command: The path of quick_provision command. 417 @param image_name: The image to be installed. 418 """ 419 logging.info('Try quick provision with public bucket.') 420 421 bucket_url = self.update_url[:self.update_url.find(image_name) - 1] 422 command = '%s --noreboot %s %s' % (provision_command, image_name, 423 bucket_url) 424 self._run(command) 425 426 def _install_update(self): 427 """Install an updating using the `quick-provision` script. 428 429 This uses the `quick-provision` script to download and install 430 a root FS, kernel and stateful filesystem content. 431 432 @return The kernel expected to be booted next. 433 """ 434 logging.info('Installing image at %s onto %s', self.update_url, 435 self.host.hostname) 436 server_name = six.moves.urllib.parse.urlparse(self.update_url)[1] 437 if self._public_bucket: 438 image_name = self.update_url.partition('provision/')[2] 439 else: 440 image_name = url_to_image_name(self.update_url) 441 442 logging.info('Installing image using quick-provision.') 443 provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT) 444 try: 445 if self._public_bucket: 446 self._quick_provision_with_public_bucket( 447 provision_command, image_name) 448 else: 449 try: 450 self._quick_provision_with_gs_cache( 451 provision_command, server_name, image_name) 452 except Exception as e: 453 logging.error( 454 'Failed to quick-provision with gscache with ' 455 'error %s', e) 456 self._quick_provision_with_devserver( 457 provision_command, server_name, image_name) 458 459 self._set_target_version() 460 return kernel_utils.verify_kernel_state_after_update(self.host) 461 except Exception: 462 # N.B. We handle only `Exception` here. Non-Exception 463 # classes (such as KeyboardInterrupt) are handled by our 464 # caller. 465 logging.exception('quick-provision script failed;') 466 self._revert_boot_partition() 467 self._reset_stateful_partition() 468 self._reset_update_engine() 469 return None 470 471 def _complete_update(self, expected_kernel): 472 """Finish the update, and confirm that it succeeded. 473 474 Initial condition is that the target build has been downloaded 475 and installed on the DUT, but has not yet been booted. This 476 function is responsible for rebooting the DUT, and checking that 477 the new build is running successfully. 478 479 @param expected_kernel: kernel expected to be active after reboot. 480 """ 481 # Regarding the 'crossystem' command below: In some cases, 482 # the update flow puts the TPM into a state such that it 483 # fails verification. We don't know why. However, this 484 # call papers over the problem by clearing the TPM during 485 # the reboot. 486 # 487 # We ignore failures from 'crossystem'. Although failure 488 # here is unexpected, and could signal a bug, the point of 489 # the exercise is to paper over problems; allowing this to 490 # fail would defeat the purpose. 491 self._run('crossystem clear_tpm_owner_request=1', ignore_status=True) 492 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 493 494 # Touch the lab machine file to leave a marker that 495 # distinguishes this image from other test images. 496 # Afterwards, we must re-run the autoreboot script because 497 # it depends on the LAB_MACHINE_FILE. 498 autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || ' 499 '( touch "$FILE" ; start autoreboot )') 500 self._run(autoreboot_cmd % LAB_MACHINE_FILE) 501 try: 502 kernel_utils.verify_boot_expectations( 503 expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE, 504 self.host) 505 except Exception: 506 # When the system is rolled back, the provision_failed file is 507 # removed. So add it back here and re-raise the exception. 508 self._run('touch %s' % PROVISION_FAILED) 509 raise 510 511 logging.debug('Cleaning up old autotest directories.') 512 try: 513 installed_autodir = autotest.Autotest.get_installed_autodir( 514 self.host) 515 self._run('rm -rf ' + installed_autodir) 516 except autotest.AutodirNotFoundError: 517 logging.debug('No autotest installed directory found.') 518 519 def run_provision(self): 520 """Perform a full provision of a DUT in the test lab. 521 522 This downloads and installs the root FS and stateful partition 523 content needed for the update specified in `self.host` and 524 `self.update_url`. The provision is performed according to the 525 requirements for provisioning a DUT for testing the requested 526 build. 527 528 At the end of the procedure, metrics are reported describing the 529 outcome of the operation. 530 531 @returns A tuple of the form `(image_name, attributes)`, where 532 `image_name` is the name of the image installed, and 533 `attributes` is new attributes to be applied to the DUT. 534 """ 535 server_name = "" 536 if not self._public_bucket: 537 server_name = dev_server.get_resolved_hostname(self.update_url) 538 metrics.Counter(_metric_name('install')).increment( 539 fields={'devserver': server_name}) 540 541 try: 542 self._prepare_host() 543 except _AttributedUpdateError: 544 raise 545 except Exception as e: 546 logging.exception('Failure preparing host prior to update.') 547 raise HostUpdateError(self.host.hostname, str(e)) 548 549 try: 550 expected_kernel = self._install_update() 551 except _AttributedUpdateError: 552 raise 553 except Exception as e: 554 logging.exception('Failure during download and install.') 555 raise ImageInstallError(self.host.hostname, server_name, str(e)) 556 557 # Servohost will handle post update process themselves. 558 if not self._is_servohost: 559 try: 560 self._complete_update(expected_kernel) 561 except _AttributedUpdateError: 562 raise 563 except Exception as e: 564 logging.exception('Failure from build after update.') 565 raise NewBuildUpdateError(self.update_version, str(e)) 566 567 image_name = url_to_image_name(self.update_url) 568 # update_url is different from devserver url needed to stage autotest 569 # packages, therefore, resolve a new devserver url here. 570 devserver_url = dev_server.ImageServer.resolve( 571 image_name, self.host.hostname).url() 572 repo_url = tools.get_package_url(devserver_url, image_name) 573 return image_name, {ds_constants.JOB_REPO_URL: repo_url} 574