1*9c5db199SXin Lifrom __future__ import print_function 2*9c5db199SXin Li 3*9c5db199SXin Liimport collections 4*9c5db199SXin Liimport logging 5*9c5db199SXin Liimport os 6*9c5db199SXin Liimport pipes 7*9c5db199SXin Liimport random 8*9c5db199SXin Liimport shutil 9*9c5db199SXin Liimport time 10*9c5db199SXin Li 11*9c5db199SXin Liimport common 12*9c5db199SXin Lifrom autotest_lib.client.bin.result_tools import runner as result_tools_runner 13*9c5db199SXin Lifrom autotest_lib.client.common_lib import error 14*9c5db199SXin Lifrom autotest_lib.client.common_lib import global_config 15*9c5db199SXin Lifrom autotest_lib.client.cros import constants 16*9c5db199SXin Lifrom autotest_lib.server import utils 17*9c5db199SXin Li 18*9c5db199SXin Litry: 19*9c5db199SXin Li from autotest_lib.utils.frozen_chromite.lib import metrics 20*9c5db199SXin Liexcept ImportError: 21*9c5db199SXin Li metrics = utils.metrics_mock 22*9c5db199SXin Li 23*9c5db199SXin Li 24*9c5db199SXin Li# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10 25*9c5db199SXin Li# then we would collect a file with size 20 half the time, and a file with size 26*9c5db199SXin Li# 40 a quarter of the time, so that in the long run we are collecting files 27*9c5db199SXin Li# with this max size. 28*9c5db199SXin Li_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB 29*9c5db199SXin Li 30*9c5db199SXin Liclass _RemoteTempDir(object): 31*9c5db199SXin Li 32*9c5db199SXin Li """Context manager for temporary directory on remote host.""" 33*9c5db199SXin Li 34*9c5db199SXin Li def __init__(self, host): 35*9c5db199SXin Li self.host = host 36*9c5db199SXin Li self.tmpdir = None 37*9c5db199SXin Li 38*9c5db199SXin Li def __repr__(self): 39*9c5db199SXin Li return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format( 40*9c5db199SXin Li cls=type(self).__name__, this=self) 41*9c5db199SXin Li 42*9c5db199SXin Li def __enter__(self): 43*9c5db199SXin Li self.tmpdir = (self.host 44*9c5db199SXin Li .run('mktemp -d', stdout_tee=None) 45*9c5db199SXin Li .stdout.strip()) 46*9c5db199SXin Li return self.tmpdir 47*9c5db199SXin Li 48*9c5db199SXin Li def __exit__(self, exc_type, exc_value, exc_tb): 49*9c5db199SXin Li self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),)) 50*9c5db199SXin Li 51*9c5db199SXin Li 52*9c5db199SXin Lidef _collect_log_file_with_summary(host, source_path, dest_path): 53*9c5db199SXin Li """Collects a log file from the remote machine with directory summary. 54*9c5db199SXin Li 55*9c5db199SXin Li @param host: The RemoteHost to collect logs from. 56*9c5db199SXin Li @param source_path: The remote path to collect the log file from. 57*9c5db199SXin Li @param dest_path: A path (file or directory) to write the copies logs into. 58*9c5db199SXin Li """ 59*9c5db199SXin Li # Build test result directory summary 60*9c5db199SXin Li summary_created = result_tools_runner.run_on_client(host, source_path) 61*9c5db199SXin Li 62*9c5db199SXin Li skip_summary_collection = True 63*9c5db199SXin Li try: 64*9c5db199SXin Li host.get_file(source_path, dest_path, preserve_perm=False) 65*9c5db199SXin Li skip_summary_collection = False 66*9c5db199SXin Li finally: 67*9c5db199SXin Li if summary_created: 68*9c5db199SXin Li # If dest_path is a file, use its parent folder to store the 69*9c5db199SXin Li # directory summary file. 70*9c5db199SXin Li if os.path.isfile(dest_path): 71*9c5db199SXin Li dest_path = os.path.dirname(dest_path) 72*9c5db199SXin Li # If dest_path doesn't exist, that means get_file failed, there is 73*9c5db199SXin Li # no need to collect directory summary file. 74*9c5db199SXin Li skip_summary_collection |= not os.path.exists(dest_path) 75*9c5db199SXin Li result_tools_runner.collect_last_summary( 76*9c5db199SXin Li host, source_path, dest_path, 77*9c5db199SXin Li skip_summary_collection=skip_summary_collection) 78*9c5db199SXin Li 79*9c5db199SXin Li 80*9c5db199SXin Lidef collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False, 81*9c5db199SXin Li clean_content=False): 82*9c5db199SXin Li """Collects a log file from the remote machine. 83*9c5db199SXin Li 84*9c5db199SXin Li Log files are collected from the remote machine and written into the 85*9c5db199SXin Li destination path. If dest_path is a directory, the log file will be named 86*9c5db199SXin Li using the basename of the remote log path. 87*9c5db199SXin Li 88*9c5db199SXin Li Very large files will randomly not be collected, to alleviate network 89*9c5db199SXin Li traffic in the case of widespread crashes dumping large core files. Note 90*9c5db199SXin Li that this check only applies to the exact file passed as log_path. For 91*9c5db199SXin Li example, if this is a directory, the size of the contents will not be 92*9c5db199SXin Li checked. 93*9c5db199SXin Li 94*9c5db199SXin Li @param host: The RemoteHost to collect logs from 95*9c5db199SXin Li @param log_path: The remote path to collect the log file from 96*9c5db199SXin Li @param dest_path: A path (file or directory) to write the copies logs into 97*9c5db199SXin Li @param use_tmp: If True, will first copy the logs to a temporary directory 98*9c5db199SXin Li on the host and download logs from there. 99*9c5db199SXin Li @param clean: If True, remove dest_path after upload attempt even if it 100*9c5db199SXin Li failed. 101*9c5db199SXin Li @param clean_content: If True, remove files and directories in dest_path 102*9c5db199SXin Li after upload attempt even if it failed. 103*9c5db199SXin Li 104*9c5db199SXin Li """ 105*9c5db199SXin Li logging.info('Collecting %s...', log_path) 106*9c5db199SXin Li if not host.check_cached_up_status(): 107*9c5db199SXin Li logging.warning('Host %s did not answer to ping, skip collecting log ' 108*9c5db199SXin Li 'file %s.', host.hostname, log_path) 109*9c5db199SXin Li return 110*9c5db199SXin Li try: 111*9c5db199SXin Li file_stats = _get_file_stats(host, log_path) 112*9c5db199SXin Li if not file_stats: 113*9c5db199SXin Li # Failed to get file stat, the file may not exist. 114*9c5db199SXin Li return 115*9c5db199SXin Li 116*9c5db199SXin Li if (not result_tools_runner.ENABLE_RESULT_THROTTLING and 117*9c5db199SXin Li random.random() > file_stats.collection_probability): 118*9c5db199SXin Li logging.warning('Collection of %s skipped:' 119*9c5db199SXin Li 'size=%s, collection_probability=%s', 120*9c5db199SXin Li log_path, file_stats.size, 121*9c5db199SXin Li file_stats.collection_probability) 122*9c5db199SXin Li elif use_tmp: 123*9c5db199SXin Li _collect_log_file_with_tmpdir(host, log_path, dest_path) 124*9c5db199SXin Li else: 125*9c5db199SXin Li _collect_log_file_with_summary(host, log_path, dest_path) 126*9c5db199SXin Li except Exception as e: 127*9c5db199SXin Li logging.exception('Non-critical failure: collection of %s failed: %s', 128*9c5db199SXin Li log_path, e) 129*9c5db199SXin Li finally: 130*9c5db199SXin Li if clean_content: 131*9c5db199SXin Li path_to_delete = os.path.join(pipes.quote(log_path), '*') 132*9c5db199SXin Li elif clean: 133*9c5db199SXin Li path_to_delete = pipes.quote(log_path) 134*9c5db199SXin Li if clean or clean_content: 135*9c5db199SXin Li host.run('rm -rf %s' % path_to_delete, ignore_status=True) 136*9c5db199SXin Li 137*9c5db199SXin Li 138*9c5db199SXin Li_FileStats = collections.namedtuple('_FileStats', 139*9c5db199SXin Li 'size collection_probability') 140*9c5db199SXin Li 141*9c5db199SXin Li 142*9c5db199SXin Lidef _collect_log_file_with_tmpdir(host, log_path, dest_path): 143*9c5db199SXin Li """Collect log file from host through a temp directory on the host. 144*9c5db199SXin Li 145*9c5db199SXin Li @param host: The RemoteHost to collect logs from. 146*9c5db199SXin Li @param log_path: The remote path to collect the log file from. 147*9c5db199SXin Li @param dest_path: A path (file or directory) to write the copies logs into. 148*9c5db199SXin Li 149*9c5db199SXin Li """ 150*9c5db199SXin Li with _RemoteTempDir(host) as tmpdir: 151*9c5db199SXin Li host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir))) 152*9c5db199SXin Li source_path = os.path.join(tmpdir, os.path.basename(log_path)) 153*9c5db199SXin Li 154*9c5db199SXin Li _collect_log_file_with_summary(host, source_path, dest_path) 155*9c5db199SXin Li 156*9c5db199SXin Li 157*9c5db199SXin Lidef _get_file_stats(host, path): 158*9c5db199SXin Li """Get the stats of a file from host. 159*9c5db199SXin Li 160*9c5db199SXin Li @param host: Instance of Host subclass with run(). 161*9c5db199SXin Li @param path: Path of file to check. 162*9c5db199SXin Li @returns: _FileStats namedtuple with file size and collection probability. 163*9c5db199SXin Li """ 164*9c5db199SXin Li cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),) 165*9c5db199SXin Li output = None 166*9c5db199SXin Li file_size = 0 167*9c5db199SXin Li try: 168*9c5db199SXin Li output = host.run(cmd).stdout 169*9c5db199SXin Li except error.CmdError as e: 170*9c5db199SXin Li logging.warning('Getting size of file %r on host %r failed: %s. ' 171*9c5db199SXin Li 'Default its size to 0', path, host, e) 172*9c5db199SXin Li try: 173*9c5db199SXin Li if output is not None: 174*9c5db199SXin Li file_size = int(output) 175*9c5db199SXin Li except ValueError: 176*9c5db199SXin Li logging.warning('Failed to convert size string "%s" for %s on host %r. ' 177*9c5db199SXin Li 'File may not exist.', output, path, host) 178*9c5db199SXin Li return 179*9c5db199SXin Li 180*9c5db199SXin Li if file_size == 0: 181*9c5db199SXin Li return _FileStats(0, 1.0) 182*9c5db199SXin Li else: 183*9c5db199SXin Li collection_probability = _MAX_FILESIZE / float(file_size) 184*9c5db199SXin Li return _FileStats(file_size, collection_probability) 185*9c5db199SXin Li 186*9c5db199SXin Li 187*9c5db199SXin Li# import any site hooks for the crashdump and crashinfo collection 188*9c5db199SXin Liget_site_crashdumps = utils.import_site_function( 189*9c5db199SXin Li __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps", 190*9c5db199SXin Li lambda host, test_start_time: None) 191*9c5db199SXin Liget_site_crashinfo = utils.import_site_function( 192*9c5db199SXin Li __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo", 193*9c5db199SXin Li lambda host, test_start_time: None) 194*9c5db199SXin Lireport_crashdumps = utils.import_site_function( 195*9c5db199SXin Li __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps", 196*9c5db199SXin Li lambda host: None) 197*9c5db199SXin Lifetch_orphaned_crashdumps = utils.import_site_function( 198*9c5db199SXin Li __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps", 199*9c5db199SXin Li lambda host, host_resultdir: None) 200*9c5db199SXin Liget_host_infodir = utils.import_site_function( 201*9c5db199SXin Li __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir", 202*9c5db199SXin Li lambda host: None) 203*9c5db199SXin Li 204*9c5db199SXin Li 205*9c5db199SXin Li@metrics.SecondsTimerDecorator( 206*9c5db199SXin Li 'chromeos/autotest/autoserv/get_crashdumps_duration') 207*9c5db199SXin Lidef get_crashdumps(host, test_start_time): 208*9c5db199SXin Li get_site_crashdumps(host, test_start_time) 209*9c5db199SXin Li 210*9c5db199SXin Li 211*9c5db199SXin Li@metrics.SecondsTimerDecorator( 212*9c5db199SXin Li 'chromeos/autotest/autoserv/get_crashinfo_duration') 213*9c5db199SXin Lidef get_crashinfo(host, test_start_time): 214*9c5db199SXin Li logging.info("Collecting crash information...") 215*9c5db199SXin Li 216*9c5db199SXin Li # get_crashdumps collects orphaned crashdumps and symbolicates all 217*9c5db199SXin Li # collected crashdumps. Symbolicating could happen 218*9c5db199SXin Li # during a postjob task as well, at which time some crashdumps could have 219*9c5db199SXin Li # already been pulled back from machine. So it doesn't necessarily need 220*9c5db199SXin Li # to wait for the machine to come up. 221*9c5db199SXin Li get_crashdumps(host, test_start_time) 222*9c5db199SXin Li 223*9c5db199SXin Li if wait_for_machine_to_recover(host): 224*9c5db199SXin Li # run any site-specific collection 225*9c5db199SXin Li get_site_crashinfo(host, test_start_time) 226*9c5db199SXin Li 227*9c5db199SXin Li crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 228*9c5db199SXin Li collect_messages(host) 229*9c5db199SXin Li collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg")) 230*9c5db199SXin Li collect_uncollected_logs(host) 231*9c5db199SXin Li 232*9c5db199SXin Li # Collect everything in /var/log. 233*9c5db199SXin Li log_path = os.path.join(crashinfo_dir, 'var') 234*9c5db199SXin Li os.makedirs(log_path) 235*9c5db199SXin Li collect_log_file(host, constants.LOG_DIR, log_path) 236*9c5db199SXin Li 237*9c5db199SXin Li # Collect console-ramoops. The filename has changed in linux-3.19, 238*9c5db199SXin Li # so collect all the files in the pstore dirs. 239*9c5db199SXin Li log_path = os.path.join(crashinfo_dir, 'pstore') 240*9c5db199SXin Li for pstore_dir in constants.LOG_PSTORE_DIRS: 241*9c5db199SXin Li collect_log_file(host, pstore_dir, log_path, use_tmp=True, 242*9c5db199SXin Li clean_content=True) 243*9c5db199SXin Li # Collect i915_error_state, only available on intel systems. 244*9c5db199SXin Li # i915 contains the Intel graphics state. It might contain useful data 245*9c5db199SXin Li # when a DUT hangs, times out or crashes. 246*9c5db199SXin Li log_path = os.path.join( 247*9c5db199SXin Li crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE)) 248*9c5db199SXin Li collect_log_file(host, constants.LOG_I915_ERROR_STATE, 249*9c5db199SXin Li log_path, use_tmp=True) 250*9c5db199SXin Li 251*9c5db199SXin Li 252*9c5db199SXin Li# Load default for number of hours to wait before giving up on crash collection. 253*9c5db199SXin LiHOURS_TO_WAIT = global_config.global_config.get_config_value( 254*9c5db199SXin Li 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0) 255*9c5db199SXin Li 256*9c5db199SXin Li 257*9c5db199SXin Lidef wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT): 258*9c5db199SXin Li """Wait for a machine (possibly down) to become accessible again. 259*9c5db199SXin Li 260*9c5db199SXin Li @param host: A RemoteHost instance to wait on 261*9c5db199SXin Li @param hours_to_wait: Number of hours to wait before giving up 262*9c5db199SXin Li 263*9c5db199SXin Li @returns: True if the machine comes back up, False otherwise 264*9c5db199SXin Li """ 265*9c5db199SXin Li current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) 266*9c5db199SXin Li if host.is_up(): 267*9c5db199SXin Li logging.info("%s already up, collecting crash info", host.hostname) 268*9c5db199SXin Li return True 269*9c5db199SXin Li 270*9c5db199SXin Li logging.info("Waiting %s hours for %s to come up (%s)", 271*9c5db199SXin Li hours_to_wait, host.hostname, current_time) 272*9c5db199SXin Li if not host.wait_up(timeout=hours_to_wait * 3600): 273*9c5db199SXin Li (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout') 274*9c5db199SXin Li .increment()) 275*9c5db199SXin Li logging.warning("%s down, unable to collect crash info", 276*9c5db199SXin Li host.hostname) 277*9c5db199SXin Li return False 278*9c5db199SXin Li else: 279*9c5db199SXin Li logging.info("%s is back up, collecting crash info", host.hostname) 280*9c5db199SXin Li return True 281*9c5db199SXin Li 282*9c5db199SXin Li 283*9c5db199SXin Lidef get_crashinfo_dir(host, dir_prefix): 284*9c5db199SXin Li """Find and if necessary create a directory to store crashinfo in. 285*9c5db199SXin Li 286*9c5db199SXin Li @param host: The RemoteHost object that crashinfo will be collected from 287*9c5db199SXin Li @param dir_prefix: Prefix of directory name. 288*9c5db199SXin Li 289*9c5db199SXin Li @returns: The path to an existing directory for writing crashinfo into 290*9c5db199SXin Li """ 291*9c5db199SXin Li host_resultdir = getattr(getattr(host, "job", None), "resultdir", None) 292*9c5db199SXin Li if host_resultdir: 293*9c5db199SXin Li infodir = host_resultdir 294*9c5db199SXin Li else: 295*9c5db199SXin Li infodir = os.path.abspath(os.getcwd()) 296*9c5db199SXin Li infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname)) 297*9c5db199SXin Li if not os.path.exists(infodir): 298*9c5db199SXin Li os.mkdir(infodir) 299*9c5db199SXin Li return infodir 300*9c5db199SXin Li 301*9c5db199SXin Li 302*9c5db199SXin Lidef collect_command(host, command, dest_path): 303*9c5db199SXin Li """Collects the result of a command on the remote machine. 304*9c5db199SXin Li 305*9c5db199SXin Li The standard output of the command will be collected and written into the 306*9c5db199SXin Li desitionation path. The destination path is assumed to be filename and 307*9c5db199SXin Li not a directory. 308*9c5db199SXin Li 309*9c5db199SXin Li @param host: The RemoteHost to collect from 310*9c5db199SXin Li @param command: A shell command to run on the remote machine and capture 311*9c5db199SXin Li the output from. 312*9c5db199SXin Li @param dest_path: A file path to write the results of the log into 313*9c5db199SXin Li """ 314*9c5db199SXin Li logging.info("Collecting '%s' ...", command) 315*9c5db199SXin Li try: 316*9c5db199SXin Li result = host.run(command, stdout_tee=None).stdout 317*9c5db199SXin Li utils.open_write_close(dest_path, result) 318*9c5db199SXin Li except Exception as e: 319*9c5db199SXin Li logging.warning("Collection of '%s' failed:\n%s", command, e) 320*9c5db199SXin Li 321*9c5db199SXin Li 322*9c5db199SXin Lidef collect_uncollected_logs(host): 323*9c5db199SXin Li """Collects any leftover uncollected logs from the client. 324*9c5db199SXin Li 325*9c5db199SXin Li @param host: The RemoteHost to collect from 326*9c5db199SXin Li """ 327*9c5db199SXin Li if host.job: 328*9c5db199SXin Li try: 329*9c5db199SXin Li logs = host.job.get_client_logs() 330*9c5db199SXin Li for hostname, remote_path, local_path in logs: 331*9c5db199SXin Li if hostname == host.hostname: 332*9c5db199SXin Li logging.info('Retrieving logs from %s:%s into %s', 333*9c5db199SXin Li hostname, remote_path, local_path) 334*9c5db199SXin Li collect_log_file(host, remote_path + '/', local_path + '/') 335*9c5db199SXin Li except Exception as e: 336*9c5db199SXin Li logging.warning('Error while trying to collect stranded ' 337*9c5db199SXin Li 'Autotest client logs: %s', e) 338*9c5db199SXin Li 339*9c5db199SXin Li 340*9c5db199SXin Lidef collect_messages(host): 341*9c5db199SXin Li """Collects the 'new' contents of /var/log/messages. 342*9c5db199SXin Li 343*9c5db199SXin Li If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects 344*9c5db199SXin Li the contents of /var/log/messages excluding whatever initial contents 345*9c5db199SXin Li are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not 346*9c5db199SXin Li present, simply collects the entire contents of /var/log/messages. 347*9c5db199SXin Li 348*9c5db199SXin Li @param host: The RemoteHost to collect from 349*9c5db199SXin Li """ 350*9c5db199SXin Li crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 351*9c5db199SXin Li 352*9c5db199SXin Li try: 353*9c5db199SXin Li # paths to the messages files 354*9c5db199SXin Li messages = os.path.join(crashinfo_dir, "messages") 355*9c5db199SXin Li messages_raw = os.path.join(crashinfo_dir, "messages.raw") 356*9c5db199SXin Li messages_at_start = os.path.join(crashinfo_dir, "messages.at_start") 357*9c5db199SXin Li 358*9c5db199SXin Li # grab the files from the remote host 359*9c5db199SXin Li collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH, 360*9c5db199SXin Li messages_at_start) 361*9c5db199SXin Li collect_log_file(host, "/var/log/messages", messages_raw) 362*9c5db199SXin Li 363*9c5db199SXin Li # figure out how much of messages.raw to skip 364*9c5db199SXin Li if os.path.exists(messages_at_start): 365*9c5db199SXin Li # if the first lines of the messages at start should match the 366*9c5db199SXin Li # first lines of the current messages; if they don't then messages 367*9c5db199SXin Li # has been erase or rotated and we just grab all of it 368*9c5db199SXin Li first_line_at_start = utils.read_one_line(messages_at_start) 369*9c5db199SXin Li first_line_now = utils.read_one_line(messages_raw) 370*9c5db199SXin Li if first_line_at_start != first_line_now: 371*9c5db199SXin Li size_at_start = 0 372*9c5db199SXin Li else: 373*9c5db199SXin Li size_at_start = os.path.getsize(messages_at_start) 374*9c5db199SXin Li else: 375*9c5db199SXin Li size_at_start = 0 376*9c5db199SXin Li raw_messages_file = open(messages_raw) 377*9c5db199SXin Li messages_file = open(messages, "w") 378*9c5db199SXin Li raw_messages_file.seek(size_at_start) 379*9c5db199SXin Li shutil.copyfileobj(raw_messages_file, messages_file) 380*9c5db199SXin Li raw_messages_file.close() 381*9c5db199SXin Li messages_file.close() 382*9c5db199SXin Li 383*9c5db199SXin Li # get rid of the "raw" versions of messages 384*9c5db199SXin Li os.remove(messages_raw) 385*9c5db199SXin Li if os.path.exists(messages_at_start): 386*9c5db199SXin Li os.remove(messages_at_start) 387*9c5db199SXin Li except Exception as e: 388*9c5db199SXin Li logging.warning("Error while collecting /var/log/messages: %s", e) 389