xref: /aosp_15_r20/external/autotest/server/cros/chaos_lib/chaos_runner.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Lint as: python2, python3
2# Copyright 2016 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import contextlib
7import datetime
8import logging
9import pprint
10import time
11
12import common
13from autotest_lib.client.common_lib import error
14from autotest_lib.client.common_lib import utils as client_utils
15from autotest_lib.client.common_lib.cros.network import ap_constants
16from autotest_lib.client.common_lib.cros.network import iw_runner
17from autotest_lib.server import hosts
18from autotest_lib.server import site_linux_system
19from autotest_lib.server.cros import host_lock_manager
20from autotest_lib.server.cros.ap_configurators import ap_batch_locker
21from autotest_lib.server.cros.ap_configurators \
22        import ap_configurator_factory
23from autotest_lib.server.cros.network import chaos_clique_utils as utils
24from autotest_lib.server.cros.network import wifi_client
25
26# Webdriver main hostname
27# TODO b:169251326 terms below are set outside of this codebase and should
28# be updated when possible ("master" -> "main"). # nocheck
29MAINNAME = 'chromeos3-chaosvmmaster.cros.corp.google.com'  # nocheck
30WEBDRIVER_PORT = 9515
31
32
33class ChaosRunner(object):
34    """Object to run a network_WiFi_ChaosXXX test."""
35
36
37    def __init__(self, test, host, spec, broken_pdus=list()):
38        """Initializes and runs test.
39
40        @param test: a string, test name.
41        @param host: an Autotest host object, device under test.
42        @param spec: an APSpec object.
43        @param broken_pdus: list of offline PDUs.
44
45        """
46        self._test = test
47        self._host = host
48        self._ap_spec = spec
49        self._broken_pdus = broken_pdus
50        # Log server and DUT times
51        dt = datetime.datetime.now()
52        logging.info('Server time: %s', dt.strftime('%a %b %d %H:%M:%S %Y'))
53        logging.info('DUT time: %s', self._host.run('date').stdout.strip())
54
55
56    def run(self, job, batch_size=10, tries=10, capturer_hostname=None,
57            conn_worker=None, work_client_hostname=None,
58            disabled_sysinfo=False):
59        """Executes Chaos test.
60
61        @param job: an Autotest job object.
62        @param batch_size: an integer, max number of APs to lock in one batch.
63        @param tries: an integer, number of iterations to run per AP.
64        @param capturer_hostname: a string or None, hostname or IP of capturer.
65        @param conn_worker: ConnectionWorkerAbstract or None, to run extra
66                            work after successful connection.
67        @param work_client_hostname: a string or None, hostname of work client
68        @param disabled_sysinfo: a bool, disable collection of logs from DUT.
69
70
71        @raises TestError: Issues locking VM webdriver instance
72        """
73
74        lock_manager = host_lock_manager.HostLockManager()
75        # TODO b:169251326 terms below are set outside of this codebase and
76        # should be updated when possible ("master" -> "main"). # nocheck
77        webdriver_main = hosts.SSHHost(MAINNAME,
78                                       user='chaosvmmaster')  # nocheck
79        host_prefix = self._host.hostname.split('-')[0]
80        with host_lock_manager.HostsLockedBy(lock_manager):
81            capture_host = utils.allocate_packet_capturer(
82                    lock_manager, hostname=capturer_hostname,
83                    prefix=host_prefix)
84            # Cleanup and reboot packet capturer before the test.
85            utils.sanitize_client(capture_host)
86            capturer = site_linux_system.LinuxSystem(capture_host, {},
87                                                     'packet_capturer')
88
89            # Run iw scan and abort if more than allowed number of APs are up.
90            iw_command = iw_runner.IwRunner(capture_host)
91            start_time = time.time()
92            logging.info('Performing a scan with a max timeout of 30 seconds.')
93            capture_interface = 'wlan0'
94            capturer_info = capture_host.run('cat /etc/lsb-release',
95                                             ignore_status=True, timeout=5).stdout
96            if 'whirlwind' in capturer_info:
97                # Use the dual band aux radio for scanning networks.
98                capture_interface = 'wlan2'
99            while time.time() - start_time <= ap_constants.MAX_SCAN_TIMEOUT:
100                networks = iw_command.scan(capture_interface)
101                if networks is None:
102                    if (time.time() - start_time ==
103                            ap_constants.MAX_SCAN_TIMEOUT):
104                        raise error.TestError(
105                            'Packet capturer is not responding to scans. Check'
106                            'device and re-run test')
107                    continue
108                elif len(networks) < ap_constants.MAX_SSID_COUNT:
109                    break
110                elif len(networks) >= ap_constants.MAX_SSID_COUNT:
111                    raise error.TestError(
112                        'Probably someone is already running a '
113                        'chaos test?!')
114
115            if conn_worker is not None:
116                work_client_machine = utils.allocate_packet_capturer(
117                        lock_manager, hostname=work_client_hostname)
118                conn_worker.prepare_work_client(work_client_machine)
119
120            # Lock VM. If on, power off; always power on. Then create a tunnel.
121            webdriver_instance = utils.allocate_webdriver_instance(lock_manager)
122
123            if utils.is_VM_running(webdriver_main, webdriver_instance):
124                logging.info('VM %s was on; powering off for a clean instance',
125                             webdriver_instance)
126                utils.power_off_VM(webdriver_main, webdriver_instance)
127                logging.info('Allow VM time to gracefully shut down')
128                time.sleep(5)
129
130            logging.info('Starting up VM %s', webdriver_instance)
131            utils.power_on_VM(webdriver_main, webdriver_instance)
132            logging.info('Allow VM time to power on before creating a tunnel.')
133            time.sleep(30)
134
135            if not client_utils.host_is_in_lab_zone(webdriver_instance.hostname):
136                self._ap_spec._webdriver_hostname = webdriver_instance.hostname
137            else:
138                # If in the lab then port forwarding must be done so webdriver
139                # connection will be over localhost.
140                self._ap_spec._webdriver_hostname = 'localhost'
141                webdriver_tunnel = webdriver_instance.create_ssh_tunnel(
142                                                WEBDRIVER_PORT, WEBDRIVER_PORT)
143                logging.info('Wait for tunnel to be created.')
144                for i in range(3):
145                    time.sleep(10)
146                    results = client_utils.run('lsof -i:%s' % WEBDRIVER_PORT,
147                                             ignore_status=True)
148                    if results:
149                        break
150                if not results:
151                    raise error.TestError(
152                            'Unable to listen to WEBDRIVER_PORT: %s', results)
153
154            batch_locker = ap_batch_locker.ApBatchLocker(
155                    lock_manager, self._ap_spec,
156                    ap_test_type=ap_constants.AP_TEST_TYPE_CHAOS)
157
158            while batch_locker.has_more_aps():
159                # Work around for CrOS devices only:crbug.com/358716
160                utils.sanitize_client(self._host)
161                healthy_dut = True
162
163                with contextlib.closing(wifi_client.WiFiClient(
164                    hosts.create_host(
165                            {
166                                    'hostname' : self._host.hostname,
167                                    'afe_host' : self._host._afe_host,
168                                    'host_info_store':
169                                            self._host.host_info_store,
170                            },
171                            host_class=self._host.__class__,
172                    ),
173                    './debug',
174                    False,
175                )) as client:
176
177                    aps = batch_locker.get_ap_batch(batch_size=batch_size)
178                    if not aps:
179                        logging.info('No more APs to test.')
180                        break
181
182                    # Power down all of the APs because some can get grumpy
183                    # if they are configured several times and remain on.
184                    # User the cartridge to down group power downs and
185                    # configurations.
186                    utils.power_down_aps(aps, self._broken_pdus)
187                    utils.configure_aps(aps, self._ap_spec, self._broken_pdus)
188
189                    aps = utils.filter_quarantined_and_config_failed_aps(aps,
190                            batch_locker, job, self._broken_pdus)
191
192                    for ap in aps:
193                        # http://crbug.com/306687
194                        if ap.ssid == None:
195                            logging.error('The SSID was not set for the AP:%s',
196                                          ap)
197
198                        healthy_dut = utils.is_dut_healthy(client, ap)
199
200                        if not healthy_dut:
201                            logging.error('DUT is not healthy, rebooting.')
202                            batch_locker.unlock_and_reclaim_aps()
203                            break
204
205                        networks = utils.return_available_networks(
206                                ap, capturer, job, self._ap_spec)
207
208                        if networks is None:
209                            # If scan returned no networks, iw scan failed.
210                            # Reboot the packet capturer device and
211                            # reconfigure the capturer.
212                            batch_locker.unlock_and_reclaim_ap(ap.host_name)
213                            logging.error('Packet capture is not healthy, '
214                                          'rebooting.')
215                            capturer.host.reboot()
216                            capturer = site_linux_system.LinuxSystem(
217                                           capture_host, {},'packet_capturer')
218                            continue
219                        if networks == list():
220                            # Packet capturer did not find the SSID in scan or
221                            # there was a security mismatch.
222                            utils.release_ap(ap, batch_locker, self._broken_pdus)
223                            continue
224
225                        assoc_params = ap.get_association_parameters()
226
227                        if not utils.is_conn_worker_healthy(
228                                conn_worker, ap, assoc_params, job):
229                            utils.release_ap(
230                                    ap, batch_locker, self._broken_pdus)
231                            continue
232
233                        name = ap.name
234                        kernel_ver = self._host.get_kernel_ver()
235                        firmware_ver = utils.get_firmware_ver(self._host)
236                        if not firmware_ver:
237                            firmware_ver = "Unknown"
238
239                        debug_dict = {'+++PARSE DATA+++': '+++PARSE DATA+++',
240                                      'SSID': ap._ssid,
241                                      'DUT': client.wifi_mac,
242                                      'AP Info': ap.name,
243                                      'kernel_version': kernel_ver,
244                                      'wifi_firmware_version': firmware_ver}
245                        debug_string = pprint.pformat(debug_dict)
246
247                        logging.info('Waiting %d seconds for the AP dhcp '
248                                     'server', ap.dhcp_delay)
249                        time.sleep(ap.dhcp_delay)
250
251                        result = job.run_test(self._test,
252                                     capturer=capturer,
253                                     capturer_frequency=networks[0].frequency,
254                                     capturer_ht_type=networks[0].width,
255                                     host=self._host,
256                                     assoc_params=assoc_params,
257                                     client=client,
258                                     tries=tries,
259                                     debug_info=debug_string,
260                                     # Copy all logs from the system
261                                     disabled_sysinfo=disabled_sysinfo,
262                                     conn_worker=conn_worker,
263                                     tag=ap.ssid if conn_worker is None else
264                                         '%s.%s' % (conn_worker.name, ap.ssid))
265
266                        utils.release_ap(ap, batch_locker, self._broken_pdus)
267
268                        if conn_worker is not None:
269                            conn_worker.cleanup()
270
271                    if not healthy_dut:
272                        continue
273
274                batch_locker.unlock_aps()
275
276            if webdriver_tunnel:
277                webdriver_instance.disconnect_ssh_tunnel(webdriver_tunnel)
278                webdriver_instance.close()
279            capturer.close()
280            logging.info('Powering off VM %s', webdriver_instance)
281            utils.power_off_VM(webdriver_main, webdriver_instance)
282            lock_manager.unlock(webdriver_instance.hostname)
283
284            if self._broken_pdus:
285                logging.info('PDU is down!!!\nThe following PDUs are down:\n')
286                pprint.pprint(self._broken_pdus)
287
288            factory = ap_configurator_factory.APConfiguratorFactory(
289                    ap_constants.AP_TEST_TYPE_CHAOS)
290            factory.turn_off_all_routers(self._broken_pdus)
291