1# Copyright 2021 gRPC authors.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Clean up resources created by the tests.
15
16This is intended as a tool to delete leaked resources from old tests.
17
18Typical usage examples:
19
20python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\
21    --project=grpc-testing\
22    --network=default-vpc\
23    --kube_context=gke_grpc-testing_us-central1-a_psm-interop-security
24    --resource_prefix='required-but-does-not-matter'\
25    --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
26"""
27import datetime
28import functools
29import json
30import logging
31import os
32import re
33import subprocess
34from typing import Any, List
35
36from absl import app
37from absl import flags
38import dateutil
39
40from framework import xds_flags
41from framework import xds_k8s_flags
42from framework.infrastructure import gcp
43from framework.infrastructure import k8s
44from framework.infrastructure import traffic_director
45from framework.test_app.runners.k8s import k8s_xds_client_runner
46from framework.test_app.runners.k8s import k8s_xds_server_runner
47
48logger = logging.getLogger(__name__)
49Json = Any
50_KubernetesClientRunner = k8s_xds_client_runner.KubernetesClientRunner
51_KubernetesServerRunner = k8s_xds_server_runner.KubernetesServerRunner
52
53GCLOUD = os.environ.get('GCLOUD', 'gcloud')
54GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
55ZONE = 'us-central1-a'
56SECONDARY_ZONE = 'us-west1-b'
57
58PSM_SECURITY_PREFIX = 'psm-interop'  # Prefix for gke resources to delete.
59URL_MAP_TEST_PREFIX = 'interop-psm-url-map'  # Prefix for url-map test resources to delete.
60
61KEEP_PERIOD_HOURS = flags.DEFINE_integer(
62    "keep_hours",
63    default=168,
64    help=
65    "number of hours for a resource to keep. Resources older than this will be deleted. Default is 168 (7 days)"
66)
67DRY_RUN = flags.DEFINE_bool(
68    "dry_run",
69    default=False,
70    help="dry run, print resources but do not perform deletion")
71TD_RESOURCE_PREFIXES = flags.DEFINE_list(
72    "td_resource_prefixes",
73    default=[PSM_SECURITY_PREFIX],
74    help=
75    "a comma-separated list of prefixes for which the leaked TD resources will be deleted",
76)
77SERVER_PREFIXES = flags.DEFINE_list(
78    "server_prefixes",
79    default=[PSM_SECURITY_PREFIX],
80    help=
81    "a comma-separated list of prefixes for which the leaked servers will be deleted",
82)
83CLIENT_PREFIXES = flags.DEFINE_list(
84    "client_prefixes",
85    default=[PSM_SECURITY_PREFIX, URL_MAP_TEST_PREFIX],
86    help=
87    "a comma-separated list of prefixes for which the leaked clients will be deleted",
88)
89
90
91def load_keep_config() -> None:
92    global KEEP_CONFIG
93    json_path = os.path.realpath(
94        os.path.join(os.path.dirname(os.path.abspath(__file__)),
95                     'keep_xds_interop_resources.json'))
96    with open(json_path, 'r') as f:
97        KEEP_CONFIG = json.load(f)
98        logging.debug('Resource keep config loaded: %s',
99                      json.dumps(KEEP_CONFIG, indent=2))
100
101
102def is_marked_as_keep_gce(suffix: str) -> bool:
103    return suffix in KEEP_CONFIG["gce_framework"]["suffix"]
104
105
106def is_marked_as_keep_gke(suffix: str) -> bool:
107    return suffix in KEEP_CONFIG["gke_framework"]["suffix"]
108
109
110@functools.lru_cache()
111def get_expire_timestamp() -> datetime.datetime:
112    return datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
113        hours=KEEP_PERIOD_HOURS.value)
114
115
116def exec_gcloud(project: str, *cmds: List[str]) -> Json:
117    cmds = [GCLOUD, '--project', project, '--quiet'] + list(cmds)
118    if 'list' in cmds:
119        # Add arguments to shape the list output
120        cmds.extend([
121            '--format', 'json', '--filter',
122            f'creationTimestamp <= {get_expire_timestamp().isoformat()}'
123        ])
124    # Executing the gcloud command
125    logging.debug('Executing: %s', " ".join(cmds))
126    proc = subprocess.Popen(cmds,
127                            stdout=subprocess.PIPE,
128                            stderr=subprocess.PIPE)
129    # NOTE(lidiz) the gcloud subprocess won't return unless its output is read
130    stdout = proc.stdout.read()
131    stderr = proc.stderr.read()
132    try:
133        returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S)
134    except subprocess.TimeoutExpired:
135        logging.error('> Timeout executing cmd [%s]', " ".join(cmds))
136        return None
137    if returncode:
138        logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s',
139                      " ".join(cmds), returncode, stderr)
140        return None
141    if stdout:
142        return json.loads(stdout)
143    return None
144
145
146def remove_relative_resources_run_xds_tests(project: str, network: str,
147                                            prefix: str, suffix: str):
148    """Removing GCP resources created by run_xds_tests.py."""
149    logging.info('----- Removing run_xds_tests.py resources with suffix [%s]',
150                 suffix)
151    exec_gcloud(project, 'compute', 'forwarding-rules', 'delete',
152                f'test-forwarding-rule{suffix}', '--global')
153    exec_gcloud(project, 'compute', 'target-http-proxies', 'delete',
154                f'test-target-proxy{suffix}')
155    exec_gcloud(project, 'alpha', 'compute', 'target-grpc-proxies', 'delete',
156                f'test-target-proxy{suffix}')
157    exec_gcloud(project, 'compute', 'url-maps', 'delete', f'test-map{suffix}')
158    exec_gcloud(project, 'compute', 'backend-services', 'delete',
159                f'test-backend-service{suffix}', '--global')
160    exec_gcloud(project, 'compute', 'backend-services', 'delete',
161                f'test-backend-service-alternate{suffix}', '--global')
162    exec_gcloud(project, 'compute', 'backend-services', 'delete',
163                f'test-backend-service-extra{suffix}', '--global')
164    exec_gcloud(project, 'compute', 'backend-services', 'delete',
165                f'test-backend-service-more-extra{suffix}', '--global')
166    exec_gcloud(project, 'compute', 'firewall-rules', 'delete',
167                f'test-fw-rule{suffix}')
168    exec_gcloud(project, 'compute', 'health-checks', 'delete',
169                f'test-hc{suffix}')
170    exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
171                f'test-ig{suffix}', '--zone', ZONE)
172    exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
173                f'test-ig-same-zone{suffix}', '--zone', ZONE)
174    exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
175                f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE)
176    exec_gcloud(project, 'compute', 'instance-templates', 'delete',
177                f'test-template{suffix}')
178
179
180# cleanup_td creates TrafficDirectorManager (and its varients for security and
181# AppNet), and then calls the cleanup() methods.
182#
183# Note that the varients are all based on the basic TrafficDirectorManager, so
184# their `cleanup()` might do duplicate work. But deleting an non-exist resource
185# returns 404, and is OK.
186def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix):
187    gcp_api_manager = gcp.api.GcpApiManager()
188    plain_td = traffic_director.TrafficDirectorManager(
189        gcp_api_manager,
190        project=project,
191        network=network,
192        resource_prefix=resource_prefix,
193        resource_suffix=resource_suffix)
194    security_td = traffic_director.TrafficDirectorSecureManager(
195        gcp_api_manager,
196        project=project,
197        network=network,
198        resource_prefix=resource_prefix,
199        resource_suffix=resource_suffix)
200    # TODO: cleanup appnet resources.
201    # appnet_td = traffic_director.TrafficDirectorAppNetManager(
202    #     gcp_api_manager,
203    #     project=project,
204    #     network=network,
205    #     resource_prefix=resource_prefix,
206    #     resource_suffix=resource_suffix)
207
208    logger.info('----- Removing traffic director for gke, prefix %s, suffix %s',
209                resource_prefix, resource_suffix)
210    security_td.cleanup(force=True)
211    # appnet_td.cleanup(force=True)
212    plain_td.cleanup(force=True)
213
214
215# cleanup_client creates a client runner, and calls its cleanup() method.
216def cleanup_client(project, network, k8s_api_manager, resource_prefix,
217                   resource_suffix, gcp_service_account):
218    runner_kwargs = dict(
219        deployment_name=xds_flags.CLIENT_NAME.value,
220        image_name=xds_k8s_flags.CLIENT_IMAGE.value,
221        td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
222        gcp_project=project,
223        gcp_api_manager=gcp.api.GcpApiManager(),
224        gcp_service_account=gcp_service_account,
225        xds_server_uri=xds_flags.XDS_SERVER_URI.value,
226        network=network,
227        stats_port=xds_flags.CLIENT_PORT.value)
228
229    client_namespace = _KubernetesClientRunner.make_namespace_name(
230        resource_prefix, resource_suffix)
231    client_runner = _KubernetesClientRunner(
232        k8s.KubernetesNamespace(k8s_api_manager, client_namespace),
233        **runner_kwargs)
234
235    logger.info('Cleanup client')
236    client_runner.cleanup(force=True, force_namespace=True)
237
238
239# cleanup_server creates a server runner, and calls its cleanup() method.
240def cleanup_server(project, network, k8s_api_manager, resource_prefix,
241                   resource_suffix, gcp_service_account):
242    runner_kwargs = dict(
243        deployment_name=xds_flags.SERVER_NAME.value,
244        image_name=xds_k8s_flags.SERVER_IMAGE.value,
245        td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
246        gcp_project=project,
247        gcp_api_manager=gcp.api.GcpApiManager(),
248        gcp_service_account=gcp_service_account,
249        network=network)
250
251    server_namespace = _KubernetesServerRunner.make_namespace_name(
252        resource_prefix, resource_suffix)
253    server_runner = _KubernetesServerRunner(
254        k8s.KubernetesNamespace(k8s_api_manager, server_namespace),
255        **runner_kwargs)
256
257    logger.info('Cleanup server')
258    server_runner.cleanup(force=True, force_namespace=True)
259
260
261def delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
262                               resources):
263    for resource in resources:
264        logger.info('-----')
265        logger.info('----- Cleaning up resource %s', resource['name'])
266        if dry_run:
267            # Skip deletion for dry-runs
268            logging.info('----- Skipped [Dry Run]: %s', resource['name'])
269            continue
270        matched = False
271        for (regex, resource_prefix, keep, remove) in td_resource_rules:
272            result = re.search(regex, resource['name'])
273            if result is not None:
274                matched = True
275                if keep(result.group(1)):
276                    logging.info('Skipped [keep]:')
277                    break  # break inner loop, continue outer loop
278                remove(project, network, resource_prefix, result.group(1))
279                break
280        if not matched:
281            logging.info(
282                '----- Skipped [does not matching resource name templates]')
283
284
285def delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
286                         k8s_api_manager, gcp_service_account, namespaces):
287    for ns in namespaces:
288        logger.info('-----')
289        logger.info('----- Cleaning up k8s namespaces %s', ns.metadata.name)
290        if ns.metadata.creation_timestamp <= get_expire_timestamp():
291            if dry_run:
292                # Skip deletion for dry-runs
293                logging.info('----- Skipped [Dry Run]: %s', ns.metadata.name)
294                continue
295
296            matched = False
297            for (regex, resource_prefix, remove) in k8s_resource_rules:
298                result = re.search(regex, ns.metadata.name)
299                if result is not None:
300                    matched = True
301                    remove(project, network, k8s_api_manager, resource_prefix,
302                           result.group(1), gcp_service_account)
303                    break
304            if not matched:
305                logging.info(
306                    '----- Skipped [does not matching resource name templates]')
307        else:
308            logging.info('----- Skipped [resource is within expiry date]')
309
310
311def find_and_remove_leaked_k8s_resources(dry_run, project, network,
312                                         gcp_service_account):
313    k8s_resource_rules = [
314        # items in each tuple, in order
315        # - regex to match
316        # - prefix of the resources
317        # - function to delete the resource
318    ]
319    for prefix in CLIENT_PREFIXES.value:
320        k8s_resource_rules.append(
321            (f'{prefix}-client-(.*)', prefix, cleanup_client),)
322    for prefix in SERVER_PREFIXES.value:
323        k8s_resource_rules.append(
324            (f'{prefix}-server-(.*)', prefix, cleanup_server),)
325
326    # Delete leaked k8s namespaces, those usually mean there are leaked testing
327    # client/servers from the gke framework.
328    k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value)
329    nss = k8s_api_manager.core.list_namespace()
330    delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
331                         k8s_api_manager, gcp_service_account, nss.items)
332
333
334def main(argv):
335    if len(argv) > 1:
336        raise app.UsageError('Too many command-line arguments.')
337    load_keep_config()
338
339    # Must be called before KubernetesApiManager or GcpApiManager init.
340    xds_flags.set_socket_default_timeout_from_flag()
341
342    project: str = xds_flags.PROJECT.value
343    network: str = xds_flags.NETWORK.value
344    gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
345    dry_run: bool = DRY_RUN.value
346
347    td_resource_rules = [
348        # itmes in each tuple, in order
349        # - regex to match
350        # - prefix of the resource (only used by gke resources)
351        # - function to check of the resource should be kept
352        # - function to delete the resource
353        (r'test-hc(.*)', '', is_marked_as_keep_gce,
354         remove_relative_resources_run_xds_tests),
355        (r'test-template(.*)', '', is_marked_as_keep_gce,
356         remove_relative_resources_run_xds_tests),
357    ]
358    for prefix in TD_RESOURCE_PREFIXES.value:
359        td_resource_rules.append((f'{prefix}-health-check-(.*)', prefix,
360                                  is_marked_as_keep_gke, cleanup_td_for_gke),)
361
362    # List resources older than KEEP_PERIOD. We only list health-checks and
363    # instance templates because these are leaves in the resource dependency tree.
364    #
365    # E.g. forwarding-rule depends on the target-proxy. So leaked
366    # forwarding-rule indicates there's a leaked target-proxy (because this
367    # target proxy cannot deleted unless the forwarding rule is deleted). The
368    # leaked target-proxy is guaranteed to be a super set of leaked
369    # forwarding-rule.
370    compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project)
371    leakedHealthChecks = []
372    for item in compute.list_health_check()['items']:
373        if dateutil.parser.isoparse(
374                item['creationTimestamp']) <= get_expire_timestamp():
375            leakedHealthChecks.append(item)
376
377    delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
378                               leakedHealthChecks)
379
380    # Delete leaked instance templates, those usually mean there are leaked VMs
381    # from the gce framework. Also note that this is only needed for the gce
382    # resources.
383    leakedInstanceTemplates = exec_gcloud(project, 'compute',
384                                          'instance-templates', 'list')
385    delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
386                               leakedInstanceTemplates)
387
388    find_and_remove_leaked_k8s_resources(dry_run, project, network,
389                                         gcp_service_account)
390
391
392if __name__ == '__main__':
393    app.run(main)
394