1# Copyright 2021 gRPC authors. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14"""Clean up resources created by the tests. 15 16This is intended as a tool to delete leaked resources from old tests. 17 18Typical usage examples: 19 20python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\ 21 --project=grpc-testing\ 22 --network=default-vpc\ 23 --kube_context=gke_grpc-testing_us-central1-a_psm-interop-security 24 --resource_prefix='required-but-does-not-matter'\ 25 --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter' 26""" 27import datetime 28import functools 29import json 30import logging 31import os 32import re 33import subprocess 34from typing import Any, List 35 36from absl import app 37from absl import flags 38import dateutil 39 40from framework import xds_flags 41from framework import xds_k8s_flags 42from framework.infrastructure import gcp 43from framework.infrastructure import k8s 44from framework.infrastructure import traffic_director 45from framework.test_app.runners.k8s import k8s_xds_client_runner 46from framework.test_app.runners.k8s import k8s_xds_server_runner 47 48logger = logging.getLogger(__name__) 49Json = Any 50_KubernetesClientRunner = k8s_xds_client_runner.KubernetesClientRunner 51_KubernetesServerRunner = k8s_xds_server_runner.KubernetesServerRunner 52 53GCLOUD = os.environ.get('GCLOUD', 'gcloud') 54GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds() 55ZONE = 'us-central1-a' 56SECONDARY_ZONE = 'us-west1-b' 57 58PSM_SECURITY_PREFIX = 'psm-interop' # Prefix for gke resources to delete. 59URL_MAP_TEST_PREFIX = 'interop-psm-url-map' # Prefix for url-map test resources to delete. 60 61KEEP_PERIOD_HOURS = flags.DEFINE_integer( 62 "keep_hours", 63 default=168, 64 help= 65 "number of hours for a resource to keep. Resources older than this will be deleted. Default is 168 (7 days)" 66) 67DRY_RUN = flags.DEFINE_bool( 68 "dry_run", 69 default=False, 70 help="dry run, print resources but do not perform deletion") 71TD_RESOURCE_PREFIXES = flags.DEFINE_list( 72 "td_resource_prefixes", 73 default=[PSM_SECURITY_PREFIX], 74 help= 75 "a comma-separated list of prefixes for which the leaked TD resources will be deleted", 76) 77SERVER_PREFIXES = flags.DEFINE_list( 78 "server_prefixes", 79 default=[PSM_SECURITY_PREFIX], 80 help= 81 "a comma-separated list of prefixes for which the leaked servers will be deleted", 82) 83CLIENT_PREFIXES = flags.DEFINE_list( 84 "client_prefixes", 85 default=[PSM_SECURITY_PREFIX, URL_MAP_TEST_PREFIX], 86 help= 87 "a comma-separated list of prefixes for which the leaked clients will be deleted", 88) 89 90 91def load_keep_config() -> None: 92 global KEEP_CONFIG 93 json_path = os.path.realpath( 94 os.path.join(os.path.dirname(os.path.abspath(__file__)), 95 'keep_xds_interop_resources.json')) 96 with open(json_path, 'r') as f: 97 KEEP_CONFIG = json.load(f) 98 logging.debug('Resource keep config loaded: %s', 99 json.dumps(KEEP_CONFIG, indent=2)) 100 101 102def is_marked_as_keep_gce(suffix: str) -> bool: 103 return suffix in KEEP_CONFIG["gce_framework"]["suffix"] 104 105 106def is_marked_as_keep_gke(suffix: str) -> bool: 107 return suffix in KEEP_CONFIG["gke_framework"]["suffix"] 108 109 110@functools.lru_cache() 111def get_expire_timestamp() -> datetime.datetime: 112 return datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( 113 hours=KEEP_PERIOD_HOURS.value) 114 115 116def exec_gcloud(project: str, *cmds: List[str]) -> Json: 117 cmds = [GCLOUD, '--project', project, '--quiet'] + list(cmds) 118 if 'list' in cmds: 119 # Add arguments to shape the list output 120 cmds.extend([ 121 '--format', 'json', '--filter', 122 f'creationTimestamp <= {get_expire_timestamp().isoformat()}' 123 ]) 124 # Executing the gcloud command 125 logging.debug('Executing: %s', " ".join(cmds)) 126 proc = subprocess.Popen(cmds, 127 stdout=subprocess.PIPE, 128 stderr=subprocess.PIPE) 129 # NOTE(lidiz) the gcloud subprocess won't return unless its output is read 130 stdout = proc.stdout.read() 131 stderr = proc.stderr.read() 132 try: 133 returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S) 134 except subprocess.TimeoutExpired: 135 logging.error('> Timeout executing cmd [%s]', " ".join(cmds)) 136 return None 137 if returncode: 138 logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s', 139 " ".join(cmds), returncode, stderr) 140 return None 141 if stdout: 142 return json.loads(stdout) 143 return None 144 145 146def remove_relative_resources_run_xds_tests(project: str, network: str, 147 prefix: str, suffix: str): 148 """Removing GCP resources created by run_xds_tests.py.""" 149 logging.info('----- Removing run_xds_tests.py resources with suffix [%s]', 150 suffix) 151 exec_gcloud(project, 'compute', 'forwarding-rules', 'delete', 152 f'test-forwarding-rule{suffix}', '--global') 153 exec_gcloud(project, 'compute', 'target-http-proxies', 'delete', 154 f'test-target-proxy{suffix}') 155 exec_gcloud(project, 'alpha', 'compute', 'target-grpc-proxies', 'delete', 156 f'test-target-proxy{suffix}') 157 exec_gcloud(project, 'compute', 'url-maps', 'delete', f'test-map{suffix}') 158 exec_gcloud(project, 'compute', 'backend-services', 'delete', 159 f'test-backend-service{suffix}', '--global') 160 exec_gcloud(project, 'compute', 'backend-services', 'delete', 161 f'test-backend-service-alternate{suffix}', '--global') 162 exec_gcloud(project, 'compute', 'backend-services', 'delete', 163 f'test-backend-service-extra{suffix}', '--global') 164 exec_gcloud(project, 'compute', 'backend-services', 'delete', 165 f'test-backend-service-more-extra{suffix}', '--global') 166 exec_gcloud(project, 'compute', 'firewall-rules', 'delete', 167 f'test-fw-rule{suffix}') 168 exec_gcloud(project, 'compute', 'health-checks', 'delete', 169 f'test-hc{suffix}') 170 exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete', 171 f'test-ig{suffix}', '--zone', ZONE) 172 exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete', 173 f'test-ig-same-zone{suffix}', '--zone', ZONE) 174 exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete', 175 f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE) 176 exec_gcloud(project, 'compute', 'instance-templates', 'delete', 177 f'test-template{suffix}') 178 179 180# cleanup_td creates TrafficDirectorManager (and its varients for security and 181# AppNet), and then calls the cleanup() methods. 182# 183# Note that the varients are all based on the basic TrafficDirectorManager, so 184# their `cleanup()` might do duplicate work. But deleting an non-exist resource 185# returns 404, and is OK. 186def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix): 187 gcp_api_manager = gcp.api.GcpApiManager() 188 plain_td = traffic_director.TrafficDirectorManager( 189 gcp_api_manager, 190 project=project, 191 network=network, 192 resource_prefix=resource_prefix, 193 resource_suffix=resource_suffix) 194 security_td = traffic_director.TrafficDirectorSecureManager( 195 gcp_api_manager, 196 project=project, 197 network=network, 198 resource_prefix=resource_prefix, 199 resource_suffix=resource_suffix) 200 # TODO: cleanup appnet resources. 201 # appnet_td = traffic_director.TrafficDirectorAppNetManager( 202 # gcp_api_manager, 203 # project=project, 204 # network=network, 205 # resource_prefix=resource_prefix, 206 # resource_suffix=resource_suffix) 207 208 logger.info('----- Removing traffic director for gke, prefix %s, suffix %s', 209 resource_prefix, resource_suffix) 210 security_td.cleanup(force=True) 211 # appnet_td.cleanup(force=True) 212 plain_td.cleanup(force=True) 213 214 215# cleanup_client creates a client runner, and calls its cleanup() method. 216def cleanup_client(project, network, k8s_api_manager, resource_prefix, 217 resource_suffix, gcp_service_account): 218 runner_kwargs = dict( 219 deployment_name=xds_flags.CLIENT_NAME.value, 220 image_name=xds_k8s_flags.CLIENT_IMAGE.value, 221 td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value, 222 gcp_project=project, 223 gcp_api_manager=gcp.api.GcpApiManager(), 224 gcp_service_account=gcp_service_account, 225 xds_server_uri=xds_flags.XDS_SERVER_URI.value, 226 network=network, 227 stats_port=xds_flags.CLIENT_PORT.value) 228 229 client_namespace = _KubernetesClientRunner.make_namespace_name( 230 resource_prefix, resource_suffix) 231 client_runner = _KubernetesClientRunner( 232 k8s.KubernetesNamespace(k8s_api_manager, client_namespace), 233 **runner_kwargs) 234 235 logger.info('Cleanup client') 236 client_runner.cleanup(force=True, force_namespace=True) 237 238 239# cleanup_server creates a server runner, and calls its cleanup() method. 240def cleanup_server(project, network, k8s_api_manager, resource_prefix, 241 resource_suffix, gcp_service_account): 242 runner_kwargs = dict( 243 deployment_name=xds_flags.SERVER_NAME.value, 244 image_name=xds_k8s_flags.SERVER_IMAGE.value, 245 td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value, 246 gcp_project=project, 247 gcp_api_manager=gcp.api.GcpApiManager(), 248 gcp_service_account=gcp_service_account, 249 network=network) 250 251 server_namespace = _KubernetesServerRunner.make_namespace_name( 252 resource_prefix, resource_suffix) 253 server_runner = _KubernetesServerRunner( 254 k8s.KubernetesNamespace(k8s_api_manager, server_namespace), 255 **runner_kwargs) 256 257 logger.info('Cleanup server') 258 server_runner.cleanup(force=True, force_namespace=True) 259 260 261def delete_leaked_td_resources(dry_run, td_resource_rules, project, network, 262 resources): 263 for resource in resources: 264 logger.info('-----') 265 logger.info('----- Cleaning up resource %s', resource['name']) 266 if dry_run: 267 # Skip deletion for dry-runs 268 logging.info('----- Skipped [Dry Run]: %s', resource['name']) 269 continue 270 matched = False 271 for (regex, resource_prefix, keep, remove) in td_resource_rules: 272 result = re.search(regex, resource['name']) 273 if result is not None: 274 matched = True 275 if keep(result.group(1)): 276 logging.info('Skipped [keep]:') 277 break # break inner loop, continue outer loop 278 remove(project, network, resource_prefix, result.group(1)) 279 break 280 if not matched: 281 logging.info( 282 '----- Skipped [does not matching resource name templates]') 283 284 285def delete_k8s_resources(dry_run, k8s_resource_rules, project, network, 286 k8s_api_manager, gcp_service_account, namespaces): 287 for ns in namespaces: 288 logger.info('-----') 289 logger.info('----- Cleaning up k8s namespaces %s', ns.metadata.name) 290 if ns.metadata.creation_timestamp <= get_expire_timestamp(): 291 if dry_run: 292 # Skip deletion for dry-runs 293 logging.info('----- Skipped [Dry Run]: %s', ns.metadata.name) 294 continue 295 296 matched = False 297 for (regex, resource_prefix, remove) in k8s_resource_rules: 298 result = re.search(regex, ns.metadata.name) 299 if result is not None: 300 matched = True 301 remove(project, network, k8s_api_manager, resource_prefix, 302 result.group(1), gcp_service_account) 303 break 304 if not matched: 305 logging.info( 306 '----- Skipped [does not matching resource name templates]') 307 else: 308 logging.info('----- Skipped [resource is within expiry date]') 309 310 311def find_and_remove_leaked_k8s_resources(dry_run, project, network, 312 gcp_service_account): 313 k8s_resource_rules = [ 314 # items in each tuple, in order 315 # - regex to match 316 # - prefix of the resources 317 # - function to delete the resource 318 ] 319 for prefix in CLIENT_PREFIXES.value: 320 k8s_resource_rules.append( 321 (f'{prefix}-client-(.*)', prefix, cleanup_client),) 322 for prefix in SERVER_PREFIXES.value: 323 k8s_resource_rules.append( 324 (f'{prefix}-server-(.*)', prefix, cleanup_server),) 325 326 # Delete leaked k8s namespaces, those usually mean there are leaked testing 327 # client/servers from the gke framework. 328 k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value) 329 nss = k8s_api_manager.core.list_namespace() 330 delete_k8s_resources(dry_run, k8s_resource_rules, project, network, 331 k8s_api_manager, gcp_service_account, nss.items) 332 333 334def main(argv): 335 if len(argv) > 1: 336 raise app.UsageError('Too many command-line arguments.') 337 load_keep_config() 338 339 # Must be called before KubernetesApiManager or GcpApiManager init. 340 xds_flags.set_socket_default_timeout_from_flag() 341 342 project: str = xds_flags.PROJECT.value 343 network: str = xds_flags.NETWORK.value 344 gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value 345 dry_run: bool = DRY_RUN.value 346 347 td_resource_rules = [ 348 # itmes in each tuple, in order 349 # - regex to match 350 # - prefix of the resource (only used by gke resources) 351 # - function to check of the resource should be kept 352 # - function to delete the resource 353 (r'test-hc(.*)', '', is_marked_as_keep_gce, 354 remove_relative_resources_run_xds_tests), 355 (r'test-template(.*)', '', is_marked_as_keep_gce, 356 remove_relative_resources_run_xds_tests), 357 ] 358 for prefix in TD_RESOURCE_PREFIXES.value: 359 td_resource_rules.append((f'{prefix}-health-check-(.*)', prefix, 360 is_marked_as_keep_gke, cleanup_td_for_gke),) 361 362 # List resources older than KEEP_PERIOD. We only list health-checks and 363 # instance templates because these are leaves in the resource dependency tree. 364 # 365 # E.g. forwarding-rule depends on the target-proxy. So leaked 366 # forwarding-rule indicates there's a leaked target-proxy (because this 367 # target proxy cannot deleted unless the forwarding rule is deleted). The 368 # leaked target-proxy is guaranteed to be a super set of leaked 369 # forwarding-rule. 370 compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project) 371 leakedHealthChecks = [] 372 for item in compute.list_health_check()['items']: 373 if dateutil.parser.isoparse( 374 item['creationTimestamp']) <= get_expire_timestamp(): 375 leakedHealthChecks.append(item) 376 377 delete_leaked_td_resources(dry_run, td_resource_rules, project, network, 378 leakedHealthChecks) 379 380 # Delete leaked instance templates, those usually mean there are leaked VMs 381 # from the gce framework. Also note that this is only needed for the gce 382 # resources. 383 leakedInstanceTemplates = exec_gcloud(project, 'compute', 384 'instance-templates', 'list') 385 delete_leaked_td_resources(dry_run, td_resource_rules, project, network, 386 leakedInstanceTemplates) 387 388 find_and_remove_leaked_k8s_resources(dry_run, project, network, 389 gcp_service_account) 390 391 392if __name__ == '__main__': 393 app.run(main) 394