cleanup.py
Go to the documentation of this file.
1 # Copyright 2021 gRPC authors.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 """Clean up resources created by the tests.
15 
16 This is intended as a tool to delete leaked resources from old tests.
17 
18 Typical usage examples:
19 
20 python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\
21  --project=grpc-testing\
22  --network=default-vpc\
23  --kube_context=gke_grpc-testing_us-central1-a_interop-test-psm-sec-v2-us-central1-a\
24  --resource_prefix='required-but-does-not-matter'\
25  --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
26 """
27 import datetime
28 import functools
29 import json
30 import logging
31 import os
32 import re
33 import subprocess
34 from typing import Any, List
35 
36 from absl import app
37 from absl import flags
38 import dateutil
39 
40 from framework import xds_flags
41 from framework import xds_k8s_flags
42 from framework.infrastructure import gcp
43 from framework.infrastructure import k8s
44 from framework.infrastructure import traffic_director
45 from framework.test_app import client_app
46 from framework.test_app import server_app
47 
48 logger = logging.getLogger(__name__)
49 Json = Any
50 KubernetesClientRunner = client_app.KubernetesClientRunner
51 KubernetesServerRunner = server_app.KubernetesServerRunner
52 
53 GCLOUD = os.environ.get('GCLOUD', 'gcloud')
54 GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
55 ZONE = 'us-central1-a'
56 SECONDARY_ZONE = 'us-west1-b'
57 
58 PSM_SECURITY_PREFIX = 'xds-k8s-security' # Prefix for gke resources to delete.
59 URL_MAP_TEST_PREFIX = 'interop-psm-url-map' # Prefix for url-map test resources to delete.
60 
61 KEEP_PERIOD_HOURS = flags.DEFINE_integer(
62  "keep_hours",
63  default=168,
64  help=
65  "number of hours for a resource to keep. Resources older than this will be deleted. Default is 168 (7 days)"
66 )
67 DRY_RUN = flags.DEFINE_bool(
68  "dry_run",
69  default=False,
70  help="dry run, print resources but do not perform deletion")
71 TD_RESOURCE_PREFIXES = flags.DEFINE_list(
72  "td_resource_prefixes",
73  default=[PSM_SECURITY_PREFIX],
74  help=
75  "a comma-separated list of prefixes for which the leaked TD resources will be deleted",
76 )
77 SERVER_PREFIXES = flags.DEFINE_list(
78  "server_prefixes",
79  default=[PSM_SECURITY_PREFIX],
80  help=
81  "a comma-separated list of prefixes for which the leaked servers will be deleted",
82 )
83 CLIENT_PREFIXES = flags.DEFINE_list(
84  "client_prefixes",
85  default=[PSM_SECURITY_PREFIX, URL_MAP_TEST_PREFIX],
86  help=
87  "a comma-separated list of prefixes for which the leaked clients will be deleted",
88 )
89 
90 
91 def load_keep_config() -> None:
92  global KEEP_CONFIG
93  json_path = os.path.realpath(
94  os.path.join(os.path.dirname(os.path.abspath(__file__)),
95  'keep_xds_interop_resources.json'))
96  with open(json_path, 'r') as f:
97  KEEP_CONFIG = json.load(f)
98  logging.debug('Resource keep config loaded: %s',
99  json.dumps(KEEP_CONFIG, indent=2))
100 
101 
102 def is_marked_as_keep_gce(suffix: str) -> bool:
103  return suffix in KEEP_CONFIG["gce_framework"]["suffix"]
104 
105 
106 def is_marked_as_keep_gke(suffix: str) -> bool:
107  return suffix in KEEP_CONFIG["gke_framework"]["suffix"]
108 
109 
110 @functools.lru_cache()
111 def get_expire_timestamp() -> datetime.datetime:
112  return datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
113  hours=KEEP_PERIOD_HOURS.value)
114 
115 
116 def exec_gcloud(project: str, *cmds: List[str]) -> Json:
117  cmds = [GCLOUD, '--project', project, '--quiet'] + list(cmds)
118  if 'list' in cmds:
119  # Add arguments to shape the list output
120  cmds.extend([
121  '--format', 'json', '--filter',
122  f'creationTimestamp <= {get_expire_timestamp().isoformat()}'
123  ])
124  # Executing the gcloud command
125  logging.debug('Executing: %s', " ".join(cmds))
126  proc = subprocess.Popen(cmds,
127  stdout=subprocess.PIPE,
128  stderr=subprocess.PIPE)
129  # NOTE(lidiz) the gcloud subprocess won't return unless its output is read
130  stdout = proc.stdout.read()
131  stderr = proc.stderr.read()
132  try:
133  returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S)
134  except subprocess.TimeoutExpired:
135  logging.error('> Timeout executing cmd [%s]', " ".join(cmds))
136  return None
137  if returncode:
138  logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s',
139  " ".join(cmds), returncode, stderr)
140  return None
141  if stdout:
142  return json.loads(stdout)
143  return None
144 
145 
146 def remove_relative_resources_run_xds_tests(project: str, network: str,
147  prefix: str, suffix: str):
148  """Removing GCP resources created by run_xds_tests.py."""
149  logging.info('----- Removing run_xds_tests.py resources with suffix [%s]',
150  suffix)
151  exec_gcloud(project, 'compute', 'forwarding-rules', 'delete',
152  f'test-forwarding-rule{suffix}', '--global')
153  exec_gcloud(project, 'compute', 'target-http-proxies', 'delete',
154  f'test-target-proxy{suffix}')
155  exec_gcloud(project, 'alpha', 'compute', 'target-grpc-proxies', 'delete',
156  f'test-target-proxy{suffix}')
157  exec_gcloud(project, 'compute', 'url-maps', 'delete', f'test-map{suffix}')
158  exec_gcloud(project, 'compute', 'backend-services', 'delete',
159  f'test-backend-service{suffix}', '--global')
160  exec_gcloud(project, 'compute', 'backend-services', 'delete',
161  f'test-backend-service-alternate{suffix}', '--global')
162  exec_gcloud(project, 'compute', 'backend-services', 'delete',
163  f'test-backend-service-extra{suffix}', '--global')
164  exec_gcloud(project, 'compute', 'backend-services', 'delete',
165  f'test-backend-service-more-extra{suffix}', '--global')
166  exec_gcloud(project, 'compute', 'firewall-rules', 'delete',
167  f'test-fw-rule{suffix}')
168  exec_gcloud(project, 'compute', 'health-checks', 'delete',
169  f'test-hc{suffix}')
170  exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
171  f'test-ig{suffix}', '--zone', ZONE)
172  exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
173  f'test-ig-same-zone{suffix}', '--zone', ZONE)
174  exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
175  f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE)
176  exec_gcloud(project, 'compute', 'instance-templates', 'delete',
177  f'test-template{suffix}')
178 
179 
180 # cleanup_td creates TrafficDirectorManager (and its varients for security and
181 # AppNet), and then calls the cleanup() methods.
182 #
183 # Note that the varients are all based on the basic TrafficDirectorManager, so
184 # their `cleanup()` might do duplicate work. But deleting an non-exist resource
185 # returns 404, and is OK.
186 def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix):
187  gcp_api_manager = gcp.api.GcpApiManager()
188  plain_td = traffic_director.TrafficDirectorManager(
189  gcp_api_manager,
190  project=project,
191  network=network,
192  resource_prefix=resource_prefix,
193  resource_suffix=resource_suffix)
194  security_td = traffic_director.TrafficDirectorSecureManager(
195  gcp_api_manager,
196  project=project,
197  network=network,
198  resource_prefix=resource_prefix,
199  resource_suffix=resource_suffix)
200  # TODO: cleanup appnet resources.
201  # appnet_td = traffic_director.TrafficDirectorAppNetManager(
202  # gcp_api_manager,
203  # project=project,
204  # network=network,
205  # resource_prefix=resource_prefix,
206  # resource_suffix=resource_suffix)
207 
208  logger.info('----- Removing traffic director for gke, prefix %s, suffix %s',
209  resource_prefix, resource_suffix)
210  security_td.cleanup(force=True)
211  # appnet_td.cleanup(force=True)
212  plain_td.cleanup(force=True)
213 
214 
215 # cleanup_client creates a client runner, and calls its cleanup() method.
216 def cleanup_client(project, network, k8s_api_manager, resource_prefix,
217  resource_suffix, gcp_service_account):
218  runner_kwargs = dict(
219  deployment_name=xds_flags.CLIENT_NAME.value,
220  image_name=xds_k8s_flags.CLIENT_IMAGE.value,
221  td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
222  gcp_project=project,
223  gcp_api_manager=gcp.api.GcpApiManager(),
224  gcp_service_account=gcp_service_account,
225  xds_server_uri=xds_flags.XDS_SERVER_URI.value,
226  network=network,
227  stats_port=xds_flags.CLIENT_PORT.value)
228 
229  client_namespace = KubernetesClientRunner.make_namespace_name(
230  resource_prefix, resource_suffix)
231  client_runner = KubernetesClientRunner(
232  k8s.KubernetesNamespace(k8s_api_manager, client_namespace),
233  **runner_kwargs)
234 
235  logger.info('Cleanup client')
236  client_runner.cleanup(force=True, force_namespace=True)
237 
238 
239 # cleanup_server creates a server runner, and calls its cleanup() method.
240 def cleanup_server(project, network, k8s_api_manager, resource_prefix,
241  resource_suffix, gcp_service_account):
242  runner_kwargs = dict(
243  deployment_name=xds_flags.SERVER_NAME.value,
244  image_name=xds_k8s_flags.SERVER_IMAGE.value,
245  td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
246  gcp_project=project,
247  gcp_api_manager=gcp.api.GcpApiManager(),
248  gcp_service_account=gcp_service_account,
249  network=network)
250 
251  server_namespace = KubernetesServerRunner.make_namespace_name(
252  resource_prefix, resource_suffix)
253  server_runner = KubernetesServerRunner(
254  k8s.KubernetesNamespace(k8s_api_manager, server_namespace),
255  **runner_kwargs)
256 
257  logger.info('Cleanup server')
258  server_runner.cleanup(force=True, force_namespace=True)
259 
260 
261 def delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
262  resources):
263  for resource in resources:
264  logger.info('-----')
265  logger.info('----- Cleaning up resource %s', resource['name'])
266  if dry_run:
267  # Skip deletion for dry-runs
268  logging.info('----- Skipped [Dry Run]: %s', resource['name'])
269  continue
270  matched = False
271  for (regex, resource_prefix, keep, remove) in td_resource_rules:
272  result = re.search(regex, resource['name'])
273  if result is not None:
274  matched = True
275  if keep(result.group(1)):
276  logging.info('Skipped [keep]:')
277  break # break inner loop, continue outer loop
278  remove(project, network, resource_prefix, result.group(1))
279  break
280  if not matched:
281  logging.info(
282  '----- Skipped [does not matching resource name templates]')
283 
284 
285 def delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
286  k8s_api_manager, gcp_service_account, namespaces):
287  for ns in namespaces:
288  logger.info('-----')
289  logger.info('----- Cleaning up k8s namespaces %s', ns.metadata.name)
290  if ns.metadata.creation_timestamp <= get_expire_timestamp():
291  if dry_run:
292  # Skip deletion for dry-runs
293  logging.info('----- Skipped [Dry Run]: %s', ns.metadata.name)
294  continue
295 
296  matched = False
297  for (regex, resource_prefix, remove) in k8s_resource_rules:
298  result = re.search(regex, ns.metadata.name)
299  if result is not None:
300  matched = True
301  remove(project, network, k8s_api_manager, resource_prefix,
302  result.group(1), gcp_service_account)
303  break
304  if not matched:
305  logging.info(
306  '----- Skipped [does not matching resource name templates]')
307  else:
308  logging.info('----- Skipped [resource is within expiry date]')
309 
310 
311 def find_and_remove_leaked_k8s_resources(dry_run, project, network,
312  gcp_service_account):
313  k8s_resource_rules = [
314  # items in each tuple, in order
315  # - regex to match
316  # - prefix of the resources
317  # - function to delete the resource
318  ]
319  for prefix in CLIENT_PREFIXES.value:
320  k8s_resource_rules.append(
321  (f'{prefix}-client-(.*)', prefix, cleanup_client),)
322  for prefix in SERVER_PREFIXES.value:
323  k8s_resource_rules.append(
324  (f'{prefix}-server-(.*)', prefix, cleanup_server),)
325 
326  # Delete leaked k8s namespaces, those usually mean there are leaked testing
327  # client/servers from the gke framework.
328  k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value)
329  nss = k8s_api_manager.core.list_namespace()
330  delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
331  k8s_api_manager, gcp_service_account, nss.items)
332 
333 
334 def main(argv):
335  if len(argv) > 1:
336  raise app.UsageError('Too many command-line arguments.')
338 
339  project: str = xds_flags.PROJECT.value
340  network: str = xds_flags.NETWORK.value
341  gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
342  dry_run: bool = DRY_RUN.value
343 
344  td_resource_rules = [
345  # itmes in each tuple, in order
346  # - regex to match
347  # - prefix of the resource (only used by gke resources)
348  # - function to check of the resource should be kept
349  # - function to delete the resource
350  (r'test-hc(.*)', '', is_marked_as_keep_gce,
351  remove_relative_resources_run_xds_tests),
352  (r'test-template(.*)', '', is_marked_as_keep_gce,
353  remove_relative_resources_run_xds_tests),
354  ]
355  for prefix in TD_RESOURCE_PREFIXES.value:
356  td_resource_rules.append((f'{prefix}-health-check-(.*)', prefix,
357  is_marked_as_keep_gke, cleanup_td_for_gke),)
358 
359  # List resources older than KEEP_PERIOD. We only list health-checks and
360  # instance templates because these are leaves in the resource dependency tree.
361  #
362  # E.g. forwarding-rule depends on the target-proxy. So leaked
363  # forwarding-rule indicates there's a leaked target-proxy (because this
364  # target proxy cannot deleted unless the forwarding rule is deleted). The
365  # leaked target-proxy is guaranteed to be a super set of leaked
366  # forwarding-rule.
367  compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project)
368  leakedHealthChecks = []
369  for item in compute.list_health_check()['items']:
370  if dateutil.parser.isoparse(
371  item['creationTimestamp']) <= get_expire_timestamp():
372  leakedHealthChecks.append(item)
373 
374  delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
375  leakedHealthChecks)
376 
377  # Delete leaked instance templates, those usually mean there are leaked VMs
378  # from the gce framework. Also note that this is only needed for the gce
379  # resources.
380  leakedInstanceTemplates = exec_gcloud(project, 'compute',
381  'instance-templates', 'list')
382  delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
383  leakedInstanceTemplates)
384 
385  find_and_remove_leaked_k8s_resources(dry_run, project, network,
386  gcp_service_account)
387 
388 
389 if __name__ == '__main__':
390  app.run(main)
cleanup.exec_gcloud
Json exec_gcloud(str project, *List[str] cmds)
Definition: cleanup.py:116
cleanup.main
def main(argv)
Definition: cleanup.py:334
cleanup.cleanup_server
def cleanup_server(project, network, k8s_api_manager, resource_prefix, resource_suffix, gcp_service_account)
Definition: cleanup.py:240
cleanup.load_keep_config
None load_keep_config()
Definition: cleanup.py:91
cleanup.KubernetesClientRunner
KubernetesClientRunner
Definition: cleanup.py:50
cleanup.delete_leaked_td_resources
def delete_leaked_td_resources(dry_run, td_resource_rules, project, network, resources)
Definition: cleanup.py:261
framework.test_app
Definition: tools/run_tests/xds_k8s_test_driver/framework/test_app/__init__.py:1
cleanup.cleanup_td_for_gke
def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix)
Definition: cleanup.py:186
cleanup.delete_k8s_resources
def delete_k8s_resources(dry_run, k8s_resource_rules, project, network, k8s_api_manager, gcp_service_account, namespaces)
Definition: cleanup.py:285
cleanup.is_marked_as_keep_gke
bool is_marked_as_keep_gke(str suffix)
Definition: cleanup.py:106
framework.infrastructure
Definition: tools/run_tests/xds_k8s_test_driver/framework/infrastructure/__init__.py:1
cleanup.cleanup_client
def cleanup_client(project, network, k8s_api_manager, resource_prefix, resource_suffix, gcp_service_account)
Definition: cleanup.py:216
cleanup.is_marked_as_keep_gce
bool is_marked_as_keep_gce(str suffix)
Definition: cleanup.py:102
cleanup.find_and_remove_leaked_k8s_resources
def find_and_remove_leaked_k8s_resources(dry_run, project, network, gcp_service_account)
Definition: cleanup.py:311
cleanup.remove_relative_resources_run_xds_tests
def remove_relative_resources_run_xds_tests(str project, str network, str prefix, str suffix)
Definition: cleanup.py:146
open
#define open
Definition: test-fs.c:46
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
cleanup.KubernetesServerRunner
KubernetesServerRunner
Definition: cleanup.py:51
cleanup.get_expire_timestamp
datetime.datetime get_expire_timestamp()
Definition: cleanup.py:111


grpc
Author(s):
autogenerated on Thu Mar 13 2025 02:58:46