Skip to content

Commit 827ac08

Browse files
authored
Add deleting stale autoscaling group to cleanup process (#137)
*Issue description:* Canaries may exit unexpectedly with auto-scaling groups for EC2 ASG not being deleted properly. *Description of changes:* This PR scans ASGs that stay longer than 3 hours, and delete them if they are not EKS node groups. Also update gitignore file to ignore metadata files on Mac OS, and terraform temporary files. *Ensure you've run the following tests on your changes and include the link below:* To do so, create a `test.yml` file with `name: Test` and workflow description to test your changes, then remove the file for your PR. Link your test run in your PR description. This process is a short term solution while we work on creating a staging environment for testing. NOTE: TESTS RUNNING ON A SINGLE EKS CLUSTER CANNOT BE RUN IN PARALLEL. See the [needs](https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idneeds) keyword to run tests in succession. - Run Java EKS on `e2e-playground` in us-east-1 and eu-central-2 - Run Python EKS on `e2e-playground` in us-east-1 and eu-central-2 - Run metric limiter on EKS cluster `e2e-playground` in us-east-1 and eu-central-2 - Run EC2 tests in all regions - Run K8s on a separate K8s cluster (check IAD test account for master node endpoints; these will change as we create and destroy clusters for OS patching) By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent de02191 commit 827ac08

File tree

3 files changed

+89
-25
lines changed

3 files changed

+89
-25
lines changed

.github/workflows/util/clean/ec2_instance_cleanup/cleaner.py

Lines changed: 86 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,74 @@
1515
# Create an EC2 client
1616
session = boto3.Session()
1717
ec2 = session.client('ec2')
18+
autoscaling = session.client('autoscaling')
1819

1920
# configure logging
2021
logging.basicConfig(level=logging.INFO)
2122

23+
def _get_autoscaling_groups_to_delete():
24+
logging.info("Start scanning autoscaling group...")
25+
26+
current_time = datetime.now(timezone.utc)
27+
time_threshold = current_time - timedelta(hours=3)
28+
groups_to_delete = []
29+
30+
# Initialize the paginator
31+
paginator = autoscaling.get_paginator('describe_auto_scaling_groups')
32+
33+
# Iterate through each page of results
34+
for page in paginator.paginate():
35+
auto_scaling_groups = page['AutoScalingGroups']
36+
for asg in auto_scaling_groups:
37+
asg_name = asg['AutoScalingGroupName']
38+
tags = asg['Tags']
39+
40+
eks_tag_present = any(tag['Key'] == 'eks:cluster-name' for tag in tags)
41+
if eks_tag_present:
42+
logging.info(f"Skipping autoscaling group with 'eks:cluster-name' tag: {asg_name}.")
43+
continue
44+
45+
if not _is_active(asg):
46+
logging.info(f"Skipping autoscaling group {asg_name} with terminating instances.")
47+
continue
48+
49+
logging.info(f"autoscaling group {asg_name} is active.")
50+
51+
creation_time = asg['CreatedTime']
52+
if creation_time < time_threshold:
53+
print(f"Autoscaling group: {asg_name} will be deleted.")
54+
groups_to_delete.append(asg)
55+
56+
logging.info(f"{len(groups_to_delete)} autoscaling groups are active for more than 3 hours.")
57+
58+
return groups_to_delete
59+
60+
61+
def _delete_autoscaling_groups(auto_scaling_groups):
62+
for asg in auto_scaling_groups:
63+
try:
64+
asg_name = asg['AutoScalingGroupName']
65+
response = autoscaling.delete_auto_scaling_group(AutoScalingGroupName=asg_name, ForceDelete=True)
66+
logging.info("===== Response for delete autoscaling group request =====")
67+
logging.info(response)
68+
except Exception as e:
69+
logging.info(f"Error terminating instances: {e}")
70+
71+
def _is_active(asg):
72+
for instance in asg['Instances']:
73+
if instance['LifecycleState'] in [
74+
'Terminating', 'Terminating:Wait', 'Terminating:Proceed'
75+
]:
76+
return False
77+
return True
78+
2279

2380
def _get_instances_to_terminate():
2481
# Get all the running instances
25-
logging.info("Getting all running instances")
82+
logging.info("Start scanning instances")
2683
running_filter = [{'Name': 'instance-state-name', 'Values': [INSTANCE_STATE_RUNNING]}]
2784
running_instances = _get_all_instances_by_filter(filters=running_filter)
28-
logging.info(f"Currently {len(running_instances)} are running.")
85+
logging.info(f"{len(running_instances)} instances are running.")
2986

3087
# Filter instances that have been running for more than 3 hours
3188
logging.info("Filtering instances that have been running for more than 3 hours")
@@ -42,10 +99,13 @@ def _get_instances_to_terminate():
4299
logging.info("Filtering instances that should not be terminated based on conditions")
43100
instances_to_terminate = []
44101
for instance in instances_running_more_than_3hrs:
45-
if (not _is_eks_cluster_instance(instance)
46-
and not _is_k8s_cluster_instance(instance)
47-
and not _is_tagged_do_not_delete(instance)):
48-
instances_to_terminate.append(instance)
102+
if (not _is_k8s_cluster_instance(instance) and not _is_tagged_do_not_delete(instance)):
103+
group_name = _get_associated_autoscaling_group_name(instance)
104+
if group_name != None:
105+
logging.info(f"Instance {instance['InstanceId']} is associated with autoscaling group {group_name}, skip the termination.")
106+
else:
107+
instances_to_terminate.append(instance)
108+
49109
logging.info(f"{len(instances_to_terminate)} instances will be terminated.")
50110

51111
return instances_to_terminate
@@ -70,13 +130,6 @@ def _get_all_instances_by_filter(filters: List[dict]):
70130
return filtered_instances
71131

72132

73-
def _is_eks_cluster_instance(instance):
74-
security_groups = instance.get('SecurityGroups', [])
75-
if any(group['GroupName'].startswith(EKS_CLUSTER_SECURITY_GROUP_PREFIX) for group in security_groups):
76-
return True
77-
return False
78-
79-
80133
def _is_k8s_cluster_instance(instance):
81134
tags = instance.get('Tags', [])
82135
if 'Name' in tags and tags['Name'].startswith(K8S_INSTANCE_NAME_PREFIX):
@@ -92,12 +145,21 @@ def _is_tagged_do_not_delete(instance):
92145
return True
93146
return False
94147

95-
96-
def _prepare_report_and_upload(instances_to_terminate) -> bool:
97-
json_data = json.dumps(instances_to_terminate, default=str)
148+
def _get_associated_autoscaling_group_name(instance):
149+
tags = instance.get('Tags', [])
150+
asg_tag = next((tag for tag in tags if tag['Key'] == 'aws:autoscaling:groupName'), None)
151+
if asg_tag is None:
152+
return None
153+
return asg_tag['Value']
154+
155+
def _prepare_report_and_upload(groups_to_delete, instances_to_terminate) -> bool:
156+
json_data = json.dumps({
157+
"autoscalingGroups": groups_to_delete,
158+
"standaloneInstances": instances_to_terminate
159+
}, default=str)
98160
# save as a json file with timestamp
99161
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
100-
filename = f"report-instances-to-terminate-{timestamp}.json"
162+
filename = f"report-resources-to-clean-{timestamp}.json"
101163
with open(filename, "w") as f:
102164
f.write(json_data)
103165

@@ -116,24 +178,26 @@ def _prepare_report_and_upload(instances_to_terminate) -> bool:
116178
def _terminate_instances(instances_to_terminate):
117179
# Terminate the instances
118180
instance_ids = [instance['InstanceId'] for instance in instances]
119-
logging.info("Number of instances terminating: " + str(len(instance_ids)))
120181
try:
121182
response = ec2.terminate_instances(InstanceIds=instance_ids)
122-
logging.info("===== Response for terminate request =====")
183+
logging.info("===== Response for terminate instances request =====")
123184
logging.info(response)
124185
except Exception as e:
125186
logging.info(f"Error terminating instances: {e}")
126187

127188

128189
if __name__ == '__main__':
190+
groups = _get_autoscaling_groups_to_delete()
129191
instances = _get_instances_to_terminate()
130-
if len(instances) == 0:
131-
logging.info("No instances to terminate")
192+
193+
if len(groups) == 0 and len(instances) == 0:
194+
logging.info("No resource to terminate")
132195
exit(0)
133196

134-
report_successful = _prepare_report_and_upload(instances)
197+
report_successful = _prepare_report_and_upload(groups, instances)
135198
if not report_successful:
136199
logging.error("Failed to prepare report and upload. Aborting termination of instances.")
137200
exit(1)
138201

202+
_delete_autoscaling_groups(groups)
139203
_terminate_instances(instances)

.gitignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1+
.DS_Store
12
.idea
23
# Ignore Gradle project-specific cache directory
34
.gradle
45

56
# Ignore Gradle build output directory
67
build
7-
8-
# Ignore the resource cleanup reports
9-
**/report-*.json

terraform/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.terraform*
2+
terraform.tfstate*

0 commit comments

Comments
 (0)