Skip to content

Commit 50325c4

Browse files
authored
Merge pull request #9 from aws-observability/retry-remote-application-and-addon
Add Retry for Remote Application, Addon, and EC2 Instance Inline
2 parents 557275c + e7a44e0 commit 50325c4

File tree

5 files changed

+199
-46
lines changed

5 files changed

+199
-46
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Reusable Action for executing commands and retrying them if it fails
2+
name: Command Retry Logic
3+
4+
inputs:
5+
# (Optional) Command to run before the retry command. To be used for environment setup, etc
6+
pre-command:
7+
required: false
8+
type: string
9+
# (Optional) Number of retries to perform. Default is 2
10+
max_retry:
11+
required: false
12+
type: number
13+
default: 2
14+
# (Required) Command to execute with the retry mechanism
15+
command:
16+
required: true
17+
type: string
18+
# (Required) Command to clean up resources before retrying the main command
19+
cleanup:
20+
required: false
21+
type: string
22+
# (Optional) Follow-up command after the main command is finished.
23+
post-command:
24+
required: false
25+
type: string
26+
27+
runs:
28+
using: "composite"
29+
steps:
30+
- name: Run pre-command
31+
shell: bash
32+
env:
33+
PRE_COMMAND: ${{ inputs.pre-command }}
34+
run: |
35+
$PRE_COMMAND
36+
37+
- name: Run command
38+
shell: bash
39+
env:
40+
MAX_RETRY: ${{ inputs.max_retry }}
41+
COMMAND: ${{ inputs.command }}
42+
CLEANUP: ${{ inputs.cleanup }}
43+
run: |
44+
retry_counter=0
45+
while [ $retry_counter -lt $MAX_RETRY ]; do
46+
attempt_failed=0
47+
eval "$COMMAND" || attempt_failed=$?
48+
49+
if [ $attempt_failed -ne 0 ]; then
50+
eval "$CLEANUP"
51+
retry_counter=$(($retry_counter+1))
52+
sleep 5
53+
else
54+
break
55+
fi
56+
57+
if [ $retry_counter -eq $max_retry ]; then
58+
echo "Max retry reached, command failed to execute properly. Exiting code"
59+
exit 1
60+
fi
61+
done
62+
63+
- name: Run post command
64+
shell: bash
65+
env:
66+
POST_COMMAND: ${{ inputs.post-command }}
67+
run: $POST_COMMAND

.github/workflows/appsignals-e2e-ec2-test.yml

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ env:
3131
LOG_GROUP_NAME: /aws/appsignals/generic
3232
TEST: ${{ inputs.test }}
3333
GET_ADOT_JAR_COMMAND: "wget -O adot.jar https://github.com/aws-observability/aws-otel-java-instrumentation/releases/latest/download/aws-opentelemetry-agent.jar"
34+
GET_CW_AGENT_RPM_COMMAND: "wget -O cw-agent.rpm https://amazoncloudwatch-agent-${{ inputs.aws-region }}.s3.${{ inputs.aws-region }}.amazonaws.com/amazon_linux/amd64/1.300031.0b313/amazon-cloudwatch-agent.rpm"
3435
TEST_RESOURCES_FOLDER: /home/runner/work/aws-application-signals-test-framework/aws-application-signals-test-framework
3536

3637

@@ -42,9 +43,6 @@ jobs:
4243
with:
4344
fetch-depth: 0
4445

45-
- name: Set CW Agent RPM environment variable
46-
run: echo GET_CW_AGENT_RPM_COMMAND="wget -O cw-agent.rpm https://amazoncloudwatch-agent-${{ inputs.aws-region }}.s3.${{ inputs.aws-region }}.amazonaws.com/amazon_linux/amd64/1.300031.0b313/amazon-cloudwatch-agent.rpm" >> $GITHUB_ENV
47-
4846
- name: Generate testing id
4947
run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
5048

@@ -67,17 +65,17 @@ jobs:
6765
aws-region: ${{ inputs.aws-region }}
6866

6967
- name: Set up terraform
70-
run: |
71-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
72-
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
73-
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
74-
sudo apt update && sudo apt install terraform
68+
uses: ./.github/workflows/actions/execute_and_retry
69+
with:
70+
command: "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
71+
post-command: 'echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list \
72+
sudo apt update && sudo apt install terraform'
7573

7674
- name: Initiate Terraform
77-
working-directory: terraform/ec2
78-
run: |
79-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
80-
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"
75+
uses: ./.github/workflows/actions/execute_and_retry
76+
with:
77+
command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/ec2 && terraform init && terraform validate"
78+
cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
8179

8280
- name: Deploy sample app via terraform and wait for endpoint to come online
8381
working-directory: terraform/ec2
@@ -108,10 +106,26 @@ jobs:
108106
# Attempts to connect will be made for up to 10 minutes
109107
if [ $deployment_failed -eq 0 ]; then
110108
echo "Attempting to connect to the endpoint"
111-
sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
109+
main_sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
110+
attempt_counter=0
111+
max_attempts=30
112+
until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
113+
if [ ${attempt_counter} -eq ${max_attempts} ];then
114+
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
115+
deployment_failed=1
116+
break
117+
fi
118+
119+
printf '.'
120+
attempt_counter=$(($attempt_counter+1))
121+
sleep 10
122+
done
123+
124+
echo "Attempting to connect to the remote sample app endpoint"
125+
remote_sample_app_endpoint=http://$(terraform output sample_app_remote_service_public_ip):8080/healthcheck
112126
attempt_counter=0
113-
max_attempts=60
114-
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
127+
max_attempts=30
128+
until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
115129
if [ ${attempt_counter} -eq ${max_attempts} ];then
116130
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
117131
deployment_failed=1
@@ -164,9 +178,10 @@ jobs:
164178
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call/
165179
166180
- name: Build Gradlew
167-
run: |
168-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
169-
execute_and_retry 2 ./gradlew
181+
uses: ./.github/workflows/actions/execute_and_retry
182+
with:
183+
max_retry: 4
184+
command: "./gradlew"
170185

171186
# Validation for pulse telemetry data
172187
- name: Validate generated EMF logs

.github/workflows/appsignals-e2e-eks-test.yml

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -94,38 +94,42 @@ jobs:
9494
- name: Set up kubeconfig
9595
run: aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}
9696

97+
- name: Download eksctl
98+
uses: ./.github/workflows/actions/execute_and_retry
99+
with:
100+
pre-command: 'mkdir ${{ github.workspace }}/eksctl'
101+
command: 'curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"'
102+
97103
- name: Install eksctl
98104
run: |
99-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
100-
mkdir ${{ github.workspace }}/eksctl
101-
curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"
102-
execute_and_retry 2 "tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz"
105+
tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz
103106
echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH
104107
105108
- name: Create role for AWS access from the sample app
106109
id: create_service_account
107-
run: |
108-
eksctl create iamserviceaccount \
110+
uses: ./.github/workflows/actions/execute_and_retry
111+
with:
112+
command: "eksctl create iamserviceaccount \
109113
--name service-account-${{ env.TESTING_ID }} \
110114
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
111115
--cluster ${{ inputs.test-cluster-name }} \
112116
--role-name eks-s3-access-${{ env.TESTING_ID }} \
113117
--attach-policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess \
114118
--region ${{ inputs.aws-region }} \
115-
--approve
119+
--approve"
116120

117121
- name: Set up terraform
118-
run: |
119-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
120-
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
121-
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
122-
sudo apt update && sudo apt install terraform
122+
uses: ./.github/workflows/actions/execute_and_retry
123+
with:
124+
command: "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
125+
post-command: 'echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list \
126+
sudo apt update && sudo apt install terraform'
123127

124128
- name: Initiate Terraform
125-
working-directory: terraform/eks
126-
run: |
127-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
128-
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"
129+
uses: ./.github/workflows/actions/execute_and_retry
130+
with:
131+
command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/eks && terraform init && terraform validate"
132+
cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
129133

130134
- name: Deploy sample app via terraform and wait for the endpoint to come online
131135
id: deploy-sample-app
@@ -152,27 +156,50 @@ jobs:
152156
-var="sample_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_E2E_RE_SA_IMG }}" \
153157
|| deployment_failed=$?
154158
155-
if [ $deployment_failed -eq 1 ]; then
159+
if [ $deployment_failed -ne 0 ]; then
156160
echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
157161
fi
158162
159163
# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint
160164
# after installing App Signals. Attempts to connect will be made for up to 10 minutes
161165
if [ $deployment_failed -eq 0 ]; then
162-
echo "Installing app signals to the sample app"
163-
${GITHUB_WORKSPACE}/enablement-script/scripts/eks/appsignals/enable-app-signals.sh \
166+
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
167+
execute_and_retry 2 \
168+
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/scripts/eks/appsignals/enable-app-signals.sh \
164169
${{ inputs.test-cluster-name }} \
165170
${{ inputs.aws-region }} \
166-
${{ env.SAMPLE_APP_NAMESPACE }}
171+
${{ env.SAMPLE_APP_NAMESPACE }}" \
172+
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/scripts/eks/appsignals/clean-app-signals.sh \
173+
${{ inputs.test-cluster-name }} \
174+
${{ inputs.aws-region }} \
175+
${{ env.SAMPLE_APP_NAMESPACE }} && \
176+
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}"
167177
168178
kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
169179
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
170180
171-
echo "Attempting to connect to the endpoint"
172-
sample_app_endpoint=http://$(terraform output sample_app_endpoint)
181+
echo "Attempting to connect to the main sample app endpoint"
182+
main_sample_app_endpoint=http://$(terraform output sample_app_endpoint)
183+
attempt_counter=0
184+
max_attempts=30
185+
until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
186+
if [ ${attempt_counter} -eq ${max_attempts} ];then
187+
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
188+
deployment_failed=1
189+
break
190+
fi
191+
192+
printf '.'
193+
attempt_counter=$(($attempt_counter+1))
194+
sleep 10
195+
done
196+
197+
echo "Attempting to connect to the remote sample app endpoint"
198+
remote_sample_app_endpoint=http://$(terraform output sample_remote_app_endpoint)/healthcheck
199+
echo $remote_sample_app_endpoint
173200
attempt_counter=0
174-
max_attempts=60
175-
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
201+
max_attempts=30
202+
until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
176203
if [ ${attempt_counter} -eq ${max_attempts} ];then
177204
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
178205
deployment_failed=1
@@ -196,7 +223,7 @@ jobs:
196223
197224
# Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
198225
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}
199-
226+
200227
echo "Destroying terraform"
201228
terraform destroy -auto-approve \
202229
-var="test_id=${{ env.TESTING_ID }}" \
@@ -248,9 +275,10 @@ jobs:
248275
curl -S -s http://${{ env.APP_ENDPOINT }}/client-call/
249276
250277
- name: Build Gradlew
251-
run: |
252-
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
253-
execute_and_retry 2 "./gradlew"
278+
uses: ./.github/workflows/actions/execute_and_retry
279+
with:
280+
max_retry: 4
281+
command: "./gradlew"
254282

255283
# Validation for app signals telemetry data
256284
- name: Call endpoint and validate generated EMF logs

terraform/ec2/main.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ resource "null_resource" "main_service_setup" {
104104

105105
provisioner "remote-exec" {
106106
inline = [
107+
# Make the Terraform fail if any step throws an error
108+
"set -o errexit",
107109
# Install Java 11 and wget
108110
"sudo yum install wget java-11-amazon-corretto -y",
109111

@@ -165,6 +167,8 @@ resource "null_resource" "remote_service_setup" {
165167

166168
provisioner "remote-exec" {
167169
inline = [
170+
# Make the Terraform fail if any step throws an error
171+
"set -o errexit",
168172
# Install Java 11 and wget
169173
"sudo yum install wget java-11-amazon-corretto -y",
170174

terraform/eks/main.tf

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,45 @@ resource "kubernetes_service" "sample_remote_app_service" {
241241
}
242242
}
243243

244+
resource "kubernetes_ingress_v1" "sample-remote-app-ingress" {
245+
depends_on = [kubernetes_service.sample_remote_app_service]
246+
wait_for_load_balancer = true
247+
metadata {
248+
name = "sample-remote-app-ingress-${var.test_id}"
249+
namespace = var.test_namespace
250+
annotations = {
251+
"kubernetes.io/ingress.class" = "alb"
252+
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
253+
"alb.ingress.kubernetes.io/target-type" = "ip"
254+
}
255+
labels = {
256+
app = "sample-remote-app-ingress"
257+
}
258+
}
259+
spec {
260+
rule {
261+
http {
262+
path {
263+
path = "/"
264+
path_type = "Prefix"
265+
backend {
266+
service {
267+
name = kubernetes_service.sample_remote_app_service.metadata[0].name
268+
port {
269+
number = 8080
270+
}
271+
}
272+
}
273+
}
274+
}
275+
}
276+
}
277+
}
278+
244279
output "sample_app_endpoint" {
245280
value = kubernetes_ingress_v1.sample-app-ingress.status.0.load_balancer.0.ingress.0.hostname
246281
}
282+
283+
output "sample_remote_app_endpoint" {
284+
value = kubernetes_ingress_v1.sample-remote-app-ingress.status.0.load_balancer.0.ingress.0.hostname
285+
}

0 commit comments

Comments
 (0)