Skip to content

Add Retry Mechanism for Gradlew, Terraform, and Eksctl #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions .github/workflows/appsignals-e2e-ec2-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ env:
LOG_GROUP_NAME: /aws/appsignals/generic
TEST: ${{ inputs.test }}
GET_ADOT_JAR_COMMAND: "wget -O adot.jar https://github.com/aws-observability/aws-otel-java-instrumentation/releases/latest/download/aws-opentelemetry-agent.jar"
TEST_RESOURCES_FOLDER: /home/runner/work/aws-application-signals-test-framework/aws-application-signals-test-framework


jobs:
e2e-ec2-test:
Expand Down Expand Up @@ -65,16 +67,21 @@ jobs:
aws-region: ${{ inputs.aws-region }}

- name: Set up terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: false
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update && sudo apt install terraform

- name: Initiate Terraform
working-directory: terraform/ec2
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"

- name: Deploy sample app via terraform and wait for endpoint to come online
working-directory: terraform/ec2
run: |
terraform init
terraform validate

# Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online.
# There may be occasional failures due to transitivity issues, so try up to 2 times.
# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
Expand Down Expand Up @@ -151,10 +158,15 @@ jobs:
- name: Call all test APIs
continue-on-error: true
run: |
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call/
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call/
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call/
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}/
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call/

- name: Build Gradlew
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 ./gradlew

# Validation for pulse telemetry data
- name: Validate generated EMF logs
Expand Down
45 changes: 24 additions & 21 deletions .github/workflows/appsignals-e2e-eks-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ env:
SAMPLE_APP_REMOTE_SERVICE_IMAGE: ${{ secrets.APP_SIGNALS_E2E_TEST_ACC }}.dkr.ecr.${{ inputs.aws-region }}.amazonawss.com/${{ secrets.APP_SIGNALS_E2E_RE_SA_IMG }}
METRIC_NAMESPACE: AppSignals
LOG_GROUP_NAME: /aws/appsignals/eks
TEST_RESOURCES_FOLDER: /home/runner/work/aws-application-signals-test-framework/aws-application-signals-test-framework

jobs:
e2e-eks-test:
Expand Down Expand Up @@ -95,9 +96,10 @@ jobs:

- name: Install eksctl
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
mkdir ${{ github.workspace }}/eksctl
curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"
tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz
execute_and_retry 2 "tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz"
echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH

- name: Create role for AWS access from the sample app
Expand All @@ -113,17 +115,22 @@ jobs:
--approve

- name: Set up terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: false
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update && sudo apt install terraform

- name: Initiate Terraform
working-directory: terraform/eks
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"

- name: Deploy sample app via terraform and wait for the endpoint to come online
id: deploy-sample-app
working-directory: terraform/eks
run: |
terraform init
terraform validate

run: |
# Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online.
# There may be occasional failures due to transitivity issues, so try up to 2 times.
# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
Expand Down Expand Up @@ -158,15 +165,6 @@ jobs:
${{ inputs.aws-region }} \
${{ env.SAMPLE_APP_NAMESPACE }}

# If the workflow provides a specific ADOT image to test, patch the deployment and restart CW agent related pods
if [ ${{ inputs.appsignals-adot-image-name }} != "" ]; then
kubectl patch deploy -namazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' \
-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/0", "value": "--auto-instrumentation-java-image=${{ inputs.appsignals-adot-image-name }}"}]'

kubectl delete pods --all -n amazon-cloudwatch
kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch
fi

kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}

Expand Down Expand Up @@ -244,10 +242,15 @@ jobs:
- name: Call all test APIs
continue-on-error: true
run: |
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/
curl -S -s http://${{ env.APP_ENDPOINT }}/outgoing-http-call/
curl -S -s http://${{ env.APP_ENDPOINT }}/aws-sdk-call/
curl -S -s http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/
curl -S -s http://${{ env.APP_ENDPOINT }}/client-call/

- name: Build Gradlew
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "./gradlew"

# Validation for app signals telemetry data
- name: Call endpoint and validate generated EMF logs
Expand Down
31 changes: 31 additions & 0 deletions .github/workflows/util/execute_and_retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# This function is for retrying commands in the case they fail. It accepts three arguments
# $1: Number of retries it will attempt
# $2: Command to execute
# $3: (Optional) Command for cleaning up resources if $2 fails
execute_and_retry () {
retry_counter=0
max_retry=$1
command=$2
cleanup=$3
while [ $retry_counter -lt $max_retry ]; do
attempt_failed=0
eval "$command" || attempt_failed=$?

if [ $attempt_failed -ne 0 ]; then
eval "$cleanup"
retry_counter=$(($retry_counter+1))
sleep 5
else
break
fi

if [ $retry_counter -eq $max_retry ]; then
echo "Max retry reached, command failed to execute properly. Exiting code"
exit 1
fi
done
}

export -f execute_and_retry