Skip to content

Add Retry for Remote Application, Addon, and EC2 Instance Inline #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/workflows/actions/execute_and_retry/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Reusable Action for executing commands and retrying them if it fails
name: Command Retry Logic

inputs:
# (Optional) Command to run before the retry command. To be used for environment setup, etc
pre-command:
required: false
type: string
# (Optional) Number of retries to perform. Default is 2
max_retry:
required: false
type: number
default: 2
# (Required) Command to execute with the retry mechanism
command:
required: true
type: string
# (Required) Command to clean up resources before retrying the main command
cleanup:
required: false
type: string
# (Optional) Follow-up command after the main command is finished.
post-command:
required: false
type: string

runs:
using: "composite"
steps:
- name: Run pre-command
shell: bash
env:
PRE_COMMAND: ${{ inputs.pre-command }}
run: |
$PRE_COMMAND

- name: Run command
shell: bash
env:
MAX_RETRY: ${{ inputs.max_retry }}
COMMAND: ${{ inputs.command }}
CLEANUP: ${{ inputs.cleanup }}
run: |
retry_counter=0
while [ $retry_counter -lt $MAX_RETRY ]; do
attempt_failed=0
eval "$COMMAND" || attempt_failed=$?

if [ $attempt_failed -ne 0 ]; then
eval "$CLEANUP"
retry_counter=$(($retry_counter+1))
sleep 5
else
break
fi

if [ $retry_counter -eq $max_retry ]; then
echo "Max retry reached, command failed to execute properly. Exiting code"
exit 1
fi
done

- name: Run post command
shell: bash
env:
POST_COMMAND: ${{ inputs.post-command }}
run: $POST_COMMAND
51 changes: 33 additions & 18 deletions .github/workflows/appsignals-e2e-ec2-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ env:
LOG_GROUP_NAME: /aws/appsignals/generic
TEST: ${{ inputs.test }}
GET_ADOT_JAR_COMMAND: "wget -O adot.jar https://github.com/aws-observability/aws-otel-java-instrumentation/releases/latest/download/aws-opentelemetry-agent.jar"
GET_CW_AGENT_RPM_COMMAND: "wget -O cw-agent.rpm https://amazoncloudwatch-agent-${{ inputs.aws-region }}.s3.${{ inputs.aws-region }}.amazonaws.com/amazon_linux/amd64/1.300031.0b313/amazon-cloudwatch-agent.rpm"
TEST_RESOURCES_FOLDER: /home/runner/work/aws-application-signals-test-framework/aws-application-signals-test-framework


Expand All @@ -42,9 +43,6 @@ jobs:
with:
fetch-depth: 0

- name: Set CW Agent RPM environment variable
run: echo GET_CW_AGENT_RPM_COMMAND="wget -O cw-agent.rpm https://amazoncloudwatch-agent-${{ inputs.aws-region }}.s3.${{ inputs.aws-region }}.amazonaws.com/amazon_linux/amd64/1.300031.0b313/amazon-cloudwatch-agent.rpm" >> $GITHUB_ENV

- name: Generate testing id
run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV

Expand All @@ -67,17 +65,17 @@ jobs:
aws-region: ${{ inputs.aws-region }}

- name: Set up terraform
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update && sudo apt install terraform
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
post-command: 'echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list \
sudo apt update && sudo apt install terraform'

- name: Initiate Terraform
working-directory: terraform/ec2
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/ec2 && terraform init && terraform validate"
cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"

- name: Deploy sample app via terraform and wait for endpoint to come online
working-directory: terraform/ec2
Expand Down Expand Up @@ -108,10 +106,26 @@ jobs:
# Attempts to connect will be made for up to 10 minutes
if [ $deployment_failed -eq 0 ]; then
echo "Attempting to connect to the endpoint"
sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
main_sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
attempt_counter=0
max_attempts=30
until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
deployment_failed=1
break
fi

printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done

echo "Attempting to connect to the remote sample app endpoint"
remote_sample_app_endpoint=http://$(terraform output sample_app_remote_service_public_ip):8080/healthcheck
attempt_counter=0
max_attempts=60
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
max_attempts=30
until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
deployment_failed=1
Expand Down Expand Up @@ -164,9 +178,10 @@ jobs:
curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call/

- name: Build Gradlew
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 ./gradlew
uses: ./.github/workflows/actions/execute_and_retry
with:
max_retry: 4
command: "./gradlew"

# Validation for pulse telemetry data
- name: Validate generated EMF logs
Expand Down
84 changes: 56 additions & 28 deletions .github/workflows/appsignals-e2e-eks-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,38 +94,42 @@ jobs:
- name: Set up kubeconfig
run: aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}

- name: Download eksctl
uses: ./.github/workflows/actions/execute_and_retry
with:
pre-command: 'mkdir ${{ github.workspace }}/eksctl'
command: 'curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"'

- name: Install eksctl
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
mkdir ${{ github.workspace }}/eksctl
curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"
execute_and_retry 2 "tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz"
tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz
echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH

- name: Create role for AWS access from the sample app
id: create_service_account
run: |
eksctl create iamserviceaccount \
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "eksctl create iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--role-name eks-s3-access-${{ env.TESTING_ID }} \
--attach-policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess \
--region ${{ inputs.aws-region }} \
--approve
--approve"

- name: Set up terraform
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update && sudo apt install terraform
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg"
post-command: 'echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list \
sudo apt update && sudo apt install terraform'

- name: Initiate Terraform
working-directory: terraform/eks
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "terraform init && terraform validate" "rm -rf .terraform && rm -rf .terraform.lock.hcl"
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/eks && terraform init && terraform validate"
cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"

- name: Deploy sample app via terraform and wait for the endpoint to come online
id: deploy-sample-app
Expand All @@ -152,27 +156,50 @@ jobs:
-var="sample_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_E2E_RE_SA_IMG }}" \
|| deployment_failed=$?

if [ $deployment_failed -eq 1 ]; then
if [ $deployment_failed -ne 0 ]; then
echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
fi

# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint
# after installing App Signals. Attempts to connect will be made for up to 10 minutes
if [ $deployment_failed -eq 0 ]; then
echo "Installing app signals to the sample app"
${GITHUB_WORKSPACE}/enablement-script/scripts/eks/appsignals/enable-app-signals.sh \
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 \
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/scripts/eks/appsignals/enable-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.SAMPLE_APP_NAMESPACE }}
${{ env.SAMPLE_APP_NAMESPACE }}" \
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/scripts/eks/appsignals/clean-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.SAMPLE_APP_NAMESPACE }} && \
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}"

kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}

echo "Attempting to connect to the endpoint"
sample_app_endpoint=http://$(terraform output sample_app_endpoint)
echo "Attempting to connect to the main sample app endpoint"
main_sample_app_endpoint=http://$(terraform output sample_app_endpoint)
attempt_counter=0
max_attempts=30
until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
deployment_failed=1
break
fi

printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done

echo "Attempting to connect to the remote sample app endpoint"
remote_sample_app_endpoint=http://$(terraform output sample_remote_app_endpoint)/healthcheck
echo $remote_sample_app_endpoint
attempt_counter=0
max_attempts=60
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
max_attempts=30
until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
deployment_failed=1
Expand All @@ -196,7 +223,7 @@ jobs:

# Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}

echo "Destroying terraform"
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
Expand Down Expand Up @@ -248,9 +275,10 @@ jobs:
curl -S -s http://${{ env.APP_ENDPOINT }}/client-call/

- name: Build Gradlew
run: |
source ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 2 "./gradlew"
uses: ./.github/workflows/actions/execute_and_retry
with:
max_retry: 4
command: "./gradlew"

# Validation for app signals telemetry data
- name: Call endpoint and validate generated EMF logs
Expand Down
4 changes: 4 additions & 0 deletions terraform/ec2/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ resource "null_resource" "main_service_setup" {

provisioner "remote-exec" {
inline = [
# Make the Terraform fail if any step throws an error
"set -o errexit",
# Install Java 11 and wget
"sudo yum install wget java-11-amazon-corretto -y",

Expand Down Expand Up @@ -165,6 +167,8 @@ resource "null_resource" "remote_service_setup" {

provisioner "remote-exec" {
inline = [
# Make the Terraform fail if any step throws an error
"set -o errexit",
# Install Java 11 and wget
"sudo yum install wget java-11-amazon-corretto -y",

Expand Down
39 changes: 39 additions & 0 deletions terraform/eks/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,45 @@ resource "kubernetes_service" "sample_remote_app_service" {
}
}

resource "kubernetes_ingress_v1" "sample-remote-app-ingress" {
depends_on = [kubernetes_service.sample_remote_app_service]
wait_for_load_balancer = true
metadata {
name = "sample-remote-app-ingress-${var.test_id}"
namespace = var.test_namespace
annotations = {
"kubernetes.io/ingress.class" = "alb"
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
"alb.ingress.kubernetes.io/target-type" = "ip"
}
labels = {
app = "sample-remote-app-ingress"
}
}
spec {
rule {
http {
path {
path = "/"
path_type = "Prefix"
backend {
service {
name = kubernetes_service.sample_remote_app_service.metadata[0].name
port {
number = 8080
}
}
}
}
}
}
}
}

output "sample_app_endpoint" {
value = kubernetes_ingress_v1.sample-app-ingress.status.0.load_balancer.0.ingress.0.hostname
}

output "sample_remote_app_endpoint" {
value = kubernetes_ingress_v1.sample-remote-app-ingress.status.0.load_balancer.0.ingress.0.hostname
}