aws-observability · harrryr · May 15, 2024 · May 14, 2024 · May 14, 2024 · May 13, 2024
diff --git a/.github/workflows/application-signals-python-e2e-ec2-asg-test.yml b/.github/workflows/application-signals-python-e2e-ec2-asg-test.yml
@@ -0,0 +1,288 @@
+## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+## SPDX-License-Identifier: Apache-2.0
+
+# This is a reusable workflow for running the Python E2E Canary test for Application Signals.
+# It is meant to be called from another workflow.
+# Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
+name: Application Signals Enablement E2E Testing - Python EC2 Asg Use Case
+on:
+  workflow_call:
+    inputs:
+      aws-region:
+        required: true
+        type: string
+      staging_wheel_name:
+        required: false
+        default: 'aws-opentelemetry-distro'
+        type: string
+      caller-workflow-name:
+        required: true
+        type: string
+
+permissions:
+  id-token: write
+  contents: read
+
+env:
+  AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
+  SAMPLE_APP_ZIP: s3://${{ secrets.APP_SIGNALS_E2E_EC2_JAR }}-prod-${{ inputs.aws-region }}/python-sample-app.zip
+  METRIC_NAMESPACE: ApplicationSignals
+  LOG_GROUP_NAME: /aws/application-signals/data
+  ADOT_WHEEL_NAME: ${{ inputs.staging_wheel_name }}
+  TEST_RESOURCES_FOLDER: ${GITHUB_WORKSPACE}
+  GET_CW_AGENT_RPM_COMMAND: "aws s3api get-object --bucket private-cloudwatch-agent-apm-beta --key linux_amd64/amazon-cloudwatch-agent.rpm cw-agent.rpm"
+
+
+jobs:
+  python-e2e-ec2-asg-test:
+    runs-on: ubuntu-latest
+    container:
+      image: public.ecr.aws/h6o3z5z9/aws-application-signals-test-framework-workflow-container:latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: aws-observability/aws-application-signals-test-framework
+          ref: ga-python
+
+      - name: Generate testing id
+        run: echo TESTING_ID="${{ github.job }}-${{ env.AWS_DEFAULT_REGION }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" >> $GITHUB_ENV
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.E2E_SECRET_TEST_ROLE_ARN }}
+          aws-region: us-east-1
+
+      - name: Retrieve account
+        uses: aws-actions/aws-secretsmanager-get-secrets@v1
+        with:
+          secret-ids:
+            ACCOUNT_ID, region-account/${{ env.AWS_DEFAULT_REGION }}
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::${{ env.ACCOUNT_ID }}:role/${{ secrets.E2E_TEST_ROLE_ARN }}
+          aws-region: ${{ env.AWS_DEFAULT_REGION }}
+
+      - uses: actions/download-artifact@v3
+        if: inputs.caller-workflow-name == 'main-build'
+        with:
+          name: ${{ env.ADOT_WHEEL_NAME }}
+
+      - name: Upload main-build adot.whl to s3
+        if: inputs.caller-workflow-name == 'main-build'
+        run: aws s3 cp ${{ env.ADOT_WHEEL_NAME }} s3://adot-main-build-staging-jar/${{ env.ADOT_WHEEL_NAME }}
+
+      - name: Set Get ADOT Wheel command environment variable
+        working-directory: terraform/python/ec2
+        run: |
+          echo GET_ADOT_WHEEL_COMMAND="aws s3 cp s3://adot-main-build-staging-jar/aws_opentelemetry_distro-0.1.1.dev0-026e62a6-py3-none-any.whl ./aws_opentelemetry_distro-0.1.1.dev0-026e62a6-py3-none-any.whl && python3.9 -m pip install aws_opentelemetry_distro-0.1.1.dev0-026e62a6-py3-none-any.whl" >> $GITHUB_ENV
+#          if [ ${{ inputs.caller-workflow-name }} == "main-build" ]; then
+#            # Reusing the adot-main-build-staging-jar bucket to store the python wheel file
+#            echo GET_ADOT_WHEEL_COMMAND="aws s3 cp s3://adot-main-build-staging-jar/${{ env.ADOT_WHEEL_NAME }} ./${{ env.ADOT_WHEEL_NAME }} && python3.9 -m pip install ${{ env.ADOT_WHEEL_NAME }}" >> $GITHUB_ENV
+#          else
+#            echo GET_ADOT_WHEEL_COMMAND="python3.9 -m pip install aws-opentelemetry-distro" >> $GITHUB_ENV
+#          fi
+
+      - name: Initiate Terraform
+        uses: ./.github/workflows/actions/execute_and_retry
+        with:
+          command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/ec2/asg && terraform init && terraform validate"
+          cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
+
+      - name: Deploy sample app via terraform and wait for endpoint to come online
+        working-directory: terraform/python/ec2/asg
+        run: |
+          # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. 
+          # There may be occasional failures due to transitivity issues, so try up to 2 times. 
+          # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
+          # that it failed at some point
+          retry_counter=0
+          max_retry=2
+          while [ $retry_counter -lt $max_retry ]; do
+            echo "Attempt $retry_counter"
+            deployment_failed=0
+            terraform apply -auto-approve \
+              -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
+              -var="test_id=${{ env.TESTING_ID }}" \
+              -var="sample_app_zip=${{ env.SAMPLE_APP_ZIP }}" \
+              -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \
+              -var="get_adot_wheel_command=${{ env.GET_ADOT_WHEEL_COMMAND }}" \
+            || deployment_failed=$?
+
+            if [ $deployment_failed -eq 1 ]; then
+              echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
+            fi
+
+            # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint.
+            # Attempts to connect will be made for up to 10 minutes
+            if [ $deployment_failed -eq 0 ]; then
+              echo "Attempting to connect to the endpoint"
+              main_service_instance_id=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names python-ec2-single-asg-${{ env.TESTING_ID }} --region ${{ env.AWS_DEFAULT_REGION }} --query "AutoScalingGroups[].Instances[0].InstanceId" --output text)
+              main_service_public_ip=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.AWS_DEFAULT_REGION }} --query "Reservations[].Instances[].PublicIpAddress" --output text)
+              main_service_sample_app_endpoint=http://$main_service_public_ip:8000
+              echo "The main service endpoint is $main_service_sample_app_endpoint"
+
+              attempt_counter=0
+              max_attempts=30
+              until $(curl --output /dev/null --silent --head --fail $(echo "$main_service_sample_app_endpoint" | tr -d '"')); do
+                if [ ${attempt_counter} -eq ${max_attempts} ];then
+                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
+                  deployment_failed=1
+                  break
+                fi
+
+                printf '.'
+                attempt_counter=$(($attempt_counter+1))
+                sleep 10
+              done
+
+              echo "Attempting to connect to the remote sample app endpoint"
+              remote_sample_app_endpoint=http://$(terraform output sample_app_remote_service_public_ip):8001/healthcheck
+              attempt_counter=0
+              max_attempts=30
+              until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
+                if [ ${attempt_counter} -eq ${max_attempts} ];then
+                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
+                  deployment_failed=1
+                  break
+                fi
+
+                printf '.'
+                attempt_counter=$(($attempt_counter+1))
+                sleep 10
+              done
+            fi
+
+            # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
+            # resources created from terraform and try again.
+            if [ $deployment_failed -eq 1 ]; then
+              echo "Destroying terraform"
+              terraform destroy -auto-approve \
+                -var="test_id=${{ env.TESTING_ID }}" 
+
+              retry_counter=$(($retry_counter+1))
+            else
+              # If deployment succeeded, then exit the loop
+              break
+            fi
+
+            if [ $retry_counter -eq $max_retry ]; then
+              echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
+              exit 1
+            fi
+          done
+
+      - name: Get sample app and EC2 instance information
+        working-directory: terraform/python/ec2/asg
+        run: |
+          main_service_instance_id=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names python-ec2-single-asg-${{ env.TESTING_ID }} --region ${{ env.AWS_DEFAULT_REGION }} --query "AutoScalingGroups[].Instances[0].InstanceId" --output text)
+          main_service_public_ip=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.AWS_DEFAULT_REGION }} --query "Reservations[].Instances[].PublicIpAddress" --output text)
+          main_service_private_dns_name=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.AWS_DEFAULT_REGION }} --query "Reservations[].Instances[].PrivateDnsName" --output text)
+          echo "INSTANCE_ID=$main_service_instance_id" >> $GITHUB_ENV
+          echo "MAIN_SERVICE_ENDPOINT=$main_service_public_ip:8000" >> $GITHUB_ENV
+          echo "PRIVATE_DNS_NAME=$main_service_private_dns_name" >> $GITHUB_ENV
+          echo "EC2_INSTANCE_AMI=$(terraform output ec2_instance_ami)" >> $GITHUB_ENV
+          echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV
+
+      # This steps increases the speed of the validation by creating the telemetry data in advance
+      - name: Call all test APIs
+        continue-on-error: true
+        run: |
+          curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call; echo
+          curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call?testingId=${{ env.TESTING_ID }}; echo
+          curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}; echo
+          curl -S -s http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call; echo
+
+      - name: Initiate Gradlew Daemon
+        uses: ./.github/workflows/actions/execute_and_retry
+        with:
+          command: "./gradlew"
+          cleanup: "./gradlew clean"
+          max_retry: 4
+          sleep_time: 30
+
+      # Validation for pulse telemetry data
+      - name: Validate generated EMF logs
+        id: log-validation
+        run: ./gradlew validator:run --args='-c python/ec2/asg/log-validation.yml
+          --testing-id ${{ env.TESTING_ID }}
+          --endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
+          --remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8001
+          --region ${{ env.AWS_DEFAULT_REGION }}
+          --metric-namespace ${{ env.METRIC_NAMESPACE }}
+          --log-group ${{ env.LOG_GROUP_NAME }}
+          --service-name python-sample-application-${{ env.TESTING_ID }}
+          --remote-service-name python-sample-remote-application-${{ env.TESTING_ID }}
+          --query-string ip=${{ env.REMOTE_SERVICE_IP }}
+          --instance-ami ${{ env.EC2_INSTANCE_AMI }}
+          --platform-info python-ec2-single-asg-${{ env.TESTING_ID }}
+          --instance-id ${{ env.INSTANCE_ID }}
+          --private-dns-name ${{ env.PRIVATE_DNS_NAME }}
+          --rollup'
+
+      - name: Validate generated metrics
+        id: metric-validation
+        if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
+        run: ./gradlew validator:run --args='-c python/ec2/asg/metric-validation.yml
+          --testing-id ${{ env.TESTING_ID }}
+          --endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
+          --remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8001
+          --region ${{ env.AWS_DEFAULT_REGION }}
+          --metric-namespace ${{ env.METRIC_NAMESPACE }}
+          --log-group ${{ env.LOG_GROUP_NAME }}
+          --service-name python-sample-application-${{ env.TESTING_ID }}
+          --remote-service-name python-sample-remote-application-${{ env.TESTING_ID }}
+          --query-string ip=${{ env.REMOTE_SERVICE_IP }}
+          --instance-ami ${{ env.EC2_INSTANCE_AMI }}
+          --platform-info python-ec2-single-asg-${{ env.TESTING_ID }}
+          --instance-id ${{ env.INSTANCE_ID }}
+          --private-dns-name ${{ env.PRIVATE_DNS_NAME }}
+          --rollup'
+
+      - name: Validate generated traces
+        id: trace-validation
+        if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
+        run: ./gradlew validator:run --args='-c python/ec2/asg/trace-validation.yml
+          --testing-id ${{ env.TESTING_ID }}
+          --endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
+          --remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8001
+          --region ${{ env.AWS_DEFAULT_REGION }}
+          --account-id ${{ env.ACCOUNT_ID }}
+          --metric-namespace ${{ env.METRIC_NAMESPACE }}
+          --log-group ${{ env.LOG_GROUP_NAME }}
+          --service-name python-sample-application-${{ env.TESTING_ID }}
+          --remote-service-name python-sample-remote-application-${{ env.TESTING_ID }}
+          --query-string ip=${{ env.REMOTE_SERVICE_IP }}
+          --instance-ami ${{ env.EC2_INSTANCE_AMI }}
+          --platform-info python-ec2-single-asg-${{ env.TESTING_ID }}
+          --instance-id ${{ env.INSTANCE_ID }}
+          --private-dns-name ${{ env.PRIVATE_DNS_NAME }}
+          --rollup'
+
+      - name: Publish metric on test result
+        if: always()
+        run: |
+          if [ "${{ steps.log-validation.outcome }}" = "success" ] && [ "${{ steps.metric-validation.outcome }}" = "success" ] && [ "${{ steps.trace-validation.outcome }}" = "success" ]; then
+            aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
+            --metric-name Failure \
+            --dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
+            --value 0.0 \
+            --region ${{ env.AWS_DEFAULT_REGION }}
+          else
+            aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
+            --metric-name Failure \
+            --dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
+            --value 1.0 \
+            --region ${{ env.AWS_DEFAULT_REGION }}
+          fi
+
+      # Clean up Procedures
+      - name: Terraform destroy
+        if: always()
+        continue-on-error: true
+        working-directory: terraform/python/ec2/asg
+        run: |
+          terraform destroy -auto-approve \
+            -var="test_id=${{ env.TESTING_ID }}"