aws-observability · harrryr · Jul 26, 2024 · Jul 28, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.github/workflows/java-ec2-asg-e2e-test.yml b/.github/workflows/java-ec2-asg-e2e-test.yml
@@ -122,60 +122,13 @@ jobs:
               -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \
               -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \
               -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}" \
+              -var="canary_type=${{ github.job }}" \
             || deployment_failed=$?
 
             if [ $deployment_failed -eq 1 ]; then
               echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
             fi
 
-            # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint.
-            # Attempts to connect will be made for up to 10 minutes
-            if [ $deployment_failed -eq 0 ]; then
-              echo "Attempting to connect to the endpoint"
-              main_service_instance_id=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names ec2-single-asg-${{ env.TESTING_ID }} --region ${{ env.E2E_TEST_AWS_REGION }} --query "AutoScalingGroups[].Instances[0].InstanceId" --output text)
-              main_service_public_ip=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.E2E_TEST_AWS_REGION }} --query "Reservations[].Instances[].PublicIpAddress" --output text)
-              main_service_private_dns_name=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.E2E_TEST_AWS_REGION }} --query "Reservations[].Instances[].PrivateDnsName" --output text)
-
-              echo "INSTANCE_ID=$main_service_instance_id" >> $GITHUB_ENV
-              echo "MAIN_SERVICE_ENDPOINT=$main_service_public_ip:8080" >> $GITHUB_ENV
-              echo "PRIVATE_DNS_NAME=$main_service_private_dns_name" >> $GITHUB_ENV
-              echo "EC2_INSTANCE_AMI=$(terraform output ec2_instance_ami)" >> $GITHUB_ENV
-              echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV
-
-              main_service_sample_app_endpoint=http://$main_service_public_ip:8080
-              echo "The main service endpoint is $main_service_sample_app_endpoint"
-          
-              attempt_counter=0
-              max_attempts=30
-              until $(curl --output /dev/null --silent --head --fail $(echo "$main_service_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
-
-              echo "Attempting to connect to the remote sample app endpoint"
-              remote_sample_app_endpoint=http://$(terraform output sample_app_remote_service_public_ip):8080/healthcheck
-              attempt_counter=0
-              max_attempts=30
-              until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
-            fi
-
             # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
             # resources created from terraform and try again.
             if [ $deployment_failed -eq 1 ]; then
@@ -195,14 +148,16 @@ jobs:
             fi
           done
 
-      # This steps increases the speed of the validation by creating the telemetry data in advance
-      - name: Call all test APIs
-        continue-on-error: true
-        run: |        
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call?ip=${{ env.REMOTE_SERVICE_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call"
+      - name: Get the sample app and EC2 instance information
+        working-directory: terraform/java/ec2/asg
+        run: |
+          main_service_instance_id=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names ec2-single-asg-${{ env.TESTING_ID }} --region ${{ env.E2E_TEST_AWS_REGION }} --query "AutoScalingGroups[].Instances[0].InstanceId" --output text)
+          main_service_private_dns_name=$(aws ec2 describe-instances --instance-ids $main_service_instance_id --region ${{ env.E2E_TEST_AWS_REGION }} --query "Reservations[].Instances[].PrivateDnsName" --output text)
+          echo "INSTANCE_ID=$main_service_instance_id" >> $GITHUB_ENV
+          echo "MAIN_SERVICE_ENDPOINT=localhost:8080" >> $GITHUB_ENV
+          echo "PRIVATE_DNS_NAME=$main_service_private_dns_name" >> $GITHUB_ENV
+          echo "EC2_INSTANCE_AMI=$(terraform output ec2_instance_ami)" >> $GITHUB_ENV
+          echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_private_ip)" >> $GITHUB_ENV
 
       - name: Initiate Gradlew Daemon
         if: steps.initiate-gradlew == 'failure'

diff --git a/.github/workflows/java-ec2-default-e2e-test.yml b/.github/workflows/java-ec2-default-e2e-test.yml
@@ -30,7 +30,6 @@ env:
   LOG_GROUP_NAME: /aws/application-signals/data
   TEST_RESOURCES_FOLDER: ${GITHUB_WORKSPACE}
 
-
 jobs:
   java-ec2-default:
     runs-on: ubuntu-latest
@@ -99,7 +98,7 @@ jobs:
       - name: Initiate Terraform
         uses: ./.github/workflows/actions/execute_and_retry
         with:
-          command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/ec2/default && terraform init && terraform validate"
+          command: "pwd && cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/ec2/default && terraform init && terraform validate"
           cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
           max_retry: 6
           sleep_time: 60
@@ -123,48 +122,17 @@ jobs:
               -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \
               -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \
               -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}" \
+              -var="canary_type=${{ github.job }}" \
             || deployment_failed=$?
+          
+            echo $(terraform state show aws_key_pair.aws_ssh_key)
+            echo $(terraform output -json private_key_content)
+          
 
             if [ $deployment_failed -eq 1 ]; then
               echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
             fi
 
-            # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint.
-            # Attempts to connect will be made for up to 10 minutes
-            if [ $deployment_failed -eq 0 ]; then
-              echo "Attempting to connect to the endpoint"
-              main_sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080
-              attempt_counter=0
-              max_attempts=30
-              until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
-
-              echo "Attempting to connect to the remote sample app endpoint"
-              remote_sample_app_endpoint=http://$(terraform output sample_app_remote_service_public_ip):8080/healthcheck
-              attempt_counter=0
-              max_attempts=30
-              until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
-            fi
-
             # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
             # resources created from terraform and try again.
             if [ $deployment_failed -eq 1 ]; then
@@ -192,19 +160,10 @@ jobs:
       - name: Get the sample app and EC2 instance information
         working-directory: terraform/java/ec2/default
         run: |
-          echo "MAIN_SERVICE_ENDPOINT=$(terraform output sample_app_main_service_public_dns):8080" >> $GITHUB_ENV
-          echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV
+          echo "MAIN_SERVICE_ENDPOINT=localhost:8080" >> $GITHUB_ENV
+          echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_private_ip)" >> $GITHUB_ENV
           echo "MAIN_SERVICE_INSTANCE_ID=$(terraform output main_service_instance_id)" >> $GITHUB_ENV
 
-      # This steps increases the speed of the validation by creating the telemetry data in advance
-      - name: Call all test APIs
-        continue-on-error: true
-        run: |
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call?ip=${{ env.REMOTE_SERVICE_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call"
-
       - name: Initiate Gradlew Daemon
         if: steps.initiate-gradlew == 'failure'
         uses: ./.github/workflows/actions/execute_and_retry

diff --git a/.github/workflows/java-eks-e2e-test.yml b/.github/workflows/java-eks-e2e-test.yml
@@ -232,39 +232,6 @@ jobs:
 
               execute_and_retry 2 "kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 60
               execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 10
-
-              echo "Attempting to connect to the main sample app endpoint"
-              main_sample_app_endpoint=http://$(terraform output sample_app_endpoint)
-              attempt_counter=0
-              max_attempts=60
-              until $(curl --output /dev/null --silent --head --fail $(echo "$main_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint ($main_sample_app_endpoint). Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
-
-              echo "Attempting to connect to the remote sample app endpoint"
-              remote_sample_app_endpoint=http://$(terraform output sample_remote_app_endpoint)/healthcheck
-              echo $remote_sample_app_endpoint
-              attempt_counter=0
-              max_attempts=30
-              until $(curl --output /dev/null --silent --head --fail $(echo "$remote_sample_app_endpoint" | tr -d '"')); do
-                if [ ${attempt_counter} -eq ${max_attempts} ];then
-                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
-                  deployment_failed=1
-                  break
-                fi
-
-                printf '.'
-                attempt_counter=$(($attempt_counter+1))
-                sleep 10
-              done
             fi
 
             # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
@@ -333,18 +300,28 @@ jobs:
           echo "REMOTE_SERVICE_POD_IP=$(kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].status.podIP}')" >> $GITHUB_ENV
 
       - name: Get the sample app endpoint
-        working-directory: terraform/java/eks
-        run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
+        run: echo "APP_ENDPOINT=$(kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=sample-app -o jsonpath='{.items[0].status.podIP}'):8080" >> $GITHUB_ENV
 
-      # This steps increases the speed of the validation by creating the telemetry data in advance
-      - name: Call all test APIs
-        continue-on-error: true
+      - name: Deploy the traffic generator
         run: |
-          curl -S -s "http://${{ env.APP_ENDPOINT }}/outgoing-http-call"
-          curl -S -s "http://${{ env.APP_ENDPOINT }}/aws-sdk-call?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"
-          curl -S -s "http://${{ env.APP_ENDPOINT }}/client-call"
-          curl -S -s "http://${{ env.APP_ENDPOINT }}/mysql"
+          # Deploy the traffic generator
+          kubectl create deployment -n ${{ env.SAMPLE_APP_NAMESPACE }} traffic-generator \
+            --image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/e2e-test-resource:traffic-generator \
+            --replicas=1
+
+          # Patch it with ImagePull always policy so that it pulls the latest image from the ECR
+          kubectl patch deployment -n ${{ env.SAMPLE_APP_NAMESPACE }} traffic-generator --patch '{"spec": {"template": {"spec": {"containers": [{"name": "e2e-test-resource", "imagePullPolicy": "Always"}]}}}}'
+
+          # Add the appropriate environment variables to the traffic generator
+          kubectl set env -n ${{ env.SAMPLE_APP_NAMESPACE }} deployment/traffic-generator MAIN_ENDPOINT=${{ env.APP_ENDPOINT }}
+          kubectl set env -n ${{ env.SAMPLE_APP_NAMESPACE }} deployment/traffic-generator REMOTE_ENDPOINT=${{ env.REMOTE_SERVICE_POD_IP }}
+          kubectl set env -n ${{ env.SAMPLE_APP_NAMESPACE }} deployment/traffic-generator ID=${{ env.TESTING_ID }}
+          kubectl set env -n ${{ env.SAMPLE_APP_NAMESPACE }} deployment/traffic-generator CANARY_TYPE=${{ github.job }}
+
+          # Restart the traffic generator with the new configuration          
+          kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --no-headers | grep '^traffic-generator' | awk '{print $1}' | xargs kubectl delete pod -n ${{ env.SAMPLE_APP_NAMESPACE }} || true
+
+          sleep 10
 
       - name: Initiate Gradlew Daemon
         if: steps.initiate-gradlew == 'failure'