[CI] Add CUDA on AWS run in pre-commit (#7846)

Pavel Chupin · web-flow · commit 6ee5a604bc70 · 2022-12-20T17:20:35.000-08:00
Reintroduce the change #7790 Add CUDA on AWS in addition to CUDA on self-hosted runner. Self-hosted runner will be turned off for OS upgrade. Note that pre-commit testing on this PR will test nothing due to pull_request_target trigger. This change has to be merged first and we'll see effect on the other pre-commit PRs where we expect both self-hosted and AWS run done in parallel. Tested on #7806
diff --git a/.github/workflows/sycl_linux_build_and_test.yml b/.github/workflows/sycl_linux_build_and_test.yml
@@ -195,7 +195,9 @@ jobs:
 
   llvm_test_suite:
     needs: [build, aws-start]
-    if: ${{ !failure() && inputs.lts_matrix != '[]' }}
+    # Continue if build was successful. If aws-start is not successful all
+    # AWS tasks will fail, but all non-AWS tasks should continue.
+    if: ${{ always() && needs.build.result == 'success' && inputs.lts_matrix != '[]' }}
     strategy:
       fail-fast: false
       max-parallel: ${{ inputs.max_parallel }}
@@ -242,7 +244,7 @@ jobs:
         check_sycl_all: ${{ matrix.check_sycl_all }}
         results_name_suffix: ${{ matrix.config }}_${{ inputs.build_artifact_suffix }}
         cmake_args: '${{ matrix.cmake_args }} ${{ inputs.lts_cmake_extra_args }}'
-  
+
   khronos_sycl_cts:
     needs: build
     if: ${{ inputs.cts_matrix != '' }}
@@ -293,6 +295,8 @@ jobs:
   aws-stop:
     name: Stop AWS
     needs: [ aws-start, llvm_test_suite ]
+    # Always attempt to shutdown AWS instance, even if AWS start was not
+    # successful.
     if: ${{ always() && inputs.lts_aws_matrix != '[]' }}
     runs-on: ubuntu-latest
     environment: aws
diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/sycl_gen_test_matrix.yml
     with:
       ref: ${{ github.event.pull_request.head.sha }}
-      lts_config: "hip_amdgpu;ocl_x64;ocl_gen9;l0_gen9;esimd_emu;cuda"
+      lts_config: "hip_amdgpu;ocl_x64;ocl_gen9;l0_gen9;esimd_emu;cuda;cuda_aws"
 
   linux_default:
     name: Linux
diff --git a/devops/actions/aws-ec2/action.yml b/devops/actions/aws-ec2/action.yml
@@ -34,6 +34,15 @@ inputs:
     #     groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner
     #     sync; shutdown -h now
 
+    # us-east-1 region:
+
+    #   ami-01cb0573cb039ab24 (for g5 instances): NVIDIA GPU-Optimized AMI 22.06.0-676eed8d-dcf5-4784-87d7-0de463205c17 (ami-003f25e6e2d2db8f1 with /dev/sda1 disk) with docker and and gh_runner (1001)
+    #     sudo -s
+    #     groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner
+    #     sync; shutdown -h now
+
+    #  ami-058347ad2ce9aef73: ami-02ec0f344128253f9 copy in us-east-1 region
+
     # aws-spot:     Enable usage of spot instances to save money (less reliable). Makes sense only for start mode. Default true.
     # aws-disk:     AWS EC2 instance AMI specific disk device path and size in GB (8 by default). Makes sense only for start mode. Default "/dev/sda1:16".
     # aws-timebomp: AWS EC2 instance maximum live time. Makes sense only for start mode. Default "1h".
@@ -59,7 +68,7 @@ inputs:
   aws-region:
     description: "AWS EC2 region"
     required: false
-    default: "us-east-2" # Ohio
+    default: "us-east-1" # North Virginia
 
 runs:
   using: node16
diff --git a/devops/actions/aws-ec2/aws-ec2.js b/devops/actions/aws-ec2/aws-ec2.js
@@ -41,10 +41,10 @@ async function start(param_type, param_label, param_ami, param_spot, param_disk,
   const ec2types     = typeof param_type     === 'string' ? [ param_type ] : param_type;
   const label        = typeof param_label    === 'string' ? param_label : param_label[0];
   const ec2ami       = typeof param_ami      !== 'undefined' ? param_ami : "ami-0966bccbb521ccb24";
-  const ec2spot      = typeof param_spot     !== 'undefined' ? param_spot : true;
+  const ec2spot      = typeof param_spot     !== 'undefined' ? (param_spot === "false" ? false : true) : true;
   const ec2disk      = typeof param_disk     !== 'undefined' ? param_disk : "/dev/sda1:16";
   const timebomb     = typeof param_timebomb !== 'undefined' ? param_timebomb : "1h";
-  const onejob       = typeof param_onejob   !== 'undefined' ? param_onejob : true;
+  const onejob       = typeof param_onejob   !== 'undefined' ? (param_onejob === "false" ? false : true) : true;
   // ephemeral runner will exit after one job so we will terminate instance sooner
   const ephemeral_str = onejob ? "--ephemeral" : "";
 
diff --git a/devops/test_configs.json b/devops/test_configs.json
@@ -71,6 +71,19 @@
       "container_options": "--gpus all",
       "check_sycl_all": "cuda:gpu",
       "cmake_args": ""
+    },
+    {
+      "config": "cuda_aws",
+      "name": "[AWS] CUDA LLVM Test Suite",
+      "runs-on": "aws-cuda_${{ inputs.uniq }}",
+      "aws-ami": "ami-01cb0573cb039ab24",
+      "aws-type": [ "g5.2xlarge", "g5.4xlarge" ],
+      "aws-disk": "/dev/sda1:64",
+      "aws-spot": "false",
+      "image": "${{ inputs.cuda_image }}",
+      "container_options": "--gpus all",
+      "check_sycl_all": "cuda:gpu",
+      "cmake_args": ""
     }
   ],
   "cts": [