Skip to content

Commit ee781e4

Browse files
authored
[CI] Add AWS EC2 dynamic runner support (#6471)
This adds infrastructure to spawn AWS EC2 runners dynamically for lts suite testing. This will be only functional if you will add "aws-type" keys as well as other keys into devops/test_configs.json configuration file like this: { "config": "hip_amdgpu", "name": "HIP AMDGPU LLVM Test Suite", "runs-on": "aws-amdgpu_${{ inputs.uniq }}", "aws-ami": "ami-0ccda708841dde988", "aws-type": [ "g4ad.2xlarge", "g4ad.4xlarge" ], "aws-spot": false, "aws-disk": "/dev/xvda:64", "image": "${{ inputs.amdgpu_image }}", "container_options": "--device=/dev/dri --device=/dev/kfd", "check_sycl_all": "hip:gpu,host", "cmake_args": "-DHIP_PLATFORM=\"AMD\" -DAMD_ARCH=\"gfx1031\"" }, { "config": "cuda", "name": "CUDA LLVM Test Suite", "runs-on": "aws-cuda_${{ inputs.uniq }}", "aws-ami": "ami-02ec0f344128253f9", "aws-type": [ "g4dn.2xlarge", "g4dn.4xlarge" ], "aws-disk": "/dev/xvda:64", "image": "${{ inputs.cuda_image }}", "container_options": "--gpus all", "check_sycl_all": "cuda:gpu,host", "cmake_args": "" } Also please make sure that other non AMD/nVidia GPU jobs do not have too generic self-hosted runner labels like "Linux", "x64" since otherwise they can go to these AWS hosts and we do not want to use them for generic workloads. Intel provided AWS account is supposed to be used. To configure it for this repo please do the following (I will keep this BKM schematic to avoid disclosing any sensitive info): 1 Login to AWS Intel account as admin 2 To go IAM users (https://us-east-1.console.aws.amazon.com/iamv2/home?region=us-east-1#/users) 3 Click "Add users" 4 Select "Access key - Programmatic access" 5 Copy permissions from existing user (sycl-ci) 6 Get new user AWS key and secret key strings (keep them private until step 11). 7 Delete original user sycl-ci (so I can no longer use this AWS account for apstasen/llvm repo for test purposes) 8 Got to https://github.com/intel/llvm/settings/secrets/actions 9 Create "aws" environment and make sure you select required reviewers for extra security (they need to pay special attention that PRs do not expose secrets by making changes workflow .yml and devops .js files) 10 Create AWS_ACCESS_KEY and AWS_SECRET_KEY secrets using obtained new AWS AMI user key strings. 11 Destroy all copies of AWS key and secret key strings (except ones stored as github "aws" environment secrets) 12 Create repository (or even better put them into "aws" environment too for better security) secret GH_PERSONAL_ACCESS_TOKEN (with Github api key with "repo" permissions)
1 parent 79434e4 commit ee781e4

File tree

10 files changed

+395
-3
lines changed

10 files changed

+395
-3
lines changed

.github/workflows/sycl_linux_build_and_test.yml

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ on:
4040
type: string
4141
required: false
4242
default: ""
43+
lts_aws_matrix:
44+
type: string
45+
required: false
46+
default: ""
4347
lts_cmake_extra_args:
4448
type: string
4549
required: false
@@ -155,9 +159,31 @@ jobs:
155159
name: sycl_lit_${{ inputs.build_artifact_suffix }}
156160
path: lit.tar.xz
157161

158-
llvm_test_suite:
162+
aws-start:
163+
name: Start AWS
159164
needs: build
160-
if: ${{ inputs.lts_matrix != '' }}
165+
if: ${{ inputs.lts_aws_matrix != '' }}
166+
runs-on: ubuntu-latest
167+
environment: aws
168+
steps:
169+
- name: Setup script
170+
run: |
171+
mkdir -p ./aws-ec2
172+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml -P ./aws-ec2
173+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js -P ./aws-ec2
174+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2
175+
npm install ./aws-ec2
176+
- name: Start AWS EC2 runners
177+
uses: ./aws-ec2
178+
with:
179+
runs-on-list: ${{ inputs.lts_aws_matrix }}
180+
GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
181+
AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
182+
AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
183+
184+
llvm_test_suite:
185+
needs: [build, aws-start]
186+
if: ${{ !failure() && inputs.lts_matrix != '' }}
161187
strategy:
162188
fail-fast: false
163189
max-parallel: ${{ inputs.max_parallel }}
@@ -203,3 +229,26 @@ jobs:
203229
check_sycl_all: ${{ matrix.check_sycl_all }}
204230
results_name_suffix: ${{ matrix.config }}_${{ inputs.build_artifact_suffix }}
205231
cmake_args: '${{ matrix.cmake_args }} ${{ inputs.lts_cmake_extra_args }}'
232+
233+
aws-stop:
234+
name: Stop AWS
235+
needs: [ aws-start, llvm_test_suite ]
236+
if: ${{ always() && inputs.lts_ats_matrix != '' }}
237+
runs-on: ubuntu-latest
238+
environment: aws
239+
steps:
240+
- name: Setup script
241+
run: |
242+
mkdir -p ./aws-ec2
243+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml -P ./aws-ec2
244+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js -P ./aws-ec2
245+
wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2
246+
npm install ./aws-ec2
247+
- name: Stop AWS EC2 runners
248+
uses: ./aws-ec2
249+
with:
250+
runs-on-list: ${{ inputs.lts_aws_matrix }}
251+
mode: stop
252+
GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
253+
AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
254+
AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}

.github/workflows/sycl_nightly.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
if: github.repository == 'intel/llvm'
2121
uses: ./.github/workflows/sycl_linux_build_and_test.yml
2222
needs: resolve_matrix
23+
secrets: inherit
2324
with:
2425
build_cache_root: "/__w/"
2526
build_artifact_suffix: default
@@ -29,6 +30,7 @@ jobs:
2930
if: github.repository == 'intel/llvm'
3031
uses: ./.github/workflows/sycl_linux_build_and_test.yml
3132
needs: resolve_matrix
33+
secrets: inherit
3234
with:
3335
build_cache_root: "/__w/"
3436
build_cache_suffix: opaque_pointers

.github/workflows/sycl_post_commit.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,16 @@ jobs:
2121
name: Linux Default
2222
needs: resolve_matrix
2323
uses: ./.github/workflows/sycl_linux_build_and_test.yml
24+
secrets: inherit
2425
with:
2526
build_cache_root: "/__w/llvm"
2627
build_artifact_suffix: "post_commit"
2728
lts_matrix: ${{ needs.resolve_matrix.outputs.lts_matrix }}
29+
lts_aws_matrix: ${{ needs.resolve_matrix.outputs.lts_aws_matrix }}
2830
linux_no_assert:
2931
name: Linux (no assert)
3032
uses: ./.github/workflows/sycl_linux_build_and_test.yml
33+
secrets: inherit
3134
with:
3235
build_cache_root: "/__w/llvm"
3336
build_cache_suffix: gcc_no_assertions

.github/workflows/sycl_precommit.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: SYCL
22

33
on:
4-
pull_request:
4+
pull_request_target:
55
branches:
66
- sycl
77
# Do not run builds if changes are only in the following locations
@@ -25,6 +25,7 @@ jobs:
2525
steps:
2626
- uses: actions/checkout@v2
2727
with:
28+
persist-credentials: false
2829
fetch-depth: 2
2930
- name: Run clang-format
3031
uses: ./devops/actions/clang-format
@@ -43,9 +44,11 @@ jobs:
4344
needs: [lint, resolve_matrix]
4445
if: always() && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint'))
4546
uses: ./.github/workflows/sycl_linux_build_and_test.yml
47+
secrets: inherit
4648
with:
4749
build_cache_root: "/__w/"
4850
build_cache_size: "8G"
4951
build_artifact_suffix: "default"
5052
build_cache_suffix: "default"
5153
lts_matrix: ${{ needs.resolve_matrix.outputs.lts_matrix }}
54+
lts_aws_matrix: ${{ needs.resolve_matrix.outputs.lts_aws_matrix }}

.github/workflows/sycl_resolve_test_matrix.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,18 @@ on:
1919
type: string
2020
required: true
2121
default: ""
22+
uniq:
23+
description: Unique string to name dynamic runners in AWS
24+
type: string
25+
required: false
26+
default: ${{ github.run_id }}-${{ github.run_attempt }}
2227
outputs:
2328
lts_matrix:
2429
description: "Generated Matrix"
2530
value: ${{ jobs.resolve_matrix.outputs.lts_matrix }}
31+
lts_aws_matrix:
32+
description: "Generated Matrix AWS subset"
33+
value: ${{ jobs.resolve_matrix.outputs.lts_aws_matrix }}
2634
jobs:
2735
resolve_matrix:
2836
name: Resolve Test Matrix

devops/actions/aws-ec2/action.yml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
name: aws-ec2
2+
description: Start AWS EC2 instances with Github actions runner agent in it
3+
4+
inputs:
5+
runs-on-list:
6+
description: "JSON string with array of objects with aws-type, runs-on, aws-ami, aws-spot, aws-disk, aws-timebomb, one-job properties"
7+
required: true
8+
# aws-type: AWS EC2 instance type. This property must be present if you want to trigger AWS EC2 instance start/stop.
9+
# runs-on: Name of the unique label assigned to the runner used as 'runs-on' property for the following jobs. Mandatory presence required.
10+
# aws-ami: AWS AMI id. Makes sense only for start mode. Default "ami-0966bccbb521ccb24".
11+
12+
# ami-0966bccbb521ccb24: Ubuntu 22.04 (ami-02f3416038bdb17fb with /dev/sda1 disk) with docker installed and gh_runner (1001) like this:
13+
# sudo -s
14+
# apt-get update
15+
# curl -fsSL https://get.docker.com -o /tmp/get-docker.sh
16+
# sh /tmp/get-docker.sh
17+
# groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner
18+
# sync; shutdown -h now
19+
20+
# ami-02ec0f344128253f9: Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver (ami-06bf0a3f89fe08f0a with /dev/xvda disk) with docker installed and gh_runner (1001) like this:
21+
# sudo -s
22+
# yum update -y
23+
# amazon-linux-extras install docker
24+
# sudo systemctl --now enable docker
25+
# distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
26+
# yum-config-manager --disable amzn2-graphics; yum clean expire-cache; yum install -y nvidia-docker2; systemctl restart docker
27+
# groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner
28+
# sync; shutdown -h now
29+
30+
# ami-0ccda708841dde988: Amazon Linux 2 AMI with AMD Radeon Pro Driver (ami-0bb1072e787242eb6 with /dev/xvda disk) with docker installed and gh_runner (1001) like this:
31+
# sudo -s
32+
# amazon-linux-extras install docker
33+
# sudo systemctl --now enable docker
34+
# groupadd -g 1001 gh_runner; useradd gh_runner -u 1001 -g 1001 -m -s /bin/bash; usermod -aG docker gh_runner; usermod -aG video gh_runner
35+
# sync; shutdown -h now
36+
37+
# aws-spot: Enable usage of spot instances to save money (less reliable). Makes sense only for start mode. Default true.
38+
# aws-disk: AWS EC2 instance AMI specific disk device path and size in GB (8 by default). Makes sense only for start mode. Default "/dev/sda1:16".
39+
# aws-timebomp: AWS EC2 instance maximum live time. Makes sense only for start mode. Default "1h".
40+
# one-job: Will terminate AWS EC2 instance after one job (not waiting for stop job) saving money. Makes sense only for start mode. Default true.
41+
42+
mode:
43+
description: "Mode of operation: start or stop"
44+
required: false
45+
default: start
46+
47+
GH_PERSONAL_ACCESS_TOKEN:
48+
description: "Github personal access token with repo permission"
49+
required: true
50+
51+
AWS_ACCESS_KEY:
52+
description: "AWS access id"
53+
required: true
54+
55+
AWS_SECRET_KEY:
56+
description: "AWS access secret key"
57+
required: true
58+
59+
aws-region:
60+
description: "AWS EC2 region"
61+
required: false
62+
default: "us-east-2" # Ohio
63+
64+
runs:
65+
using: node12
66+
main: ./aws-ec2.js

0 commit comments

Comments
 (0)