Skip to content

Commit efd2883

Browse files
authored
Add role to install NVIDIA DOCA on top of an existing "fat" image (#492)
* add doca role run by fatimage * add workflow to test doca build * make packer inventory groups clearer and allow defining no extra * update packer workflows for new packer config * define builds entirely via matrix * WIP: do DOCA CI build on top of current fat image * fixup matrix for changes * fix doca workflow typo * use current fatimage for doca test build * enable fatimage to be used for volume-backed builds * bump CI image * doca workflow: clean up image and only run on relevant changes * remove commented-out code * add DOCA README * fix DOCA role actually running * tidyup DOCA play * include doca packages in image summary * fix squid being selected for any stackhopc build VM * fix nightly build concurrency * re-add squid back to Stackhpc builder group * remove debugging exit * update image build docs * update packer docs
1 parent 4de581c commit efd2883

File tree

14 files changed

+323
-155
lines changed

14 files changed

+323
-155
lines changed

.github/workflows/doca.yml

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
name: Test DOCA extra build
2+
on:
3+
workflow_dispatch:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
9+
- 'ansible/roles/doca/**'
10+
- '.github/workflows/doca'
11+
pull_request:
12+
paths:
13+
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
14+
- 'ansible/roles/doca/**'
15+
- '.github/workflows/doca'
16+
17+
jobs:
18+
doca:
19+
name: doca-build
20+
concurrency:
21+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
22+
cancel-in-progress: true
23+
runs-on: ubuntu-22.04
24+
strategy:
25+
fail-fast: false # allow other matrix jobs to continue even if one fails
26+
matrix: # build RL8, RL9
27+
build:
28+
- image_name: openhpc-doca-RL8
29+
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
30+
inventory_groups: doca
31+
- image_name: openhpc-doca-RL9
32+
source_image_name_key: RL9
33+
inventory_groups: doca
34+
env:
35+
ANSIBLE_FORCE_COLOR: True
36+
OS_CLOUD: openstack
37+
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
38+
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
39+
40+
steps:
41+
- uses: actions/checkout@v2
42+
43+
- name: Load current fat images into GITHUB_ENV
44+
# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string
45+
run: |
46+
{
47+
echo 'FAT_IMAGES<<EOF'
48+
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
49+
echo EOF
50+
} >> "$GITHUB_ENV"
51+
52+
- name: Record settings
53+
run: |
54+
echo CI_CLOUD: ${{ env.CI_CLOUD }}
55+
echo FAT_IMAGES: ${FAT_IMAGES}
56+
57+
- name: Setup ssh
58+
run: |
59+
set -x
60+
mkdir ~/.ssh
61+
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
62+
chmod 0600 ~/.ssh/id_rsa
63+
shell: bash
64+
65+
- name: Add bastion's ssh key to known_hosts
66+
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
67+
shell: bash
68+
69+
- name: Install ansible etc
70+
run: dev/setup-env.sh
71+
72+
- name: Write clouds.yaml
73+
run: |
74+
mkdir -p ~/.config/openstack/
75+
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
76+
shell: bash
77+
78+
- name: Setup environment
79+
run: |
80+
. venv/bin/activate
81+
. environments/.stackhpc/activate
82+
83+
- name: Build fat image with packer
84+
id: packer_build
85+
run: |
86+
set -x
87+
. venv/bin/activate
88+
. environments/.stackhpc/activate
89+
cd packer/
90+
packer init .
91+
92+
PACKER_LOG=1 packer build \
93+
-on-error=${{ vars.PACKER_ON_ERROR }} \
94+
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
95+
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
96+
-var "image_name=${{ matrix.build.image_name }}" \
97+
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
98+
openstack.pkr.hcl
99+
100+
- name: Get created image names from manifest
101+
id: manifest
102+
run: |
103+
. venv/bin/activate
104+
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
105+
while ! openstack image show -f value -c name $IMAGE_ID; do
106+
sleep 5
107+
done
108+
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
109+
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
110+
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
111+
echo $IMAGE_ID > image-id.txt
112+
echo $IMAGE_NAME > image-name.txt
113+
114+
- name: Make image usable for further builds
115+
run: |
116+
. venv/bin/activate
117+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
118+
119+
- name: Delete image for automatically-run workflows
120+
run: |
121+
. venv/bin/activate
122+
openstack image delete "${{ steps.manifest.outputs.image-id }}"
123+
if: ${{ github.event_name != 'workflow_dispatch' }}
124+
125+
- name: Upload manifest artifact
126+
uses: actions/upload-artifact@v4
127+
with:
128+
name: image-details-${{ matrix.build.image_name }}
129+
path: |
130+
./image-id.txt
131+
./image-name.txt
132+
overwrite: true

.github/workflows/fatimage.yml

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,23 @@ jobs:
1515
openstack:
1616
name: openstack-imagebuild
1717
concurrency:
18-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
18+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
1919
cancel-in-progress: true
2020
runs-on: ubuntu-22.04
2121
strategy:
2222
fail-fast: false # allow other matrix jobs to continue even if one fails
2323
matrix: # build RL8, RL9
24-
os_version:
25-
- RL8
26-
- RL9
2724
build:
28-
- openstack.openhpc
25+
- image_name: openhpc-RL8
26+
source_image_name: rocky-latest-RL8
27+
inventory_groups: control,compute,login
28+
- image_name: openhpc-RL9
29+
source_image_name: rocky-latest-RL9
30+
inventory_groups: control,compute,login
2931
env:
3032
ANSIBLE_FORCE_COLOR: True
3133
OS_CLOUD: openstack
3234
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
33-
SOURCE_IMAGES_MAP: |
34-
{
35-
"RL8": {
36-
"openstack.openhpc": "rocky-latest-RL8"
37-
},
38-
"RL9": {
39-
"openstack.openhpc": "rocky-latest-RL9"
40-
}
41-
}
4235
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
4336

4437
steps:
@@ -85,13 +78,11 @@ jobs:
8578
8679
PACKER_LOG=1 packer build \
8780
-on-error=${{ vars.PACKER_ON_ERROR }} \
88-
-only=${{ matrix.build }} \
8981
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
90-
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
82+
-var "source_image_name=${{ matrix.build.source_image_name }}" \
83+
-var "image_name=${{ matrix.build.image_name }}" \
84+
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
9185
openstack.pkr.hcl
92-
env:
93-
PKR_VAR_os_version: ${{ matrix.os_version }}
94-
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
9586
9687
- name: Get created image names from manifest
9788
id: manifest
@@ -102,13 +93,20 @@ jobs:
10293
sleep 5
10394
done
10495
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
96+
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
97+
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
10598
echo $IMAGE_ID > image-id.txt
10699
echo $IMAGE_NAME > image-name.txt
107100
101+
- name: Make image usable for further builds
102+
run: |
103+
. venv/bin/activate
104+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
105+
108106
- name: Upload manifest artifact
109107
uses: actions/upload-artifact@v4
110108
with:
111-
name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
109+
name: image-details-${{ matrix.build.image_name }}
112110
path: |
113111
./image-id.txt
114112
./image-name.txt

.github/workflows/nightlybuild.yml

Lines changed: 20 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,32 +11,29 @@ on:
1111
- SMS
1212
- ARCUS
1313
schedule:
14-
- cron: '0 0 * * *' # Run at midnight
14+
- cron: '0 0 * * *' # Run at midnight on default branch
1515

1616
jobs:
1717
openstack:
1818
name: openstack-imagebuild
1919
concurrency:
20-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
20+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
2121
cancel-in-progress: true
2222
runs-on: ubuntu-22.04
2323
strategy:
2424
fail-fast: false # allow other matrix jobs to continue even if one fails
2525
matrix: # build RL8, RL9
26-
os_version:
27-
- RL8
28-
- RL9
2926
build:
30-
- openstack.rocky-latest
27+
- image_name: rocky-latest-RL8
28+
source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2
29+
inventory_groups: update
30+
- image_name: rocky-latest-RL9
31+
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
32+
inventory_groups: update
3133
env:
3234
ANSIBLE_FORCE_COLOR: True
3335
OS_CLOUD: openstack
3436
CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
35-
SOURCE_IMAGES_MAP: |
36-
{
37-
"RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2",
38-
"RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
39-
}
4037
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
4138

4239
steps:
@@ -83,15 +80,12 @@ jobs:
8380
8481
PACKER_LOG=1 packer build \
8582
-on-error=${{ vars.PACKER_ON_ERROR }} \
86-
-only=${{ matrix.build }} \
8783
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
88-
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
84+
-var "source_image_name=${{ matrix.build.source_image_name }}" \
85+
-var "image_name=${{ matrix.build.image_name }}" \
86+
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
8987
openstack.pkr.hcl
9088
91-
env:
92-
PKR_VAR_os_version: ${{ matrix.os_version }}
93-
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }}
94-
9589
- name: Get created image names from manifest
9690
id: manifest
9791
run: |
@@ -125,7 +119,7 @@ jobs:
125119
name: upload-nightly-targets
126120
needs: openstack
127121
concurrency:
128-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }}
122+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }}
129123
cancel-in-progress: true
130124
runs-on: ubuntu-22.04
131125
strategy:
@@ -135,18 +129,15 @@ jobs:
135129
- LEAFCLOUD
136130
- SMS
137131
- ARCUS
138-
os_version:
139-
- RL8
140-
- RL9
141-
image:
142-
- rocky-latest
132+
build:
133+
- image_name: rocky-latest-RL8
134+
- image_name: rocky-latest-RL9
143135
exclude:
144136
- target_cloud: LEAFCLOUD
145137
env:
146138
OS_CLOUD: openstack
147139
SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
148140
TARGET_CLOUD: ${{ matrix.target_cloud }}
149-
IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}"
150141
steps:
151142
- uses: actions/checkout@v2
152143

@@ -161,42 +152,37 @@ jobs:
161152
. venv/bin/activate
162153
pip install -U pip
163154
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
164-
shell: bash
165155
166156
- name: Write clouds.yaml
167157
run: |
168158
mkdir -p ~/.config/openstack/
169159
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml
170160
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml
171-
shell: bash
172161
173162
- name: Download source image
174163
run: |
175164
. venv/bin/activate
176165
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml
177-
openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }}
178-
shell: bash
166+
openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }}
179167
180168
- name: Upload to target cloud
181169
run: |
182170
. venv/bin/activate
183171
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
184172
185-
openstack image create "${{ env.IMAGE_NAME }}" \
186-
--file "${{ env.IMAGE_NAME }}" \
173+
openstack image create "${{ matrix.build.image_name }}" \
174+
--file "${{ matrix.build.image_name }}" \
187175
--disk-format qcow2 \
188-
shell: bash
189176
190177
- name: Delete old latest image from target cloud
191178
run: |
192179
. venv/bin/activate
193180
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
194181
195-
IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l)
182+
IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l)
196183
if [ "$IMAGE_COUNT" -gt 1 ]; then
197-
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1)
184+
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1)
198185
openstack image delete "$OLD_IMAGE_ID"
199186
else
200187
echo "Only one image exists, skipping deletion."
201188
fi
202-
shell: bash

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,5 @@ roles/*
6666
!roles/lustre/**
6767
!roles/dnf_repos/
6868
!roles/dnf_repos/**
69+
!roles/doca/
70+
!roles/doca/**

ansible/cleanup.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,5 +61,10 @@
6161
os: "{{ ansible_distribution }} {{ ansible_distribution_version }}"
6262
kernel: "{{ ansible_kernel }}"
6363
ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}"
64+
doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}"
6465
cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}"
6566
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
67+
68+
- name: Show image summary
69+
debug:
70+
var: image_info

ansible/fatimage.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
tasks:
77
- name: Report hostname (= final image name)
88
command: hostname
9+
- name: Report inventory groups
10+
debug:
11+
var: group_names
912

1013
- name: Run pre.yml hook
1114
vars:
@@ -199,6 +202,14 @@
199202
name: cloudalchemy.grafana
200203
tasks_from: install.yml
201204

205+
- hosts: doca
206+
become: yes
207+
gather_facts: yes
208+
tasks:
209+
- name: Install NVIDIA DOCA
210+
import_role:
211+
name: doca
212+
202213
- name: Run post.yml hook
203214
vars:
204215
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

0 commit comments

Comments
 (0)