Skip to content

Commit 6ec3a73

Browse files
sjpbbertiethorpe
andauthored
Add RL9 cuda build variant (#428)
* determine cuda distro automatically * fix typo in CUDA samples * make facts available for cuda * add RL9 cuda build variant * fix typo in build definitions * set packer build volume sizes depending on build variant * fix volume size definition * fix cuda verfsion to workaround issue with 12-6-0-1 * don't fail all builds if one fails * bump CUDA builder disk size (build ran out of space) * download cuda image to /mnt on gh runner * download cuda image to /mnt on gh runner * fix fatimage.yml mnt permissions * Update main.yml * switch to open nvidia drivers * bump CI images * make packer build volume-backed optional again --------- Co-authored-by: bertiethorpe <[email protected]> Co-authored-by: bertiethorpe <[email protected]>
1 parent 513ad1c commit 6ec3a73

File tree

8 files changed

+37
-34
lines changed

8 files changed

+37
-34
lines changed

.github/workflows/fatimage.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,20 @@ jobs:
1010
name: openstack-imagebuild
1111
runs-on: ubuntu-22.04
1212
strategy:
13-
matrix:
13+
fail-fast: false # allow other matrix jobs to continue even if one fails
14+
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
1415
os_version:
1516
- RL8
1617
- RL9
1718
build:
1819
- openstack.openhpc
1920
- openstack.openhpc-ofed
21+
- openstack.openhpc-cuda
2022
exclude:
2123
- os_version: RL8
2224
build: openstack.openhpc-ofed
25+
- os_version: RL8
26+
build: openstack.openhpc-cuda
2327
- os_version: RL9
2428
build: openstack.openhpc
2529
env:
@@ -81,7 +85,9 @@ jobs:
8185
- name: Download image
8286
run: |
8387
. venv/bin/activate
84-
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
88+
sudo mkdir /mnt/images
89+
sudo chmod 777 /mnt/images
90+
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
8591
8692
- name: Set up QEMU
8793
uses: docker/setup-qemu-action@v3
@@ -95,13 +101,13 @@ jobs:
95101
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
96102

97103
- name: mount qcow2 file
98-
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
104+
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
99105

100106
- name: Run Trivy vulnerability scanner
101107
uses: aquasecurity/[email protected]
102108
with:
103109
scan-type: fs
104-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
110+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
105111
scanners: "vuln"
106112
format: sarif
107113
output: "${{ steps.manifest.outputs.image-name }}.sarif"
@@ -117,7 +123,7 @@ jobs:
117123
uses: aquasecurity/[email protected]
118124
with:
119125
scan-type: fs
120-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
126+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
121127
scanners: "vuln"
122128
format: table
123129
exit-code: '1'

ansible/extras.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- name: Setup CUDA
2222
hosts: cuda
2323
become: yes
24-
gather_facts: no
24+
gather_facts: yes
2525
tags: cuda
2626
tasks:
2727
- import_role:

ansible/roles/cuda/defaults/main.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
cuda_distro: rhel8
1+
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
33
cuda_driver_stream: default
4+
cuda_package_version: 'latest'
45
cuda_packages:
5-
- cuda
6+
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
67
- nvidia-gds
78
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8-
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
9+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
910
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1011
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1112
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,13 @@
2424
failed_when: false
2525
register: _cuda_driver_module_enabled
2626

27-
- name: List nvidia driver dnf module stream versions
28-
shell:
29-
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30-
# Output of interest from command is something like (some whitespace removed):
31-
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32-
changed_when: false
33-
register: _cuda_driver_module_streams
34-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35-
3627
- name: Enable nvidia driver module
37-
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
28+
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
3829
register: _cuda_driver_module_enable
3930
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
4031
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
4132

42-
- name: Install nvidia drivers # TODO: make removal possible?
33+
- name: Install nvidia drivers
4334
ansible.builtin.command: dnf module install -y nvidia-driver
4435
register: _cuda_driver_install
4536
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

environments/.stackhpc/ARCUS.pkrvars.hcl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
11
flavor = "vm.ska.cpu.general.small"
2-
use_blockstorage_volume = true
3-
volume_size = 15 # GB
4-
image_disk_format = "qcow2"
52
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
63
ssh_keypair_name = "slurm-app-ci"
74
ssh_private_key_file = "~/.ssh/id_rsa"

environments/.stackhpc/LEAFCLOUD.pkrvars.hcl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
flavor = "ec1.large"
2-
use_blockstorage_volume = true
3-
volume_size = 15 # GB
42
volume_type = "unencrypted"
5-
image_disk_format = "qcow2"
63
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
74
ssh_keypair_name = "slurm-app-ci"
85
ssh_private_key_file = "~/.ssh/id_rsa"

environments/.stackhpc/terraform/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ variable "cluster_image" {
3030
type = map(string)
3131
default = {
3232
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
33-
RL8: "openhpc-RL8-240813-1317-1b370a36"
34-
RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
33+
RL8: "openhpc-RL8-240904-1509-1687368f"
34+
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
3535
}
3636
}
3737

packer/openstack.pkr.hcl

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ variable "manifest_output_path" {
120120

121121
variable "use_blockstorage_volume" {
122122
type = bool
123-
default = false
123+
default = true
124124
}
125125

126126
variable "volume_type" {
@@ -129,13 +129,18 @@ variable "volume_type" {
129129
}
130130

131131
variable "volume_size" {
132-
type = number
133-
default = null # When not specified use the size of the builder instance root disk
132+
type = map(number)
133+
default = {
134+
# fat image builds, GB:
135+
openhpc = 15
136+
openhpc-ofed = 15
137+
openhpc-cuda = 30
138+
}
134139
}
135140

136141
variable "image_disk_format" {
137142
type = string
138-
default = null # When not specified use the image default
143+
default = "qcow2"
139144
}
140145

141146
variable "metadata" {
@@ -150,6 +155,7 @@ variable "groups" {
150155
# fat image builds:
151156
openhpc = ["control", "compute", "login"]
152157
openhpc-ofed = ["control", "compute", "login", "ofed"]
158+
openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
153159
}
154160
}
155161

@@ -158,11 +164,11 @@ source "openstack" "openhpc" {
158164
flavor = var.flavor
159165
use_blockstorage_volume = var.use_blockstorage_volume
160166
volume_type = var.volume_type
167+
volume_size = var.volume_size[source.name]
161168
metadata = var.metadata
162169
networks = var.networks
163170
floating_ip_network = var.floating_ip_network
164171
security_groups = var.security_groups
165-
volume_size = var.volume_size
166172

167173
# Input image:
168174
source_image = "${var.source_image[var.os_version]}"
@@ -178,7 +184,7 @@ source "openstack" "openhpc" {
178184
ssh_bastion_private_key_file = var.ssh_bastion_private_key_file
179185

180186
# Output image:
181-
image_disk_format = var.image_disk_format
187+
image_disk_format = "qcow2"
182188
image_visibility = var.image_visibility
183189
image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
184190
}
@@ -195,6 +201,11 @@ build {
195201
name = "openhpc-ofed"
196202
}
197203

204+
# CUDA fat image:
205+
source "source.openstack.openhpc" {
206+
name = "openhpc-cuda"
207+
}
208+
198209
# Extended site-specific image, built on fat image:
199210
source "source.openstack.openhpc" {
200211
name = "openhpc-extra"

0 commit comments

Comments
 (0)