Skip to content

Commit fed2d6e

Browse files
authored
Pin nvidia-driver and cuda packages to working packages (#496)
* move cuda tasks to install * pin nvidia driver to working version and autodetect os/arch * make install of cuda packages optional * don't run cuda install tasks unless during build * move doca install before cuda * update cuda docs * add cuda to extra build test CI * add cuda runtime tasks * fix typo in extras playbook * bump extra build size to 30GB for cuda * pin both cuda package version * make cuda idempotent/restartable * allow using computed tasks_from for cuda role * fix showing image summary * rename nvidia driver version var * bump CI image
1 parent 15ed0a3 commit fed2d6e

File tree

9 files changed

+61
-31
lines changed

9 files changed

+61
-31
lines changed

.github/workflows/doca.yml renamed to .github/workflows/extra.yml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test DOCA extra build
1+
name: Test extra build
22
on:
33
workflow_dispatch:
44
push:
@@ -7,16 +7,18 @@ on:
77
paths:
88
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
99
- 'ansible/roles/doca/**'
10-
- '.github/workflows/doca'
10+
- 'ansible/roles/cuda/**'
11+
- '.github/workflows/extra.yml'
1112
pull_request:
1213
paths:
1314
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
1415
- 'ansible/roles/doca/**'
15-
- '.github/workflows/doca'
16+
- 'ansible/roles/cuda/**'
17+
- '.github/workflows/extra.yml'
1618

1719
jobs:
1820
doca:
19-
name: doca-build
21+
name: extra-build
2022
concurrency:
2123
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
2224
cancel-in-progress: true
@@ -25,12 +27,14 @@ jobs:
2527
fail-fast: false # allow other matrix jobs to continue even if one fails
2628
matrix: # build RL8, RL9
2729
build:
28-
- image_name: openhpc-doca-RL8
30+
- image_name: openhpc-extra-RL8
2931
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
30-
inventory_groups: doca
31-
- image_name: openhpc-doca-RL9
32+
inventory_groups: doca,cuda
33+
volume_size: 30 # needed for cuda
34+
- image_name: openhpc-extra-RL9
3235
source_image_name_key: RL9
33-
inventory_groups: doca
36+
inventory_groups: doca,cuda
37+
volume_size: 30 # needed for cuda
3438
env:
3539
ANSIBLE_FORCE_COLOR: True
3640
OS_CLOUD: openstack
@@ -95,6 +99,7 @@ jobs:
9599
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
96100
-var "image_name=${{ matrix.build.image_name }}" \
97101
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
102+
-var "volume_size=${{ matrix.build.volume_size }}" \
98103
openstack.pkr.hcl
99104
100105
- name: Get created image names from manifest

ansible/cleanup.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,4 @@
6666
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
6767

6868
- name: Show image summary
69-
debug:
70-
var: image_info
69+
command: cat /var/lib/image/image.json

ansible/extras.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
gather_facts: yes
2525
tags: cuda
2626
tasks:
27-
- import_role:
27+
- include_role:
2828
name: cuda
29+
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
2930

3031
- name: Persist hostkeys across rebuilds
3132
# Must be after filesystems.yml (for storage)

ansible/fatimage.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@
2929

3030
- import_playbook: bootstrap.yml
3131

32+
- hosts: doca
33+
become: yes
34+
gather_facts: yes
35+
tasks:
36+
- name: Install NVIDIA DOCA
37+
import_role:
38+
name: doca
39+
3240
- name: Run post-bootstrap.yml hook
3341
vars:
3442
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -220,15 +228,15 @@
220228
import_role:
221229
name: doca
222230

223-
- import_playbook: disable-repos.yml
224-
225231
- name: Run post.yml hook
226232
vars:
227233
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
228234
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
229235
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
230236
when: hook_path | exists
231237

238+
- import_playbook: disable-repos.yml
239+
232240
- hosts: builder
233241
become: yes
234242
gather_facts: yes

ansible/roles/cuda/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# cuda
22

3-
Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
3+
Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

55
## Prerequisites
66

77
Requires OFED to be installed to provide required kernel-* packages.
88

99
## Role Variables
1010

11-
- `cuda_distro`: Optional. Default `rhel8`.
12-
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13-
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
11+
- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
12+
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
1413
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
14+
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
1515
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
2-
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3-
cuda_driver_stream: default
4-
cuda_package_version: 'latest'
1+
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
2+
cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages
3+
cuda_package_version: '12.6.3-1'
54
cuda_packages:
65
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
76
- nvidia-gds

ansible/roles/cuda/tasks/main.yml renamed to ansible/roles/cuda/tasks/install.yml

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
33

4-
- name: Check for OFED
4+
- name: Check for OFED/DOCA
55
command:
66
cmd: dnf list --installed rdma-core
77
register: _dnf_rdma_core
@@ -10,41 +10,53 @@
1010
- name: Assert OFED installed
1111
assert:
1212
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?"
13+
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
1414

1515
- name: Install cuda repo
1616
get_url:
17-
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
18-
url: "{{ cuda_repo }}"
17+
dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo"
18+
url: "{{ cuda_repo_url }}"
1919

2020
- name: Check if nvidia driver module is enabled
21-
shell:
22-
cmd: dnf module list --enabled nvidia-driver
21+
ansible.builtin.command: dnf module list --enabled nvidia-driver
2322
changed_when: false
2423
failed_when: false
2524
register: _cuda_driver_module_enabled
2625

2726
- name: Enable nvidia driver module
28-
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
27+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}"
2928
register: _cuda_driver_module_enable
3029
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3130
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
3231

32+
- name: Check if nvidia driver module is installed
33+
ansible.builtin.command: dnf module list --installed nvidia-driver
34+
changed_when: false
35+
failed_when: false
36+
register: _cuda_driver_module_installed
37+
3338
- name: Install nvidia drivers
3439
ansible.builtin.command: dnf module install -y nvidia-driver
3540
register: _cuda_driver_install
36-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
41+
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
3742
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
3843

44+
- name: Check kernel has not been modified
45+
assert:
46+
that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched
47+
fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}"
48+
3949
- name: Install cuda packages
4050
ansible.builtin.dnf:
4151
name: "{{ cuda_packages }}"
52+
when: cuda_package_version != 'none'
4253
register: cuda_package_install
4354

4455
- name: Add cuda binaries to path
4556
lineinfile:
4657
path: /etc/profile.d/sh.local
4758
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
59+
when: cuda_package_version != 'none'
4860

4961
- name: Enable NVIDIA Persistence Daemon
5062
systemd:
@@ -60,3 +72,4 @@
6072
- name: Wait for hosts to be reachable
6173
wait_for_connection:
6274
sleep: 15
75+
when: cuda_package_install.changed

ansible/roles/cuda/tasks/runtime.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
- name: Ensure NVIDIA Persistence Daemon state
2+
systemd:
3+
name: nvidia-persistenced
4+
enabled: true
5+
state: "{{ cuda_persistenced_state }}"
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-241218-1011-5effb3fa",
4-
"RL9": "openhpc-RL9-241218-1011-5effb3fa"
3+
"RL8": "openhpc-RL8-241218-1705-09ac4268",
4+
"RL9": "openhpc-RL9-241218-1705-09ac4268"
55
}
66
}

0 commit comments

Comments
 (0)