Skip to content

Commit 089d85c

Browse files
authored
Merge pull request #315 from stackhpc/fix/nvidia-driver-install
Make nvidia-driver install idempotent
2 parents 3c70674 + 1472bd6 commit 089d85c

File tree

4 files changed

+31
-11
lines changed

4 files changed

+31
-11
lines changed

ansible/roles/cuda/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@ Requires OFED to be installed to provide required kernel-* packages.
1010

1111
- `cuda_distro`: Optional. Default `rhel8`.
1212
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13+
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
1314
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
1415
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
cuda_distro: rhel8
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3+
cuda_driver_stream: default
34
cuda_packages:
45
- cuda
56
- nvidia-gds
6-
# cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
7-
cuda_version_short: "{{ cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
7+
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
89
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
910
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1011
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,40 @@
1717
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
1818
url: "{{ cuda_repo }}"
1919

20+
- name: Check if nvidia driver module is enabled
21+
shell:
22+
cmd: dnf module list --enabled nvidia-driver
23+
changed_when: false
24+
failed_when: false
25+
register: _cuda_driver_module_enabled
26+
27+
- name: List nvidia driver dnf module stream versions
28+
shell:
29+
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30+
# Output of interest from command is something like (some whitespace removed):
31+
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32+
changed_when: false
33+
register: _cuda_driver_module_streams
34+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35+
2036
- name: Enable nvidia driver module
21-
ansible.builtin.command: dnf module enable -y nvidia-driver:latest-dkms
22-
register: nvidiadriver_enable
23-
changed_when: "'Nothing to do' not in nvidiadriver_enable.stdout"
37+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
38+
register: _cuda_driver_module_enable
39+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
40+
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
2441

25-
- name: Install nvidia driver module
26-
ansible.builtin.command: dnf module install -y nvidia-driver:latest-dkms
27-
register: nvidiadriver_install
28-
changed_when: "'Nothing to do' not in nvidiadriver_install.stdout"
42+
- name: Install nvidia drivers # TODO: make removal possible?
43+
ansible.builtin.command: dnf module install -y nvidia-driver
44+
register: _cuda_driver_install
45+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
46+
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
2947

3048
- name: Install cuda packages
3149
ansible.builtin.dnf:
3250
name: "{{ cuda_packages }}"
3351
register: cuda_package_install
3452

35-
- name: Add latest cuda binaries to path
53+
- name: Add cuda binaries to path
3654
lineinfile:
3755
path: /etc/profile.d/sh.local
3856
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'

ansible/roles/cuda/tasks/samples.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
- name: Set fact for discovered cuda version
77
set_fact:
8-
cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
8+
_cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
99

1010
- name: Ensure cuda_samples_path exists
1111
file:

0 commit comments

Comments
 (0)