Skip to content

Commit 2a832f5

Browse files
committed
make dnf module install of nvidia-driver idempotent
1 parent 3c70674 commit 2a832f5

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

ansible/roles/cuda/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

5+
To avoid unwanted package updates which break functionality, on first use this role enables the dkms-flavour `nvidia-driver` DNF module stream which has the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* switch which stream is enabled even if later version streams become available. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
6+
57
## Prerequisites
68

79
Requires OFED to be installed to provide required kernel-* packages.

ansible/roles/cuda/tasks/main.yml

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,40 @@
1717
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
1818
url: "{{ cuda_repo }}"
1919

20+
- name: Check if nvidia driver module is enabled
21+
shell:
22+
cmd: dnf module list --enabled nvidia-driver
23+
changed_when: false
24+
failed_when: false
25+
register: _cuda_driver_module_enabled
26+
27+
- name: List nvidia driver dnf module stream versions
28+
shell:
29+
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30+
# Output of interest from command is something like (some whitespace removed):
31+
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32+
changed_when: false
33+
register: _cuda_driver_module_streams
34+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35+
2036
- name: Enable nvidia driver module
21-
ansible.builtin.command: dnf module enable -y nvidia-driver:latest-dkms
22-
register: nvidiadriver_enable
23-
changed_when: "'Nothing to do' not in nvidiadriver_enable.stdout"
37+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
38+
register: _cuda_driver_module_enable
39+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
40+
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
2441

25-
- name: Install nvidia driver module
26-
ansible.builtin.command: dnf module install -y nvidia-driver:latest-dkms
27-
register: nvidiadriver_install
28-
changed_when: "'Nothing to do' not in nvidiadriver_install.stdout"
42+
- name: Install nvidia drivers # TODO: make removal possible?
43+
ansible.builtin.command: dnf module install -y nvidia-driver
44+
register: _cuda_driver_install
45+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
46+
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
2947

3048
- name: Install cuda packages
3149
ansible.builtin.dnf:
3250
name: "{{ cuda_packages }}"
3351
register: cuda_package_install
3452

35-
- name: Add latest cuda binaries to path
53+
- name: Add cuda binaries to path
3654
lineinfile:
3755
path: /etc/profile.d/sh.local
3856
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'

0 commit comments

Comments
 (0)