Skip to content

Commit 994d8f6

Browse files
committed
Adds support for NVIDIA MIG configuration
1 parent 4ef0c82 commit 994d8f6

File tree

13 files changed

+171
-1
lines changed

13 files changed

+171
-1
lines changed

ansible/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,6 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/slurm_recompile/**
94+
!roles/slurm_recompile/**
95+

ansible/extras.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@
4848
name: cuda
4949
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050

51+
- name: Setup vGPU
52+
hosts: vgpu
53+
become: yes
54+
gather_facts: yes
55+
tags: vgpu
56+
tasks:
57+
- include_role:
58+
name: stackhpc.linux.vgpu
59+
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+
handlers:
61+
- name: reboot
62+
fail:
63+
msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64+
5165
- name: Persist hostkeys across rebuilds
5266
# Must be after filesystems.yml (for storage)
5367
# and before portal.yml (where OOD login node hostkeys are scanned)

ansible/fatimage.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,16 @@
250250
name: cloudalchemy.grafana
251251
tasks_from: install.yml
252252

253+
- name: Add support for NVIDIA GPU auto detection to Slurm
254+
hosts: cuda
255+
become: yes
256+
tasks:
257+
- name: Recompile slurm
258+
import_role:
259+
name: slurm_recompile
260+
vars:
261+
recompile_slurm_nvml: "{{ groups.cuda | length > 0 }}"
262+
253263
- name: Run post.yml hook
254264
vars:
255265
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

ansible/roles/compute_init/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575
| extras.yml | basic_users | All functionality [6] | No |
7676
| extras.yml | eessi | All functionality [7] | No |
7777
| extras.yml | cuda | None required - use image build | Yes [8] |
78+
| extras.yml | vgpu | All functionality | Yes |
7879
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081
| extras.yml | k9s (install) | Not relevant during boot | n/a |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
23+
2224

2325
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2426
resolv_conf_nameservers: []
@@ -295,6 +297,12 @@
295297
cmd: "cvmfs_config setup"
296298
when: enable_eessi
297299

300+
- name: Configure VGPUs
301+
include_role:
302+
name: stackhpc.linux.vgpu
303+
tasks_from: 'configure.yml'
304+
when: enable_vgpu
305+
298306
# NB: don't need conditional block on enable_compute as have already exited
299307
# if not the case
300308
- name: Write Munge key

ansible/roles/cuda/tasks/facts.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
3+
- name: Set cuda_facts_version_short
4+
set_fact:
5+
cuda_facts_version_short: "{{ cuda_version_short }}"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
slurm_recompile_nvml: false
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
---
2+
- name: Get facts about CUDA installation
3+
import_role:
4+
name: cuda
5+
tasks_from: facts.yml
6+
7+
- name: Gather the package facts
8+
ansible.builtin.package_facts:
9+
manager: auto
10+
11+
- name: Set fact containing slurm package facts
12+
set_fact:
13+
slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
14+
15+
- name: Recompile and install slurm packages
16+
shell: |
17+
#!/bin/bash
18+
source /etc/profile
19+
set -eux
20+
dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
21+
rpm -i slurm-ohpc-*.src.rpm
22+
cd /root/rpmbuild/SPECS
23+
dnf builddep -y slurm.spec
24+
rpmbuild -bb{% if slurm_recompile_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
25+
dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
26+
become: true
27+
28+
- name: Workaround missing symlink
29+
# Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
30+
command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
31+
args:
32+
creates: /lib64/libnvidia-ml.so
33+
when: slurm_recompile_nvml | bool
34+
35+
- name: Cleanup Dependencies
36+
shell: |
37+
#!/bin/bash
38+
set -eux
39+
set -o pipefail
40+
dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
41+
become: true

ansible/validate.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,13 @@
8383
- import_role:
8484
name: lustre
8585
tasks_from: validate.yml
86+
87+
- name: Validate vGPU configuration
88+
hosts: vgpu
89+
become: yes
90+
gather_facts: yes
91+
tags: vgpu
92+
tasks:
93+
- include_role:
94+
name: stackhpc.linux.vgpu
95+
tasks_from: validate.yml

docs/mig.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# vGPU/MIG configuration
2+
3+
This page details how to configure Multi Instance GPU (MIG) in Slurm.
4+
5+
## Pre-requisites
6+
7+
- Image built with cuda support. This should automatically recompile slurm against NVML.
8+
9+
## Inventory
10+
11+
Add relevant hosts to the ``vgpu`` group, for example in ```environments/$ENV/inventory/groups``:
12+
13+
```
14+
[vgpu:children]
15+
cuda
16+
```
17+
18+
## Configuration
19+
20+
Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu) role.
21+
22+
For example in: `environments/<environment>/inventory/group_vars/all/vgpu`:
23+
24+
```
25+
---
26+
vgpu_definitions:
27+
- pci_address: "0000:17:00.0"
28+
mig_devices:
29+
"1g.10gb": 4
30+
"4g.40gb": 1
31+
- pci_address: "0000:81:00.0"
32+
mig_devices:
33+
"1g.10gb": 4
34+
"4g.40gb": 1
35+
```
36+
37+
The appliance will use the driver installed via the ``cuda`` role. Use ``lspci`` to determine the PCI
38+
addresses.
39+
40+
## compute_init
41+
42+
Use the ``vgpu`` metadata option to enable creation of mig devices on rebuild.
43+
44+
## gres configuration
45+
46+
Enable gres autodetection. This can be set as a host or group var.
47+
48+
```
49+
openhpc_gres_autodetect: nvml
50+
```
51+
52+
You should stop terraform templating out partitions.yml and specify `openhpc_slurm_partitions` manually.
53+
An example of specifying gres resources is shown below
54+
(`environments/<environment>/inventory/group_vars/all/partitions-manual.yml`):
55+
56+
```
57+
openhpc_slurm_partitions:
58+
- name: cpu
59+
- name: gpu
60+
gres:
61+
# Two cards not partitioned with MIG
62+
- conf: "gpu:nvidia_h100_80gb_hbm3:2"
63+
- conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2"
64+
- conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6"
65+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
3+
# Nvidia driver is provided by cuda role.
4+
vgpu_nvidia_driver_install_enabled: false

environments/common/inventory/groups

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ freeipa_client
112112
[cuda]
113113
# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
114114

115+
[vgpu]
116+
# FIXME: Update once PR merged
117+
# Hosts where vGPU/MIG should be configured - see https://github.com/stackhpc/ansible-collection-linux/pull/43/files#diff-74e43d9a34244aa54721f4dbd12a029baa87957afd762b88c2677aa75414f514R75
118+
115119
[eessi]
116120
# Hosts on which EESSI stack should be configured
117121

requirements.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ roles:
44
version: v25.3.2
55
name: stackhpc.nfs
66
- src: https://github.com/stackhpc/ansible-role-openhpc.git
7-
version: v0.28.0
7+
version: feature/gres-autodetect
88
name: stackhpc.openhpc
99
- src: https://github.com/stackhpc/ansible-node-exporter.git
1010
version: stackhpc
@@ -55,4 +55,7 @@ collections:
5555
version: 0.0.15
5656
- name: stackhpc.pulp
5757
version: 0.5.5
58+
- name: https://github.com/stackhpc/ansible-collection-linux
59+
type: git
60+
version: feature/mig-only
5861
...

0 commit comments

Comments
 (0)