File tree Expand file tree Collapse file tree 13 files changed +171
-1
lines changed
environments/common/inventory Expand file tree Collapse file tree 13 files changed +171
-1
lines changed Original file line number Diff line number Diff line change @@ -90,3 +90,6 @@ roles/*
90
90
! roles /gateway /**
91
91
! roles /alertmanager /
92
92
! roles /alertmanager /**
93
+ ! roles /slurm_recompile /**
94
+ ! roles /slurm_recompile /**
95
+
Original file line number Diff line number Diff line change 48
48
name : cuda
49
49
tasks_from : " {{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
50
50
51
+ - name : Setup vGPU
52
+ hosts : vgpu
53
+ become : yes
54
+ gather_facts : yes
55
+ tags : vgpu
56
+ tasks :
57
+ - include_role :
58
+ name : stackhpc.linux.vgpu
59
+ tasks_from : " {{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60
+ handlers :
61
+ - name : reboot
62
+ fail :
63
+ msg : Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64
+
51
65
- name : Persist hostkeys across rebuilds
52
66
# Must be after filesystems.yml (for storage)
53
67
# and before portal.yml (where OOD login node hostkeys are scanned)
Original file line number Diff line number Diff line change 250
250
name : cloudalchemy.grafana
251
251
tasks_from : install.yml
252
252
253
+ - name : Add support for NVIDIA GPU auto detection to Slurm
254
+ hosts : cuda
255
+ become : yes
256
+ tasks :
257
+ - name : Recompile slurm
258
+ import_role :
259
+ name : slurm_recompile
260
+ vars :
261
+ recompile_slurm_nvml : " {{ groups.cuda | length > 0 }}"
262
+
253
263
- name : Run post.yml hook
254
264
vars :
255
265
appliances_environment_root : " {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Original file line number Diff line number Diff line change @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
75
75
| extras.yml | basic_users | All functionality [ 6] | No |
76
76
| extras.yml | eessi | All functionality [ 7] | No |
77
77
| extras.yml | cuda | None required - use image build | Yes [ 8] |
78
+ | extras.yml | vgpu | All functionality | Yes |
78
79
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
79
80
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
80
81
| extras.yml | k9s (install) | Not relevant during boot | n/a |
Original file line number Diff line number Diff line change 19
19
enable_basic_users : " {{ os_metadata.meta.basic_users | default(false) | bool }}"
20
20
enable_eessi : " {{ os_metadata.meta.eessi | default(false) | bool }}"
21
21
enable_chrony : " {{ os_metadata.meta.chrony | default(false) | bool }}"
22
+ enable_vgpu : " {{ os_metadata.meta.vpgu | default(false) | bool }}"
23
+
22
24
23
25
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
24
26
resolv_conf_nameservers : []
295
297
cmd : " cvmfs_config setup"
296
298
when : enable_eessi
297
299
300
+ - name : Configure VGPUs
301
+ include_role :
302
+ name : stackhpc.linux.vgpu
303
+ tasks_from : ' configure.yml'
304
+ when : enable_vgpu
305
+
298
306
# NB: don't need conditional block on enable_compute as have already exited
299
307
# if not the case
300
308
- name : Write Munge key
Original file line number Diff line number Diff line change
1
+ ---
2
+
3
+ - name : Set cuda_facts_version_short
4
+ set_fact :
5
+ cuda_facts_version_short : " {{ cuda_version_short }}"
Original file line number Diff line number Diff line change
1
+ ---
2
+ slurm_recompile_nvml : false
Original file line number Diff line number Diff line change
1
+ ---
2
+ - name : Get facts about CUDA installation
3
+ import_role :
4
+ name : cuda
5
+ tasks_from : facts.yml
6
+
7
+ - name : Gather the package facts
8
+ ansible.builtin.package_facts :
9
+ manager : auto
10
+
11
+ - name : Set fact containing slurm package facts
12
+ set_fact :
13
+ slurm_package : " {{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
14
+
15
+ - name : Recompile and install slurm packages
16
+ shell : |
17
+ #!/bin/bash
18
+ source /etc/profile
19
+ set -eux
20
+ dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
21
+ rpm -i slurm-ohpc-*.src.rpm
22
+ cd /root/rpmbuild/SPECS
23
+ dnf builddep -y slurm.spec
24
+ rpmbuild -bb{% if slurm_recompile_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
25
+ dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
26
+ become : true
27
+
28
+ - name : Workaround missing symlink
29
+ # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
30
+ command : ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
31
+ args :
32
+ creates : /lib64/libnvidia-ml.so
33
+ when : slurm_recompile_nvml | bool
34
+
35
+ - name : Cleanup Dependencies
36
+ shell : |
37
+ #!/bin/bash
38
+ set -eux
39
+ set -o pipefail
40
+ dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
41
+ become : true
Original file line number Diff line number Diff line change 83
83
- import_role :
84
84
name : lustre
85
85
tasks_from : validate.yml
86
+
87
+ - name : Validate vGPU configuration
88
+ hosts : vgpu
89
+ become : yes
90
+ gather_facts : yes
91
+ tags : vgpu
92
+ tasks :
93
+ - include_role :
94
+ name : stackhpc.linux.vgpu
95
+ tasks_from : validate.yml
Original file line number Diff line number Diff line change
1
+ # vGPU/MIG configuration
2
+
3
+ This page details how to configure Multi Instance GPU (MIG) in Slurm.
4
+
5
+ ## Pre-requisites
6
+
7
+ - Image built with cuda support. This should automatically recompile slurm against NVML.
8
+
9
+ ## Inventory
10
+
11
+ Add relevant hosts to the `` vgpu `` group, for example in ```environments/$ENV/inventory/groups``:
12
+
13
+ ```
14
+ [vgpu:children]
15
+ cuda
16
+ ```
17
+
18
+ ## Configuration
19
+
20
+ Use variables from the [ stackhpc.linux.vgpu] ( https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu ) role.
21
+
22
+ For example in: ` environments/<environment>/inventory/group_vars/all/vgpu ` :
23
+
24
+ ```
25
+ ---
26
+ vgpu_definitions:
27
+ - pci_address: "0000:17:00.0"
28
+ mig_devices:
29
+ "1g.10gb": 4
30
+ "4g.40gb": 1
31
+ - pci_address: "0000:81:00.0"
32
+ mig_devices:
33
+ "1g.10gb": 4
34
+ "4g.40gb": 1
35
+ ```
36
+
37
+ The appliance will use the driver installed via the `` cuda `` role. Use `` lspci `` to determine the PCI
38
+ addresses.
39
+
40
+ ## compute_init
41
+
42
+ Use the `` vgpu `` metadata option to enable creation of mig devices on rebuild.
43
+
44
+ ## gres configuration
45
+
46
+ Enable gres autodetection. This can be set as a host or group var.
47
+
48
+ ```
49
+ openhpc_gres_autodetect: nvml
50
+ ```
51
+
52
+ You should stop terraform templating out partitions.yml and specify ` openhpc_slurm_partitions ` manually.
53
+ An example of specifying gres resources is shown below
54
+ (` environments/<environment>/inventory/group_vars/all/partitions-manual.yml ` ):
55
+
56
+ ```
57
+ openhpc_slurm_partitions:
58
+ - name: cpu
59
+ - name: gpu
60
+ gres:
61
+ # Two cards not partitioned with MIG
62
+ - conf: "gpu:nvidia_h100_80gb_hbm3:2"
63
+ - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2"
64
+ - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6"
65
+ ```
Original file line number Diff line number Diff line change
1
+ ---
2
+
3
+ # Nvidia driver is provided by cuda role.
4
+ vgpu_nvidia_driver_install_enabled: false
Original file line number Diff line number Diff line change @@ -112,6 +112,10 @@ freeipa_client
112
112
[cuda]
113
113
# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
114
114
115
+ [vgpu]
116
+ # FIXME: Update once PR merged
117
+ # Hosts where vGPU/MIG should be configured - see https://github.com/stackhpc/ansible-collection-linux/pull/43/files#diff-74e43d9a34244aa54721f4dbd12a029baa87957afd762b88c2677aa75414f514R75
118
+
115
119
[eessi]
116
120
# Hosts on which EESSI stack should be configured
117
121
Original file line number Diff line number Diff line change 4
4
version : v25.3.2
5
5
name : stackhpc.nfs
6
6
- src : https://github.com/stackhpc/ansible-role-openhpc.git
7
- version : v0.28.0
7
+ version : feature/gres-autodetect
8
8
name : stackhpc.openhpc
9
9
- src : https://github.com/stackhpc/ansible-node-exporter.git
10
10
version : stackhpc
@@ -55,4 +55,7 @@ collections:
55
55
version : 0.0.15
56
56
- name : stackhpc.pulp
57
57
version : 0.5.5
58
+ - name : https://github.com/stackhpc/ansible-collection-linux
59
+ type : git
60
+ version : feature/mig-only
58
61
...
You can’t perform that action at this time.
0 commit comments