Skip to content

Commit bd6526f

Browse files
authored
Merge branch 'main' into feat/etc_hosts
2 parents 2439d80 + 9f4ef8e commit bd6526f

File tree

12 files changed

+172
-1
lines changed

12 files changed

+172
-1
lines changed

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ roles/*
3434
!roles/mysql/**
3535
!roles/systemd/
3636
!roles/systemd/**
37+
!roles/cuda/
38+
!roles/cuda/**
3739
!roles/freeipa/
3840
!roles/freeipa/**
3941
!roles/proxy/

ansible/adhoc/cudatests.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
- hosts: cuda
2+
become: yes
3+
gather_facts: no
4+
tags: cuda_samples
5+
tasks:
6+
- import_role:
7+
name: cuda
8+
tasks_from: samples.yml

ansible/extras.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
- hosts: cuda
2+
become: yes
3+
gather_facts: no
4+
tags: cuda
5+
tasks:
6+
- import_role:
7+
name: cuda

ansible/fatimage.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
dnf:
3333
name: nfs-utils
3434

35+
- import_playbook: extras.yml
36+
37+
- hosts: builder
38+
become: yes
39+
gather_facts: no
40+
tasks:
3541
# - import_playbook: slurm.yml
3642
- name: OpenHPC
3743
import_role:

ansible/roles/cuda/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cuda
2+
3+
Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
4+
5+
## Prerequisites
6+
7+
Requires OFED to be installed to provide required kernel-* packages.
8+
9+
## Role Variables
10+
11+
- `cuda_distro`: Optional. Default `rhel8`.
12+
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13+
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
cuda_distro: rhel8
2+
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3+
cuda_packages:
4+
- cuda
5+
- nvidia-gds
6+
# cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
7+
cuda_version_short: "{{ cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
8+
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
9+
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
10+
cuda_samples_programs:
11+
- deviceQuery
12+
- bandwidthTest
13+
# cuda_devices: # discovered from deviceQuery run

ansible/roles/cuda/tasks/main.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
3+
4+
- name: Check for OFED
5+
command:
6+
cmd: dnf list --installed rdma-core
7+
register: _dnf_rdma_core
8+
changed_when: false
9+
10+
- name: Assert OFED installed
11+
assert:
12+
that: "'mlnx' in _dnf_rdma_core.stdout"
13+
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?"
14+
15+
- name: Install cuda repo
16+
get_url:
17+
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
18+
url: "{{ cuda_repo }}"
19+
20+
- name: Enable nvidia driver module
21+
ansible.builtin.command: dnf module enable -y nvidia-driver:latest-dkms
22+
register: nvidiadriver_enable
23+
changed_when: "'Nothing to do' not in nvidiadriver_enable.stdout"
24+
25+
- name: Install nvidia driver module
26+
ansible.builtin.command: dnf module install -y nvidia-driver:latest-dkms
27+
register: nvidiadriver_install
28+
changed_when: "'Nothing to do' not in nvidiadriver_install.stdout"
29+
30+
- name: Install cuda packages
31+
ansible.builtin.dnf:
32+
name: "{{ cuda_packages }}"
33+
register: cuda_package_install
34+
35+
- name: Add latest cuda binaries to path
36+
lineinfile:
37+
path: /etc/profile.d/sh.local
38+
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
39+
40+
- name: Enable NVIDIA Persistence Daemon
41+
systemd:
42+
name: nvidia-persistenced
43+
enabled: true
44+
state: started
45+
46+
- name: Reboot
47+
ansible.builtin.reboot:
48+
post_reboot_delay: 30
49+
when: cuda_package_install.changed
50+
51+
- name: Wait for hosts to be reachable
52+
wait_for_connection:
53+
sleep: 15

ansible/roles/cuda/tasks/samples.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
- name: Read cuda version file
2+
slurp:
3+
src: /usr/local/cuda/version.json
4+
register: _cuda_samples_version
5+
6+
- name: Set fact for discovered cuda version
7+
set_fact:
8+
cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
9+
10+
- name: Ensure cuda_samples_path exists
11+
file:
12+
state: directory
13+
path: "{{ cuda_samples_path }}"
14+
owner: "{{ ansible_user }}"
15+
group: "{{ ansible_user }}"
16+
17+
- name: Download cuda sample release
18+
unarchive:
19+
remote_src: yes
20+
src: "{{ cuda_samples_release_url }}"
21+
dest: "{{ cuda_samples_path }}"
22+
owner: "{{ ansible_user }}"
23+
group: "{{ ansible_user }}"
24+
25+
- name: Build cuda samples
26+
shell:
27+
cmd: make
28+
chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/Samples/1_Utilities/{{ item }}"
29+
creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/{{ item }}"
30+
loop: "{{ cuda_samples_programs }}"
31+
32+
- name: Run cuda deviceQuery
33+
command:
34+
cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/deviceQuery"
35+
register: _cuda_devicequery
36+
37+
- name: Set fact for cuda devices
38+
set_fact:
39+
cuda_devices: "{{ _cuda_devicequery.stdout | regex_findall('Device (\\d+):') }}"
40+
41+
- name: Run cuda bandwidth test
42+
command:
43+
cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/bandwidthTest --device={{ item }}"
44+
register: _cuda_bandwidthtest
45+
loop: "{{ cuda_devices }}"
46+
loop_control:
47+
label: "Device {{ item }}" # e.g '0'
48+
49+
- name: Summarise bandwidth test output
50+
debug:
51+
msg: |
52+
{{ _parts[1].splitlines()[0] | trim }}
53+
Bandwidths: (Gb/s)
54+
Host to Device: {{ _parts[2].split()[-1] }}
55+
Device to Host: {{ _parts[3].split()[-1] }}
56+
Device to Device: {{ _parts[4].split()[-1] }}
57+
{{ ': '.join(_parts[5].split('=') | map('trim')) }}
58+
{{ _parts[6] }}
59+
loop: "{{ _cuda_bandwidthtest.results }}"
60+
vars:
61+
_parts: "{{ item.stdout.split('\n\n') }}"
62+
loop_control:
63+
label: "Device {{ item.item }}" # e.g '0'

ansible/site.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
when: hook_path | exists
1919

2020
- import_playbook: filesystems.yml
21+
- import_playbook: extras.yml
2122
- import_playbook: slurm.yml
2223
- import_playbook: portal.yml
2324
- import_playbook: monitoring.yml

environments/common/inventory/groups

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ grafana
104104
control
105105
prometheus
106106

107+
[cuda]
108+
# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
107109
[resolv_conf]
108110
# Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md
109111

environments/common/layouts/everything

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ compute
5555
[etc_hosts]
5656
# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md
5757

58+
[cuda]
59+
# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
60+
5861
[eessi:children]
5962
openhpc
6063

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ roles:
33
- src: stackhpc.nfs
44
version: v22.9.1
55
- src: https://github.com/stackhpc/ansible-role-openhpc.git
6-
version: v0.18.0 # requires/uses openhpc v2.6.1
6+
version: v0.20.0 # Allow multiple empty partitions by @sjpb in #156
77
name: stackhpc.openhpc
88
- src: https://github.com/stackhpc/ansible-node-exporter.git
99
version: feature/no-install

0 commit comments

Comments
 (0)