Skip to content

Commit a2dde14

Browse files
authored
Merge branch 'main' into feat/compute-script-sb
2 parents 3b9eb46 + 8059d24 commit a2dde14

File tree

14 files changed

+127
-46
lines changed

14 files changed

+127
-46
lines changed

.github/workflows/doca.yml renamed to .github/workflows/extra.yml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test DOCA extra build
1+
name: Test extra build
22
on:
33
workflow_dispatch:
44
push:
@@ -7,16 +7,18 @@ on:
77
paths:
88
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
99
- 'ansible/roles/doca/**'
10-
- '.github/workflows/doca'
10+
- 'ansible/roles/cuda/**'
11+
- '.github/workflows/extra.yml'
1112
pull_request:
1213
paths:
1314
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
1415
- 'ansible/roles/doca/**'
15-
- '.github/workflows/doca'
16+
- 'ansible/roles/cuda/**'
17+
- '.github/workflows/extra.yml'
1618

1719
jobs:
1820
doca:
19-
name: doca-build
21+
name: extra-build
2022
concurrency:
2123
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
2224
cancel-in-progress: true
@@ -25,12 +27,14 @@ jobs:
2527
fail-fast: false # allow other matrix jobs to continue even if one fails
2628
matrix: # build RL8, RL9
2729
build:
28-
- image_name: openhpc-doca-RL8
30+
- image_name: openhpc-extra-RL8
2931
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
30-
inventory_groups: doca
31-
- image_name: openhpc-doca-RL9
32+
inventory_groups: doca,cuda
33+
volume_size: 30 # needed for cuda
34+
- image_name: openhpc-extra-RL9
3235
source_image_name_key: RL9
33-
inventory_groups: doca
36+
inventory_groups: doca,cuda
37+
volume_size: 30 # needed for cuda
3438
env:
3539
ANSIBLE_FORCE_COLOR: True
3640
OS_CLOUD: openstack
@@ -95,6 +99,7 @@ jobs:
9599
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
96100
-var "image_name=${{ matrix.build.image_name }}" \
97101
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
102+
-var "volume_size=${{ matrix.build.volume_size }}" \
98103
openstack.pkr.hcl
99104
100105
- name: Get created image names from manifest

.github/workflows/fatimage.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ jobs:
2323
matrix: # build RL8, RL9
2424
build:
2525
- image_name: openhpc-RL8
26-
source_image_name: rocky-latest-RL8
27-
inventory_groups: control,compute,login
26+
source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2
27+
inventory_groups: control,compute,login,update
2828
- image_name: openhpc-RL9
29-
source_image_name: rocky-latest-RL9
30-
inventory_groups: control,compute,login
29+
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
30+
inventory_groups: control,compute,login,update
3131
env:
3232
ANSIBLE_FORCE_COLOR: True
3333
OS_CLOUD: openstack

ansible/cleanup.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,4 @@
6666
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
6767

6868
- name: Show image summary
69-
debug:
70-
var: image_info
69+
command: cat /var/lib/image/image.json

ansible/extras.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
gather_facts: yes
2525
tags: cuda
2626
tasks:
27-
- import_role:
27+
- include_role:
2828
name: cuda
29+
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
2930

3031
- name: Persist hostkeys across rebuilds
3132
# Must be after filesystems.yml (for storage)
@@ -56,3 +57,13 @@
5657
tasks:
5758
- import_role:
5859
name: k9s
60+
61+
- hosts: extra_packages
62+
become: yes
63+
tags:
64+
- extra_packages
65+
tasks:
66+
- name: Install additional packages
67+
dnf:
68+
name: "{{ appliances_extra_packages }}"
69+
when: appliances_mode != 'configure' or appliances_extra_packages_during_configure

ansible/fatimage.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@
2929

3030
- import_playbook: bootstrap.yml
3131

32+
- hosts: doca
33+
become: yes
34+
gather_facts: yes
35+
tasks:
36+
- name: Install NVIDIA DOCA
37+
import_role:
38+
name: doca
39+
3240
- name: Run post-bootstrap.yml hook
3341
vars:
3442
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -230,15 +238,15 @@
230238
import_role:
231239
name: doca
232240

233-
- import_playbook: disable-repos.yml
234-
235241
- name: Run post.yml hook
236242
vars:
237243
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
238244
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
239245
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
240246
when: hook_path | exists
241247

248+
- import_playbook: disable-repos.yml
249+
242250
- hosts: builder
243251
become: yes
244252
gather_facts: yes

ansible/roles/cuda/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# cuda
22

3-
Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
3+
Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

55
## Prerequisites
66

77
Requires OFED to be installed to provide required kernel-* packages.
88

99
## Role Variables
1010

11-
- `cuda_distro`: Optional. Default `rhel8`.
12-
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13-
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
11+
- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
12+
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
1413
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
14+
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
1515
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
2-
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3-
cuda_driver_stream: default
4-
cuda_package_version: 'latest'
1+
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
2+
cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages
3+
cuda_package_version: '12.6.3-1'
54
cuda_packages:
65
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
76
- nvidia-gds

ansible/roles/cuda/tasks/main.yml renamed to ansible/roles/cuda/tasks/install.yml

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
33

4-
- name: Check for OFED
4+
- name: Check for OFED/DOCA
55
command:
66
cmd: dnf list --installed rdma-core
77
register: _dnf_rdma_core
@@ -10,41 +10,53 @@
1010
- name: Assert OFED installed
1111
assert:
1212
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?"
13+
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
1414

1515
- name: Install cuda repo
1616
get_url:
17-
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
18-
url: "{{ cuda_repo }}"
17+
dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo"
18+
url: "{{ cuda_repo_url }}"
1919

2020
- name: Check if nvidia driver module is enabled
21-
shell:
22-
cmd: dnf module list --enabled nvidia-driver
21+
ansible.builtin.command: dnf module list --enabled nvidia-driver
2322
changed_when: false
2423
failed_when: false
2524
register: _cuda_driver_module_enabled
2625

2726
- name: Enable nvidia driver module
28-
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
27+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}"
2928
register: _cuda_driver_module_enable
3029
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3130
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
3231

32+
- name: Check if nvidia driver module is installed
33+
ansible.builtin.command: dnf module list --installed nvidia-driver
34+
changed_when: false
35+
failed_when: false
36+
register: _cuda_driver_module_installed
37+
3338
- name: Install nvidia drivers
3439
ansible.builtin.command: dnf module install -y nvidia-driver
3540
register: _cuda_driver_install
36-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
41+
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
3742
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
3843

44+
- name: Check kernel has not been modified
45+
assert:
46+
that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched
47+
fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}"
48+
3949
- name: Install cuda packages
4050
ansible.builtin.dnf:
4151
name: "{{ cuda_packages }}"
52+
when: cuda_package_version != 'none'
4253
register: cuda_package_install
4354

4455
- name: Add cuda binaries to path
4556
lineinfile:
4657
path: /etc/profile.d/sh.local
4758
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
59+
when: cuda_package_version != 'none'
4860

4961
- name: Enable NVIDIA Persistence Daemon
5062
systemd:
@@ -60,3 +72,4 @@
6072
- name: Wait for hosts to be reachable
6173
wait_for_connection:
6274
sleep: 15
75+
when: cuda_package_install.changed

ansible/roles/cuda/tasks/runtime.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
- name: Ensure NVIDIA Persistence Daemon state
2+
systemd:
3+
name: nvidia-persistenced
4+
enabled: true
5+
state: "{{ cuda_persistenced_state }}"

docs/operations.md

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,30 @@ This is a usually a two-step process:
6363
Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster).
6464

6565
# Adding Additional Packages
66-
Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example:
67-
68-
# environments/foo-base/inventory/group_vars/all/openhpc.yml:
69-
openhpc_packages_extra:
66+
By default, the following utility packages are installed during build:
67+
- htop
68+
- nano
69+
- screen
70+
- tmux
71+
- wget
72+
- bind-utils
73+
- net-tools
74+
- postfix
75+
- git
76+
- latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4)
77+
78+
Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example:
79+
80+
```yaml
81+
# environments/foo-base/inventory/group_vars/all/defaults.yml:
82+
appliances_extra_packages_other:
7083
- somepackage
7184
- anotherpackage
7285

7386

7487
The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules.
7588

76-
To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images.
89+
If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users).
7790

7891
If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this:
7992
- `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-241216-1607-2357a730",
4-
"RL9": "openhpc-RL9-241216-1607-2357a730"
3+
"RL8": "openhpc-RL8-241219-1232-7f84fed4",
4+
"RL9": "openhpc-RL9-241219-1145-7f84fed4"
55
}
66
}

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,37 @@ appliances_local_users_default:
8080
appliances_local_users_extra: [] # see format of appliances_local_users_default above
8181
appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}"
8282

83-
###########################################################################################
83+
################## bootstrap: extra package installs ######################################
84+
85+
appliances_extra_packages_default:
86+
- htop
87+
- nano
88+
- screen
89+
- tmux
90+
- wget
91+
- bind-utils
92+
- net-tools
93+
- postfix
94+
- git
95+
- "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}"
96+
97+
98+
appliances_extra_packages_during_configure: false
99+
100+
appliances_extra_packages_other: []
101+
102+
appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}"
103+
104+
###################### ark repo timestamps ###################################################
84105

85106
appliances_repo_timestamps:
86107
baseos:
87-
'9.4': 20240816T002610
108+
'9.4': 20241115T011711
88109
appstream:
89-
'9.4': 20240816T002610
110+
'9.4': 20241112T003151
90111
crb:
91-
'9.4': 20240816T002610
112+
'9.4': 20241115T003133
92113
extras:
93-
'9.4': 20240816T002610
114+
'9.4': 20241118T002802
94115
epel:
95-
'9': 20240902T080424
116+
'9': 20241213T010218

environments/common/inventory/groups

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ freeipa_client
148148
[lustre]
149149
# Hosts to run lustre client
150150

151+
[extra_packages]
152+
# Hosts to install specified additional packages on
153+
151154
[dnf_repos:children]
152155
# Hosts to replace system repos with Pulp repos
153156
# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users

environments/common/layouts/everything

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,7 @@ control
9696

9797
[lustre]
9898
# Hosts to run lustre client
99+
100+
[extra_packages:children]
101+
# Hosts to install specified additional packages on
102+
cluster

0 commit comments

Comments
 (0)