Skip to content

Commit f4b02ce

Browse files
committed
Merge branch 'main' into feat/no-ohpc
2 parents 29c8018 + 6f31af4 commit f4b02ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+856
-74
lines changed

.github/workflows/fatimage.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
. environments/.stackhpc/activate
4848
cd packer/
4949
packer init .
50-
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
50+
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
5151
5252
- name: Get created image name from manifest
5353
id: manifest

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,5 @@ roles/*
4242
!roles/proxy/**
4343
!roles/resolv_conf/
4444
!roles/resolv_conf/**
45+
!roles/cve-2023-41914
46+
!roles/cve-2023-41914/**

ansible/adhoc/backup-keytabs.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Use ONE of the following tags on this playbook:
2+
# - retrieve: copies keytabs out of the state volume to the environment
3+
# - deploy: copies keytabs from the environment to the state volume
4+
5+
- hosts: freeipa_client
6+
become: yes
7+
gather_facts: no
8+
tasks:
9+
- import_role:
10+
name: freeipa
11+
tasks_from: backup-keytabs.yml

ansible/adhoc/cve-2023-41914.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
- hosts: openhpc
2+
gather_facts: no
3+
become: yes
4+
tasks:
5+
- import_role:
6+
name: cve-2023-41914

ansible/bootstrap.yml

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,21 @@
8282
policy: "{{ selinux_policy }}"
8383
register: sestatus
8484

85+
- hosts: freeipa_server
86+
# Done here as it might be providing DNS
87+
tags:
88+
- freeipa
89+
- freeipa_server
90+
gather_facts: yes
91+
become: yes
92+
tasks:
93+
- name: Install FreeIPA server
94+
import_role:
95+
name: freeipa
96+
tasks_from: server.yml
97+
98+
# --- tasks after here require access to package repos ---
99+
85100
- hosts: firewalld
86101
gather_facts: false
87102
become: yes
@@ -112,16 +127,6 @@
112127
tasks_from: config.yml
113128
tags: config
114129

115-
- name: Setup EESSI
116-
hosts: eessi
117-
tags: eessi
118-
become: true
119-
gather_facts: false
120-
tasks:
121-
- name: Install and configure EESSI
122-
import_role:
123-
name: eessi
124-
125130
- hosts: update
126131
gather_facts: false
127132
become: yes

ansible/ci/retrieve_inventory.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
gather_facts: no
88
vars:
99
cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475
10-
cluster_network: WCDC-iLab-60
10+
ci_vars_file: "{{ appliances_environment_root + '/terraform/' + lookup('env', 'CI_CLOUD') }}.tfvars"
11+
cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}"
1112
tasks:
1213
- name: Get control host IP
1314
set_fact:

ansible/extras.yml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,25 @@
1-
- hosts: cuda
1+
- hosts: basic_users
2+
become: yes
3+
tags:
4+
- basic_users
5+
- users
6+
gather_facts: yes
7+
tasks:
8+
- import_role:
9+
name: basic_users
10+
11+
- name: Setup EESSI
12+
hosts: eessi
13+
tags: eessi
14+
become: true
15+
gather_facts: false
16+
tasks:
17+
- name: Install and configure EESSI
18+
import_role:
19+
name: eessi
20+
21+
- name: Setup CUDA
22+
hosts: cuda
223
become: yes
324
gather_facts: no
425
tags: cuda

ansible/fatimage.yml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Builder version of site.yml just installing binaries
22

3+
- hosts: builder
4+
become: no
5+
gather_facts: no
6+
tasks:
7+
- name: Report hostname (= final image name)
8+
command: hostname
9+
310
- name: Run pre.yml hook
411
vars:
512
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -27,6 +34,13 @@
2734
state: stopped
2835
enabled: false
2936

37+
# - import_playbook: iam.yml
38+
- name: Install FreeIPA client
39+
import_role:
40+
name: freeipa
41+
tasks_from: client-install.yml
42+
when: "'freeipa_client' in group_names"
43+
3044
# - import_playbook: filesystems.yml
3145
- name: nfs
3246
dnf:
@@ -44,7 +58,7 @@
4458
tasks_from: "install-{{ openhpc_install_type }}.yml"
4559

4660
- name: Include distribution variables for osc.ood
47-
include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky.yml"
61+
include_vars: "{{ appliances_repository_root }}/ansible/roles/osc.ood/vars/Rocky/8.yml"
4862
# FUTURE: install-apps.yml - this is git clones
4963

5064
# - import_playbook: portal.yml
@@ -141,8 +155,6 @@
141155
name: cloudalchemy.grafana
142156
tasks_from: install.yml
143157

144-
# - import_playbook: iam.yml - nothing to do
145-
146158
- name: Run post.yml hook
147159
vars:
148160
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

ansible/iam.yml

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,42 @@
1-
- hosts: basic_users
1+
- hosts: freeipa_client
2+
tags:
3+
- freeipa
4+
- freeipa_server # as this is only relevant if using freeipa_server
5+
- freeipa_host
6+
gather_facts: no
27
become: yes
8+
tasks:
9+
- name: Ensure FreeIPA client hosts are added to the FreeIPA server
10+
import_role:
11+
name: freeipa
12+
tasks_from: addhost.yml
13+
when: groups['freeipa_server'] | length > 0
14+
15+
- hosts: freeipa_client
316
tags:
4-
- basic_users
17+
- freeipa
18+
- freeipa_client
519
gather_facts: yes
20+
become: yes
21+
tasks:
22+
- name: Install FreeIPA client
23+
import_role:
24+
name: freeipa
25+
tasks_from: client-install.yml
26+
- name: Enrol FreeIPA client
27+
import_role:
28+
name: freeipa
29+
tasks_from: enrol.yml
30+
31+
- hosts: freeipa_server
32+
tags:
33+
- freeipa
34+
- freeipa_server
35+
- users
36+
gather_facts: yes
37+
become: yes
638
tasks:
7-
- import_role:
8-
name: basic_users
39+
- name: Add FreeIPA users
40+
import_role:
41+
name: freeipa
42+
tasks_from: users.yml

ansible/roles/cuda/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@ Requires OFED to be installed to provide required kernel-* packages.
1010

1111
- `cuda_distro`: Optional. Default `rhel8`.
1212
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13+
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
1314
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
1415
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
cuda_distro: rhel8
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3+
cuda_driver_stream: default
34
cuda_packages:
45
- cuda
56
- nvidia-gds
6-
# cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
7-
cuda_version_short: "{{ cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
7+
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
89
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
910
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1011
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,40 @@
1717
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
1818
url: "{{ cuda_repo }}"
1919

20+
- name: Check if nvidia driver module is enabled
21+
shell:
22+
cmd: dnf module list --enabled nvidia-driver
23+
changed_when: false
24+
failed_when: false
25+
register: _cuda_driver_module_enabled
26+
27+
- name: List nvidia driver dnf module stream versions
28+
shell:
29+
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30+
# Output of interest from command is something like (some whitespace removed):
31+
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32+
changed_when: false
33+
register: _cuda_driver_module_streams
34+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35+
2036
- name: Enable nvidia driver module
21-
ansible.builtin.command: dnf module enable -y nvidia-driver:latest-dkms
22-
register: nvidiadriver_enable
23-
changed_when: "'Nothing to do' not in nvidiadriver_enable.stdout"
37+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
38+
register: _cuda_driver_module_enable
39+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
40+
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
2441

25-
- name: Install nvidia driver module
26-
ansible.builtin.command: dnf module install -y nvidia-driver:latest-dkms
27-
register: nvidiadriver_install
28-
changed_when: "'Nothing to do' not in nvidiadriver_install.stdout"
42+
- name: Install nvidia drivers # TODO: make removal possible?
43+
ansible.builtin.command: dnf module install -y nvidia-driver
44+
register: _cuda_driver_install
45+
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
46+
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
2947

3048
- name: Install cuda packages
3149
ansible.builtin.dnf:
3250
name: "{{ cuda_packages }}"
3351
register: cuda_package_install
3452

35-
- name: Add latest cuda binaries to path
53+
- name: Add cuda binaries to path
3654
lineinfile:
3755
path: /etc/profile.d/sh.local
3856
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'

ansible/roles/cuda/tasks/samples.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
- name: Set fact for discovered cuda version
77
set_fact:
8-
cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
8+
_cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0'
99

1010
- name: Ensure cuda_samples_path exists
1111
file:
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# cve-2023-41914
2+
3+
This role fixes [Slurm CVE-2023-41914](https://lists.schedmd.com/pipermail/slurm-announce/2023/000100.html):
4+
5+
> A number of race conditions have been identified within the slurmd/slurmstepd processes that can lead to the user taking ownership of an arbitrary file on the system. A related issue can lead to the user overwriting an arbitrary file on the compute node (although with data that is not directly under their control). A related issue can also lead to the user deleting all files and sub-directories of an arbitrary target directory on the compute node.
6+
7+
**NB:** It is only suitable for use on systems installed from OpenHPC v2.6.1 (Slurm v22.05).
8+
9+
At the time of writing, new OpenHPC packages have been built but are not available from the respositories (reference), hence `dnf update ...` is not available.
10+
11+
This role can be run in two ways:
12+
13+
1. To remediate an existing system, run `tasks/main.yml`, e.g. using the playbook `ansible/adhoc/cve-2023-41914.yml`. This will:
14+
- Stop all Slurm services
15+
- Backup the slurmdbd mysql database to the volume-backed directory `/var/lib/state/mysql-backups/` on the control node (by default).
16+
- Uninstall the affected packages and install updated rpms from the OpenHPC build system.
17+
- Restart Slurm services.
18+
19+
**NB**: This playbook will ALWAYS stop and restart Slurm, even if no updates are actually required.
20+
21+
2. To remediate images during build (i.e no Slurm services are running, no slurm database exists), run `tasks/install-rpms.yml`, e.g. using the following in an environment pre-hook:
22+
23+
```yaml
24+
- hosts: builder
25+
gather_facts: no
26+
become: yes
27+
tasks:
28+
- name: Apply fixes for cve-2023-41914
29+
import_role:
30+
name: cve-2023-41914
31+
tasks_from: install-rpms.yml
32+
```
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
# _cve_2023_41814_installed_slurm: []
3+
cve_2023_41914_mysql_backup_path: "{{ mysql_datadir }}-backups/{{ lookup('pipe', 'date --iso-8601=seconds') }}.sql"
4+
5+
cve_2023_41914_rpm_url: http://obs.openhpc.community:82/OpenHPC:/2.6.2:/Factory/EL_8/x86_64
6+
cve_2023_41914_rpms: # see cve_2023_41914_rpm_url
7+
- slurm-ohpc # has to be first as dependency
8+
- slurm-contribs-ohpc
9+
- slurm-devel-ohpc
10+
- slurm-example-configs-ohpc
11+
- slurm-libpmi-ohpc
12+
- slurm-ohpc-slurmrestd
13+
- slurm-openlava-ohpc
14+
- slurm-pam_slurm-ohpc
15+
- slurm-perlapi-ohpc
16+
- slurm-slurmctld-ohpc
17+
- slurm-slurmd-ohpc
18+
- slurm-slurmdbd-ohpc
19+
- slurm-sview-ohpc
20+
- slurm-torque-ohpc
21+
cve_2023_41914_rpm_fix_ver: '22.05.10'
22+
cve_2023_41914_rpm_fix_release: '2.1.ohpc.2.6.2'
23+
_cve_2023_41814_updates: []
24+
cve_2023_41914_pkglist_path: "{{ appliances_environment_root }}/{{ inventory_hostname }}-cve_2023_41814_updates"
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
- name: Validate suitability
2+
include_tasks: validate.yml
3+
when: _cve_2023_41814_installed_pkgs is undefined
4+
5+
- name: Identify packages to update
6+
set_fact:
7+
_cve_2023_41814_updates: "{{ _cve_2023_41814_updates + [item] }}"
8+
loop: "{{ cve_2023_41914_rpms }}"
9+
when:
10+
- item in ansible_facts.packages
11+
- cve_2023_41914_rpm_fix_ver is version(ansible_facts.packages[item][0].version, '>')
12+
13+
- name: Write packages to be modified to a file
14+
# allows recovery from failures in subsequent package deletion/rpm install
15+
copy:
16+
dest: "{{ cve_2023_41914_pkglist_path }}"
17+
content: "{{ _cve_2023_41814_updates | to_nice_yaml }}"
18+
when: _cve_2023_41814_updates | length > 0
19+
delegate_to: localhost
20+
21+
- name: Read packages to modify
22+
set_fact:
23+
_cve_2023_41814_updates: "{{ lookup('file', cve_2023_41914_pkglist_path) | from_yaml }}"
24+
25+
- name: Identify architecture
26+
setup:
27+
gather_subset: architecture
28+
29+
- name: Remove installed packages
30+
dnf:
31+
name: "{{ _cve_2023_41814_updates }}"
32+
state: absent
33+
34+
- name: Install rpms
35+
dnf:
36+
name: "{{ cve_2023_41914_rpm_url }}/{{ item }}-{{ cve_2023_41914_rpm_fix_ver }}-{{ cve_2023_41914_rpm_fix_release }}.{{ ansible_architecture }}.rpm"
37+
loop: "{{ _cve_2023_41814_updates }}"
38+
register: _cve_2023_41814_rpm_installs
39+
40+
- name: Reload systemd units
41+
command: systemctl daemon-reload
42+
when: _cve_2023_41814_rpm_installs.changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- include_tasks: validate.yml
2+
- include_tasks: pre-upgrade.yml
3+
- include_tasks: install-rpms.yml
4+
- include_tasks: post-upgrade.yml

0 commit comments

Comments
 (0)