Skip to content

Commit a803a3d

Browse files
authored
Merge pull request #25 from stackhpc/prod2312-lab-slurm3
Get custom slurm working in lab
2 parents 95493a1 + c2ade7a commit a803a3d

File tree

8 files changed

+149
-22
lines changed

8 files changed

+149
-22
lines changed

environments/lab/hooks/build.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
- name: Ensure build directory exists
2+
file:
3+
state: directory
4+
path: "{{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}"
5+
6+
- name: Ensure build directory is empty
7+
shell:
8+
cmd: "rm -rvf {{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}/*"
9+
register: _empty_build_dir
10+
changed_when: _empty_build_dir.stdout_lines | length > 0
11+
12+
- name: Build container
13+
command:
14+
cmd: >-
15+
podman --tmpdir=/mnt/image-storage/tmp build
16+
--build-arg SLURM_PREFIX={{ slurm_build_dir }}
17+
--build-arg SLURM_SYSCONFDIR={{ openhpc_slurm_conf_path | dirname }}
18+
. -t slurm-{{ slurm_build_version }}
19+
--output ./{{ slurm_build_version }}
20+
chdir: "{{ appliances_environment_root }}/slurmbuild"
21+
# TODO: doesn't look idempotent although it is

environments/lab/hooks/pre.yml

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,50 @@
1-
- name: NREL lab fixes - Workaround no internal DNS
2-
hosts: all
3-
become: true
4-
gather_facts: false
5-
tags: etc_hosts
6-
tasks:
7-
- name: Create /etc/hosts for all nodes as DNS doesn't work
8-
# the interface used as ansible_host is defined by terraform's `access_network` parameter, so this is deterministic for multi-rail hosts
9-
blockinfile:
10-
path: /etc/hosts
11-
create: yes
12-
state: present
13-
block: |
14-
{% for hostname in groups['all'] %}
15-
{{ hostvars[hostname]['ansible_host'] }} {{ hostname }}
16-
{% endfor %}
17-
181
- name: NREL lab fixes - For compute nodes
192
hosts: compute
203
become: true
214
gather_facts: false
5+
tags: scratch
226
tasks:
237
- name: Create scratch directory - on local SSD on prod
248
file:
259
path: /var/scratch
2610
state: directory
11+
12+
- name: Build custom Slurm
13+
hosts: localhost
14+
become: no
15+
gather_facts: no
16+
tags: slurm
17+
tasks:
18+
- include_tasks: build.yml
19+
20+
- name: Copy custom Slurm to storage
21+
hosts: control
22+
become: yes
23+
gather_facts: no
24+
tags: slurm
25+
tasks:
26+
- name: Ensure shared slurm directory exists
27+
file:
28+
state: directory
29+
path: "{{ slurm_build_dir }}" # NB this will be exported by nfs filesystems.yml
30+
owner: root
31+
group: root
32+
mode: u=rwX,go=rX
33+
34+
- name: Copy custom slurm
35+
copy:
36+
src: "{{ item.src }}"
37+
dest: "{{ item.dest }}"
38+
owner: root
39+
group: root
40+
mode: u=rwx,go=rx
41+
loop:
42+
- src: "{{ slurm_local_build_dir }}/sbin/"
43+
dest: "{{ openhpc_sbin_dir }}"
44+
- src: "{{ slurm_local_build_dir }}/lib/"
45+
dest: "{{ openhpc_lib_dir }}"
46+
- src: "{{ slurm_local_build_dir }}/bin/"
47+
dest: "{{ openhpc_bin_dir }}"
48+
vars:
49+
slurm_local_build_dir: "{{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}"
50+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
# Don't have os_manila for lab so fake it using NFS
22
[nfs:children]
33
openhpc
4+
5+
# Don't have working internal DNS
6+
[etc_hosts:children]
7+
cluster

environments/lab/inventory/group_vars/openhpc/overrides.yml renamed to environments/lab/inventory/group_vars/all/openhpc.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ slurm_build_path: /nopt/vtest/slurm
77
slurm_build_dir: "{{ slurm_build_path }}/{{ slurm_build_version }}"
88

99
openhpc_sbin_dir: "{{ slurm_build_dir }}/sbin"
10-
openhpc_lib_dir: "{{ slurm_build_dir }}/slurm"
10+
openhpc_lib_dir: "{{ slurm_build_dir }}/lib" # TODO: investigating RPATH shows it expects to find /nopt/vtest/slurm/23.11.0/lib/slurm which needs this
1111
openhpc_bin_dir: "{{ slurm_build_dir }}/bin"
1212
openhpc_slurm_conf_path: "{{ slurm_build_dir }}/etc/slurm.conf"
1313

1414

1515
openhpc_slurm_partitions:
1616
- name: "sm"
17-
default: NO
17+
default: YES
1818
maxtime: "1-0" # 1 days 0 hours
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
FROM rockylinux:9 as build-stage
2+
3+
ARG SLURM_VERSION=23.11.0 # From https://www.schedmd.com/downloads.php
4+
ARG SLURM_PREFIX=/opt/slurm # Should match directory Slurm is installed at
5+
ARG SLURM_SYSCONFDIR=/etc/slurm # Should match directory slurm.conf will be in
6+
7+
RUN set -ex \
8+
&& yum makecache \
9+
&& yum -y update \
10+
&& yum -y install dnf-plugins-core epel-release \
11+
&& yum -y install dnf-plugins-core \
12+
&& yum config-manager --set-enabled crb \
13+
&& yum -y install \
14+
wget \
15+
bzip2 \
16+
perl \
17+
gcc \
18+
gcc-c++\
19+
git \
20+
gnupg \
21+
make \
22+
munge \
23+
munge-devel \
24+
python3-devel \
25+
python3-pip \
26+
python3 \
27+
mariadb-server \
28+
mariadb-devel \
29+
psmisc \
30+
bash-completion \
31+
vim-enhanced \
32+
http-parser-devel \
33+
json-c-devel \
34+
mpitests-openmpi \
35+
pmix-devel \
36+
hwloc \
37+
hwloc-devel \
38+
dbus-devel \
39+
&& yum clean all \
40+
&& rm -rf /var/cache/yum
41+
42+
RUN pip3 install Cython nose
43+
44+
RUN set -x \
45+
&& wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \
46+
&& tar --bzip -x -f slurm*tar.bz2
47+
48+
WORKDIR /slurm-${SLURM_VERSION}
49+
50+
RUN set -x && ./configure \
51+
--enable-debug \
52+
--prefix=${SLURM_PREFIX} \
53+
--without-rpath \
54+
--sysconfdir=${SLURM_SYSCONFDIR} \
55+
--with-mysql_config=/usr/bin
56+
57+
RUN set -x && make install
58+
59+
ENTRYPOINT ["/bin/bash"]
60+
61+
62+
FROM scratch as export-stage
63+
64+
ARG SLURM_PREFIX=/slurm # Should match directory Slurm is installed at
65+
# RUN ls ${SLURM_PREFIX}
66+
COPY --from=build-stage ${SLURM_PREFIX}/ .

environments/lab/slurmbuild/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
This uses a podman container to build Slurm, which is then copied out of the container into a version directory.
2+
3+
The following arguments to `./configure` are important:
4+
- `--prefix` must match the path the binaries appear to be at (i.e. from the NFS client side). This is because:
5+
- The `slurm{ctld,d,dbd}` executables hardcode an RPATH, even when passing the `--without-rpath` flag to ./configure.
6+
This means unless the path they are executed at matches the build prefix, they can't find `libslurmfull.so` on startup,
7+
even with entries in `/etc/ld.so.conf.d/`.
8+
- `PluginDir` defaults to being based on the build prefix. Although it can be overriden in `slurm.conf`, the `slurmd`s do not appear to get this parameter when running configless, so they won't start saying the (default) plugin dir doesn't exist
9+
- `--sysconfdir` must match the path the `slurm.conf` file is at on the nodes. Otherwise `s*` commands running on nodes *without* `slurmd` (i.e. the control node only, for a standard Slurm appliance configuration) cannot find the configuration file unless the `SLURM_CONF` environment variable set.
10+
11+
Note that a tmpdir is hardcoded to a volume mounted on the lab deploy host, due to its small root filesystem.

environments/nrel/inventory/group_vars/openhpc/overrides.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ openhpc_generic_packages:
110110
- mpitests-openmpi
111111

112112
# Additional parameters to set in slurm.conf - use yaml format
113+
openhpc_epilog: '/nopt/slurm/etc/epilog.d/*'
113114
openhpc_slurmd_spool_dir: /var/spool/slurm/slurmd
114115
openhpc_config_extra:
115116
LaunchParameters: use_interactive_step
@@ -135,7 +136,7 @@ openhpc_config_extra:
135136
# Prolog: '/nopt/slurm/etc/prolog.d/*'
136137
# PrologFlags: 'X11'
137138
# X11Parameters: 'local_xauthority'
138-
# Epilog: '/nopt/slurm/etc/epilog.d/*'
139+
Epilog: '<absent>' # /nopt/slurm/etc/epilog.d/*'
139140
# PrologEpilogTimeout: 180
140141
# UnkillableStepTimeout: 180
141142

@@ -178,7 +179,7 @@ openhpc_config_extra:
178179

179180
# SCHEDULING
180181
SchedulerType: 'sched/backfill'
181-
SelectType: 'select/cons_res'
182+
SelectType: 'select/cons_tres'
182183
SelectTypeParameters: 'CR_Core'
183184
EnforcePartLimits: 'ALL'
184185
SchedulerParameters:

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ roles:
33
- src: stackhpc.nfs
44
version: v23.12.1 # Tolerate state nfs file handles
55
- src: https://github.com/stackhpc/ansible-role-openhpc.git
6-
version: feat/no-ohpc # https://github.com/stackhpc/ansible-role-openhpc/pull/162
6+
version: 5b73b8a # https://github.com/stackhpc/ansible-role-openhpc/pull/163 # TODO: bump on release
77
name: stackhpc.openhpc
88
- src: https://github.com/stackhpc/ansible-node-exporter.git
99
version: stackhpc

0 commit comments

Comments
 (0)