Skip to content

Commit c553f5e

Browse files
committed
WIP: deploy custom slurm in lab
1 parent 8cf88bd commit c553f5e

File tree

5 files changed

+141
-1
lines changed

5 files changed

+141
-1
lines changed

environments/lab/hooks/build.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
- name: Ensure build directory exists
2+
file:
3+
state: directory
4+
path: "{{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}"
5+
6+
- name: Ensure build directory is empty
7+
shell:
8+
cmd: "rm -rvf {{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}/*"
9+
register: _empty_build_dir
10+
changed_when: _empty_build_dir.stdout_lines | length > 0
11+
12+
- name: Build container
13+
command:
14+
cmd: >-
15+
podman --tmpdir=/mnt/image-storage/tmp build
16+
--build-arg SLURM_PREFIX={{ slurm_build_dir }}
17+
--build-arg SLURM_SYSCONFDIR={{ openhpc_slurm_conf_path | dirname }}
18+
. -t slurm-{{ slurm_build_version }}
19+
--output ./{{ slurm_build_version }}
20+
chdir: "{{ appliances_environment_root }}/slurmbuild"
21+
# TODO: doesn't look idempotent although it is

environments/lab/hooks/pre.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,49 @@
1919
hosts: compute
2020
become: true
2121
gather_facts: false
22+
tags: scratch
2223
tasks:
2324
- name: Create scratch directory - on local SSD on prod
2425
file:
2526
path: /var/scratch
2627
state: directory
28+
29+
- name: Build custom Slurm
30+
hosts: localhost
31+
become: no
32+
gather_facts: no
33+
tags: slurm
34+
tasks:
35+
- include_tasks: build.yml
36+
37+
- name: Copy custom Slurm to storage
38+
hosts: control # doens't matter, just needs to be one
39+
become: yes
40+
gather_facts: no
41+
tags: slurm
42+
tasks:
43+
- name: Ensure shared slurm directory exists
44+
file:
45+
state: directory
46+
path: "{{ slurm_build_dir }}"
47+
owner: root
48+
group: root
49+
mode: u=rwX,go=rX
50+
51+
- name: Copy custom slurm
52+
copy:
53+
src: "{{ item.src }}"
54+
dest: "{{ item.dest }}"
55+
owner: root
56+
group: root
57+
mode: u=rwx,go=rx
58+
loop:
59+
# - src: "{{ slurm_local_build_dir }}/sbin/"
60+
# dest: "{{ openhpc_sbin_dir }}"
61+
- src: "{{ slurm_local_build_dir }}/lib/"
62+
dest: "{{ openhpc_lib_dir }}"
63+
# - src: "{{ slurm_local_build_dir }}/bin/"
64+
# dest: "{{ openhpc_bin_dir }}"
65+
vars:
66+
slurm_local_build_dir: "{{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}"
67+

environments/lab/inventory/group_vars/openhpc/overrides.yml renamed to environments/lab/inventory/group_vars/all/openhpc.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ slurm_build_path: /nopt/vtest/slurm
77
slurm_build_dir: "{{ slurm_build_path }}/{{ slurm_build_version }}"
88

99
openhpc_sbin_dir: "{{ slurm_build_dir }}/sbin"
10-
openhpc_lib_dir: "{{ slurm_build_dir }}/slurm"
10+
openhpc_lib_dir: "{{ slurm_build_dir }}/lib" # TODO: investigating RPATH shows it expects to find /nopt/vtest/slurm/23.11.0/lib/slurm which needs this
1111
openhpc_bin_dir: "{{ slurm_build_dir }}/bin"
1212
openhpc_slurm_conf_path: "{{ slurm_build_dir }}/etc/slurm.conf"
1313

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
FROM rockylinux:8 as build-stage
2+
3+
ARG SLURM_VERSION=23.11.0 # From https://www.schedmd.com/downloads.php
4+
ARG SLURM_PREFIX=/opt/slurm # Should match directory Slurm is installed at
5+
ARG SLURM_SYSCONFDIR=/etc/slurm # Should match directory slurm.conf will be in
6+
7+
RUN set -ex \
8+
&& yum makecache \
9+
&& yum -y update \
10+
&& yum -y install dnf-plugins-core epel-release \
11+
&& yum -y install dnf-plugins-core \
12+
&& yum config-manager --set-enabled powertools \
13+
&& yum -y install \
14+
wget \
15+
bzip2 \
16+
perl \
17+
gcc \
18+
gcc-c++\
19+
git \
20+
gnupg \
21+
make \
22+
munge \
23+
munge-devel \
24+
python3-devel \
25+
python3-pip \
26+
python3 \
27+
mariadb-server \
28+
mariadb-devel \
29+
psmisc \
30+
bash-completion \
31+
vim-enhanced \
32+
http-parser-devel \
33+
json-c-devel \
34+
mpitests-openmpi \
35+
pmix-devel \
36+
hwloc \
37+
hwloc-devel \
38+
&& yum clean all \
39+
&& rm -rf /var/cache/yum
40+
41+
RUN alternatives --set python /usr/bin/python3
42+
43+
RUN pip3 install Cython nose
44+
45+
RUN set -x \
46+
&& wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \
47+
&& tar --bzip -x -f slurm*tar.bz2
48+
49+
WORKDIR /slurm-${SLURM_VERSION}
50+
51+
RUN set -x && ./configure \
52+
--enable-debug \
53+
--prefix=${SLURM_PREFIX} \
54+
--without-rpath \
55+
--sysconfdir=${SLURM_SYSCONFDIR} \
56+
--with-mysql_config=/usr/bin
57+
58+
RUN set -x && make install
59+
60+
ENTRYPOINT ["/bin/bash"]
61+
62+
63+
FROM scratch as export-stage
64+
65+
ARG SLURM_PREFIX=/slurm # Should match directory Slurm is installed at
66+
# RUN ls ${SLURM_PREFIX}
67+
COPY --from=build-stage ${SLURM_PREFIX}/ .

environments/lab/slurmbuild/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
This uses a podman container to build Slurm, which is then copied out of the container into a version directory.
2+
3+
The following arguments to `./configure` are important:
4+
- `--prefix` must match the path the binaries appear to be at (i.e. from the NFS client side). This is because:
5+
- The `slurm{ctld,d,dbd}` executables hardcode an RPATH, even when passing the `--without-rpath` flag to ./configure.
6+
This means unless the path they are executed at matches the build prefix, they can't find `libslurmfull.so` on startup,
7+
even with entries in `/etc/ld.so.conf.d/`.
8+
- `PluginDir` defaults to being based on the build prefix. Although it can be overriden in `slurm.conf`, the `slurmd`s do not appear to get this parameter when running configless, so they won't start saying the (default) plugin dir doesn't exist
9+
- `--sysconfdir` must match the path the `slurm.conf` file is at on the nodes. Otherwise `s*` commands running on nodes *without* `slurmd` (i.e. the control node only, for a standard Slurm appliance configuration) cannot find the configuration file unless the `SLURM_CONF` environment variable set.
10+
11+
Note that a tmpdir is hardcoded to a volume mounted on the lab deploy host, due to its small root filesystem.

0 commit comments

Comments
 (0)