Skip to content

Commit 2362b5e

Browse files
committed
support multiple partitions in hpctests
1 parent be6eafc commit 2362b5e

File tree

9 files changed

+26
-6
lines changed

9 files changed

+26
-6
lines changed

ansible/roles/hpctests/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ Role Variables
2424
--------------
2525

2626
- `hpctests_rootdir`: Required. Path to root of test directory tree, which must be on a r/w filesystem shared to all cluster nodes under test. The last directory component will be created.
27-
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the default partition are used. Note nodes selected **must** be in the default partition.
27+
- `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used.
28+
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used.
2829
- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use).
2930
- `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user).
3031
- `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html).

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ hpctests_hpl_NB: 192
1010
hpctests_hpl_mem_frac: 0.8
1111
hpctests_hpl_arch: linux64
1212
#hpctests_nodes:
13+
#hpctests_partition:

ansible/roles/hpctests/library/slurm_node_info.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
options
2424
nodes:
2525
description:
26-
- Slurm nodenames for which information is required. These must be homogenous.
26+
- Slurm nodenames for which information is required.
2727
required: true
2828
type: list
2929
requirements:
@@ -56,7 +56,6 @@ def run_module():
5656
print(values)
5757
for ix, param in enumerate(params):
5858
info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']]
59-
# info[param] = [nodeinfo[nodelist_ix] for nodeinfo in values]
6059
result['info'] = info
6160

6261
module.exit_json(**result)

ansible/roles/hpctests/tasks/hpl-solo.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
- debug:
4343
msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}"
4444

45-
- name: Get all nodes
46-
shell: "sinfo --Node --noheader --format %N" # TODO: assumes only one partition, although actually excluding nodes not in the default partition should be fine.
45+
- name: Get all nodes in partition
46+
shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}"
4747
register: all_nodes
4848
changed_when: false
4949

@@ -74,6 +74,11 @@
7474
vars:
7575
hpctests_hplsolo_ntasks: 2 # TODO: FIXME
7676

77+
- name: Remove previous outputs
78+
# As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten
79+
shell:
80+
cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out"
81+
7782
- name: Run hpl-solo
7883
shell: sbatch --wait hpl-solo.sh
7984
become: no

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
---
22

3+
- name: Get partition information
4+
shell: "sinfo --format %P --noheader"
5+
register: _sinfo_partitions
6+
changed_when: false
7+
8+
- name: Select default partition if hpctests_partition not given
9+
set_fact:
10+
hpctests_partition: "{{ _sinfo_partitions.stdout_lines | select('contains', '*') | first | trim('*') }}"
11+
when: hpctests_partition is not defined
12+
313
- name: Get info about compute nodes
4-
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --format %N"
14+
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N"
515
register: hpctests_computes
616
changed_when: false
717
failed_when: hpctests_computes.rc != 0

ansible/roles/hpctests/templates/hpl-build.sh.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#SBATCH --output=%x.%a.out
55
#SBATCH --error=%x.%a.out
66
#SBATCH --exclusive
7+
#SBATCH --partition={{ hpctests_partition }}
78
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %}
89

910
echo HPL arch: {{ hpctests_hpl_arch }}

ansible/roles/hpctests/templates/hpl-solo.sh.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#SBATCH --error=%x.%a.out
66
#SBATCH --exclusive
77
#SBATCH --array=0-{{ hpctests_computes.stdout_lines | length - 1 }}
8+
#SBATCH --partition={{ hpctests_partition }}
89
{% if hpctests_hplsolo_excluded_nodes | length > 0 %}
910
#SBATCH --exclude={{ hpctests_hplsolo_excluded_nodes | join(',') }}
1011
{% endif %}

ansible/roles/hpctests/templates/pingmatrix.sh.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#SBATCH --output=%x.out
66
#SBATCH --error=%x.out
77
#SBATCH --exclusive
8+
#SBATCH --partition={{ hpctests_partition }}
89
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}
910

1011
export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}

ansible/roles/hpctests/templates/pingpong.sh.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#SBATCH --output=%x.out
66
#SBATCH --error=%x.out
77
#SBATCH --exclusive
8+
#SBATCH --partition={{ hpctests_partition }}
89
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}
910

1011
export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}

0 commit comments

Comments
 (0)