Skip to content

Commit d4055a7

Browse files
committed
Merge branch 'main' into fix/elasticsearch
2 parents 69a51f3 + a4ab33f commit d4055a7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+143
-580
lines changed

.github/workflows/stackhpc.yml

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ jobs:
1212
strategy:
1313
matrix:
1414
cloud:
15-
- "smslabs" # SMS-Labs OpenStack in stackhpc-ci project
1615
- "arcus" # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
1716
fail-fast: false # as want clouds to continue independently
1817
concurrency: ${{ matrix.cloud }}
@@ -70,7 +69,9 @@ jobs:
7069
. venv/bin/activate
7170
. environments/${{ matrix.cloud }}/activate
7271
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
73-
echo "::set-output name=messages::$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
72+
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
73+
echo $TF_FAIL_MSGS
74+
echo "::set-output name=messages::${TF_FAIL_MSGS}"
7475
env:
7576
OS_CLOUD: openstack
7677
TF_VAR_cluster_name: ci${{ github.run_id }}
@@ -87,8 +88,7 @@ jobs:
8788
TF_VAR_cluster_name: ci${{ github.run_id }}
8889
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}
8990

90-
- name: Directly configure cluster and build compute, login and control images
91-
# see pre-hook for the image build
91+
- name: Directly configure cluster
9292
run: |
9393
. venv/bin/activate
9494
. environments/${{ matrix.cloud }}/activate
@@ -131,9 +131,20 @@ jobs:
131131
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
132132
env:
133133
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
134-
135-
- name: Test reimage of login and compute nodes
136-
# TODO: test control node reimage
134+
135+
- name: Build packer images
136+
run: |
137+
. venv/bin/activate
138+
. environments/${{ matrix.cloud }}/activate
139+
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
140+
cd packer/
141+
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
142+
env:
143+
OS_CLOUD: openstack
144+
ANSIBLE_FORCE_COLOR: True
145+
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
146+
147+
- name: Test reimage of nodes
137148
run: |
138149
. venv/bin/activate
139150
. environments/${{ matrix.cloud }}/activate

ansible/bootstrap.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
# Need to change working directory otherwise we try to switch back to non-existent directory.
2424
become_flags: '-i'
2525
become: true
26+
- name: Reset ssh connection to allow user changes to affect ansible_user
27+
meta: reset_connection
2628

2729
- hosts: selinux
2830
gather_facts: false
@@ -94,6 +96,8 @@
9496
become: yes
9597
tags:
9698
- reboot
99+
- selinux
100+
- update
97101
tasks:
98102
- name: Check for pending reboot from package updates
99103
stat:

ansible/roles/fail2ban/.travis.yml

Lines changed: 0 additions & 29 deletions
This file was deleted.

ansible/roles/firewalld/.travis.yml

Lines changed: 0 additions & 29 deletions
This file was deleted.

ansible/roles/hpctests/.travis.yml

Lines changed: 0 additions & 29 deletions
This file was deleted.
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import matplotlib as mpl
2+
import matplotlib.pyplot as plt
3+
from matplotlib import ticker
4+
import numpy as np
5+
import os
6+
7+
def sizeof_fmt(num, suffix='B'):
8+
""" TODO: """
9+
# from https://stackoverflow.com/a/1094933/916373
10+
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
11+
if abs(num) < 1024.0:
12+
return "%3.1f%s%s" % (num, unit, suffix)
13+
num /= 1024.0
14+
return "%.1f%s%s" % (num, 'Yi', suffix)
15+
16+
def read_imb_out(path):
17+
""" Read stdout from an IMB-MPI1 run.
18+
19+
Returns a dict with:
20+
key:= int, total number of processes involved
21+
value:= pandas dataframe, i.e. one per results table. Columns as per table.
22+
23+
If multiple results tables are present it is assumed that they are all the same benchmark,
24+
and only differ in the number of processes.
25+
"""
26+
27+
data = {}
28+
29+
COLTYPES = { # all benchmark names here should be lowercase
30+
'uniband': (int, int, float, int), # #bytes #repetitions Mbytes/sec Msg/sec
31+
'biband': (int, int, float, int),
32+
'pingpong':(int, int, float, float), # #bytes #repetitions t[usec] Mbytes/sec
33+
'alltoall':(int, int, float, float, float) # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec]
34+
}
35+
36+
with open(path) as f:
37+
for line in f:
38+
if line.startswith('# Benchmarking '):
39+
benchmark = line.split()[-1].lower()
40+
if benchmark not in COLTYPES:
41+
raise ValueError('Do not know how to read %r benchmark in %s' % (benchmark, path))
42+
converters = COLTYPES[benchmark]
43+
line = next(f)
44+
if not line.startswith('# #processes = '):
45+
raise ValueError('expected %s, got %s' % (expect, nprocs_line))
46+
n_procs = int(line.split('=')[-1].strip())
47+
while line.startswith('#'):
48+
line = next(f) # may or may not include line "# .. additional processes waiting in MPI_Barrier", plus other # lines
49+
rows = []
50+
while True:
51+
line = next(f).strip()
52+
if line == '':
53+
break
54+
rows.append([f(v) for (f, v) in zip(converters, line.split())])
55+
# turn data around:
56+
cols = [[] for _ in range(len(converters))]
57+
for r in rows:
58+
for ic, c in enumerate(cols):
59+
c.append(r[ic])
60+
data[n_procs] = cols
61+
return data
62+
63+
if __name__ == '__main__':
64+
import sys
65+
d = read_imb_out(sys.argv[1])
66+
if len(d) > 1:
67+
raise ValueError('Found > 1 benchmark in', sys.argv[1])
68+
outdir = os.path.dirname(sys.argv[1])
69+
for n, df in d.items():
70+
fig, ax1 = plt.subplots()
71+
ax2 = ax1.twinx()
72+
ax1.plot(df[0], df[2], label='latency', color='b')
73+
ax2.plot(df[0], df[3], label='bandwidth', color='r')
74+
ax1.set_xscale('log', base=2)
75+
ax1.set_yscale('log', base=10)
76+
ax1.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x)))
77+
ax1.grid(True, which="both")
78+
ax1.set_xlabel('#bytes')
79+
ax1.set_ylabel('latency ($\mu$s)', color='b')
80+
ax2.set_ylabel('bandwidth (Mbytes/sec)', color='r')
81+
fig.legend(loc='upper left')
82+
plt.tight_layout()
83+
figpath = os.path.join(outdir, 'pingpong.png')
84+
plt.savefig(figpath)
85+
print(figpath)

ansible/roles/hpctests/library/slurm_node_info.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,16 @@ def run_module():
4747
if module.check_mode:
4848
module.exit_json(**result)
4949

50-
node_spec = ','.join(module.params['nodes'])
51-
_, stdout,_ = module.run_command("sinfo --Format All --node %s" % node_spec, check_rc=True)
50+
_, stdout,_ = module.run_command("sinfo --Format All --Node", check_rc=True) # `--nodes` doesn't filter enough, other partitions are still shown
5251
lines = stdout.splitlines()
53-
# if len(lines) > 2:
54-
# raise ValueError('Info requested for nodes which are not homogenous: %s' % lines)
5552
info = {}
5653
params = [v.strip() for v in lines[0].split('|')]
5754
values = [line.split('|') for line in lines[1:]]
55+
nodelist_ix = params.index('NODELIST')
5856
print(values)
5957
for ix, param in enumerate(params):
60-
info[param] = [nodeinfo[ix].strip() for nodeinfo in values]
58+
info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']]
59+
# info[param] = [nodeinfo[nodelist_ix] for nodeinfo in values]
6160
result['info'] = info
6261

6362
module.exit_json(**result)

ansible/roles/hpctests/tasks/hpl-solo.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
- name: Check nodes are homogenous
1717
assert:
1818
that: "{{ hpctests_nodeinfo.info[item] | unique | length == 1 }}"
19-
fail_msg: "Selected nodes are not homogenous: {{ item }} = {{ hpctests_nodeinfo.info[item] }}"
19+
fail_msg: "Selected nodes are not homogenous: {{ item }} ({{ hpctests_nodeinfo.info['NODELIST'] }}) = {{ hpctests_nodeinfo.info[item] }}"
2020
loop:
2121
- SOCKETS
2222
- CORES

ansible/roles/hpctests/tasks/pingpong.yml

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,41 @@
2424
chdir: "{{ hpctests_rootdir }}/pingpong"
2525
register: hpctests_pingpong_sbatch
2626

27-
- name: Read pingpong
27+
- set_fact:
28+
_pingpong_jobid: "{{ hpctests_pingpong_sbatch.stdout.split()[-1] }}"
29+
- set_fact:
30+
_pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{_pingpong_jobid}}/pingpong.sh.out"
31+
32+
- name: Retrieve results file
33+
ansible.builtin.fetch:
34+
src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
35+
dest: "{{ _pingpong_local_output }}"
36+
flat: yes
37+
38+
- name: Read pingpong results
2839
read_imb_pingpong:
29-
path: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
40+
path: "{{ _pingpong_local_output }}"
3041
register: hpctests_pingpong_out
42+
delegate_to: localhost
3143

3244
- name: Read nodes used
33-
shell: "grep 'SLURM_JOB_NODELIST:' {{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
45+
shell: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}"
3446
register: hpctests_pingpong_run_nodes
47+
delegate_to: localhost
3548

49+
- name: Plot image
50+
shell:
51+
cmd: "python {{lookup('env', 'APPLIANCES_REPO_ROOT') }}/ansible/roles/hpctests/files/plot_imb_pingpong.py {{ _pingpong_local_output }}"
52+
creates: "{{ _pingpong_local_output | dirname }}/latency.png"
53+
register: _pingpong_plot
54+
delegate_to: localhost
55+
3656
- debug:
3757
msg: |
38-
Summary for pingpong (2x scheduler-selected nodes) job {{ hpctests_pingpong_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
58+
Summary for pingpong (2x scheduler-selected nodes) job {{ _pingpong_jobid }} (using interface {{ hpctests_ucx_net_devices }}):
3959
nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
4060
zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
4161
max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
62+
63+
See plot on localhost:
64+
{{ _pingpong_plot.stdout }}

ansible/roles/hpctests/templates/hpl-build.sh.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#SBATCH --output=%x.%a.out
55
#SBATCH --error=%x.%a.out
66
#SBATCH --exclusive
7-
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}
7+
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %}
88

99
echo HPL arch: {{ hpctests_hpl_arch }}
1010

environments/arcus/hooks/pre.yml

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,3 @@
1-
- hosts: localhost
2-
become: false
3-
tags: build
4-
tasks:
5-
- name: Ensure secrets generated
6-
include_role:
7-
name: passwords
8-
9-
- name: Build packer images
10-
shell:
11-
cmd: |
12-
cd packer
13-
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
14-
chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}"
15-
when: "'builder' not in group_names" # avoid recursion!
16-
register: packer_run
17-
async: 2700 # 45 minutes
18-
poll: 0
19-
201
- hosts: all
212
become: true
223
tags: etc_hosts

environments/common/inventory/groups

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,9 @@ cluster
7070
# Hosts to install fail2ban on to protect SSH - uses firewalld
7171
# https://www.fail2ban.org/wiki/index.php/Main_Page
7272

73-
[firewalld]
73+
[firewalld:children]
7474
# Hosts to install firewalld on - see ansible/roles/filewalld
75+
fail2ban
7576

7677
[block_devices]
7778
# Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md

environments/common/layouts/everything

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,6 @@ cluster
3737
# Hosts to install fail2ban on to protect SSH
3838
login
3939

40-
[firewalld:children]
41-
# Hosts to install firewalld on
42-
fail2ban
43-
4440
[block_devices:children]
4541
# Environment-specific so not defined here
4642

environments/smslabs/.gitignore

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)