Skip to content

Commit f8839fe

Browse files
authored
Merge branch 'main' into fix/templating2
2 parents 5d53dd6 + 531b3b9 commit f8839fe

File tree

45 files changed

+345
-103
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+345
-103
lines changed

.github/workflows/stackhpc.yml

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
cloud:
1515
- "arcus" # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
1616
fail-fast: false # as want clouds to continue independently
17-
concurrency: ${{ matrix.cloud }}
17+
concurrency: ${{ github.ref }} # to branch/PR
1818
runs-on: ubuntu-20.04
1919
steps:
2020
- uses: actions/checkout@v2
@@ -52,30 +52,52 @@ jobs:
5252
smslabs_CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }}
5353
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
5454

55-
- name: Provision infrastructure
56-
id: provision
55+
- name: Provision ports, inventory and other infrastructure apart from nodes
56+
id: provision_ports
5757
run: |
5858
. venv/bin/activate
5959
. environments/${{ matrix.cloud }}/activate
6060
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
61-
terraform apply -auto-approve
61+
TF_VAR_create_nodes=false terraform apply -auto-approve
6262
env:
6363
OS_CLOUD: openstack
6464
TF_VAR_cluster_name: ci${{ github.run_id }}
6565

66+
- name: Setup environment-specific inventory/terraform inputs
67+
run: |
68+
. venv/bin/activate
69+
. environments/${{ matrix.cloud }}/activate
70+
ansible-playbook ansible/adhoc/generate-passwords.yml
71+
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
72+
ansible-playbook ansible/adhoc/template-cloud-init.yml
73+
env:
74+
ANSIBLE_FORCE_COLOR: True
75+
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
76+
77+
- name: Provision servers
78+
id: provision_servers
79+
run: |
80+
. venv/bin/activate
81+
. environments/${{ matrix.cloud }}/activate
82+
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
83+
terraform apply -auto-approve
84+
env:
85+
OS_CLOUD: openstack
86+
TF_VAR_cluster_name: ci${{ github.run_id }}
87+
6688
- name: Get server provisioning failure messages
6789
id: provision_failure
6890
run: |
6991
. venv/bin/activate
7092
. environments/${{ matrix.cloud }}/activate
7193
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
7294
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
73-
echo $TF_FAIL_MSGS
95+
echo TF failure messages: $TF_FAIL_MSGS
7496
echo "::set-output name=messages::${TF_FAIL_MSGS}"
7597
env:
7698
OS_CLOUD: openstack
7799
TF_VAR_cluster_name: ci${{ github.run_id }}
78-
if: always() && steps.provision.outcome == 'failure'
100+
if: always() && steps.provision_servers.outcome == 'failure'
79101

80102
- name: Delete infrastructure if failed due to lack of hosts
81103
run: |
@@ -86,20 +108,17 @@ jobs:
86108
env:
87109
OS_CLOUD: openstack
88110
TF_VAR_cluster_name: ci${{ github.run_id }}
89-
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}
111+
if: ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
90112

91113
- name: Directly configure cluster
92114
run: |
93115
. venv/bin/activate
94116
. environments/${{ matrix.cloud }}/activate
95117
ansible all -m wait_for_connection
96-
ansible-playbook ansible/adhoc/generate-passwords.yml
97-
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
98118
ansible-playbook -vv ansible/site.yml
99119
env:
100120
OS_CLOUD: openstack
101121
ANSIBLE_FORCE_COLOR: True
102-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
103122

104123
- name: Run MPI-based tests
105124
run: |
@@ -135,23 +154,24 @@ jobs:
135154
--server-response \
136155
--no-check-certificate \
137156
--http-user=testuser \
138-
--http-password=${TEST_USER_PASSWORD} https://${openondemand_servername} \
157+
--http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \
139158
2>&1)
140159
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
141160
env:
142-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
161+
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
143162

144163
- name: Build packer images
145164
run: |
146165
. venv/bin/activate
147166
. environments/${{ matrix.cloud }}/activate
148-
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
167+
ansible-playbook ansible/adhoc/generate-passwords.yml
168+
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
149169
cd packer/
150170
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
151171
env:
152172
OS_CLOUD: openstack
153173
ANSIBLE_FORCE_COLOR: True
154-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
174+
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
155175

156176
- name: Test reimage of nodes
157177
run: |

ansible/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ roles/*
2626
!roles/slurm_exporter/**
2727
!roles/firewalld/
2828
!roles/firewalld/**
29+
!roles/etc_hosts/
30+
!roles/etc_hosts/**
31+
!roles/cloud_init/
32+
!roles/cloud_init/**
2933
!roles/mysql/
3034
!roles/mysql/**
3135
!roles/systemd/

ansible/adhoc/hpctests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
---
66

7-
- hosts: hpctests[0] # TODO: might want to make which node is used selectable?
7+
- hosts: login[0] # TODO: might want to make which node is used selectable?
88
become: false
99
gather_facts: false
1010
tasks:

ansible/adhoc/template-cloud-init.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
- hosts: cloud_init
2+
become: no
3+
gather_facts: no
4+
tasks:
5+
- name: Template out cloud-init userdata
6+
import_role:
7+
name: cloud_init
8+
tasks_from: template.yml
9+
delegate_to: localhost

ansible/bootstrap.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@
1313
to update these variable names. ** NB: The actual secrets will not be changed.**
1414
when: "'secrets_openhpc_' in (hostvars[inventory_hostname] | join)"
1515

16+
- hosts: etc_hosts
17+
gather_facts: false
18+
tags: etc_hosts
19+
become: yes
20+
tasks:
21+
- name: Template /etc/hosts
22+
copy:
23+
content: "{{ etc_hosts_template }}"
24+
dest: /etc/hosts
25+
owner: root
26+
group: root
27+
mode: u=rw,og=r
28+
1629
- hosts: cluster
1730
gather_facts: false
1831
tasks:
@@ -74,6 +87,20 @@
7487
- import_role:
7588
name: fail2ban
7689

90+
- name: Setup podman
91+
hosts: podman
92+
tags: podman
93+
tasks:
94+
- import_role:
95+
name: podman
96+
tasks_from: prereqs.yml
97+
tags: prereqs
98+
99+
- import_role:
100+
name: podman
101+
tasks_from: config.yml
102+
tags: config
103+
77104
- hosts: update
78105
gather_facts: false
79106
become: yes

ansible/ci/test_reimage.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545
gather_facts: no
4646
tags: reimage_compute
4747
tasks:
48-
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
48+
# TODO: This is specific to arcus environment config - could generalise to all compute nodes
4949
- name: Request compute node rebuild via Slurm
5050
shell:
51-
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
51+
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-3]
5252
become: yes
5353

5454
- name: Check compute node rebuild completed

ansible/monitoring.yml

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,6 @@
11
# ---
22
# # NOTE: Requires slurmdbd
33

4-
- name: Setup podman
5-
hosts: podman
6-
tags: podman
7-
tasks:
8-
- import_role:
9-
name: podman
10-
tasks_from: prereqs.yml
11-
tags: prereqs
12-
13-
- import_role:
14-
name: podman
15-
tasks_from: config.yml
16-
tags: config
17-
184
- name: Setup elasticsearch
195
hosts: opendistro
206
tags: opendistro

ansible/roles/cloud_init/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# cloud_init
2+
3+
Create cloud init userdata for instance groups.
4+
5+
# Requirements
6+
Image and cloud environment supporting cloud-init.
7+
8+
# Role Variables
9+
10+
- `cloud_init_output_path`: Required. Path to output userdata files to.
11+
- `cloud_init_userdata_templates`: Optional list. Each element is a dict with keys/values as follows:
12+
- `module`: Required str. Name of cloud_init [module](https://cloudinit.readthedocs.io/en/latest/topics/modules.html)
13+
- `group`: Optional str. Name of inventory group to which this config applies - if no group is specified then it applies to all groups. This allows defining `cloud_init_userdata_templates` for group `all`.
14+
- `template`: Jinja template for cloud_init module [configuration](https://cloudinit.readthedocs.io/en/latest/topics/modules.html).
15+
16+
Elements may repeat `module`; the resulting userdata cloud-config file will will contain configuration from all applicable (by group) elements for that module.
17+
18+
Note that the appliance [constructs](../../../environments/common/inventory/group_vars/all/cloud_init.yml) `cloud_init_userdata_templates` from `cloud_init_userdata_templates_default` and `cloud_init_userdata_templates_extra` to
19+
allow easier customisation in specific environments.
20+
21+
# Dependencies
22+
None.
23+
24+
# Example Playbook
25+
See `ansible/adhoc/rebuild.yml`.
26+
27+
# License
28+
Apache 2.0
29+
30+
# Author Information
31+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#cloud_init_output_path:
2+
cloud_init_userdata_templates: []
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
- name: Template out cloud-init userdata
3+
ansible.builtin.template:
4+
src: userdata.yml.j2
5+
dest: "{{ cloud_init_output_path }}/{{ inventory_hostname }}.userdata.yml"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#cloud-config
2+
disable_ec2_metadata: true
3+
4+
{% for module, tmpls in cloud_init_userdata_templates | groupby(attribute='module') %}
5+
{% for tmpl in tmpls %}
6+
{% if not 'group' in tmpl or tmpl.group in group_names %}
7+
{% if loop.first %}
8+
{{ module }}:
9+
{% endif %}
10+
{{ tmpl.template }}
11+
{% endif %}
12+
{% endfor %}
13+
{% endfor %}

ansible/roles/etc_hosts/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# etc_hosts
2+
3+
This role provides documentation only.
4+
5+
Hosts in the `etc_hosts` groups get `/etc/hosts` created via `cloud-init`. The generated file defines all hosts in this group using `ansible_host` as the IP address and `inventory_hostname` as the canonical hostname. This may need overriding for multi-homed hosts. See `environments/common/inventory/group_vars/all/cloud_init.yml` for configuration.

ansible/roles/hpctests/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ Role Variables
2424
--------------
2525

2626
- `hpctests_rootdir`: Required. Path to root of test directory tree, which must be on a r/w filesystem shared to all cluster nodes under test. The last directory component will be created.
27-
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the default partition are used. Note nodes selected **must** be in the default partition.
27+
- `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used.
28+
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used.
2829
- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use).
2930
- `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user).
3031
- `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html).

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ hpctests_hpl_NB: 192
1010
hpctests_hpl_mem_frac: 0.8
1111
hpctests_hpl_arch: linux64
1212
#hpctests_nodes:
13+
#hpctests_partition:

ansible/roles/hpctests/library/slurm_node_info.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
options
2424
nodes:
2525
description:
26-
- Slurm nodenames for which information is required. These must be homogenous.
26+
- Slurm nodenames for which information is required.
2727
required: true
2828
type: list
2929
requirements:
@@ -56,7 +56,6 @@ def run_module():
5656
print(values)
5757
for ix, param in enumerate(params):
5858
info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']]
59-
# info[param] = [nodeinfo[nodelist_ix] for nodeinfo in values]
6059
result['info'] = info
6160

6261
module.exit_json(**result)

ansible/roles/hpctests/tasks/hpl-solo.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
- debug:
4343
msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}"
4444

45-
- name: Get all nodes
46-
shell: "sinfo --Node --noheader --format %N" # TODO: assumes only one partition, although actually excluding nodes not in the default partition should be fine.
45+
- name: Get all nodes in partition
46+
shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}"
4747
register: all_nodes
4848
changed_when: false
4949

@@ -74,6 +74,11 @@
7474
vars:
7575
hpctests_hplsolo_ntasks: 2 # TODO: FIXME
7676

77+
- name: Remove previous outputs
78+
# As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten
79+
shell:
80+
cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out"
81+
7782
- name: Run hpl-solo
7883
shell: sbatch --wait hpl-solo.sh
7984
become: no
@@ -111,10 +116,11 @@
111116
tags: postpro
112117
debug:
113118
msg: |
114-
Summary for hpl-solo ({{ hpctests_computes.stdout_lines | length }} nodes) job {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
119+
Summary for hpl-solo on {{ hpctests_computes.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
120+
115121
Max: {{ perf.stdout_lines | map('float') | max }} gflops
116122
Min: {{ perf.stdout_lines | map('float') | min }} gflops
117-
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
123+
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
118124
119125
Individual node results (gflops):
120126
{{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float') )) | to_nice_yaml }}

ansible/roles/hpctests/tasks/pingmatrix.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
- name: Summarise results
6262
debug:
6363
msg: |
64-
Summary for pingmatrix (pairwise on {{ slurm_names.stdout_lines | length }} nodes) job {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
64+
Summary for pingmatrix pairwise over {{ slurm_names.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
65+
6566
{{ nxnlatbw['stats'] | to_nice_yaml }}
67+
6668
Tabular output on ansible control host at {{ hpctests_outdir }}/pingmatrix.html

ansible/roles/hpctests/tasks/pingpong.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,11 @@
5555

5656
- debug:
5757
msg: |
58-
Summary for pingpong (2x scheduler-selected nodes) job {{ _pingpong_jobid }} (using interface {{ hpctests_ucx_net_devices }}):
59-
nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
60-
zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
61-
max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
58+
Summary for pingpong using 2x scheduler-selected nodes in '{{ hpctests_partition }}' partition, job ID {{ _pingpong_jobid }}, device '{{ hpctests_ucx_net_devices }}':
59+
60+
Nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
61+
Zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
62+
Max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
6263
6364
See plot on localhost:
6465
{{ _pingpong_plot.stdout }}

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
---
22

3+
- name: Get partition information
4+
shell: "sinfo --format %P --noheader"
5+
register: _sinfo_partitions
6+
changed_when: false
7+
8+
- name: Select default partition if hpctests_partition not given
9+
set_fact:
10+
hpctests_partition: "{{ _sinfo_partitions.stdout_lines | select('contains', '*') | first | trim('*') }}"
11+
when: hpctests_partition is not defined
12+
313
- name: Get info about compute nodes
4-
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --format %N"
14+
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N"
515
register: hpctests_computes
616
changed_when: false
717
failed_when: hpctests_computes.rc != 0

0 commit comments

Comments
 (0)