Skip to content

Commit 492a0d1

Browse files
authored
Merge branch 'main' into fix/templating2
2 parents f8839fe + ae5418b commit 492a0d1

File tree

14 files changed

+91
-202
lines changed

14 files changed

+91
-202
lines changed

.github/workflows/stackhpc.yml

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ jobs:
1616
fail-fast: false # as want clouds to continue independently
1717
concurrency: ${{ github.ref }} # to branch/PR
1818
runs-on: ubuntu-20.04
19+
env:
20+
ANSIBLE_FORCE_COLOR: True
21+
OS_CLOUD: openstack
22+
TF_VAR_cluster_name: ci${{ github.run_id }}
1923
steps:
2024
- uses: actions/checkout@v2
2125

@@ -26,7 +30,6 @@ jobs:
2630
echo "${${{ matrix.cloud }}_SSH_KEY}" > ~/.ssh/id_rsa
2731
chmod 0600 ~/.ssh/id_rsa
2832
env:
29-
smslabs_SSH_KEY: ${{ secrets.SSH_KEY }}
3033
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
3134

3235
- name: Add bastion's ssh key to known_hosts
@@ -49,19 +52,14 @@ jobs:
4952
echo "${${{ matrix.cloud }}_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
5053
shell: bash
5154
env:
52-
smslabs_CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }}
5355
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
5456

5557
- name: Provision ports, inventory and other infrastructure apart from nodes
56-
id: provision_ports
5758
run: |
5859
. venv/bin/activate
5960
. environments/${{ matrix.cloud }}/activate
6061
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
6162
TF_VAR_create_nodes=false terraform apply -auto-approve
62-
env:
63-
OS_CLOUD: openstack
64-
TF_VAR_cluster_name: ci${{ github.run_id }}
6563
6664
- name: Setup environment-specific inventory/terraform inputs
6765
run: |
@@ -71,7 +69,6 @@ jobs:
7169
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
7270
ansible-playbook ansible/adhoc/template-cloud-init.yml
7371
env:
74-
ANSIBLE_FORCE_COLOR: True
7572
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
7673

7774
- name: Provision servers
@@ -81,9 +78,6 @@ jobs:
8178
. environments/${{ matrix.cloud }}/activate
8279
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8380
terraform apply -auto-approve
84-
env:
85-
OS_CLOUD: openstack
86-
TF_VAR_cluster_name: ci${{ github.run_id }}
8781
8882
- name: Get server provisioning failure messages
8983
id: provision_failure
@@ -94,9 +88,6 @@ jobs:
9488
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
9589
echo TF failure messages: $TF_FAIL_MSGS
9690
echo "::set-output name=messages::${TF_FAIL_MSGS}"
97-
env:
98-
OS_CLOUD: openstack
99-
TF_VAR_cluster_name: ci${{ github.run_id }}
10091
if: always() && steps.provision_servers.outcome == 'failure'
10192

10293
- name: Delete infrastructure if failed due to lack of hosts
@@ -105,29 +96,21 @@ jobs:
10596
. environments/${{ matrix.cloud }}/activate
10697
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
10798
terraform destroy -auto-approve
108-
env:
109-
OS_CLOUD: openstack
110-
TF_VAR_cluster_name: ci${{ github.run_id }}
11199
if: ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
112100

113101
- name: Directly configure cluster
114102
run: |
115103
. venv/bin/activate
116104
. environments/${{ matrix.cloud }}/activate
117105
ansible all -m wait_for_connection
118-
ansible-playbook -vv ansible/site.yml
119-
env:
120-
OS_CLOUD: openstack
121-
ANSIBLE_FORCE_COLOR: True
106+
ansible-playbook -v ansible/site.yml
107+
ansible-playbook -v ansible/ci/check_slurm.yml
122108
123109
- name: Run MPI-based tests
124110
run: |
125111
. venv/bin/activate
126112
. environments/${{ matrix.cloud }}/activate
127113
ansible-playbook -vv ansible/adhoc/hpctests.yml
128-
env:
129-
ANSIBLE_FORCE_COLOR: True
130-
OS_CLOUD: openstack
131114
132115
- name: Confirm Open Ondemand is up (via SOCKS proxy)
133116
run: |
@@ -161,62 +144,62 @@ jobs:
161144
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
162145

163146
- name: Build packer images
147+
id: packer_build
164148
run: |
165149
. venv/bin/activate
166150
. environments/${{ matrix.cloud }}/activate
167-
ansible-playbook ansible/adhoc/generate-passwords.yml
168-
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
169151
cd packer/
170152
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
171-
env:
172-
OS_CLOUD: openstack
173-
ANSIBLE_FORCE_COLOR: True
174-
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
153+
../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
175154
176-
- name: Test reimage of nodes
155+
- name: Test reimage of login nodes (via rebuild adhoc)
177156
run: |
178157
. venv/bin/activate
179158
. environments/${{ matrix.cloud }}/activate
180-
ansible all -m wait_for_connection
181-
ansible-playbook -vv ansible/ci/test_reimage.yml
182-
env:
183-
OS_CLOUD: openstack
184-
ANSIBLE_FORCE_COLOR: True
159+
ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
160+
ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161+
ansible-playbook -v ansible/ci/check_slurm.yml
162+
163+
- name: Test reimage of compute nodes (via slurm)
164+
run: |
165+
. venv/bin/activate
166+
. environments/${{ matrix.cloud }}/activate
167+
ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
168+
ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169+
ansible-playbook -v ansible/ci/check_slurm.yml
170+
171+
- name: Test reimage of control node (via rebuild adhoc)
172+
run: |
173+
. venv/bin/activate
174+
. environments/${{ matrix.cloud }}/activate
175+
ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
176+
ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
177+
ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
178+
ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
179+
ansible-playbook -v ansible/ci/check_slurm.yml
185180
186181
- name: Check sacct state survived reimage
187182
run: |
188183
. venv/bin/activate
189184
. environments/${{ matrix.cloud }}/activate
190185
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
191-
env:
192-
ANSIBLE_FORCE_COLOR: True
193-
OS_CLOUD: openstack
194186
195187
- name: Check MPI-based tests are shown in Grafana
196188
run: |
197189
. venv/bin/activate
198190
. environments/${{ matrix.cloud }}/activate
199191
ansible-playbook -vv ansible/ci/check_grafana.yml
200-
env:
201-
ANSIBLE_FORCE_COLOR: True
202-
OS_CLOUD: openstack
203192
204193
- name: Delete infrastructure
205194
run: |
206195
. venv/bin/activate
207196
. environments/${{ matrix.cloud }}/activate
208197
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
209198
terraform destroy -auto-approve
210-
env:
211-
OS_CLOUD: openstack
212-
TF_VAR_cluster_name: ci${{ github.run_id }}
213199
if: ${{ success() || cancelled() }}
214200

215201
- name: Delete images
216202
run: |
217203
. venv/bin/activate
218204
. environments/${{ matrix.cloud }}/activate
219205
ansible-playbook -vv ansible/ci/delete_images.yml
220-
env:
221-
OS_CLOUD: openstack
222-
ANSIBLE_FORCE_COLOR: True

ansible/adhoc/hpctests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
---
66

7-
- hosts: login[0] # TODO: might want to make which node is used selectable?
7+
- hosts: hpctests[0] # TODO: might want to make which node is used selectable?
88
become: false
99
gather_facts: false
1010
tasks:

ansible/bootstrap.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,6 @@
117117
async: "{{ 30 * 60 }}" # wait for up to 30 minutes
118118
poll: 15 # check every 15 seconds
119119
register: updates
120-
- debug:
121-
var: updates
122120
- name: Ensure update log directory on localhost exists
123121
file:
124122
path: "{{ update_log_path | dirname }}"

ansible/ci/check_slurm.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
- hosts: login:!builder # won't have a slurm control daemon when in build
2+
become: no
3+
gather_facts: false
4+
tasks:
5+
- name: Run sinfo
6+
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
7+
register: sinfo
8+
changed_when: false
9+
until: "'boot' not in sinfo.stdout_lines"
10+
retries: 5
11+
delay: 10
12+
- name: Check nodes have expected slurm state
13+
assert:
14+
that: sinfo.stdout_lines == expected_sinfo
15+
fail_msg: |
16+
sinfo output not as expected:
17+
actual:
18+
{{ sinfo.stdout_lines }}
19+
expected:
20+
{{ expected_sinfo }}
21+
<end>
22+
vars:
23+
expected_sinfo:
24+
- "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle"
25+
- "{{ openhpc_cluster_name }}-compute-[2-3] extra up 60-00:00:00 2 idle"

ansible/ci/test_reimage.yml

Lines changed: 0 additions & 122 deletions
This file was deleted.

ansible/roles/hpctests/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Role Variables
3535
The following variables should not generally be changed:
3636
- `hpctests_pingmatrix_modules`: Optional. List of modules to load for pingmatrix test. Defaults are suitable for OpenHPC 2.x cluster using the required packages.
3737
- `hpctests_pingpong_modules`: As above but for pingpong test.
38+
- `hpctests_pingpong_plot`: Whether to plot pingpong results. Default `yes`.
3839
- `hpctests_hpl_modules`: As above but for hpl tests.
3940
- `hpctests_hpl_version`: Version of HPL
4041

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
hpctests_rootdir:
33
hpctests_pingmatrix_modules: [gnu9 openmpi4]
44
hpctests_pingpong_modules: [gnu9 openmpi4 imb]
5+
hpctests_pingpong_plot: yes
56
hpctests_hpl_modules: [gnu9 openmpi4 openblas]
67
hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests"
78
hpctests_ucx_net_devices: all

ansible/roles/hpctests/tasks/pingpong.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,11 @@
4848

4949
- name: Plot image
5050
shell:
51-
cmd: "python {{lookup('env', 'APPLIANCES_REPO_ROOT') }}/ansible/roles/hpctests/files/plot_imb_pingpong.py {{ _pingpong_local_output }}"
51+
cmd: "python {{ role_path }}/files/plot_imb_pingpong.py {{ _pingpong_local_output }}"
5252
creates: "{{ _pingpong_local_output | dirname }}/latency.png"
5353
register: _pingpong_plot
5454
delegate_to: localhost
55+
when: hpctests_pingpong_plot | bool
5556

5657
- debug:
5758
msg: |
@@ -61,5 +62,7 @@
6162
Zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
6263
Max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
6364
65+
{% if hpctests_pingpong_plot %}
6466
See plot on localhost:
6567
{{ _pingpong_plot.stdout }}
68+
{% endif %}

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
- name: Select default partition if hpctests_partition not given
99
set_fact:
10-
hpctests_partition: "{{ _sinfo_partitions.stdout_lines | select('contains', '*') | first | trim('*') }}"
10+
hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}"
1111
when: hpctests_partition is not defined
1212

1313
- name: Get info about compute nodes

0 commit comments

Comments
 (0)