Skip to content

Commit 9e4379e

Browse files
authored
Merge branch 'main' into fix/hpctests-partitions-jinja
2 parents 96c270e + 9a6e0c3 commit 9e4379e

File tree

8 files changed

+79
-199
lines changed

8 files changed

+79
-199
lines changed

.github/workflows/stackhpc.yml

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ jobs:
1616
fail-fast: false # as want clouds to continue independently
1717
concurrency: ${{ github.ref }} # to branch/PR
1818
runs-on: ubuntu-20.04
19+
env:
20+
ANSIBLE_FORCE_COLOR: True
21+
OS_CLOUD: openstack
22+
TF_VAR_cluster_name: ci${{ github.run_id }}
1923
steps:
2024
- uses: actions/checkout@v2
2125

@@ -26,7 +30,6 @@ jobs:
2630
echo "${${{ matrix.cloud }}_SSH_KEY}" > ~/.ssh/id_rsa
2731
chmod 0600 ~/.ssh/id_rsa
2832
env:
29-
smslabs_SSH_KEY: ${{ secrets.SSH_KEY }}
3033
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
3134

3235
- name: Add bastion's ssh key to known_hosts
@@ -49,19 +52,14 @@ jobs:
4952
echo "${${{ matrix.cloud }}_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
5053
shell: bash
5154
env:
52-
smslabs_CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }}
5355
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
5456

5557
- name: Provision ports, inventory and other infrastructure apart from nodes
56-
id: provision_ports
5758
run: |
5859
. venv/bin/activate
5960
. environments/${{ matrix.cloud }}/activate
6061
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
6162
TF_VAR_create_nodes=false terraform apply -auto-approve
62-
env:
63-
OS_CLOUD: openstack
64-
TF_VAR_cluster_name: ci${{ github.run_id }}
6563
6664
- name: Setup environment-specific inventory/terraform inputs
6765
run: |
@@ -71,7 +69,6 @@ jobs:
7169
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
7270
ansible-playbook ansible/adhoc/template-cloud-init.yml
7371
env:
74-
ANSIBLE_FORCE_COLOR: True
7572
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
7673

7774
- name: Provision servers
@@ -81,9 +78,6 @@ jobs:
8178
. environments/${{ matrix.cloud }}/activate
8279
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8380
terraform apply -auto-approve
84-
env:
85-
OS_CLOUD: openstack
86-
TF_VAR_cluster_name: ci${{ github.run_id }}
8781
8882
- name: Get server provisioning failure messages
8983
id: provision_failure
@@ -94,9 +88,6 @@ jobs:
9488
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
9589
echo TF failure messages: $TF_FAIL_MSGS
9690
echo "::set-output name=messages::${TF_FAIL_MSGS}"
97-
env:
98-
OS_CLOUD: openstack
99-
TF_VAR_cluster_name: ci${{ github.run_id }}
10091
if: always() && steps.provision_servers.outcome == 'failure'
10192

10293
- name: Delete infrastructure if failed due to lack of hosts
@@ -105,29 +96,21 @@ jobs:
10596
. environments/${{ matrix.cloud }}/activate
10697
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
10798
terraform destroy -auto-approve
108-
env:
109-
OS_CLOUD: openstack
110-
TF_VAR_cluster_name: ci${{ github.run_id }}
11199
if: ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
112100

113101
- name: Directly configure cluster
114102
run: |
115103
. venv/bin/activate
116104
. environments/${{ matrix.cloud }}/activate
117105
ansible all -m wait_for_connection
118-
ansible-playbook -vv ansible/site.yml
119-
env:
120-
OS_CLOUD: openstack
121-
ANSIBLE_FORCE_COLOR: True
106+
ansible-playbook -v ansible/site.yml
107+
ansible-playbook -v ansible/ci/check_slurm.yml
122108
123109
- name: Run MPI-based tests
124110
run: |
125111
. venv/bin/activate
126112
. environments/${{ matrix.cloud }}/activate
127113
ansible-playbook -vv ansible/adhoc/hpctests.yml
128-
env:
129-
ANSIBLE_FORCE_COLOR: True
130-
OS_CLOUD: openstack
131114
132115
- name: Confirm Open Ondemand is up (via SOCKS proxy)
133116
run: |
@@ -161,62 +144,62 @@ jobs:
161144
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
162145

163146
- name: Build packer images
147+
id: packer_build
164148
run: |
165149
. venv/bin/activate
166150
. environments/${{ matrix.cloud }}/activate
167-
ansible-playbook ansible/adhoc/generate-passwords.yml
168-
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
169151
cd packer/
170152
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
171-
env:
172-
OS_CLOUD: openstack
173-
ANSIBLE_FORCE_COLOR: True
174-
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
153+
../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
175154
176-
- name: Test reimage of nodes
155+
- name: Test reimage of login nodes (via rebuild adhoc)
177156
run: |
178157
. venv/bin/activate
179158
. environments/${{ matrix.cloud }}/activate
180-
ansible all -m wait_for_connection
181-
ansible-playbook -vv ansible/ci/test_reimage.yml
182-
env:
183-
OS_CLOUD: openstack
184-
ANSIBLE_FORCE_COLOR: True
159+
ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
160+
ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161+
ansible-playbook -v ansible/ci/check_slurm.yml
162+
163+
- name: Test reimage of compute nodes (via slurm)
164+
run: |
165+
. venv/bin/activate
166+
. environments/${{ matrix.cloud }}/activate
167+
ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
168+
ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169+
ansible-playbook -v ansible/ci/check_slurm.yml
170+
171+
- name: Test reimage of control node (via rebuild adhoc)
172+
run: |
173+
. venv/bin/activate
174+
. environments/${{ matrix.cloud }}/activate
175+
ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
176+
ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
177+
ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
178+
ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
179+
ansible-playbook -v ansible/ci/check_slurm.yml
185180
186181
- name: Check sacct state survived reimage
187182
run: |
188183
. venv/bin/activate
189184
. environments/${{ matrix.cloud }}/activate
190185
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
191-
env:
192-
ANSIBLE_FORCE_COLOR: True
193-
OS_CLOUD: openstack
194186
195187
- name: Check MPI-based tests are shown in Grafana
196188
run: |
197189
. venv/bin/activate
198190
. environments/${{ matrix.cloud }}/activate
199191
ansible-playbook -vv ansible/ci/check_grafana.yml
200-
env:
201-
ANSIBLE_FORCE_COLOR: True
202-
OS_CLOUD: openstack
203192
204193
- name: Delete infrastructure
205194
run: |
206195
. venv/bin/activate
207196
. environments/${{ matrix.cloud }}/activate
208197
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
209198
terraform destroy -auto-approve
210-
env:
211-
OS_CLOUD: openstack
212-
TF_VAR_cluster_name: ci${{ github.run_id }}
213199
if: ${{ success() || cancelled() }}
214200

215201
- name: Delete images
216202
run: |
217203
. venv/bin/activate
218204
. environments/${{ matrix.cloud }}/activate
219205
ansible-playbook -vv ansible/ci/delete_images.yml
220-
env:
221-
OS_CLOUD: openstack
222-
ANSIBLE_FORCE_COLOR: True

ansible/bootstrap.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,6 @@
117117
async: "{{ 30 * 60 }}" # wait for up to 30 minutes
118118
poll: 15 # check every 15 seconds
119119
register: updates
120-
- debug:
121-
var: updates
122120
- name: Ensure update log directory on localhost exists
123121
file:
124122
path: "{{ update_log_path | dirname }}"

ansible/ci/check_slurm.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
- hosts: login:!builder # won't have a slurm control daemon when in build
2+
become: no
3+
gather_facts: false
4+
tasks:
5+
- name: Run sinfo
6+
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
7+
register: sinfo
8+
changed_when: false
9+
until: "'boot' not in sinfo.stdout_lines"
10+
retries: 5
11+
delay: 10
12+
- name: Check nodes have expected slurm state
13+
assert:
14+
that: sinfo.stdout_lines == expected_sinfo
15+
fail_msg: |
16+
sinfo output not as expected:
17+
actual:
18+
{{ sinfo.stdout_lines }}
19+
expected:
20+
{{ expected_sinfo }}
21+
<end>
22+
vars:
23+
expected_sinfo:
24+
- "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle"
25+
- "{{ openhpc_cluster_name }}-compute-[2-3] extra up 60-00:00:00 2 idle"

ansible/ci/test_reimage.yml

Lines changed: 0 additions & 122 deletions
This file was deleted.

dev/output_manifest.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python
2+
# Set github workflow output parameters defining image IDs from a packer manifest.
3+
# Usage:
4+
# ./packer/read_manifest.py packer/packer-manifest.json
5+
6+
# E.g. assuming the default packer builds this will produce something like:
7+
# ::set-output name=NEW_COMPUTE_IMAGE_ID::9aabd73d-e550-4116-a90c-700478b722ce
8+
# ::set-output name=NEW_LOGIN_IMAGE_ID::87b41d58-d7e3-4c38-be05-453c3287ecab
9+
# ::set-output name=NEW_CONTROL_IMAGE_ID::7f812168-73fe-4a60-b9e9-9109a405390d
10+
# which can be used in subsequent workflow steps: [1]
11+
#
12+
# [1]: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-setting-a-value
13+
14+
import sys, json
15+
output = {}
16+
with open(sys.argv[1]) as f:
17+
data = json.load(f)
18+
for build in data['builds']:
19+
node_type = build['custom_data']['source']
20+
image_id = build['artifact_id']
21+
output[node_type] = image_id # NB: this deliberately gets the LAST build for a node type
22+
for node_type, image_id in output.items():
23+
print('::set-output name=NEW_%s_IMAGE_ID::%s' % (node_type.upper(), image_id))

environments/arcus/hooks/check_slurm.yml

Lines changed: 0 additions & 21 deletions
This file was deleted.

environments/arcus/hooks/post.yml

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)