Skip to content

Commit 1b5e6a4

Browse files
committed
make smslabs CI similar to ci/arcus - running image build in parallel with direct deploy
1 parent 283b19c commit 1b5e6a4

File tree

7 files changed

+160
-100
lines changed

7 files changed

+160
-100
lines changed

.github/workflows/smslabs.yml

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
pull_request:
88
concurrency: stackhpc-ci # openstack project
99
jobs:
10-
openstack-example:
10+
smslabs:
1111
runs-on: ubuntu-20.04
1212
steps:
1313
- uses: actions/checkout@v2
@@ -77,63 +77,55 @@ jobs:
7777
TF_VAR_cluster_name: ci${{ github.run_id }}
7878
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}
7979

80-
- name: Configure infrastructure
80+
- name: Directly configure cluster and build compute + login images
81+
# see pre-hook for the image build
8182
run: |
8283
. venv/bin/activate
8384
. environments/smslabs/activate
8485
ansible all -m wait_for_connection
8586
ansible-playbook ansible/adhoc/generate-passwords.yml
87+
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
8688
ansible-playbook -vv ansible/site.yml
8789
env:
90+
OS_CLOUD: openstack
8891
ANSIBLE_FORCE_COLOR: True
8992
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
9093

91-
- name: Run MPI-based tests
92-
run: |
93-
. venv/bin/activate
94-
. environments/smslabs/activate
95-
ansible-playbook -vv ansible/adhoc/hpctests.yml
96-
env:
97-
ANSIBLE_FORCE_COLOR: True
98-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
99-
100-
- name: Build control and compute images
94+
- name: Test reimage of login and compute nodes
10195
run: |
10296
. venv/bin/activate
10397
. environments/smslabs/activate
104-
cd packer
105-
PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
98+
ansible all -m wait_for_connection
99+
ansible-playbook -vv ansible/ci/test_reimage.yml
106100
env:
107101
OS_CLOUD: openstack
108-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
102+
ANSIBLE_FORCE_COLOR: True
109103

110-
- name: Reimage compute nodes via slurm and check cluster still up
104+
- name: Run MPI-based tests
111105
run: |
112106
. venv/bin/activate
113107
. environments/smslabs/activate
114-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml
115-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
108+
ansible-playbook -vv ansible/adhoc/hpctests.yml
116109
env:
110+
ANSIBLE_FORCE_COLOR: True
117111
OS_CLOUD: openstack
118-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
119112

120-
- name: Reimage login nodes via openstack and check cluster still up
113+
- name: Delete infrastructure
121114
run: |
122115
. venv/bin/activate
123116
. environments/smslabs/activate
124-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml
125-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
117+
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
118+
terraform destroy -auto-approve
126119
env:
127120
OS_CLOUD: openstack
128-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
121+
TF_VAR_cluster_name: ci${{ github.run_id }}
122+
if: ${{ success() || cancelled() }}
129123

130-
- name: Delete infrastructure
124+
- name: Delete images
131125
run: |
132126
. venv/bin/activate
133127
. environments/smslabs/activate
134-
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
135-
terraform destroy -auto-approve
128+
ansible-playbook -vv ansible/ci/delete_images.yml
136129
env:
137130
OS_CLOUD: openstack
138-
TF_VAR_cluster_name: ci${{ github.run_id }}
139-
if: ${{ success() || cancelled() }}
131+
ANSIBLE_FORCE_COLOR: True
Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
1-
# Reimage login nodes via OpenStack
2-
3-
- hosts: login
1+
- hosts: login:!builder
42
become: no
3+
gather_facts: no
54
tasks:
65
- name: Read packer build manifest
76
set_fact:
87
manifest: "{{ lookup('file', manifest_path) | from_json }}"
98
vars:
109
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
1110
delegate_to: localhost
12-
13-
- name: Get latest login image build
11+
12+
- name: Get latest image builds
1413
set_fact:
1514
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
15+
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
1616

17-
- name: Reimage node via openstack
17+
- name: Delete images
1818
shell:
19-
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
19+
cmd: |
20+
openstack image delete {{ login_build.artifact_id }}
21+
openstack image delete {{ compute_build.artifact_id }}
2022
delegate_to: localhost
21-
22-
- name: Wait for connection
23-
wait_for_connection:
23+

ansible/ci/test_reimage.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
- hosts: login:!builder
2+
become: no
3+
tasks:
4+
- name: Read packer build manifest
5+
set_fact:
6+
manifest: "{{ lookup('file', manifest_path) | from_json }}"
7+
vars:
8+
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
9+
delegate_to: localhost
10+
11+
- name: Get latest image builds
12+
set_fact:
13+
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
14+
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
15+
16+
- name: Reimage login node via openstack
17+
shell:
18+
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
19+
delegate_to: localhost
20+
21+
- name: Check login node rebuild completed
22+
shell:
23+
cmd: openstack server show {{ inventory_hostname }} --format value -c image
24+
register: openstack_login
25+
delegate_to: localhost
26+
retries: 5
27+
delay: 30
28+
until: login_build.artifact_id in openstack_login.stdout
29+
changed_when: false
30+
31+
- name: Wait for login connection
32+
wait_for_connection:
33+
timeout: 800
34+
35+
- name: Check slurm up after reimaging login node
36+
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
37+
38+
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
39+
- name: Request compute node rebuild via Slurm
40+
shell:
41+
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
42+
become: yes
43+
44+
- name: Check compute node rebuild completed
45+
shell:
46+
cmd: openstack server show {{ item }} --format value -c image
47+
register: openstack_compute
48+
delegate_to: localhost
49+
loop: "{{ groups['compute'] }}"
50+
retries: 5
51+
delay: 30
52+
until: compute_build.artifact_id in openstack_compute.stdout
53+
changed_when: false
54+
55+
- hosts: compute:!builder
56+
become: no
57+
gather_facts: no
58+
tasks:
59+
- name: Wait for compute connection
60+
wait_for_connection:
61+
timeout: 800
62+
63+
- name: Check slurm up after reimaging login node
64+
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
65+
run_once: true

environments/smslabs/ci/reimage-compute.yml

Lines changed: 0 additions & 37 deletions
This file was deleted.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
- name: Run sinfo
2+
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
3+
register: sinfo
4+
changed_when: false
5+
until: "'boot' not in sinfo.stdout_lines"
6+
retries: 5
7+
delay: 10
8+
- name: Check nodes have expected slurm state
9+
assert:
10+
that: sinfo.stdout_lines == expected_sinfo
11+
fail_msg: |
12+
sinfo output not as expected:
13+
actual:
14+
{{ sinfo.stdout_lines }}
15+
expected:
16+
{{ expected_sinfo }}
17+
<end>
18+
vars:
19+
expected_sinfo:
20+
- "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle"

environments/smslabs/hooks/post.yml

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,19 @@
1-
- hosts: login
1+
- hosts: login:!builder # won't have a slurm control daemon when in build
22
become: no
33
gather_facts: false
4-
tags: checks
54
tasks:
6-
- block:
7-
- name: Run sinfo
8-
shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
9-
register: sinfo
10-
changed_when: false
11-
- name: Check nodes have expected slurm state
12-
assert:
13-
that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them
14-
fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}"
15-
when: "'builder' not in group_names" # won't have a slurm control daemon when in build
5+
- name: Check slurm up after direct deploy
6+
import_tasks: check_slurm.yml
167

17-
- hosts: openondemand
18-
name: Check Open Ondemand is running
19-
tags:
20-
- checks
21-
- openondemand
22-
- openondemand_server
8+
- hosts: localhost
9+
become: false
10+
tags: build
2311
tasks:
24-
- uri:
25-
url: https://localhost
26-
validate_certs: false # selfsigned
27-
force_basic_auth: yes # as otherwise we get 401
28-
url_username: testuser
29-
url_password: "{{ test_user_password }}"
30-
status_code: 200
12+
- name: Check Packer build finished
13+
async_status:
14+
jid: "{{ packer_run.ansible_job_id }}"
15+
register: packer_result
16+
until: packer_result.finished
17+
retries: 30 # allow 15 mins
18+
delay: 30
19+
when: packer_run is defined # allows rerunning post.yml

environments/smslabs/hooks/pre.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
- hosts: localhost
2+
become: false
3+
tags: build
4+
tasks:
5+
- name: Ensure secrets generated
6+
include_role:
7+
name: passwords
8+
9+
- name: Build packer images
10+
shell:
11+
cmd: |
12+
cd packer
13+
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
14+
chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}"
15+
when: "'builder' not in group_names" # avoid recursion!
16+
register: packer_run
17+
async: 2700 # 45 minutes
18+
poll: 0
19+
20+
# For some reason squid shows TCP_MISS_ABORTED/200 on everything
21+
# - hosts: all
22+
# become: yes
23+
# gather_facts: no
24+
# tasks:
25+
# - name: Configure dnf proxy
26+
# community.general.ini_file:
27+
# path: /etc/dnf/dnf.conf
28+
# section: main
29+
# option: proxy
30+
# value: "{{ squid_proxy }}"
31+
# no_extra_spaces: true

0 commit comments

Comments
 (0)