16
16
fail-fast : false # as want clouds to continue independently
17
17
concurrency : ${{ github.ref }} # to branch/PR
18
18
runs-on : ubuntu-20.04
19
+ env :
20
+ ANSIBLE_FORCE_COLOR : True
21
+ OS_CLOUD : openstack
22
+ TF_VAR_cluster_name : ci${{ github.run_id }}
19
23
steps :
20
24
- uses : actions/checkout@v2
21
25
26
30
echo "${${{ matrix.cloud }}_SSH_KEY}" > ~/.ssh/id_rsa
27
31
chmod 0600 ~/.ssh/id_rsa
28
32
env :
29
- smslabs_SSH_KEY : ${{ secrets.SSH_KEY }}
30
33
arcus_SSH_KEY : ${{ secrets.ARCUS_SSH_KEY }}
31
34
32
35
- name : Add bastion's ssh key to known_hosts
@@ -49,19 +52,14 @@ jobs:
49
52
echo "${${{ matrix.cloud }}_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
50
53
shell : bash
51
54
env :
52
- smslabs_CLOUDS_YAML : ${{ secrets.CLOUDS_YAML }}
53
55
arcus_CLOUDS_YAML : ${{ secrets.ARCUS_CLOUDS_YAML }}
54
56
55
57
- name : Provision ports, inventory and other infrastructure apart from nodes
56
- id : provision_ports
57
58
run : |
58
59
. venv/bin/activate
59
60
. environments/${{ matrix.cloud }}/activate
60
61
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
61
62
TF_VAR_create_nodes=false terraform apply -auto-approve
62
- env :
63
- OS_CLOUD : openstack
64
- TF_VAR_cluster_name : ci${{ github.run_id }}
65
63
66
64
- name : Setup environment-specific inventory/terraform inputs
67
65
run : |
71
69
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
72
70
ansible-playbook ansible/adhoc/template-cloud-init.yml
73
71
env :
74
- ANSIBLE_FORCE_COLOR : True
75
72
TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
76
73
77
74
- name : Provision servers
81
78
. environments/${{ matrix.cloud }}/activate
82
79
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
83
80
terraform apply -auto-approve
84
- env :
85
- OS_CLOUD : openstack
86
- TF_VAR_cluster_name : ci${{ github.run_id }}
87
81
88
82
- name : Get server provisioning failure messages
89
83
id : provision_failure
94
88
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
95
89
echo TF failure messages: $TF_FAIL_MSGS
96
90
echo "::set-output name=messages::${TF_FAIL_MSGS}"
97
- env :
98
- OS_CLOUD : openstack
99
- TF_VAR_cluster_name : ci${{ github.run_id }}
100
91
if : always() && steps.provision_servers.outcome == 'failure'
101
92
102
93
- name : Delete infrastructure if failed due to lack of hosts
@@ -105,29 +96,21 @@ jobs:
105
96
. environments/${{ matrix.cloud }}/activate
106
97
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
107
98
terraform destroy -auto-approve
108
- env :
109
- OS_CLOUD : openstack
110
- TF_VAR_cluster_name : ci${{ github.run_id }}
111
99
if : ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
112
100
113
101
- name : Directly configure cluster
114
102
run : |
115
103
. venv/bin/activate
116
104
. environments/${{ matrix.cloud }}/activate
117
105
ansible all -m wait_for_connection
118
- ansible-playbook -vv ansible/site.yml
119
- env :
120
- OS_CLOUD : openstack
121
- ANSIBLE_FORCE_COLOR : True
106
+ ansible-playbook -v ansible/site.yml
107
+ ansible-playbook -v ansible/ci/check_slurm.yml
122
108
123
109
- name : Run MPI-based tests
124
110
run : |
125
111
. venv/bin/activate
126
112
. environments/${{ matrix.cloud }}/activate
127
113
ansible-playbook -vv ansible/adhoc/hpctests.yml
128
- env :
129
- ANSIBLE_FORCE_COLOR : True
130
- OS_CLOUD : openstack
131
114
132
115
- name : Confirm Open Ondemand is up (via SOCKS proxy)
133
116
run : |
@@ -161,62 +144,62 @@ jobs:
161
144
TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
162
145
163
146
- name : Build packer images
147
+ id : packer_build
164
148
run : |
165
149
. venv/bin/activate
166
150
. environments/${{ matrix.cloud }}/activate
167
- ansible-playbook ansible/adhoc/generate-passwords.yml
168
- echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
169
151
cd packer/
170
152
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
171
- env :
172
- OS_CLOUD : openstack
173
- ANSIBLE_FORCE_COLOR : True
174
- TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
153
+ ../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
175
154
176
- - name : Test reimage of nodes
155
+ - name : Test reimage of login nodes (via rebuild adhoc)
177
156
run : |
178
157
. venv/bin/activate
179
158
. environments/${{ matrix.cloud }}/activate
180
- ansible all -m wait_for_connection
181
- ansible-playbook -vv ansible/ci/test_reimage.yml
182
- env :
183
- OS_CLOUD : openstack
184
- ANSIBLE_FORCE_COLOR : True
159
+ ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
160
+ ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161
+ ansible-playbook -v ansible/ci/check_slurm.yml
162
+
163
+ - name : Test reimage of compute nodes (via slurm)
164
+ run : |
165
+ . venv/bin/activate
166
+ . environments/${{ matrix.cloud }}/activate
167
+ ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
168
+ ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169
+ ansible-playbook -v ansible/ci/check_slurm.yml
170
+
171
+ - name : Test reimage of control node (via rebuild adhoc)
172
+ run : |
173
+ . venv/bin/activate
174
+ . environments/${{ matrix.cloud }}/activate
175
+ ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
176
+ ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
177
+ ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
178
+ ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
179
+ ansible-playbook -v ansible/ci/check_slurm.yml
185
180
186
181
- name : Check sacct state survived reimage
187
182
run : |
188
183
. venv/bin/activate
189
184
. environments/${{ matrix.cloud }}/activate
190
185
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
191
- env :
192
- ANSIBLE_FORCE_COLOR : True
193
- OS_CLOUD : openstack
194
186
195
187
- name : Check MPI-based tests are shown in Grafana
196
188
run : |
197
189
. venv/bin/activate
198
190
. environments/${{ matrix.cloud }}/activate
199
191
ansible-playbook -vv ansible/ci/check_grafana.yml
200
- env :
201
- ANSIBLE_FORCE_COLOR : True
202
- OS_CLOUD : openstack
203
192
204
193
- name : Delete infrastructure
205
194
run : |
206
195
. venv/bin/activate
207
196
. environments/${{ matrix.cloud }}/activate
208
197
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
209
198
terraform destroy -auto-approve
210
- env :
211
- OS_CLOUD : openstack
212
- TF_VAR_cluster_name : ci${{ github.run_id }}
213
199
if : ${{ success() || cancelled() }}
214
200
215
201
- name : Delete images
216
202
run : |
217
203
. venv/bin/activate
218
204
. environments/${{ matrix.cloud }}/activate
219
205
ansible-playbook -vv ansible/ci/delete_images.yml
220
- env :
221
- OS_CLOUD : openstack
222
- ANSIBLE_FORCE_COLOR : True
0 commit comments