1
1
2
- name : Test deployment and image build on OpenStack
2
+ name : Test deployment and reimage on OpenStack
3
3
on :
4
4
workflow_dispatch :
5
5
push :
8
8
pull_request :
9
9
jobs :
10
10
openstack :
11
- name : openstack-ci-${{ matrix.cloud }}
12
- strategy :
13
- matrix :
14
- cloud :
15
- - " arcus" # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
16
- fail-fast : false # as want clouds to continue independently
11
+ name : openstack-ci-arcus # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
17
12
concurrency : ${{ github.ref }} # to branch/PR
18
13
runs-on : ubuntu-20.04
19
14
env :
@@ -27,13 +22,13 @@ jobs:
27
22
run : |
28
23
set -x
29
24
mkdir ~/.ssh
30
- echo "${${{ matrix.cloud }}_SSH_KEY }" > ~/.ssh/id_rsa
25
+ echo "${arcus_SSH_KEY }" > ~/.ssh/id_rsa
31
26
chmod 0600 ~/.ssh/id_rsa
32
27
env :
33
28
arcus_SSH_KEY : ${{ secrets.ARCUS_SSH_KEY }}
34
29
35
30
- name : Add bastion's ssh key to known_hosts
36
- run : cat environments/${{ matrix.cloud }} /bastion_fingerprint >> ~/.ssh/known_hosts
31
+ run : cat environments/.stackhpc /bastion_fingerprint >> ~/.ssh/known_hosts
37
32
shell : bash
38
33
39
34
- name : Install ansible etc
@@ -44,38 +39,38 @@ jobs:
44
39
45
40
- name : Initialise terraform
46
41
run : terraform init
47
- working-directory : ${{ github.workspace }}/environments/${{ matrix.cloud }} /terraform
42
+ working-directory : ${{ github.workspace }}/environments/.stackhpc /terraform
48
43
49
44
- name : Write clouds.yaml
50
45
run : |
51
46
mkdir -p ~/.config/openstack/
52
- echo "${${{ matrix.cloud }}_CLOUDS_YAML }" > ~/.config/openstack/clouds.yaml
47
+ echo "${arcus_CLOUDS_YAML }" > ~/.config/openstack/clouds.yaml
53
48
shell : bash
54
49
env :
55
50
arcus_CLOUDS_YAML : ${{ secrets.ARCUS_CLOUDS_YAML }}
56
51
57
52
- name : Setup environment-specific inventory/terraform inputs
58
53
run : |
59
54
. venv/bin/activate
60
- . environments/${{ matrix.cloud }} /activate
55
+ . environments/.stackhpc /activate
61
56
ansible-playbook ansible/adhoc/generate-passwords.yml
62
57
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
63
58
env :
64
59
TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
65
60
66
- - name : Provision servers
61
+ - name : Provision nodes using fat image
67
62
id : provision_servers
68
63
run : |
69
64
. venv/bin/activate
70
- . environments/${{ matrix.cloud }} /activate
65
+ . environments/.stackhpc /activate
71
66
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
72
67
terraform apply -auto-approve
73
68
74
69
- name : Get server provisioning failure messages
75
70
id : provision_failure
76
71
run : |
77
72
. venv/bin/activate
78
- . environments/${{ matrix.cloud }} /activate
73
+ . environments/.stackhpc /activate
79
74
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
80
75
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
81
76
echo TF failure messages: $TF_FAIL_MSGS
@@ -85,29 +80,29 @@ jobs:
85
80
- name : Delete infrastructure if failed due to lack of hosts
86
81
run : |
87
82
. venv/bin/activate
88
- . environments/${{ matrix.cloud }} /activate
83
+ . environments/.stackhpc /activate
89
84
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
90
85
terraform destroy -auto-approve
91
86
if : ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
92
87
93
- - name : Directly configure cluster
88
+ - name : Configure cluster
94
89
run : |
95
90
. venv/bin/activate
96
- . environments/${{ matrix.cloud }} /activate
91
+ . environments/.stackhpc /activate
97
92
ansible all -m wait_for_connection
98
93
ansible-playbook -v ansible/site.yml
99
94
ansible-playbook -v ansible/ci/check_slurm.yml
100
95
101
96
- name : Run MPI-based tests
102
97
run : |
103
98
. venv/bin/activate
104
- . environments/${{ matrix.cloud }} /activate
99
+ . environments/.stackhpc /activate
105
100
ansible-playbook -vv ansible/adhoc/hpctests.yml
106
101
107
102
- name : Confirm Open Ondemand is up (via SOCKS proxy)
108
103
run : |
109
104
. venv/bin/activate
110
- . environments/${{ matrix.cloud }} /activate
105
+ . environments/.stackhpc /activate
111
106
112
107
# load ansible variables into shell:
113
108
ansible-playbook ansible/ci/output_vars.yml \
@@ -135,63 +130,55 @@ jobs:
135
130
env :
136
131
TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
137
132
138
- - name : Build packer images
139
- id : packer_build
140
- run : |
141
- . venv/bin/activate
142
- . environments/${{ matrix.cloud }}/activate
143
- cd packer/
144
- PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
145
- ../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
146
-
147
- - name : Test reimage of login nodes (via rebuild adhoc)
148
- run : |
149
- . venv/bin/activate
150
- . environments/${{ matrix.cloud }}/activate
151
- ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
152
- ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
153
- ansible-playbook -v ansible/ci/check_slurm.yml
154
-
155
- - name : Test reimage of compute nodes (via slurm)
156
- run : |
157
- . venv/bin/activate
158
- . environments/${{ matrix.cloud }}/activate
159
- ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
160
- ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161
- ansible-playbook -v ansible/ci/check_slurm.yml
133
+ # - name: Build environment-specific compute image
134
+ # id: packer_build
135
+ # run: |
136
+ # . venv/bin/activate
137
+ # . environments/.stackhpc/activate
138
+ # cd packer/
139
+ # packer init
140
+ # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
141
+ # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
142
+
143
+ # - name: Test reimage of compute nodes to new environment-specific image (via slurm)
144
+ # run: |
145
+ # . venv/bin/activate
146
+ # . environments/.stackhpc/activate
147
+ # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
148
+ # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
149
+ # ansible-playbook -v ansible/ci/check_slurm.yml
162
150
163
- - name : Test reimage of control node (via rebuild adhoc)
151
+ - name : Test reimage of all nodes (via rebuild adhoc)
164
152
run : |
165
153
. venv/bin/activate
166
- . environments/${{ matrix.cloud }}/activate
167
- ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
168
- ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169
- ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
170
- ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
154
+ . environments/.stackhpc/activate
155
+ ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
156
+ ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
157
+ ansible-playbook -v ansible/site.yml
171
158
ansible-playbook -v ansible/ci/check_slurm.yml
172
159
173
160
- name : Check sacct state survived reimage
174
161
run : |
175
162
. venv/bin/activate
176
- . environments/${{ matrix.cloud }} /activate
163
+ . environments/.stackhpc /activate
177
164
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
178
165
179
166
- name : Check MPI-based tests are shown in Grafana
180
167
run : |
181
168
. venv/bin/activate
182
- . environments/${{ matrix.cloud }} /activate
169
+ . environments/.stackhpc /activate
183
170
ansible-playbook -vv ansible/ci/check_grafana.yml
184
171
185
172
- name : Delete infrastructure
186
173
run : |
187
174
. venv/bin/activate
188
- . environments/${{ matrix.cloud }} /activate
175
+ . environments/.stackhpc /activate
189
176
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
190
177
terraform destroy -auto-approve
191
178
if : ${{ success() || cancelled() }}
192
179
193
- - name : Delete images
194
- run : |
195
- . venv/bin/activate
196
- . environments/${{ matrix.cloud }} /activate
197
- ansible-playbook -vv ansible/ci/delete_images.yml
180
+ # - name: Delete images
181
+ # run: |
182
+ # . venv/bin/activate
183
+ # . environments/.stackhpc /activate
184
+ # ansible-playbook -vv ansible/ci/delete_images.yml
0 commit comments