Skip to content

Commit 416b440

Browse files
authored
Merge pull request #250 from stackhpc/feat/fatimage
Build fat image in appliance
2 parents c130ed9 + 7952e2c commit 416b440

File tree

34 files changed

+324
-170
lines changed

34 files changed

+324
-170
lines changed

.github/workflows/fatimage.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
2+
name: Build fat image
3+
on:
4+
workflow_dispatch:
5+
jobs:
6+
openstack:
7+
name: openstack-build-arcus
8+
concurrency: ${{ github.ref }} # to branch/PR
9+
runs-on: ubuntu-20.04
10+
env:
11+
ANSIBLE_FORCE_COLOR: True
12+
OS_CLOUD: openstack
13+
steps:
14+
- uses: actions/checkout@v2
15+
16+
- name: Setup ssh
17+
run: |
18+
set -x
19+
mkdir ~/.ssh
20+
echo "${arcus_SSH_KEY}" > ~/.ssh/id_rsa
21+
chmod 0600 ~/.ssh/id_rsa
22+
env:
23+
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
24+
25+
- name: Add bastion's ssh key to known_hosts
26+
run: cat environments/.stackhpc/bastion_fingerprint >> ~/.ssh/known_hosts
27+
shell: bash
28+
29+
- name: Install ansible etc
30+
run: dev/setup-env.sh
31+
32+
- name: Write clouds.yaml
33+
run: |
34+
mkdir -p ~/.config/openstack/
35+
echo "${arcus_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
36+
shell: bash
37+
env:
38+
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
39+
40+
- name: Setup environment
41+
run: |
42+
. venv/bin/activate
43+
. environments/.stackhpc/activate
44+
45+
- name: Build fat image with packer
46+
id: packer_build
47+
run: |
48+
. venv/bin/activate
49+
. environments/.stackhpc/activate
50+
cd packer/
51+
packer init
52+
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
53+
54+
- name: Get created image name from manifest
55+
id: manifest
56+
run: |
57+
. venv/bin/activate
58+
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer-manifest.json)
59+
while ! openstack image show -f value -c name $IMAGE_ID; do
60+
sleep 30
61+
done
62+
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
63+
echo "::set-output name=IMAGE_ID::$IMAGE_ID"
64+
echo "::set-output name=IMAGE_NAME::$IMAGE_NAME"

.github/workflows/stackhpc.yml

Lines changed: 45 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
name: Test deployment and image build on OpenStack
2+
name: Test deployment and reimage on OpenStack
33
on:
44
workflow_dispatch:
55
push:
@@ -8,12 +8,7 @@ on:
88
pull_request:
99
jobs:
1010
openstack:
11-
name: openstack-ci-${{ matrix.cloud }}
12-
strategy:
13-
matrix:
14-
cloud:
15-
- "arcus" # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
16-
fail-fast: false # as want clouds to continue independently
11+
name: openstack-ci-arcus # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
1712
concurrency: ${{ github.ref }} # to branch/PR
1813
runs-on: ubuntu-20.04
1914
env:
@@ -27,13 +22,13 @@ jobs:
2722
run: |
2823
set -x
2924
mkdir ~/.ssh
30-
echo "${${{ matrix.cloud }}_SSH_KEY}" > ~/.ssh/id_rsa
25+
echo "${arcus_SSH_KEY}" > ~/.ssh/id_rsa
3126
chmod 0600 ~/.ssh/id_rsa
3227
env:
3328
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
3429

3530
- name: Add bastion's ssh key to known_hosts
36-
run: cat environments/${{ matrix.cloud }}/bastion_fingerprint >> ~/.ssh/known_hosts
31+
run: cat environments/.stackhpc/bastion_fingerprint >> ~/.ssh/known_hosts
3732
shell: bash
3833

3934
- name: Install ansible etc
@@ -44,38 +39,38 @@ jobs:
4439

4540
- name: Initialise terraform
4641
run: terraform init
47-
working-directory: ${{ github.workspace }}/environments/${{ matrix.cloud }}/terraform
42+
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
4843

4944
- name: Write clouds.yaml
5045
run: |
5146
mkdir -p ~/.config/openstack/
52-
echo "${${{ matrix.cloud }}_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
47+
echo "${arcus_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
5348
shell: bash
5449
env:
5550
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
5651

5752
- name: Setup environment-specific inventory/terraform inputs
5853
run: |
5954
. venv/bin/activate
60-
. environments/${{ matrix.cloud }}/activate
55+
. environments/.stackhpc/activate
6156
ansible-playbook ansible/adhoc/generate-passwords.yml
6257
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
6358
env:
6459
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
6560

66-
- name: Provision servers
61+
- name: Provision nodes using fat image
6762
id: provision_servers
6863
run: |
6964
. venv/bin/activate
70-
. environments/${{ matrix.cloud }}/activate
65+
. environments/.stackhpc/activate
7166
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
7267
terraform apply -auto-approve
7368
7469
- name: Get server provisioning failure messages
7570
id: provision_failure
7671
run: |
7772
. venv/bin/activate
78-
. environments/${{ matrix.cloud }}/activate
73+
. environments/.stackhpc/activate
7974
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8075
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
8176
echo TF failure messages: $TF_FAIL_MSGS
@@ -85,29 +80,29 @@ jobs:
8580
- name: Delete infrastructure if failed due to lack of hosts
8681
run: |
8782
. venv/bin/activate
88-
. environments/${{ matrix.cloud }}/activate
83+
. environments/.stackhpc/activate
8984
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
9085
terraform destroy -auto-approve
9186
if: ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
9287

93-
- name: Directly configure cluster
88+
- name: Configure cluster
9489
run: |
9590
. venv/bin/activate
96-
. environments/${{ matrix.cloud }}/activate
91+
. environments/.stackhpc/activate
9792
ansible all -m wait_for_connection
9893
ansible-playbook -v ansible/site.yml
9994
ansible-playbook -v ansible/ci/check_slurm.yml
10095
10196
- name: Run MPI-based tests
10297
run: |
10398
. venv/bin/activate
104-
. environments/${{ matrix.cloud }}/activate
99+
. environments/.stackhpc/activate
105100
ansible-playbook -vv ansible/adhoc/hpctests.yml
106101
107102
- name: Confirm Open Ondemand is up (via SOCKS proxy)
108103
run: |
109104
. venv/bin/activate
110-
. environments/${{ matrix.cloud }}/activate
105+
. environments/.stackhpc/activate
111106
112107
# load ansible variables into shell:
113108
ansible-playbook ansible/ci/output_vars.yml \
@@ -135,63 +130,55 @@ jobs:
135130
env:
136131
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
137132

138-
- name: Build packer images
139-
id: packer_build
140-
run: |
141-
. venv/bin/activate
142-
. environments/${{ matrix.cloud }}/activate
143-
cd packer/
144-
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
145-
../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
146-
147-
- name: Test reimage of login nodes (via rebuild adhoc)
148-
run: |
149-
. venv/bin/activate
150-
. environments/${{ matrix.cloud }}/activate
151-
ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
152-
ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
153-
ansible-playbook -v ansible/ci/check_slurm.yml
154-
155-
- name: Test reimage of compute nodes (via slurm)
156-
run: |
157-
. venv/bin/activate
158-
. environments/${{ matrix.cloud }}/activate
159-
ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
160-
ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161-
ansible-playbook -v ansible/ci/check_slurm.yml
133+
# - name: Build environment-specific compute image
134+
# id: packer_build
135+
# run: |
136+
# . venv/bin/activate
137+
# . environments/.stackhpc/activate
138+
# cd packer/
139+
# packer init
140+
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
141+
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
142+
143+
# - name: Test reimage of compute nodes to new environment-specific image (via slurm)
144+
# run: |
145+
# . venv/bin/activate
146+
# . environments/.stackhpc/activate
147+
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
148+
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
149+
# ansible-playbook -v ansible/ci/check_slurm.yml
162150

163-
- name: Test reimage of control node (via rebuild adhoc)
151+
- name: Test reimage of all nodes (via rebuild adhoc)
164152
run: |
165153
. venv/bin/activate
166-
. environments/${{ matrix.cloud }}/activate
167-
ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
168-
ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169-
ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
170-
ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
154+
. environments/.stackhpc/activate
155+
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
156+
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
157+
ansible-playbook -v ansible/site.yml
171158
ansible-playbook -v ansible/ci/check_slurm.yml
172159
173160
- name: Check sacct state survived reimage
174161
run: |
175162
. venv/bin/activate
176-
. environments/${{ matrix.cloud }}/activate
163+
. environments/.stackhpc/activate
177164
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
178165
179166
- name: Check MPI-based tests are shown in Grafana
180167
run: |
181168
. venv/bin/activate
182-
. environments/${{ matrix.cloud }}/activate
169+
. environments/.stackhpc/activate
183170
ansible-playbook -vv ansible/ci/check_grafana.yml
184171
185172
- name: Delete infrastructure
186173
run: |
187174
. venv/bin/activate
188-
. environments/${{ matrix.cloud }}/activate
175+
. environments/.stackhpc/activate
189176
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
190177
terraform destroy -auto-approve
191178
if: ${{ success() || cancelled() }}
192179

193-
- name: Delete images
194-
run: |
195-
. venv/bin/activate
196-
. environments/${{ matrix.cloud }}/activate
197-
ansible-playbook -vv ansible/ci/delete_images.yml
180+
# - name: Delete images
181+
# run: |
182+
# . venv/bin/activate
183+
# . environments/.stackhpc/activate
184+
# ansible-playbook -vv ansible/ci/delete_images.yml

ansible/cleanup.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Clean up a Packer build VM
2+
3+
- meta: flush_handlers
4+
5+
- name: Remove dnf caches
6+
command: dnf clean all
7+
8+
- name: Delete /etc/resolv.conf
9+
# required as if cloud-init (rather than network manager) controls this on next boot it won't be entirely overrwritten
10+
file:
11+
path: /etc/resolv.conf
12+
state: absent
13+
14+
- name: Delete any injected ssh config for rocky
15+
file:
16+
path: /home/rocky/.ssh/
17+
state: absent
18+
19+
- name: Run cloud-init cleanup
20+
command: cloud-init clean --logs --seed
21+

0 commit comments

Comments
 (0)