Skip to content

Commit 7fc8085

Browse files
committed
Merge branch 'main' into feature/control-images2
2 parents 3b037fd + 192edee commit 7fc8085

File tree

11 files changed

+243
-120
lines changed

11 files changed

+243
-120
lines changed

.github/workflows/smslabs.yml

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
pull_request:
88
concurrency: stackhpc-ci # openstack project
99
jobs:
10-
openstack-example:
10+
smslabs:
1111
runs-on: ubuntu-20.04
1212
steps:
1313
- uses: actions/checkout@v2
@@ -77,63 +77,56 @@ jobs:
7777
TF_VAR_cluster_name: ci${{ github.run_id }}
7878
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}
7979

80-
- name: Configure infrastructure
80+
- name: Directly configure cluster and build compute, login and control images
81+
# see pre-hook for the image build
8182
run: |
8283
. venv/bin/activate
8384
. environments/smslabs/activate
8485
ansible all -m wait_for_connection
8586
ansible-playbook ansible/adhoc/generate-passwords.yml
87+
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
8688
ansible-playbook -vv ansible/site.yml
8789
env:
90+
OS_CLOUD: openstack
8891
ANSIBLE_FORCE_COLOR: True
8992
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
9093

91-
- name: Run MPI-based tests
92-
run: |
93-
. venv/bin/activate
94-
. environments/smslabs/activate
95-
ansible-playbook -vv ansible/adhoc/hpctests.yml
96-
env:
97-
ANSIBLE_FORCE_COLOR: True
98-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
99-
100-
- name: Build control, login and compute images
94+
- name: Test reimage of login and compute nodes
95+
# TODO: test control node reimage
10196
run: |
10297
. venv/bin/activate
10398
. environments/smslabs/activate
104-
cd packer
105-
PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
99+
ansible all -m wait_for_connection
100+
ansible-playbook -vv ansible/ci/test_reimage.yml
106101
env:
107102
OS_CLOUD: openstack
108-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
103+
ANSIBLE_FORCE_COLOR: True
109104

110-
- name: Reimage compute nodes via slurm and check cluster still up
105+
- name: Run MPI-based tests
111106
run: |
112107
. venv/bin/activate
113108
. environments/smslabs/activate
114-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml
115-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
109+
ansible-playbook -vv ansible/adhoc/hpctests.yml
116110
env:
111+
ANSIBLE_FORCE_COLOR: True
117112
OS_CLOUD: openstack
118-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
119113

120-
- name: Reimage login nodes via openstack and check cluster still up
114+
- name: Delete infrastructure
121115
run: |
122116
. venv/bin/activate
123117
. environments/smslabs/activate
124-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml
125-
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
118+
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
119+
terraform destroy -auto-approve
126120
env:
127121
OS_CLOUD: openstack
128-
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
122+
TF_VAR_cluster_name: ci${{ github.run_id }}
123+
if: ${{ success() || cancelled() }}
129124

130-
- name: Delete infrastructure
125+
- name: Delete images
131126
run: |
132127
. venv/bin/activate
133128
. environments/smslabs/activate
134-
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
135-
terraform destroy -auto-approve
129+
ansible-playbook -vv ansible/ci/delete_images.yml
136130
env:
137131
OS_CLOUD: openstack
138-
TF_VAR_cluster_name: ci${{ github.run_id }}
139-
if: ${{ success() || cancelled() }}
132+
ANSIBLE_FORCE_COLOR: True
Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
1-
# Reimage login nodes via OpenStack
2-
3-
- hosts: login
1+
- hosts: login:!builder
42
become: no
3+
gather_facts: no
54
tasks:
65
- name: Read packer build manifest
76
set_fact:
87
manifest: "{{ lookup('file', manifest_path) | from_json }}"
98
vars:
109
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
1110
delegate_to: localhost
12-
13-
- name: Get latest login image build
11+
12+
- name: Get latest image builds
1413
set_fact:
1514
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
15+
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
1616

17-
- name: Reimage node via openstack
17+
- name: Delete images
1818
shell:
19-
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
19+
cmd: |
20+
openstack image delete {{ login_build.artifact_id }}
21+
openstack image delete {{ compute_build.artifact_id }}
2022
delegate_to: localhost
21-
22-
- name: Wait for connection
23-
wait_for_connection:
23+

ansible/ci/test_reimage.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
- hosts: login:!builder
2+
become: no
3+
tasks:
4+
- name: Read packer build manifest
5+
set_fact:
6+
manifest: "{{ lookup('file', manifest_path) | from_json }}"
7+
vars:
8+
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
9+
delegate_to: localhost
10+
11+
- name: Get latest image builds
12+
set_fact:
13+
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
14+
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
15+
16+
- name: Reimage login node via openstack
17+
shell:
18+
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
19+
delegate_to: localhost
20+
21+
- name: Check login node rebuild completed
22+
shell:
23+
cmd: openstack server show {{ inventory_hostname }} --format value -c image
24+
register: openstack_login
25+
delegate_to: localhost
26+
retries: 5
27+
delay: 30
28+
until: login_build.artifact_id in openstack_login.stdout
29+
changed_when: false
30+
31+
- name: Wait for login connection
32+
wait_for_connection:
33+
timeout: 800
34+
35+
- name: Check slurm up after reimaging login node
36+
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
37+
38+
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
39+
- name: Request compute node rebuild via Slurm
40+
shell:
41+
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
42+
become: yes
43+
44+
- name: Check compute node rebuild completed
45+
shell:
46+
cmd: openstack server show {{ item }} --format value -c image
47+
register: openstack_compute
48+
delegate_to: localhost
49+
loop: "{{ groups['compute'] }}"
50+
retries: 5
51+
delay: 30
52+
until: compute_build.artifact_id in openstack_compute.stdout
53+
changed_when: false
54+
55+
- hosts: compute:!builder
56+
become: no
57+
gather_facts: no
58+
tasks:
59+
- name: Wait for compute connection
60+
wait_for_connection:
61+
timeout: 800
62+
63+
- name: Check slurm up after reimaging login node
64+
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
65+
run_once: true

ansible/roles/block_devices/README.md

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ This is a convenience wrapper around the ansible modules:
1111

1212
It includes logic to handle OpenStack-provided volumes appropriately both for appliance instances and the Packer build VM.
1313

14+
To avoid issues with device names changing after e.g. reboots, devices are identified by serial number and mounted by filesystem UUID.
15+
1416
Requirements
1517
------------
1618

@@ -20,7 +22,7 @@ Role Variables
2022
--------------
2123

2224
- `block_devices_partition_state`: Optional. Partition state, 'present' or 'absent' (as for parted) or 'skip'. Defaults to 'present'.
23-
- `block_devices_device`: Required. Path to block device, e.g. '/dev/sda'. See `community.general.parted:device` and `community.general.filesystem:dev`.
25+
- `block_devices_serial`: Required. Serial number of block device. For an OpenStack volume this is the volume ID.
2426
- `block_devices_number`: Required. Partition number, e.g 1 for "/dev/sda1". See `community.general.parted:number`.
2527
- `block_devices_fstype`: Required. Filesystem type, e.g.'ext4'. See `community.general.filesystem:fstype`
2628
- `block_devices_resizefs`: Optional. Grow filesystem into block device space, 'yes' or 'no' (default). See `community.general.filesystem:resizefs` for applicable fileysystem types.
@@ -51,7 +53,7 @@ Example Playbook
5153
The example variables below create an `ext4` partition on `/dev/sdb1` and mount it as `/mnt/files` with the default owner/group:
5254

5355
```yaml
54-
block_devices_device: /dev/sdb
56+
block_devices_serial: a1076455-da55-4e0c-bac8-ccc4698cff97
5557
block_devices_number: 1
5658
block_devices_fstype: ext4
5759
block_devices_path: /mnt/files
@@ -61,25 +63,12 @@ This does the same:
6163

6264
```yaml
6365
block_devices_configurations:
64-
- device: /dev/sdb
66+
- serial: a1076455-da55-4e0c-bac8-ccc4698cff97
6567
number: 1
6668
fstype: ext4
6769
path: /mnt/files
6870
```
6971

70-
This creates 'ext4' partitions on `/dev/sdb1` on `server` and `/dev/sdc1` on `server2`, both mounted at `/mnt/files`:
71-
72-
```yaml
73-
block_devices_fstype: ext4
74-
block_devices_path: /mnt/files
75-
block_devices_number: 1
76-
block_devices_configurations:
77-
- device: /dev/sdb
78-
hostnames: server1
79-
- device: /dev/sdc
80-
hostnames: server2
81-
```
82-
8372
License
8473
-------
8574

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/python
2+
3+
# Copyright: (c) 2021, StackHPC
4+
# Apache 2 License
5+
6+
DOCUMENTATION = r'''
7+
---
8+
module: block_devices
9+
10+
short_description: Return block device paths by serial number.
11+
12+
options: (none)
13+
14+
author:
15+
- Steve Brasier (@sjpb)
16+
'''
17+
18+
RETURN = r'''
19+
devices:
20+
description: dict with device serial numbers as keys and full paths (e.g. /dev/sdb) as values
21+
type: dict
22+
return: always
23+
'''
24+
25+
import json
26+
27+
from ansible.module_utils.basic import AnsibleModule
28+
29+
def run_module():
30+
module_args = dict()
31+
module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
32+
result = {"changed": False}
33+
_, stdout, _ = module.run_command("lsblk --paths --json -O", check_rc=True)
34+
35+
device_info = json.loads(stdout)['blockdevices']
36+
result['devices'] = dict((item['serial'], item['name']) for item in device_info)
37+
module.exit_json(**result)
38+
39+
def main():
40+
run_module()
41+
42+
43+
if __name__ == '__main__':
44+
main()

ansible/roles/block_devices/tasks/main.yml

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,55 @@
1+
- name: Enumerate block device paths by serial number
2+
block_devices:
3+
register: _block_devices
4+
15
- name: Create partitions
26
parted:
3-
device: "{{ item.get('device', block_devices_device) }}"
7+
device: "{{ _device }}"
48
number: "{{ item.get('number', block_devices_number) }}"
59
state: "{{ item.get('partition_state', block_devices_partition_state) }}"
610
when: "item.get('partition_state', block_devices_partition_state) != 'skip'"
711
loop: "{{ block_devices_configurations }}"
12+
vars:
13+
_device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}"
814

915
- name: Create filesystems
1016
filesystem:
1117
fstype: "{{ item.get('fstype', block_devices_fstype) }}"
12-
dev: "{{ item.get('device', block_devices_device) }}{{ item.get('number', block_devices_number) }}"
18+
dev: "{{ _device }}{{ item.get('number', block_devices_number) }}"
1319
resizefs: "{{ item.get('resizefs', block_devices_resizefs) }}"
1420
state: "{{ item.get('filesystem_state', block_devices_filesystem_state) }}"
1521
when: "item.get('filesystem_state', block_devices_filesystem_state) != 'skip'"
1622
loop: "{{ block_devices_configurations }}"
23+
vars:
24+
_device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}"
25+
26+
- name: Get filesystem UUIDs
27+
command:
28+
cmd: "lsblk {{ _device }}{{ item.get('number', block_devices_number) }} --noheadings --output UUID"
29+
loop: "{{ block_devices_configurations }}"
30+
vars:
31+
_device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}"
32+
register: block_devices_uuids
33+
changed_when: false
34+
check_mode: no
1735

1836
- name: Ensure mount point exists
1937
file:
2038
path: "{{ item.get('path', block_devices_path) }}"
2139
state: directory
2240
loop: "{{ block_devices_configurations }}"
2341

24-
- name: Mount filesystems
42+
- name: Mount filesystems by UUID
2543
mount:
2644
path: "{{ item.get('path', block_devices_path) }}"
27-
src: "{{ item.get('device', block_devices_device) }}{{ item.get('number', block_devices_number) }}"
45+
src: "UUID={{ _uuid }}"
2846
fstype: "{{ item.get('fstype', block_devices_fstype) }}"
2947
state: "{{ item.get('mount_state', block_devices_mount_state) }}"
48+
vars:
49+
_uuid: "{{ block_devices_uuids.results[block_devices_idx].stdout }}"
3050
loop: "{{ block_devices_configurations }}"
51+
loop_control:
52+
index_var: block_devices_idx
3153

3254
- name: Set owner/group for mounted directory
3355
file:

ansible/roles/openondemand/tasks/pam_auth.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,11 @@
2121
mode: 0640
2222
group: apache
2323

24+
- name: Allow httpd access to PAM in SELinux
25+
ansible.posix.seboolean:
26+
name: httpd_mod_auth_pam
27+
state: yes
28+
persistent: yes
29+
when: ansible_facts.selinux.status == 'enabled'
30+
2431
# TODO: do we need to restart OOD here??

0 commit comments

Comments
 (0)