Skip to content

Commit c7722c1

Browse files
authored
Merge pull request #518 from stackhpc/feat/compute-init-cookiecutter
Support and test "re-imageable" compute nodes via compute node metadata
2 parents 2cac614 + 9897f29 commit c7722c1

File tree

12 files changed

+52
-144
lines changed

12 files changed

+52
-144
lines changed

.github/workflows/stackhpc.yml

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -170,33 +170,21 @@ jobs:
170170
env:
171171
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
172172

173-
# - name: Build environment-specific compute image
174-
# id: packer_build
175-
# run: |
176-
# . venv/bin/activate
177-
# . environments/.stackhpc/activate
178-
# cd packer/
179-
# packer init
180-
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
181-
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
182-
183-
# - name: Test reimage of compute nodes to new environment-specific image (via slurm)
184-
# run: |
185-
# . venv/bin/activate
186-
# . environments/.stackhpc/activate
187-
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
188-
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
189-
# ansible-playbook -v ansible/ci/check_slurm.yml
190-
191173
- name: Test reimage of login and control nodes (via rebuild adhoc)
192174
run: |
193175
. venv/bin/activate
194176
. environments/.stackhpc/activate
195177
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
196-
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
197178
ansible-playbook -v ansible/site.yml
198179
ansible-playbook -v ansible/ci/check_slurm.yml
199180
181+
- name: Test reimage of compute nodes and compute-init (via rebuild adhoc)
182+
run: |
183+
. venv/bin/activate
184+
. environments/.stackhpc/activate
185+
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186+
ansible-playbook -v ansible/ci/check_slurm.yml
187+
200188
- name: Check sacct state survived reimage
201189
run: |
202190
. venv/bin/activate

ansible/ci/check_slurm.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
77
register: sinfo
88
changed_when: false
9-
until: "'boot' not in sinfo.stdout_lines"
10-
retries: 5
11-
delay: 10
9+
until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout)
10+
retries: 10
11+
delay: 5
1212
- name: Check nodes have expected slurm state
1313
assert:
1414
that: sinfo.stdout_lines == expected_sinfo

ansible/extras.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
# NB: has to be after eeesi and os-manila-mount
4545
tags: compute_init
4646
become: yes
47-
name: Export hostvars
4847
tasks:
4948
- include_role:
5049
name: compute_init

ansible/roles/compute_init/README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ The following roles/groups are currently fully functional:
4242
node and all compute nodes.
4343
- `openhpc`: all functionality
4444

45-
# Development/debugging
45+
The above may be enabled by setting the compute_init_enable property on the
46+
terraform compute variable.
4647

47-
To develop/debug this without actually having to build an image:
48+
# Development/debugging
4849

50+
To develop/debug changes to the compute script without actually having to build
51+
a new image:
4952

5053
1. Deploy a cluster using tofu and ansible/site.yml as normal. This will
5154
additionally configure the control node to export compute hostvars over NFS.
@@ -103,7 +106,7 @@ as in step 3.
103106
available v the current approach:
104107

105108
```
106-
[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml
109+
[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml
107110
"grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}",
108111
"grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}",
109112
"mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}",

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
vars:
77
os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}"
88
server_node_ip: "{{ os_metadata.meta.control_address }}"
9-
enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}"
10-
enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}"
11-
enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}"
12-
enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}"
13-
enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}"
14-
enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}"
15-
enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}"
9+
enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}"
10+
enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}"
11+
enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}"
12+
enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}"
13+
enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}"
14+
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
15+
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
1616

1717
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
1818
resolv_conf_nameservers: []

docs/experimental/compute-init.md

Lines changed: 8 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2,112 +2,17 @@
22

33
See the role README.md
44

5-
# Results/progress
5+
# CI workflow
66

7-
Without any metadata:
7+
The compute node rebuild is tested in CI after the tests for rebuilding the
8+
login and control nodes. The process follows
89

9-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
10-
● ansible-init.service
11-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
12-
Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago
13-
Main PID: 16089 (ansible-init)
14-
Tasks: 8 (limit: 10912)
15-
Memory: 99.5M
16-
CPU: 11.687s
17-
CGroup: /system.slice/ansible-init.service
18-
├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init
19-
├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml
20-
├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml
21-
├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0"
22-
├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py
23-
├─16363 /usr/bin/mount /mnt/cluster
24-
└─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync
10+
1. Compute nodes are reimaged:
2511

26-
Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1]
27-
Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] **********************
28-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1]
29-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ********************************************************************
30-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1]
31-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] ***************************************
32-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access>
33-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1]
34-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ******************************************************
35-
Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None
36-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
12+
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
3713

38-
Added metadata via horizon:
14+
2. Ansible-init runs against newly reimaged compute nodes
3915

40-
compute_groups ["compute"]
16+
3. Run sinfo and check nodes have expected slurm state
4117

42-
43-
OK:
44-
45-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
46-
● ansible-init.service
47-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
48-
Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago
49-
Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
50-
Main PID: 16089 (code=exited, status=0/SUCCESS)
51-
CPU: 13.003s
52-
53-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => {
54-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share"
55-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: }
56-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ********************************************************************
57-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP *********************************************************************
58-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1
59-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post
60-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done
61-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully
62-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
63-
64-
Now run site.yml, then restart ansible-init again:
65-
66-
67-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
68-
● ansible-init.service
69-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
70-
Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago
71-
Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
72-
Main PID: 18921 (code=exited, status=0/SUCCESS)
73-
CPU: 8.240s
74-
75-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] **********************
76-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1]
77-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ********************************************************************
78-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1]
79-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP *********************************************************************
80-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0
81-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post
82-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done
83-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully
84-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
85-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/host
86-
hosts hostvars/
87-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-
88-
rl9-compute-0/ rl9-compute-1/
89-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-
90-
rl9-compute-0/ rl9-compute-1/
91-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/
92-
hostvars.yml
93-
94-
This commit - shows that hostvars have loaded:
95-
96-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
97-
● ansible-init.service
98-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
99-
Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago
100-
Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
101-
Main PID: 27585 (code=exited, status=0/SUCCESS)
102-
CPU: 8.161s
103-
104-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] ****************************************
105-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => {
106-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0"
107-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: }
108-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP *********************************************************************
109-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0
110-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post
111-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done
112-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully
113-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
18+
ansible-playbook -v ansible/ci/check_slurm.yml
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250109-1444-ecea8219",
4-
"RL9": "openhpc-RL9-250109-1444-ecea8219"
3+
"RL8": "openhpc-RL8-250114-1627-bccc88b5",
4+
"RL9": "openhpc-RL9-250114-1626-bccc88b5"
55
}
66
}

environments/.stackhpc/terraform/main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ module "cluster" {
8282
standard: { # NB: can't call this default!
8383
nodes: ["compute-0", "compute-1"]
8484
flavor: var.other_node_flavor
85+
compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"]
8586
}
8687
# Example of how to add another partition:
8788
# extra: {

environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ module "compute" {
2020
root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size)
2121
extra_volumes = lookup(each.value, "extra_volumes", {})
2222

23+
compute_init_enable = lookup(each.value, "compute_init_enable", [])
24+
2325
key_pair = var.key_pair
2426
environment_root = var.environment_root
2527
k3s_token = var.k3s_token

environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,14 @@ resource "openstack_compute_instance_v2" "compute" {
7474
access_network = true
7575
}
7676

77-
metadata = {
78-
environment_root = var.environment_root
79-
k3s_token = var.k3s_token
80-
control_address = var.control_address
81-
}
77+
metadata = merge(
78+
{
79+
environment_root = var.environment_root
80+
k3s_token = var.k3s_token
81+
control_address = var.control_address
82+
},
83+
{for e in var.compute_init_enable: e => true}
84+
)
8285

8386
user_data = <<-EOF
8487
#cloud-config

environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,9 @@ variable "control_address" {
8888
description = "Name/address of control node"
8989
type = string
9090
}
91+
92+
variable "compute_init_enable" {
93+
type = list(string)
94+
description = "Groups to activate for ansible-init compute rebuilds"
95+
default = []
96+
}

environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ variable "compute" {
5252
image_id: Overrides variable cluster_image_id
5353
vnic_type: Overrides variable vnic_type
5454
vnic_profile: Overrides variable vnic_profile
55+
compute_init_enable: Toggles compute-init rebuild (see compute-init role docs)
5556
volume_backed_instances: Overrides variable volume_backed_instances
5657
root_volume_size: Overrides variable root_volume_size
5758
extra_volumes: Mapping defining additional volumes to create and attach
@@ -142,4 +143,4 @@ variable "root_volume_size" {
142143
variable "k3s_token" {
143144
description = "K3s cluster authentication token, set automatically by Ansible"
144145
type = string
145-
}
146+
}

0 commit comments

Comments
 (0)