Skip to content

Enable build of environment-specific control images #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
May 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/smslabs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
TF_VAR_cluster_name: ci${{ github.run_id }}
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}

- name: Directly configure cluster and build compute + login images
- name: Directly configure cluster and build compute, login and control images
# see pre-hook for the image build
run: |
. venv/bin/activate
Expand Down Expand Up @@ -123,6 +123,7 @@ jobs:
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Test reimage of login and compute nodes
# TODO: test control node reimage
run: |
. venv/bin/activate
. environments/smslabs/activate
Expand Down
17 changes: 3 additions & 14 deletions ansible/ci/delete_images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,11 @@
become: no
gather_facts: no
tasks:
- name: Read packer build manifest
set_fact:
manifest: "{{ lookup('file', manifest_path) | from_json }}"
vars:
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
delegate_to: localhost

- name: Get latest image builds
set_fact:
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
- import_tasks: get_image_ids.yml

- name: Delete images
shell:
cmd: |
openstack image delete {{ login_build.artifact_id }}
openstack image delete {{ compute_build.artifact_id }}
openstack image delete {{ item.artifact_id }}
delegate_to: localhost

loop: "{{ manifest['builds'] }}"
12 changes: 12 additions & 0 deletions ansible/ci/get_image_ids.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- name: Read packer build manifest
set_fact:
manifest: "{{ lookup('file', manifest_path) | from_json }}"
vars:
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
delegate_to: localhost

- name: Get latest image builds
set_fact:
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
control_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'control'}) | last }}"
68 changes: 55 additions & 13 deletions ansible/ci/test_reimage.yml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
- hosts: login:!builder
- hosts: all:!builder
become: no
gather_facts: no
tags:
- reimage_login
- reimage_compute
- reimage_control
tasks:
- name: Read packer build manifest
set_fact:
manifest: "{{ lookup('file', manifest_path) | from_json }}"
vars:
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
delegate_to: localhost

- name: Get latest image builds
set_fact:
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"
- import_tasks: get_image_ids.yml

- hosts: login:!builder
become: no
gather_facts: no
tags: reimage_login
tasks:
- name: Reimage login node via openstack
shell:
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
Expand All @@ -34,7 +34,12 @@

- name: Check slurm up after reimaging login node
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"


- hosts: login:!builder
become: no
gather_facts: no
tags: reimage_compute
tasks:
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
- name: Request compute node rebuild via Slurm
shell:
Expand All @@ -55,6 +60,7 @@
- hosts: compute:!builder
become: no
gather_facts: no
tags: reimage_compute
tasks:
- name: Wait for compute connection
wait_for_connection:
Expand All @@ -63,3 +69,39 @@
- name: Check slurm up after reimaging login node
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
run_once: true

- hosts: control:!builder
become: no
gather_facts: no
tags: reimage_control
tasks:
- name: Reimage control node via openstack
shell:
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ control_build.artifact_id }}"
delegate_to: localhost

- name: Check control node rebuild completed
shell:
cmd: openstack server show {{ inventory_hostname }} --format value -c image
register: openstack_control
delegate_to: localhost
retries: 5
delay: 30
until: control_build.artifact_id in openstack_control.stdout
changed_when: false

- name: Wait for control connection
wait_for_connection:
timeout: 800

- name: Run slurm playbook again to add partition info
import_playbook: ../slurm.yml
tags: reimage_control

- hosts: control:!builder
become: no
gather_facts: no
tags: reimage_control
tasks:
- name: Check slurm up after reimaging login node
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
3 changes: 2 additions & 1 deletion ansible/roles/grafana-dashboards/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,5 @@
state: present
overwrite: true
#no_log: true
with_items: "{{ grafana_dashboards }}"
with_items: "{{ grafana_dashboards }}"
when: grafana_state | default('started') != 'stopped'
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ block_devices_partition_state: skip
block_devices_filesystem_state: skip
block_devices_mount_state: present
basic_users_manage_homedir: false
grafana_state: stopped # as it tries to listen on the "real" grafana node
grafana_datasources: [] # as grafana won't be up - NB also dashboards are downloaded but not imported
block_devices_configurations: [] # as volumes will not be attached to Packer build VMs
38 changes: 24 additions & 14 deletions packer/README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Packer-based image build

This workflow uses Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build compute and login node images. These images can be used during cluster creation or to update an existing cluster. Building images reduces the number of package downloads when deploying a large cluster, and ensures that nodes can be recreated even if packages have changed in repos (e.g. due to CentOS or OpenHPC updates).
This workflow uses Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. These images can be used during cluster creation or to update an existing cluster. Building images reduces the number of package downloads when deploying a large cluster, and ensures that nodes can be recreated even if packages are changed in repositories (e.g. due to Rocky Linux or OpenHPC updates).

Packer creates OpenStack VMs and configures them by running `ansible/site.yml`, as for direct configuration. However (by default) in Packer builds a `yum update *` step is run, which is not the default when running ansible directly to avoid modifying existing nodes. Packer will upload the resulting images to OpenStack with a name including a timestamp.
Packer creates OpenStack VMs and configures them by running `ansible/site.yml` in the same way as for direct configuration of instances using Ansible. However (by default) in Packer builds a `yum update *` step is run. This is not the default for direct configuration, to avoid modifying existing nodes. Packer will upload the resulting images to OpenStack with a name which includes a timestamp.

As configuring slurm deamons require the control hostname (as may other features such as NFS mounts), building login and control images requires that the control node is deployed, although it does not need to be configured. Note that control node images cannot [currently](https://github.com/stackhpc/ansible-slurm-appliance/issues/133) be created.
Building images is likely to require Ansible host/group variables to be set in inventory to provide required configuration information. This may (depending on the inventory generation approach) require nodes to deployed before building images. See developer notes below for more information.

Steps:
# Build Process

- Create an application credential with sufficient authorisation to upload images (this may or may not be the `member` role, depending on your OpenStack configuration).
- Create a file `environments/<environment>/builder.pkrvars.hcl` containing at a minimum e.g.:
Expand All @@ -20,7 +20,7 @@ Steps:

The network(s) used for the Packer VMs must provide for outbound internet access but do not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). These items are configured but not enabled in the Packer VMs.

For additional options (e.g. non-default private key locations) see the variable descriptions in `./openstack.pkr.hcl`.
For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`.

- Activate the venv and the relevant environment.
- Ensure you have generated passwords using:
Expand All @@ -29,23 +29,33 @@ Steps:

- Ensure you have the private part of the keypair `ssh_keypair_name` at `~/.ssh/id_rsa.pub` (or set variable `ssh_private_key_file` in `builder.pkrvars.hcl`).

- Ensure a control node is deployed, following the main `README.md`. Note variable `openhpc_slurm_partitions` ([docs](https://github.com/stackhpc/ansible-role-openhpc/#slurmconf)) must define a (non-empty) partition configuration, but this partition configuration does not actually affect the compute/login node images so e.g. a smaller cluster may be deployed for development and image build.

- Build images using the variable definition file:

cd packer
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl

Note the builder VMs are added to the `builder` group to differentiate them from "real" nodes - see developer notes below.

This will build images for the `compute` and `login` ansible groups. To add additional builds add a new `source` in `openstack.pkr.hcl`.
This will build images for the `compute`, `login` and `control` ansible groups. To add additional builds add a new `source` in `openstack.pkr.hcl`.

To build only specific images use e.g. `-only openstack.login`.

Instances using built compute and login images should immediately join the cluster, as long as they are in the Slurm configuration. If reimaging existing nodes, consider doing this via Slurm - see [stackhpc.slurm_openstack_tools.rebuild/README.md](../ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md).

Instances using built control images will require re-running the `ansible/site.yml` playbook on the entire cluster, as the following aspects cannot be configured inside the image:
- Slurm configuration (the "slurm.conf" file)
- Grafana dashboard import (assuming default use of control node for Grafana)
- Prometheus scrape configuration (ditto)

# Notes for developers

The Packer build VMs are added to both the `builder` group and the `login` or `compute` groups as appropriate. The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the VM where the real cluster may not be contactable (depending on the build network used). Currently this means:
- Enabling but not starting `slurmd`.
- Setting NFS mounts to `present` but not `mounted`
The Packer build VMs are added to both the `builder` group and the appropriate `login`, `compute` or `control` group. The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started.

Note that hostnames in the Packer VMs are not the same as the equivalent "real" hosts. Therefore variables required inside a Packer VM must be defined as group vars, not hostvars.

Note that in this appliance the munge key is pre-created in the environment's "all" group vars, so this aspect needs no special handling.
Ansible may need to proxy to compute nodes. If the Packer build should not use the same proxy to connect to the builder VMs, note that proxy configuration should not be added to the `all` group.

Ansible may need to proxy to the compute nodes. If the Packer build should not use the same proxy to connect to the builder VMs, proxy configuration should not be added to the `all` group.
When using appliance defaults and an environment with an `inventory/groups` file matching `environments/common/layouts/everything` (as used by cookiecutter for new environment creation), the following inventory variables must be defined when running Packer builds:
- `openhpc_cluster_name`
- `openondemand_servername`
- `inventory_hostname` for a host in the `control` group (provides `openhpc_slurm_control_host` and `nfs_server`)
3 changes: 3 additions & 0 deletions packer/extra_vars.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Used to override anything defined in a concrete environment

openhpc_slurm_partitions: [] # as no compute nodes will be in play, but partition definition might exist in inventory
13 changes: 6 additions & 7 deletions packer/openstack.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ source "openstack" "openhpc" {
ssh_bastion_username = "${var.ssh_bastion_username}"
ssh_bastion_private_key_file = "${var.ssh_bastion_private_key_file}"
security_groups = "${var.security_groups}"
image_name = "ohpc-${source.name}-${local.timestamp}.qcow2"
image_name = "ohpc-${source.name}-${local.timestamp}" # also provides a unique legal instance hostname (in case of parallel packer builds)
image_visibility = "${var.image_visibility}"
}

Expand All @@ -88,17 +88,16 @@ build {
name = "login"
}

source "source.openstack.openhpc" {
name = "control"
}

provisioner "ansible" {
playbook_file = "${var.repo_root}/ansible/site.yml"
host_alias = "packer"
groups = concat(["builder"], split("-", "${source.name}"))
keep_inventory_file = true # for debugging
use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting
# TODO: use completely separate inventory, which just shares common? This will ensure
# we don't accidently run anything via delegate_to.
extra_arguments = ["--limit", "builder", "-i", "./ansible-inventory.sh", "-vv"]
# TODO: Support vault password
#ansible_env_vars = ["ANSIBLE_VAULT_PASSWORD_FILE=/home/stack/.kayobe-vault-pass"]
extra_arguments = ["--limit", "builder", "-i", "./ansible-inventory.sh", "-vv", "-e", "@extra_vars.yml"]
}

post-processor "manifest" {
Expand Down
7 changes: 4 additions & 3 deletions requirements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ roles:
- src: stackhpc.nfs
version: v21.2.1
- src: https://github.com/stackhpc/ansible-role-openhpc.git
version: v0.12.0
version: v0.13.0
name: stackhpc.openhpc
- src: https://github.com/stackhpc/ansible-node-exporter.git
version: feature/no-install
Expand All @@ -15,8 +15,9 @@ roles:
name: cloudalchemy.prometheus
- src: cloudalchemy.alertmanager
version: 0.19.1
- src: cloudalchemy.grafana
version: 0.18.0
- src: https://github.com/stackhpc/ansible-grafana.git
name: cloudalchemy.grafana
version: service-state
- src: geerlingguy.mysql
version: 3.3.2
- src: jriguera.configdrive
Expand Down