Skip to content

Update fatimage base to RL8.9 with robust volume mounts #341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 0 additions & 23 deletions ansible/roles/cluster_infra/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,29 +57,6 @@
- terraform_state == "present"
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages

- name: Detect volume device prefix from image metadata
block:
- name: Get image metadata from OpenStack API
openstack.cloud.image_info:
image: "{{ cluster_previous_image | default(cluster_image) }}"
register: cluster_image_info
- name: Check only single image found
assert:
that: cluster_image_info.images | length == 1
fail_msg: "Multiple images found for 'cluster_image' {{ cluster_image }}"
- name: Set volume_device_prefix fact
set_fact:
block_device_prefix: >-
{{
'sd' if (cluster_image_info.images | first).hw_scsi_model is defined and
(cluster_image_info.images | first).hw_scsi_model in scsi_models
else 'vd'
}}
# Only run when block_device_prefix isn't set as an extravar
when:
- block_device_prefix is not defined
- cluster_image is defined

- name: Template Terraform files into project directory
template:
src: >-
Expand Down
17 changes: 6 additions & 11 deletions ansible/roles/cluster_infra/templates/resources.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -358,18 +358,13 @@ resource "openstack_compute_instance_v2" "control" {
{%- for ssh_key in cluster_deploy_ssh_keys_extra %}
- {{ ssh_key }}
{%- endfor %}
fs_setup:
- label: state
filesystem: ext4
device: /dev/{{ block_device_prefix }}b
partition: auto
- label: home
filesystem: ext4
device: /dev/{{ block_device_prefix }}c
partition: auto
bootcmd:
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
%{endfor}
mounts:
- [LABEL=state, /var/lib/state, auto, "x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service"]
- [LABEL=home, /exports/home, auto, "x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service"]
- [LABEL=state, {{ appliances_state_dir }}, auto]
- [LABEL=home, /exports/home, auto]
EOF
}

Expand Down
2 changes: 1 addition & 1 deletion environments/.stackhpc/ARCUS.pkrvars.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ volume_size = 10 # GB
image_disk_format = "qcow2"
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2"
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
security_groups = ["default", "SSH"]
Expand Down
2 changes: 1 addition & 1 deletion environments/.stackhpc/SMS.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
flavor = "general.v1.tiny"
networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # stackhpc-ipv4-geneve
source_image_name = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2"
fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
security_groups = ["default", "SSH"]
Expand Down
2 changes: 0 additions & 2 deletions environments/.stackhpc/terraform/ARCUS.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ cluster_subnet = "portal-internal"
vnic_type = "normal"
control_node_flavor = "vm.ska.cpu.general.quarter"
other_node_flavor = "vm.ska.cpu.general.small"
state_volume_device_path = "/dev/sdb"
home_volume_device_path = "/dev/sdc"
2 changes: 0 additions & 2 deletions environments/.stackhpc/terraform/SMS.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ cluster_subnet = "stackhpc-ipv4-geneve-subnet"
vnic_type = "normal"
control_node_flavor = "general.v1.medium"
other_node_flavor = "general.v1.tiny"
state_volume_device_path = "/dev/vdb"
home_volume_device_path = "/dev/vdc"
10 changes: 2 additions & 8 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ variable "cluster_name" {
variable "cluster_image" {
description = "single image for all cluster nodes - a convenience for CI"
type = string
default = "openhpc-231206-1648-9d6aa4e4" # https://github.com/stackhpc/ansible-slurm-appliance/pull/340
# default = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2"
default = "openhpc-231208-1207-b69af6e2" # https://github.com/stackhpc/ansible-slurm-appliance/pull/341
# default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
}

variable "cluster_net" {}
Expand All @@ -31,10 +31,6 @@ variable "volume_backed_instances" {
default = false
}

variable "state_volume_device_path" {}

variable "home_volume_device_path" {}

module "cluster" {
source = "../../skeleton/{{cookiecutter.environment}}/terraform/"

Expand Down Expand Up @@ -76,6 +72,4 @@ module "cluster" {
state_volume_size = 10
home_volume_size = 20

state_volume_device_path = var.state_volume_device_path
home_volume_device_path = var.home_volume_device_path
}
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,14 @@ resource "openstack_compute_instance_v2" "control" {
#cloud-config
fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix}

fs_setup:
- label: state
filesystem: ext4
device: ${var.state_volume_device_path}
partition: auto
- label: home
filesystem: ext4
device: ${var.home_volume_device_path}
partition: auto
bootcmd:
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
%{endfor}

mounts:
- [LABEL=state, ${var.state_dir}]
- [LABEL=home, /exports/home, auto, "x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service"]
- [LABEL=home, /exports/home]
EOF

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,6 @@ variable "environment_root" {
description = "Path to environment root, automatically set by activate script"
}

variable "state_volume_device_path" {
type = string
description = "Path to block device for state"
default = "/dev/sdb"
}

variable "home_volume_device_path" {
type = string
description = "Path to block device name for home directories"
default = "/dev/sdc"
}

variable "state_dir" {
type = string
description = "Path to state directory on control node"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
resource "openstack_blockstorage_volume_v3" "state" {
name = "${var.cluster_name}-state"
description = "State for control node"
description = "State for control node" # first word used to label filesystem
size = var.state_volume_size
}

resource "openstack_blockstorage_volume_v3" "home" {
name = "${var.cluster_name}-home"
description = "Home for control node"
description = "Home for control node" # first word used to label filesystem
size = var.home_volume_size
}