Skip to content

Update manila #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,16 @@
tasks_from: client-install.yml
when: "'freeipa_client' in group_names"

# - import_playbook: filesystems.yml
- name: nfs
# - import_playbook: filesystems.yml:
- name: Install nfs packages
dnf:
name: nfs-utils
when: "'nfs' in group_names"
- name: Install Manila client packages
include_role:
name: stackhpc.os-manila-mount
tasks_from: install.yml
when: "'manila' in group_names"

- import_playbook: extras.yml

Expand Down
8 changes: 8 additions & 0 deletions ansible/filesystems.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@
tasks:
- include_role:
name: stackhpc.nfs

- name: Setup Manila share mounts
hosts: manila
become: true
tags: manila
tasks:
- include_role:
name: stackhpc.os-manila-mount
19 changes: 0 additions & 19 deletions ansible/roles/cluster_infra/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,6 @@
}
dest: "{{ terraform_project_path }}/backend.tf"

# Patching in this appliance is implemented as a switch to a new base image
# So unless explicitly patching, we want to use the same image as last time
# To do this, we query the previous Terraform state before updating
- block:
- name: Get previous Terraform state
stackhpc.terraform.terraform_output:
binary_path: "{{ terraform_binary_path }}"
project_path: "{{ terraform_project_path }}"
backend_config: "{{ terraform_backend_config }}"
register: cluster_infra_terraform_output

- name: Extract image from Terraform state
set_fact:
cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}"
when: '"cluster_image" in cluster_infra_terraform_output.outputs'
when:
- terraform_state == "present"
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages

- name: Template Terraform files into project directory
template:
src: >-
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/cluster_infra/templates/outputs.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,5 @@ output "cluster_nodes" {

output "cluster_image" {
description = "The id of the image used to build the cluster nodes"
value = "{{ cluster_previous_image | default(cluster_image) }}"
value = "{{ cluster_image }}"
}
35 changes: 28 additions & 7 deletions ansible/roles/cluster_infra/templates/resources.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,22 @@ resource "openstack_blockstorage_volume_v3" "state" {
size = "{{ state_volume_size }}"
}

{% if cluster_home_manila_share | bool %}
resource "openstack_sharedfilesystem_share_v2" "home" {
name = "{{ cluster_name }}-home"
description = "Home for cluster"
share_proto = "CEPHFS"
share_type = {{ '"' + cluster_home_manila_share_type + '"' | default('null') }}
size = "{{ home_volume_size }}"
}

resource "openstack_sharedfilesystem_share_access_v2" "home" {
share_id = openstack_sharedfilesystem_share_v2.home.id
access_type = "cephx"
access_to = "cluster_{{ cluster_id }}"
access_level = "rw"
}
{% else %}
resource "openstack_blockstorage_volume_v3" "home" {
name = "{{ cluster_name }}-home"
description = "Home for control node"
Expand All @@ -89,6 +105,7 @@ resource "openstack_blockstorage_volume_v3" "home" {
{% endif %}
{% endif %}
}
{% endif %}

######
###### Cluster network
Expand Down Expand Up @@ -249,7 +266,7 @@ resource "openstack_compute_keypair_v2" "cluster_keypair" {

resource "openstack_compute_instance_v2" "login" {
name = "{{ cluster_name }}-login-0"
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
image_id = "{{ cluster_image }}"
{% if login_flavor_name is defined %}
flavor_name = "{{ login_flavor_name }}"
{% else %}
Expand All @@ -262,7 +279,7 @@ resource "openstack_compute_instance_v2" "login" {

# root device:
block_device {
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
uuid = "{{ cluster_image }}"
source_type = "image"
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
volume_size = {{ cluster_root_volume_size | default("20") }}
Expand Down Expand Up @@ -298,7 +315,7 @@ resource "openstack_compute_instance_v2" "login" {

resource "openstack_compute_instance_v2" "control" {
name = "{{ cluster_name }}-control-0"
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
image_id = "{{ cluster_image }}"
{% if control_flavor_name is defined %}
flavor_name = "{{ control_flavor_name }}"
{% else %}
Expand All @@ -311,7 +328,7 @@ resource "openstack_compute_instance_v2" "control" {

# root device:
block_device {
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
uuid = "{{ cluster_image }}"
source_type = "image"
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
volume_size = {{ cluster_root_volume_size | default("20") }}
Expand All @@ -334,13 +351,15 @@ resource "openstack_compute_instance_v2" "control" {
uuid = openstack_blockstorage_volume_v3.state.id
}

{% if not cluster_home_manila_share | bool %}
# home volume:
block_device {
destination_type = "volume"
source_type = "volume"
boot_index = -1
uuid = openstack_blockstorage_volume_v3.home.id
}
{% endif %}

# Use cloud-init to a) inject SSH keys b) configure volumes
user_data = <<-EOF
Expand All @@ -359,12 +378,14 @@ resource "openstack_compute_instance_v2" "control" {
- {{ ssh_key }}
{%- endfor %}
bootcmd:
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
%{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]}
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
%{endfor}
mounts:
- [LABEL=state, {{ appliances_state_dir }}, auto]
{% if not cluster_home_manila_share | bool %}
- [LABEL=home, /exports/home, auto]
{% endif %}
EOF
}

Expand All @@ -373,7 +394,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
count = {{ partition.count }}

name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
image_id = "{{ cluster_image }}"
{% if 'flavor_name' in partition %}
flavor_name = "{{ partition.flavor_name }}"
{% else %}
Expand All @@ -386,7 +407,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {

# root device:
block_device {
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
uuid = "{{ cluster_image }}"
source_type = "image"
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
volume_size = {{ cluster_root_volume_size | default("20") }}
Expand Down
4 changes: 4 additions & 0 deletions environments/.caas/inventory/extra_groups
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ cluster
[zenith:children]
grafana
openondemand

[manila:children]
login
compute
4 changes: 4 additions & 0 deletions environments/.caas/inventory/group_vars/all/cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_ga
openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}"

appliances_state_dir: /var/lib/state

# Defaults for caas-provided extravars:
cluster_project_manila_share: false
cluster_home_manila_share: "{{ cluster_project_manila_share }}"
16 changes: 16 additions & 0 deletions environments/.caas/inventory/group_vars/all/manila.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
caas_manila_home:
share_name: "{{ cluster_name }}-home"
mount_path: /home
mount_user: root
mount_group: root
mount_mode: u=rwX,go=rX

cluster_project_manila_share_name: azimuth-project-share
caas_manila_project:
share_name: "{{ cluster_project_manila_share_name }}"
mount_path: /project
mount_user: root
mount_group: root
mount_mode: ugo=rwX

os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}"
18 changes: 11 additions & 7 deletions environments/.caas/inventory/group_vars/all/nfs.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
nfs_server: "{{ nfs_server_default }}"

nfs_configurations:
- comment: Export /exports/home from Slurm control node as /home
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
nfs_export: "/exports/home" # assumes skeleton TF is being used
nfs_client_mnt_point: "/home"
caas_nfs_ood_state:
- comment: Export /var/lib/state from Slurm control node to OOD
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['openondemand'] }}"
nfs_export: "{{ appliances_state_dir }}"
nfs_client_mnt_point: "{{ appliances_state_dir }}"
nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service"

caas_nfs_home:
- comment: Export /exports/home from Slurm control node as /home
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
nfs_export: "/exports/home" # assumes skeleton TF is being used
nfs_client_mnt_point: "/home"

nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}"
4 changes: 2 additions & 2 deletions environments/.caas/inventory/group_vars/all/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
---

# Set Prometheus storage retention size
prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB"
# We reserve 10GB of the state volume for cluster state, the rest is for metrics
prometheus_storage_retention_size: "{{ state_volume_size - 10 }}GB"
3 changes: 0 additions & 3 deletions environments/.caas/inventory/group_vars/openstack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ terraform_project_path: "{{ playbook_dir }}/terraform"
terraform_state: "{{ cluster_state | default('present') }}"
cluster_ssh_user: rocky

# Set the size of the state volume to metrics_db_maximum_size + 10
state_volume_size: "{{ metrics_db_maximum_size + 10 }}"

# Provision a single "standard" compute partition using the supplied
# node count and flavor
openhpc_slurm_partitions:
Expand Down
44 changes: 32 additions & 12 deletions environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,24 @@ parameters:
kind: cloud.ip
immutable: true

- name: login_flavor
label: Login node size
description: The size to use for the login node.
kind: cloud.size
immutable: true
options:
min_ram: 2048
min_disk: 20

- name: control_flavor
label: Control node size
description: The size to use for the control node.
kind: cloud.size
immutable: true
options:
min_ram: 2048
min_disk: 20

- name: compute_count
label: Compute node count
description: The number of compute nodes in the cluster.
Expand All @@ -23,16 +41,17 @@ parameters:
- name: compute_flavor
label: Compute node size
description: The size to use for the compute node.
kind: "cloud.size"
kind: cloud.size
immutable: true
options:
count_parameter: compute_count
min_ram: 2048
min_disk: 20

- name: home_volume_size
label: Home volume size (GB)
description: The size of the cloud volume to use for home directories
kind: integer
description: The size of the cloud volume to use for home directories.
kind: cloud.volume_size
immutable: true
options:
min: 10
Expand All @@ -51,19 +70,20 @@ parameters:
options:
checkboxLabel: Put home directories on high-performance storage?

- name: metrics_db_maximum_size
label: Metrics database size (GB)
- name: state_volume_size
label: State volume size (GB)
description: |
The size of the state volume, used to hold and persist important files and data. Of
this volume, 10GB is set aside for cluster state and the remaining space is used
to store cluster metrics.

The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be
discarded to ensure that the database does not grow larger than this size.

**A cloud volume of this size +10GB will be created to hold and persist the metrics
database and important Slurm files.**
kind: integer
discarded to ensure that the database does not grow larger than this volume.
kind: cloud.volume_size
immutable: true
options:
min: 10
default: 10
min: 20
default: 20

- name: cluster_run_validation
label: Post-configuration validation
Expand Down
Loading