Skip to content

Commit 272c12f

Browse files
authored
Merge pull request #27 from stackhpc/prod2312-merger
Merge Stackhpc work
2 parents bc96d78 + 86344de commit 272c12f

File tree

51 files changed

+805
-554
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+805
-554
lines changed

ansible/fatimage.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,16 @@
4141
tasks_from: client-install.yml
4242
when: "'freeipa_client' in group_names"
4343

44-
# - import_playbook: filesystems.yml
45-
- name: nfs
44+
# - import_playbook: filesystems.yml:
45+
- name: Install nfs packages
4646
dnf:
4747
name: nfs-utils
48+
when: "'nfs' in group_names"
49+
- name: Install Manila client packages
50+
include_role:
51+
name: stackhpc.os-manila-mount
52+
tasks_from: install.yml
53+
when: "'manila' in group_names"
4854

4955
- import_playbook: extras.yml
5056

ansible/filesystems.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,11 @@
1616
tasks:
1717
- include_role:
1818
name: stackhpc.nfs
19+
20+
- name: Setup Manila share mounts
21+
hosts: manila
22+
become: true
23+
tags: manila
24+
tasks:
25+
- include_role:
26+
name: stackhpc.os-manila-mount

ansible/roles/cluster_infra/tasks/main.yml

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,25 +38,6 @@
3838
}
3939
dest: "{{ terraform_project_path }}/backend.tf"
4040

41-
# Patching in this appliance is implemented as a switch to a new base image
42-
# So unless explicitly patching, we want to use the same image as last time
43-
# To do this, we query the previous Terraform state before updating
44-
- block:
45-
- name: Get previous Terraform state
46-
stackhpc.terraform.terraform_output:
47-
binary_path: "{{ terraform_binary_path }}"
48-
project_path: "{{ terraform_project_path }}"
49-
backend_config: "{{ terraform_backend_config }}"
50-
register: cluster_infra_terraform_output
51-
52-
- name: Extract image from Terraform state
53-
set_fact:
54-
cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}"
55-
when: '"cluster_image" in cluster_infra_terraform_output.outputs'
56-
when:
57-
- terraform_state == "present"
58-
- cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages
59-
6041
- name: Template Terraform files into project directory
6142
template:
6243
src: >-

ansible/roles/cluster_infra/templates/outputs.tf.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,5 @@ output "cluster_nodes" {
4949

5050
output "cluster_image" {
5151
description = "The id of the image used to build the cluster nodes"
52-
value = "{{ cluster_previous_image | default(cluster_image) }}"
52+
value = "{{ cluster_image }}"
5353
}

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,22 @@ resource "openstack_blockstorage_volume_v3" "state" {
7979
size = "{{ state_volume_size }}"
8080
}
8181

82+
{% if cluster_home_manila_share | bool %}
83+
resource "openstack_sharedfilesystem_share_v2" "home" {
84+
name = "{{ cluster_name }}-home"
85+
description = "Home for cluster"
86+
share_proto = "CEPHFS"
87+
share_type = {{ '"' + cluster_home_manila_share_type + '"' | default('null') }}
88+
size = "{{ home_volume_size }}"
89+
}
90+
91+
resource "openstack_sharedfilesystem_share_access_v2" "home" {
92+
share_id = openstack_sharedfilesystem_share_v2.home.id
93+
access_type = "cephx"
94+
access_to = "cluster_{{ cluster_id }}"
95+
access_level = "rw"
96+
}
97+
{% else %}
8298
resource "openstack_blockstorage_volume_v3" "home" {
8399
name = "{{ cluster_name }}-home"
84100
description = "Home for control node"
@@ -89,6 +105,7 @@ resource "openstack_blockstorage_volume_v3" "home" {
89105
{% endif %}
90106
{% endif %}
91107
}
108+
{% endif %}
92109

93110
######
94111
###### Cluster network
@@ -249,7 +266,7 @@ resource "openstack_compute_keypair_v2" "cluster_keypair" {
249266

250267
resource "openstack_compute_instance_v2" "login" {
251268
name = "{{ cluster_name }}-login-0"
252-
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
269+
image_id = "{{ cluster_image }}"
253270
{% if login_flavor_name is defined %}
254271
flavor_name = "{{ login_flavor_name }}"
255272
{% else %}
@@ -262,7 +279,7 @@ resource "openstack_compute_instance_v2" "login" {
262279

263280
# root device:
264281
block_device {
265-
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
282+
uuid = "{{ cluster_image }}"
266283
source_type = "image"
267284
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
268285
volume_size = {{ cluster_root_volume_size | default("20") }}
@@ -298,7 +315,7 @@ resource "openstack_compute_instance_v2" "login" {
298315

299316
resource "openstack_compute_instance_v2" "control" {
300317
name = "{{ cluster_name }}-control-0"
301-
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
318+
image_id = "{{ cluster_image }}"
302319
{% if control_flavor_name is defined %}
303320
flavor_name = "{{ control_flavor_name }}"
304321
{% else %}
@@ -311,7 +328,7 @@ resource "openstack_compute_instance_v2" "control" {
311328

312329
# root device:
313330
block_device {
314-
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
331+
uuid = "{{ cluster_image }}"
315332
source_type = "image"
316333
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
317334
volume_size = {{ cluster_root_volume_size | default("20") }}
@@ -334,13 +351,15 @@ resource "openstack_compute_instance_v2" "control" {
334351
uuid = openstack_blockstorage_volume_v3.state.id
335352
}
336353

354+
{% if not cluster_home_manila_share | bool %}
337355
# home volume:
338356
block_device {
339357
destination_type = "volume"
340358
source_type = "volume"
341359
boot_index = -1
342360
uuid = openstack_blockstorage_volume_v3.home.id
343361
}
362+
{% endif %}
344363

345364
# Use cloud-init to a) inject SSH keys b) configure volumes
346365
user_data = <<-EOF
@@ -359,12 +378,14 @@ resource "openstack_compute_instance_v2" "control" {
359378
- {{ ssh_key }}
360379
{%- endfor %}
361380
bootcmd:
362-
%{for volume in [openstack_blockstorage_volume_v3.state, openstack_blockstorage_volume_v3.home]}
381+
%{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]}
363382
- BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV
364383
%{endfor}
365384
mounts:
366385
- [LABEL=state, {{ appliances_state_dir }}, auto]
386+
{% if not cluster_home_manila_share | bool %}
367387
- [LABEL=home, /exports/home, auto]
388+
{% endif %}
368389
EOF
369390
}
370391

@@ -373,7 +394,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
373394
count = {{ partition.count }}
374395

375396
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
376-
image_id = "{{ cluster_previous_image | default(cluster_image) }}"
397+
image_id = "{{ cluster_image }}"
377398
{% if 'flavor_name' in partition %}
378399
flavor_name = "{{ partition.flavor_name }}"
379400
{% else %}
@@ -386,7 +407,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
386407

387408
# root device:
388409
block_device {
389-
uuid = "{{ cluster_previous_image | default(cluster_image) }}"
410+
uuid = "{{ cluster_image }}"
390411
source_type = "image"
391412
{% if cluster_use_root_volumes is defined and cluster_use_root_volumes %}
392413
volume_size = {{ cluster_root_volume_size | default("20") }}

environments/.caas/inventory/extra_groups

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,7 @@ cluster
77
[zenith:children]
88
grafana
99
openondemand
10+
11+
[manila:children]
12+
login
13+
compute

environments/.caas/inventory/group_vars/all/cluster.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@ openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_ga
2020
openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}"
2121

2222
appliances_state_dir: /var/lib/state
23+
24+
# Defaults for caas-provided extravars:
25+
cluster_project_manila_share: false
26+
cluster_home_manila_share: "{{ cluster_project_manila_share }}"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
caas_manila_home:
2+
share_name: "{{ cluster_name }}-home"
3+
mount_path: /home
4+
mount_user: root
5+
mount_group: root
6+
mount_mode: u=rwX,go=rX
7+
8+
cluster_project_manila_share_name: azimuth-project-share
9+
caas_manila_project:
10+
share_name: "{{ cluster_project_manila_share_name }}"
11+
mount_path: /project
12+
mount_user: root
13+
mount_group: root
14+
mount_mode: ugo=rwX
15+
16+
os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}"
Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
nfs_server: "{{ nfs_server_default }}"
22

3-
nfs_configurations:
4-
- comment: Export /exports/home from Slurm control node as /home
5-
nfs_enable:
6-
server: "{{ inventory_hostname in groups['control'] }}"
7-
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
8-
nfs_export: "/exports/home" # assumes skeleton TF is being used
9-
nfs_client_mnt_point: "/home"
3+
caas_nfs_ood_state:
104
- comment: Export /var/lib/state from Slurm control node to OOD
115
nfs_enable:
126
server: "{{ inventory_hostname in groups['control'] }}"
137
clients: "{{ inventory_hostname in groups['openondemand'] }}"
148
nfs_export: "{{ appliances_state_dir }}"
159
nfs_client_mnt_point: "{{ appliances_state_dir }}"
1610
nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service"
11+
12+
caas_nfs_home:
13+
- comment: Export /exports/home from Slurm control node as /home
14+
nfs_enable:
15+
server: "{{ inventory_hostname in groups['control'] }}"
16+
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
17+
nfs_export: "/exports/home" # assumes skeleton TF is being used
18+
nfs_client_mnt_point: "/home"
19+
20+
nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}"
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
---
22

3-
# Set Prometheus storage retention size
4-
prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB"
3+
# We reserve 10GB of the state volume for cluster state, the rest is for metrics
4+
prometheus_storage_retention_size: "{{ state_volume_size - 10 }}GB"

environments/.caas/inventory/group_vars/openstack.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@ terraform_project_path: "{{ playbook_dir }}/terraform"
1616
terraform_state: "{{ cluster_state | default('present') }}"
1717
cluster_ssh_user: rocky
1818

19-
# Set the size of the state volume to metrics_db_maximum_size + 10
20-
state_volume_size: "{{ metrics_db_maximum_size + 10 }}"
21-
2219
# Provision a single "standard" compute partition using the supplied
2320
# node count and flavor
2421
openhpc_slurm_partitions:

environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,33 @@ description: >-
55
OnDemand web interface, and custom monitoring.
66
logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png
77

8+
requires_ssh_key: true
9+
810
parameters:
911
- name: cluster_floating_ip
1012
label: External IP
1113
description: The external IP to use for the login node.
1214
kind: cloud.ip
1315
immutable: true
1416

17+
- name: login_flavor
18+
label: Login node size
19+
description: The size to use for the login node.
20+
kind: cloud.size
21+
immutable: true
22+
options:
23+
min_ram: 2048
24+
min_disk: 20
25+
26+
- name: control_flavor
27+
label: Control node size
28+
description: The size to use for the control node.
29+
kind: cloud.size
30+
immutable: true
31+
options:
32+
min_ram: 2048
33+
min_disk: 20
34+
1535
- name: compute_count
1636
label: Compute node count
1737
description: The number of compute nodes in the cluster.
@@ -23,16 +43,17 @@ parameters:
2343
- name: compute_flavor
2444
label: Compute node size
2545
description: The size to use for the compute node.
26-
kind: "cloud.size"
46+
kind: cloud.size
2747
immutable: true
2848
options:
49+
count_parameter: compute_count
2950
min_ram: 2048
3051
min_disk: 20
3152

3253
- name: home_volume_size
3354
label: Home volume size (GB)
34-
description: The size of the cloud volume to use for home directories
35-
kind: integer
55+
description: The size of the cloud volume to use for home directories.
56+
kind: cloud.volume_size
3657
immutable: true
3758
options:
3859
min: 10
@@ -51,19 +72,20 @@ parameters:
5172
options:
5273
checkboxLabel: Put home directories on high-performance storage?
5374

54-
- name: metrics_db_maximum_size
55-
label: Metrics database size (GB)
75+
- name: state_volume_size
76+
label: State volume size (GB)
5677
description: |
78+
The size of the state volume, used to hold and persist important files and data. Of
79+
this volume, 10GB is set aside for cluster state and the remaining space is used
80+
to store cluster metrics.
81+
5782
The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be
58-
discarded to ensure that the database does not grow larger than this size.
59-
60-
**A cloud volume of this size +10GB will be created to hold and persist the metrics
61-
database and important Slurm files.**
62-
kind: integer
83+
discarded to ensure that the database does not grow larger than this volume.
84+
kind: cloud.volume_size
6385
immutable: true
6486
options:
65-
min: 10
66-
default: 10
87+
min: 20
88+
default: 20
6789

6890
- name: cluster_run_validation
6991
label: Post-configuration validation
@@ -107,6 +129,15 @@ usage_template: |-
107129
108130
SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`.
109131
132+
Note that home directories are deleted when the platform is destroyed.
133+
134+
If configured by the adminstrator, a project filesystem may be mounted at `/project`.
135+
Content stored there will be available to all such configured workstations and Slurm
136+
clusters, and will persist after deletion of workstations/clusters.
137+
138+
Other parts of the filesystem may be affected during a patch operation, including any
139+
packages that have been installed using `dnf`.
140+
110141
services:
111142
- name: ood
112143
label: Open OnDemand

0 commit comments

Comments
 (0)