Skip to content

Commit 3a53585

Browse files
authored
Merge pull request #288 from stackhpc/ci/smslabs
Make CI cloud selectable between SMSlabs and Arcus
2 parents cfc57db + 4236f08 commit 3a53585

File tree

10 files changed

+129
-50
lines changed

10 files changed

+129
-50
lines changed

.github/workflows/stackhpc.yml

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,31 @@ on:
88
pull_request:
99
jobs:
1010
openstack:
11-
name: openstack-ci-arcus # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
11+
name: openstack-ci
1212
concurrency: ${{ github.ref }} # to branch/PR
1313
runs-on: ubuntu-20.04
1414
env:
1515
ANSIBLE_FORCE_COLOR: True
1616
OS_CLOUD: openstack
1717
TF_VAR_cluster_name: ci${{ github.run_id }}
18+
CI_CLOUD: ${{ vars.CI_CLOUD }}
1819
steps:
1920
- uses: actions/checkout@v2
2021

22+
- name: Record which cloud CI is running on
23+
run: |
24+
echo CI_CLOUD: ${{ vars.CI_CLOUD }}
25+
2126
- name: Setup ssh
2227
run: |
2328
set -x
2429
mkdir ~/.ssh
25-
echo "${arcus_SSH_KEY}" > ~/.ssh/id_rsa
30+
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
2631
chmod 0600 ~/.ssh/id_rsa
27-
env:
28-
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
29-
32+
shell: bash
33+
3034
- name: Add bastion's ssh key to known_hosts
31-
run: cat environments/.stackhpc/bastion_fingerprint >> ~/.ssh/known_hosts
35+
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
3236
shell: bash
3337

3438
- name: Install ansible etc
@@ -44,11 +48,9 @@ jobs:
4448
- name: Write clouds.yaml
4549
run: |
4650
mkdir -p ~/.config/openstack/
47-
echo "${arcus_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
51+
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
4852
shell: bash
49-
env:
50-
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
51-
53+
5254
- name: Setup environment-specific inventory/terraform inputs
5355
run: |
5456
. venv/bin/activate
@@ -64,14 +66,14 @@ jobs:
6466
. venv/bin/activate
6567
. environments/.stackhpc/activate
6668
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
67-
terraform apply -auto-approve
69+
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
6870
6971
- name: Delete infrastructure if provisioning failed
7072
run: |
7173
. venv/bin/activate
7274
. environments/.stackhpc/activate
7375
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
74-
terraform destroy -auto-approve
76+
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
7577
if: failure() && steps.provision_servers.outcome == 'failure'
7678

7779
- name: Configure cluster
@@ -143,7 +145,7 @@ jobs:
143145
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
144146
# ansible-playbook -v ansible/ci/check_slurm.yml
145147

146-
- name: Test reimage of all nodes (via rebuild adhoc)
148+
- name: Test reimage of login and control nodes (via rebuild adhoc)
147149
run: |
148150
. venv/bin/activate
149151
. environments/.stackhpc/activate
@@ -169,7 +171,7 @@ jobs:
169171
. venv/bin/activate
170172
. environments/.stackhpc/activate
171173
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
172-
terraform destroy -auto-approve
174+
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
173175
if: ${{ success() || cancelled() }}
174176

175177
# - name: Delete images
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
|1|BwhEZQPqvZcdf9Phmh2mTPmIivU=|bHi1Nf8dYI8z1C+qsqQFPAty1xA= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQChxwhZggdwj55gNzfDBzah0G8IeTPQjgMZrpboxp2BO4J+o1iZSwDj+2fqyhBGTE43vCJR13uEygz49XIy+t17qBNwHz4fVVR7jdMNymtbZoOsq9oAoBdGEICHrMzQsYZmT9+Wt74ZP2PKOOn+a+f2vg7YdeSy1UhT08iJlbXwCx56fCQnMJMOnZM9MXVLd4NUFN1TeOCIBQHwRiMJyJ7S7CdUKpyUqHOG85peKiPJ07C0RZ/W5HkYKqltwtvPGQd262p5eLC9j3nhOYSG2meRV8yTxYz3lDIPDx0+189CZ5NaxFSPCgqSYA24zavhPVLQqoct7nd7fcEw9JiTs+abZC6GckCONSHDLM+iRtWC/i5u21ZZDLxM9SIqPI96cYFszGeqyZoXxS5qPaIDHbQNAEqJp9ygNXgh9vuBo7E+aWYbFDTG0RuvW02fbmFfZw2/yXIr37+cQX+GPOnkfIRuHE3Hx5eN8C04v+BMrAfK2minawhG3A2ONJs9LI6QoeE=
22
|1|whGSPLhKW4xt/7PWOZ1treg3PtA=|F5gwV8j0JYWDzjb6DvHHaqO+sxs= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCpCG881Gt3dr+nuVIC2uGEQkeVwG6WDdS1WcCoxXC7AG+Oi5bfdqtf4IfeLpWmeuEaAaSFH48ODFr76ViygSjU=
3-
|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ
3+
|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ
4+
|1|u3QVAK9R2x7Z3uKNj+0vDEIekl0=|yy09Q0Kw472+J7bjFkmir28x3lE= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV
Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1-
bastion_user: slurm-app-ci
2-
bastion_ip: 128.232.222.183
1+
ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}"
2+
bastion_config:
3+
ARCUS:
4+
user: slurm-app-ci
5+
ip: 128.232.222.183
6+
SMS:
7+
user: steveb
8+
ip: 185.45.78.150
9+
# NB: The bastion_{user,ip} variables are used directly in the CI workflow too
10+
bastion_user: "{{ bastion_config[ci_cloud].user }}"
11+
bastion_ip: "{{ bastion_config[ci_cloud].ip }}"
312
ansible_ssh_common_args: '-o ProxyCommand="ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p"'
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# rebuilding volume-backed instances requires the image to be specified
2+
# this is a bit hacky and can go away if/when ansible templates terraform
3+
rebuild_image: "{{ lookup('file', appliances_environment_root + '/terraform/main.tf') | regex_search('openhpc-[0-9a-z-]*') }}"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cluster_net = "WCDC-iLab-60"
2+
cluster_subnet = "WCDC-iLab-60"
3+
vnic_type = "direct"
4+
control_node_flavor = "vm.ska.cpu.general.quarter"
5+
other_node_flavor = "vm.ska.cpu.general.small"
6+
volume_backed_instances = false
7+
state_volume_device_path = "/dev/sdb"
8+
home_volume_device_path = "/dev/sdc"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cluster_net = "stackhpc-ipv4-geneve"
2+
cluster_subnet = "stackhpc-ipv4-geneve-subnet"
3+
vnic_type = "normal"
4+
control_node_flavor = "general.v1.medium"
5+
other_node_flavor = "general.v1.tiny"
6+
volume_backed_instances = true
7+
state_volume_device_path = "/dev/vdb"
8+
home_volume_device_path = "/dev/vdc"
Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# This terraform configuration uses the "skeleton" terraform, so that is checked by CI.
2+
13
variable "environment_root" {
24
type = string
35
description = "Path to environment root, automatically set by activate script"
@@ -8,45 +10,55 @@ variable "cluster_name" {
810
description = "Name for cluster, used as prefix for resources - set by environment var in CI"
911
}
1012

11-
variable "create_nodes" {
12-
description = "Whether to create nodes (servers) or just ports and other infra"
13-
type = bool # can't use bool as want to pass from command-line
14-
default = true
15-
}
16-
1713
variable "cluster_image" {
1814
description = "single image for all cluster nodes - a convenience for CI"
1915
type = string
20-
default = "openhpc-230503-0944-bf8c3f63.qcow2" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
16+
default = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
2117
# default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2"
2218
# default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
2319
}
2420

21+
variable "cluster_net" {}
22+
23+
variable "cluster_subnet" {}
24+
25+
variable "vnic_type" {}
26+
27+
variable "control_node_flavor" {}
28+
29+
variable "other_node_flavor" {}
30+
31+
variable "volume_backed_instances" {}
32+
33+
variable "state_volume_device_path" {}
34+
35+
variable "home_volume_device_path" {}
36+
2537
module "cluster" {
2638
source = "../../skeleton/{{cookiecutter.environment}}/terraform/"
2739

2840
cluster_name = var.cluster_name
29-
cluster_net = "WCDC-iLab-60"
30-
cluster_subnet = "WCDC-iLab-60"
31-
vnic_type = "direct"
41+
cluster_net = var.cluster_net
42+
cluster_subnet = var.cluster_subnet
43+
vnic_type = var.vnic_type
3244
key_pair = "slurm-app-ci"
3345
control_node = {
34-
flavor: "vm.ska.cpu.general.quarter"
46+
flavor: var.control_node_flavor
3547
image: var.cluster_image
3648
}
3749
login_nodes = {
3850
login-0: {
39-
flavor: "vm.ska.cpu.general.small"
51+
flavor: var.other_node_flavor
4052
image: var.cluster_image
4153
}
4254
}
4355
compute_types = {
4456
small: {
45-
flavor: "vm.ska.cpu.general.small"
57+
flavor: var.other_node_flavor
4658
image: var.cluster_image
4759
}
4860
extra: {
49-
flavor: "vm.ska.cpu.general.small"
61+
flavor: var.other_node_flavor
5062
image: var.cluster_image
5163
}
5264
}
@@ -56,10 +68,13 @@ module "cluster" {
5668
compute-2: "extra"
5769
compute-3: "extra"
5870
}
59-
create_nodes = var.create_nodes
71+
volume_backed_instances = var.volume_backed_instances
6072

6173
environment_root = var.environment_root
6274
# Can reduce volume size a lot for short-lived CI clusters:
6375
state_volume_size = 10
6476
home_volume_size = 20
77+
78+
state_volume_device_path = var.state_volume_device_path
79+
home_volume_device_path = var.home_volume_device_path
6580
}

environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ data "openstack_images_image_v2" "control" {
77
name = var.control_node.image
88
}
99

10+
data "openstack_images_image_v2" "login" {
11+
for_each = var.login_nodes
12+
13+
name = each.value.image
14+
}
15+
16+
data "openstack_images_image_v2" "compute" {
17+
for_each = var.compute_nodes
18+
19+
name = lookup(var.compute_images, each.key, var.compute_types[each.value].image)
20+
}
21+
1022
resource "openstack_networking_port_v2" "login" {
1123

1224
for_each = toset(keys(var.login_nodes))
@@ -68,7 +80,7 @@ resource "openstack_networking_port_v2" "compute" {
6880

6981
resource "openstack_compute_instance_v2" "control" {
7082

71-
for_each = var.create_nodes ? toset(["control"]) : toset([])
83+
for_each = toset(["control"])
7284

7385
name = "${var.cluster_name}-${each.key}"
7486
image_name = data.openstack_images_image_v2.control.name
@@ -79,7 +91,8 @@ resource "openstack_compute_instance_v2" "control" {
7991
block_device {
8092
uuid = data.openstack_images_image_v2.control.id
8193
source_type = "image"
82-
destination_type = "local"
94+
destination_type = var.volume_backed_instances ? "volume" : "local"
95+
volume_size = var.volume_backed_instances ? var.root_volume_size : null
8396
boot_index = 0
8497
delete_on_termination = true
8598
}
@@ -136,12 +149,24 @@ resource "openstack_compute_instance_v2" "control" {
136149

137150
resource "openstack_compute_instance_v2" "login" {
138151

139-
for_each = var.create_nodes ? var.login_nodes : {}
152+
for_each = var.login_nodes
140153

141154
name = "${var.cluster_name}-${each.key}"
142155
image_name = each.value.image
143156
flavor_name = each.value.flavor
144157
key_pair = var.key_pair
158+
159+
dynamic "block_device" {
160+
for_each = var.volume_backed_instances ? [1]: []
161+
content {
162+
uuid = data.openstack_images_image_v2.login[each.key].id
163+
source_type = "image"
164+
destination_type = "volume"
165+
volume_size = var.root_volume_size
166+
boot_index = 0
167+
delete_on_termination = true
168+
}
169+
}
145170

146171
network {
147172
port = openstack_networking_port_v2.login[each.key].id
@@ -162,12 +187,24 @@ resource "openstack_compute_instance_v2" "login" {
162187

163188
resource "openstack_compute_instance_v2" "compute" {
164189

165-
for_each = var.create_nodes ? var.compute_nodes : {}
190+
for_each = var.compute_nodes
166191

167192
name = "${var.cluster_name}-${each.key}"
168193
image_name = lookup(var.compute_images, each.key, var.compute_types[each.value].image)
169194
flavor_name = var.compute_types[each.value].flavor
170195
key_pair = var.key_pair
196+
197+
dynamic "block_device" {
198+
for_each = var.volume_backed_instances ? [1]: []
199+
content {
200+
uuid = data.openstack_images_image_v2.compute[each.key].id
201+
source_type = "image"
202+
destination_type = "volume"
203+
volume_size = var.root_volume_size
204+
boot_index = 0
205+
delete_on_termination = true
206+
}
207+
}
171208

172209
network {
173210
port = openstack_networking_port_v2.compute[each.key].id

environments/skeleton/{{cookiecutter.environment}}/terraform/terraform.tfvars

Lines changed: 0 additions & 10 deletions
This file was deleted.

environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,14 @@ variable "nonlogin_security_groups" {
109109
]
110110
}
111111

112-
variable "create_nodes" {
113-
description = "Whether to create nodes (servers) or just ports and other infra"
114-
type = bool # can't use bool as want to pass from command-line
115-
default = true
112+
variable "volume_backed_instances" {
113+
description = "Whether to use volumes for root disks"
114+
type = bool
115+
default = false
116+
}
117+
118+
variable "root_volume_size" {
119+
description = "Size of volume for root volumes if using volume backed instances, in Gb"
120+
type = number
121+
default = 40
116122
}

0 commit comments

Comments
 (0)