Skip to content

Make CI cloud selectable between SMSlabs and Arcus #288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,31 @@ on:
pull_request:
jobs:
openstack:
name: openstack-ci-arcus # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
name: openstack-ci
concurrency: ${{ github.ref }} # to branch/PR
runs-on: ubuntu-20.04
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: ci${{ github.run_id }}
CI_CLOUD: ${{ vars.CI_CLOUD }}
steps:
- uses: actions/checkout@v2

- name: Record which cloud CI is running on
run: |
echo CI_CLOUD: ${{ vars.CI_CLOUD }}

- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${arcus_SSH_KEY}" > ~/.ssh/id_rsa
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
env:
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}

shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprint >> ~/.ssh/known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: Install ansible etc
Expand All @@ -44,11 +48,9 @@ jobs:
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${arcus_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
env:
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}


- name: Setup environment-specific inventory/terraform inputs
run: |
. venv/bin/activate
Expand All @@ -64,14 +66,14 @@ jobs:
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"

- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'

- name: Configure cluster
Expand Down Expand Up @@ -143,7 +145,7 @@ jobs:
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
# ansible-playbook -v ansible/ci/check_slurm.yml

- name: Test reimage of all nodes (via rebuild adhoc)
- name: Test reimage of login and control nodes (via rebuild adhoc)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
Expand All @@ -169,7 +171,7 @@ jobs:
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}

# - name: Delete images
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
|1|BwhEZQPqvZcdf9Phmh2mTPmIivU=|bHi1Nf8dYI8z1C+qsqQFPAty1xA= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQChxwhZggdwj55gNzfDBzah0G8IeTPQjgMZrpboxp2BO4J+o1iZSwDj+2fqyhBGTE43vCJR13uEygz49XIy+t17qBNwHz4fVVR7jdMNymtbZoOsq9oAoBdGEICHrMzQsYZmT9+Wt74ZP2PKOOn+a+f2vg7YdeSy1UhT08iJlbXwCx56fCQnMJMOnZM9MXVLd4NUFN1TeOCIBQHwRiMJyJ7S7CdUKpyUqHOG85peKiPJ07C0RZ/W5HkYKqltwtvPGQd262p5eLC9j3nhOYSG2meRV8yTxYz3lDIPDx0+189CZ5NaxFSPCgqSYA24zavhPVLQqoct7nd7fcEw9JiTs+abZC6GckCONSHDLM+iRtWC/i5u21ZZDLxM9SIqPI96cYFszGeqyZoXxS5qPaIDHbQNAEqJp9ygNXgh9vuBo7E+aWYbFDTG0RuvW02fbmFfZw2/yXIr37+cQX+GPOnkfIRuHE3Hx5eN8C04v+BMrAfK2minawhG3A2ONJs9LI6QoeE=
|1|whGSPLhKW4xt/7PWOZ1treg3PtA=|F5gwV8j0JYWDzjb6DvHHaqO+sxs= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCpCG881Gt3dr+nuVIC2uGEQkeVwG6WDdS1WcCoxXC7AG+Oi5bfdqtf4IfeLpWmeuEaAaSFH48ODFr76ViygSjU=
|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ
|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ
|1|u3QVAK9R2x7Z3uKNj+0vDEIekl0=|yy09Q0Kw472+J7bjFkmir28x3lE= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV
13 changes: 11 additions & 2 deletions environments/.stackhpc/inventory/group_vars/all/bastion.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
bastion_user: slurm-app-ci
bastion_ip: 128.232.222.183
ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}"
bastion_config:
ARCUS:
user: slurm-app-ci
ip: 128.232.222.183
SMS:
user: steveb
ip: 185.45.78.150
# NB: The bastion_{user,ip} variables are used directly in the CI workflow too
bastion_user: "{{ bastion_config[ci_cloud].user }}"
bastion_ip: "{{ bastion_config[ci_cloud].ip }}"
ansible_ssh_common_args: '-o ProxyCommand="ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p"'
3 changes: 3 additions & 0 deletions environments/.stackhpc/inventory/group_vars/all/rebuild.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# rebuilding volume-backed instances requires the image to be specified
# this is a bit hacky and can go away if/when ansible templates terraform
rebuild_image: "{{ lookup('file', appliances_environment_root + '/terraform/main.tf') | regex_search('openhpc-[0-9a-z-]*') }}"
8 changes: 8 additions & 0 deletions environments/.stackhpc/terraform/ARCUS.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cluster_net = "WCDC-iLab-60"
cluster_subnet = "WCDC-iLab-60"
vnic_type = "direct"
control_node_flavor = "vm.ska.cpu.general.quarter"
other_node_flavor = "vm.ska.cpu.general.small"
volume_backed_instances = false
state_volume_device_path = "/dev/sdb"
home_volume_device_path = "/dev/sdc"
8 changes: 8 additions & 0 deletions environments/.stackhpc/terraform/SMS.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cluster_net = "stackhpc-ipv4-geneve"
cluster_subnet = "stackhpc-ipv4-geneve-subnet"
vnic_type = "normal"
control_node_flavor = "general.v1.medium"
other_node_flavor = "general.v1.tiny"
volume_backed_instances = true
state_volume_device_path = "/dev/vdb"
home_volume_device_path = "/dev/vdc"
45 changes: 30 additions & 15 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# This terraform configuration uses the "skeleton" terraform, so that is checked by CI.

variable "environment_root" {
type = string
description = "Path to environment root, automatically set by activate script"
Expand All @@ -8,45 +10,55 @@ variable "cluster_name" {
description = "Name for cluster, used as prefix for resources - set by environment var in CI"
}

variable "create_nodes" {
description = "Whether to create nodes (servers) or just ports and other infra"
type = bool # can't use bool as want to pass from command-line
default = true
}

variable "cluster_image" {
description = "single image for all cluster nodes - a convenience for CI"
type = string
default = "openhpc-230503-0944-bf8c3f63.qcow2" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
default = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
# default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2"
# default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
}

variable "cluster_net" {}

variable "cluster_subnet" {}

variable "vnic_type" {}

variable "control_node_flavor" {}

variable "other_node_flavor" {}

variable "volume_backed_instances" {}

variable "state_volume_device_path" {}

variable "home_volume_device_path" {}

module "cluster" {
source = "../../skeleton/{{cookiecutter.environment}}/terraform/"

cluster_name = var.cluster_name
cluster_net = "WCDC-iLab-60"
cluster_subnet = "WCDC-iLab-60"
vnic_type = "direct"
cluster_net = var.cluster_net
cluster_subnet = var.cluster_subnet
vnic_type = var.vnic_type
key_pair = "slurm-app-ci"
control_node = {
flavor: "vm.ska.cpu.general.quarter"
flavor: var.control_node_flavor
image: var.cluster_image
}
login_nodes = {
login-0: {
flavor: "vm.ska.cpu.general.small"
flavor: var.other_node_flavor
image: var.cluster_image
}
}
compute_types = {
small: {
flavor: "vm.ska.cpu.general.small"
flavor: var.other_node_flavor
image: var.cluster_image
}
extra: {
flavor: "vm.ska.cpu.general.small"
flavor: var.other_node_flavor
image: var.cluster_image
}
}
Expand All @@ -56,10 +68,13 @@ module "cluster" {
compute-2: "extra"
compute-3: "extra"
}
create_nodes = var.create_nodes
volume_backed_instances = var.volume_backed_instances

environment_root = var.environment_root
# Can reduce volume size a lot for short-lived CI clusters:
state_volume_size = 10
home_volume_size = 20

state_volume_device_path = var.state_volume_device_path
home_volume_device_path = var.home_volume_device_path
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ data "openstack_images_image_v2" "control" {
name = var.control_node.image
}

data "openstack_images_image_v2" "login" {
for_each = var.login_nodes

name = each.value.image
}

data "openstack_images_image_v2" "compute" {
for_each = var.compute_nodes

name = lookup(var.compute_images, each.key, var.compute_types[each.value].image)
}

resource "openstack_networking_port_v2" "login" {

for_each = toset(keys(var.login_nodes))
Expand Down Expand Up @@ -68,7 +80,7 @@ resource "openstack_networking_port_v2" "compute" {

resource "openstack_compute_instance_v2" "control" {

for_each = var.create_nodes ? toset(["control"]) : toset([])
for_each = toset(["control"])

name = "${var.cluster_name}-${each.key}"
image_name = data.openstack_images_image_v2.control.name
Expand All @@ -79,7 +91,8 @@ resource "openstack_compute_instance_v2" "control" {
block_device {
uuid = data.openstack_images_image_v2.control.id
source_type = "image"
destination_type = "local"
destination_type = var.volume_backed_instances ? "volume" : "local"
volume_size = var.volume_backed_instances ? var.root_volume_size : null
boot_index = 0
delete_on_termination = true
}
Expand Down Expand Up @@ -136,12 +149,24 @@ resource "openstack_compute_instance_v2" "control" {

resource "openstack_compute_instance_v2" "login" {

for_each = var.create_nodes ? var.login_nodes : {}
for_each = var.login_nodes

name = "${var.cluster_name}-${each.key}"
image_name = each.value.image
flavor_name = each.value.flavor
key_pair = var.key_pair

dynamic "block_device" {
for_each = var.volume_backed_instances ? [1]: []
content {
uuid = data.openstack_images_image_v2.login[each.key].id
source_type = "image"
destination_type = "volume"
volume_size = var.root_volume_size
boot_index = 0
delete_on_termination = true
}
}

network {
port = openstack_networking_port_v2.login[each.key].id
Expand All @@ -162,12 +187,24 @@ resource "openstack_compute_instance_v2" "login" {

resource "openstack_compute_instance_v2" "compute" {

for_each = var.create_nodes ? var.compute_nodes : {}
for_each = var.compute_nodes

name = "${var.cluster_name}-${each.key}"
image_name = lookup(var.compute_images, each.key, var.compute_types[each.value].image)
flavor_name = var.compute_types[each.value].flavor
key_pair = var.key_pair

dynamic "block_device" {
for_each = var.volume_backed_instances ? [1]: []
content {
uuid = data.openstack_images_image_v2.compute[each.key].id
source_type = "image"
destination_type = "volume"
volume_size = var.root_volume_size
boot_index = 0
delete_on_termination = true
}
}

network {
port = openstack_networking_port_v2.compute[each.key].id
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,14 @@ variable "nonlogin_security_groups" {
]
}

variable "create_nodes" {
description = "Whether to create nodes (servers) or just ports and other infra"
type = bool # can't use bool as want to pass from command-line
default = true
variable "volume_backed_instances" {
description = "Whether to use volumes for root disks"
type = bool
default = false
}

variable "root_volume_size" {
description = "Size of volume for root volumes if using volume backed instances, in Gb"
type = number
default = 40
}