Skip to content

Commit 5c49061

Browse files
committed
Merge branch 'main' into rl8-ofed-fixes
2 parents afc7677 + 6ec3a73 commit 5c49061

File tree

13 files changed

+297
-34
lines changed

13 files changed

+297
-34
lines changed

.github/bin/create-merge-branch.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
3+
#####
4+
# This script creates a branch that merges the latest release
5+
#####
6+
7+
set -ex
8+
9+
# Only allow running on main
10+
CURRENT_BRANCH="$(git branch --show-current)"
11+
if [ "$CURRENT_BRANCH" != "main" ]; then
12+
echo "[ERROR] This script can only be run on the main branch" >&2
13+
exit 1
14+
fi
15+
16+
if [ -n "$(git status --short)" ]; then
17+
echo "[ERROR] This script cannot run with uncommitted changes" >&2
18+
exit 1
19+
fi
20+
21+
UPSTREAM_REPO="${UPSTREAM_REPO:-"stackhpc/ansible-slurm-appliance"}"
22+
echo "[INFO] Using upstream repo - $UPSTREAM_REPO"
23+
24+
# Fetch the tag for the latest release from the upstream repository
25+
RELEASE_TAG="$(curl -fsSL "https://api.github.com/repos/${UPSTREAM_REPO}/releases/latest" | jq -r '.tag_name')"
26+
echo "[INFO] Found latest release tag - $RELEASE_TAG"
27+
28+
# Add the repository as an upstream
29+
echo "[INFO] Adding upstream remote..."
30+
git remote add upstream "https://github.com/${UPSTREAM_REPO}.git"
31+
git remote show upstream
32+
33+
echo "[INFO] Fetching remote tags..."
34+
git remote update
35+
36+
# Use a branch that is named for the release
37+
BRANCH_NAME="upgrade/$RELEASE_TAG"
38+
39+
# Check if the branch already exists on the origin
40+
# If it does, there is nothing more to do as the branch can be rebased from the MR
41+
if git show-branch "remotes/origin/$BRANCH_NAME" >/dev/null 2>&1; then
42+
echo "[INFO] Merge branch already created for $RELEASE_TAG"
43+
exit
44+
fi
45+
46+
echo "[INFO] Merging release tag - $RELEASE_TAG"
47+
git merge --strategy recursive -X theirs --no-commit $RELEASE_TAG
48+
49+
# Check if the merge resulted in any changes being staged
50+
if [ -n "$(git status --short)" ]; then
51+
echo "[INFO] Merge resulted in the following changes"
52+
git status
53+
54+
# NOTE(scott): The GitHub create-pull-request action does
55+
# the commiting for us, so we only need to make branches
56+
# and commits if running outside of GitHub actions.
57+
if [ ! $GITHUB_ACTIONS ]; then
58+
echo "[INFO] Checking out temporary branch '$BRANCH_NAME'..."
59+
git checkout -b "$BRANCH_NAME"
60+
61+
echo "[INFO] Committing changes"
62+
git commit -m "Upgrade ansible-slurm-applaince to $RELEASE_TAG"
63+
64+
echo "[INFO] Pushing changes to origin"
65+
git push --set-upstream origin "$BRANCH_NAME"
66+
67+
# Go back to the main branch at the end
68+
echo "[INFO] Reverting back to main"
69+
git checkout main
70+
71+
echo "[INFO] Removing temporary branch"
72+
git branch -d "$BRANCH_NAME"
73+
fi
74+
75+
# Write a file containing the branch name and tag
76+
# for automatic PR or MR creation that follows
77+
echo "BRANCH_NAME=\"$BRANCH_NAME\"" > .mergeenv
78+
echo "RELEASE_TAG=\"$RELEASE_TAG\"" >> .mergeenv
79+
else
80+
echo "[INFO] Merge resulted in no changes"
81+
fi

.github/bin/get-s3-image.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
3+
#####
4+
# This script looks for an image in OpenStack and if not found, downloads from
5+
# S3 bucket, and then uploads to OpenStack
6+
#####
7+
8+
set -ex
9+
10+
image_name=$1
11+
bucket_name=$2
12+
echo "Checking if image $image_name exists in OpenStack"
13+
image_exists=$(openstack image list --name "$image_name" -f value -c Name)
14+
15+
if [ -n "$image_exists" ]; then
16+
echo "Image $image_name already exists in OpenStack."
17+
else
18+
echo "Image $image_name not found in OpenStack. Getting it from S3."
19+
20+
wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga
21+
22+
echo "Uploading image $image_name to OpenStack..."
23+
openstack image create --file $image_name --disk-format qcow2 $image_name --progress
24+
25+
echo "Image $image_name has been uploaded to OpenStack."
26+
fi

.github/workflows/fatimage.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,20 @@ jobs:
1010
name: openstack-imagebuild
1111
runs-on: ubuntu-22.04
1212
strategy:
13-
matrix:
13+
fail-fast: false # allow other matrix jobs to continue even if one fails
14+
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
1415
os_version:
1516
- RL8
1617
- RL9
1718
build:
1819
- openstack.openhpc
1920
- openstack.openhpc-ofed
21+
- openstack.openhpc-cuda
2022
exclude:
2123
- os_version: RL8
2224
build: openstack.openhpc-ofed
25+
- os_version: RL8
26+
build: openstack.openhpc-cuda
2327
- os_version: RL9
2428
build: openstack.openhpc
2529
env:
@@ -81,7 +85,9 @@ jobs:
8185
- name: Download image
8286
run: |
8387
. venv/bin/activate
84-
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
88+
sudo mkdir /mnt/images
89+
sudo chmod 777 /mnt/images
90+
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
8591
8692
- name: Set up QEMU
8793
uses: docker/setup-qemu-action@v3
@@ -95,13 +101,13 @@ jobs:
95101
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
96102

97103
- name: mount qcow2 file
98-
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
104+
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
99105

100106
- name: Run Trivy vulnerability scanner
101107
uses: aquasecurity/[email protected]
102108
with:
103109
scan-type: fs
104-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
110+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
105111
scanners: "vuln"
106112
format: sarif
107113
output: "${{ steps.manifest.outputs.image-name }}.sarif"
@@ -117,7 +123,7 @@ jobs:
117123
uses: aquasecurity/[email protected]
118124
with:
119125
scan-type: fs
120-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
126+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
121127
scanners: "vuln"
122128
format: table
123129
exit-code: '1'
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# This workflow compares a downstream ansible-slurm-appliance repository for a specific site with the upstream
2+
# stackhpc/ansible-slurm-appliance repository to check whether there is a new upstream version available. If a
3+
# newer tag is found in the upstream repository then a pull request is created to the downstream repo
4+
# in order to merge in the changes from the new upstream release.
5+
#
6+
# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
7+
# and give it an appropriate name, e.g.
8+
# cp .github/workflows/upgrade-check.yml.sample .github/workflows/upgrade-check.yml
9+
#
10+
# Workflow uses https://github.com/peter-evans/create-pull-request to handle the pull request action.
11+
# See the docs for action inputs.
12+
#
13+
# In order for GitHub actions to create pull requests that make changes to workflows in `.github/workflows`,
14+
# a token for each deployment must be provided. Both user PAT and fine-grained tokens should work, but it was tested
15+
# with a PAT. Fine-grained repo-scoped token is preferred if possible but requires organisation admin privileges.
16+
#
17+
# See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
18+
# for security considerations around tokens. TREAT YOUR ACCESS TOKENS LIKE PASSWORDS.
19+
#
20+
# The following repository permissions must be set for the PAT:
21+
# - `Workflows: Read and write`
22+
# - `Actions: Read and write`
23+
# - `Pull requests: Read and write`
24+
# The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.
25+
26+
name: Check for upstream updates
27+
on:
28+
schedule:
29+
- cron: "0 9 * * *"
30+
workflow_dispatch:
31+
jobs:
32+
check_for_update:
33+
runs-on: ubuntu-22.04
34+
35+
steps:
36+
- name: Checkout the config repo
37+
uses: actions/checkout@v4
38+
with:
39+
fetch-depth: 0
40+
fetch-tags: true
41+
42+
# Based on equivalent azimuth-config job
43+
- name: Check for new release
44+
shell: bash
45+
run: |
46+
set -xe
47+
48+
# Tell git who we are for commits
49+
git config user.email "${{ github.actor }}[email protected]"
50+
git config user.name "${{ github.actor }} CI"
51+
52+
# Create the merge branch and write vars to .mergeenv file
53+
.github/bin/create-merge-branch.sh
54+
55+
- name: Set release tag output
56+
id: release_tag
57+
if: ${{ hashFiles('.mergeenv') }}
58+
run: source .mergeenv && echo value=$RELEASE_TAG >> $GITHUB_OUTPUT
59+
60+
- name: Set branch name output
61+
id: branch_name
62+
if: ${{ hashFiles('.mergeenv') }}
63+
run: source .mergeenv && echo value=$BRANCH_NAME >> $GITHUB_OUTPUT
64+
65+
- name: Remove tmp file
66+
run: rm .mergeenv
67+
if: ${{ hashFiles('.mergeenv') }}
68+
69+
- name: Create Pull Request
70+
if: ${{ steps.release_tag.outputs.value }}
71+
uses: peter-evans/create-pull-request@v6
72+
with:
73+
base: main
74+
branch: ${{ steps.branch_name.outputs.value }}
75+
title: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
76+
body: This PR was automatically generated by GitHub Actions.
77+
commit-message: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
78+
delete-branch: true
79+
token: ${{ secrets.WORKFLOW_TOKEN }}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# This workflow can be used to fetch images published by StackHPC and upload them to a client's OpenStack.
2+
# The workflow takes two inputs:
3+
# - image name
4+
# - s3 bucket name
5+
# and first checks to see if the image exists in the target OpenStack. If the image doesn't exist, it is downloaded
6+
# from StackHPC's public S3 bucket and then uploaded to the target OpenStack.
7+
#
8+
# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
9+
# and give it an appropriate name, e.g.
10+
# cp .github/workflows/upload-s3-image.yml.sample .github/workflows/upload-s3-image.yml
11+
#
12+
# In order for the workflow to access the target OpenStack, an application credential clouds.yaml file must be
13+
# added as a repository secret named OS_CLOUD_YAML.
14+
# Details on the contents of the clouds.yaml file can be found at https://docs.openstack.org/keystone/latest/user/application_credentials.html
15+
16+
name: Upload release images to client sites from s3
17+
on:
18+
workflow_dispatch:
19+
inputs:
20+
image_name:
21+
type: string
22+
description: Image name from: (https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{BUCKET_NAME})
23+
required: true
24+
bucket_name:
25+
type: choice
26+
required: true
27+
description: Bucket name
28+
options:
29+
- openhpc-images
30+
# - openhpc-images-prerelease
31+
32+
jobs:
33+
image_upload:
34+
runs-on: ubuntu-22.04
35+
concurrency: ${{ github.ref }}
36+
env:
37+
OS_CLOUD: openstack
38+
steps:
39+
- uses: actions/checkout@v4
40+
41+
- name: Write clouds.yaml
42+
run: |
43+
mkdir -p ~/.config/openstack/
44+
echo "${{ secrets.OS_CLOUD_YAML }}" > ~/.config/openstack/clouds.yaml
45+
shell: bash
46+
47+
- name: Upload latest image if missing
48+
run: |
49+
python3 -m venv venv
50+
. venv/bin/activate
51+
pip install -U pip
52+
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
53+
bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }}
54+
55+
- name: Cleanup OpenStack Image (on error or cancellation)
56+
if: cancelled()
57+
run: |
58+
. venv/bin/activate
59+
image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
60+
if [ -n "$image_hanging" ]; then
61+
echo "Cleaning up OpenStack image with ID: $image_hanging"
62+
openstack image delete $image_hanging
63+
else
64+
echo "No image ID found, skipping cleanup."
65+
fi
66+
shell: bash

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,11 @@ Please contact us for specific advice, but in outline this generally involves:
144144
## Monitoring and logging
145145

146146
Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.README.md) for details.
147+
148+
## CI/CD automation
149+
150+
The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
151+
152+
- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
153+
154+
- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud.

ansible/extras.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- name: Setup CUDA
2222
hosts: cuda
2323
become: yes
24-
gather_facts: no
24+
gather_facts: yes
2525
tags: cuda
2626
tasks:
2727
- import_role:

ansible/roles/cuda/defaults/main.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
cuda_distro: rhel8
1+
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
33
cuda_driver_stream: default
4+
cuda_package_version: 'latest'
45
cuda_packages:
5-
- cuda
6+
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
67
- nvidia-gds
78
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8-
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
9+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
910
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1011
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1112
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,13 @@
2424
failed_when: false
2525
register: _cuda_driver_module_enabled
2626

27-
- name: List nvidia driver dnf module stream versions
28-
shell:
29-
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30-
# Output of interest from command is something like (some whitespace removed):
31-
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32-
changed_when: false
33-
register: _cuda_driver_module_streams
34-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35-
3627
- name: Enable nvidia driver module
37-
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
28+
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
3829
register: _cuda_driver_module_enable
3930
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
4031
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
4132

42-
- name: Install nvidia drivers # TODO: make removal possible?
33+
- name: Install nvidia drivers
4334
ansible.builtin.command: dnf module install -y nvidia-driver
4435
register: _cuda_driver_install
4536
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

environments/.stackhpc/ARCUS.pkrvars.hcl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
11
flavor = "vm.ska.cpu.general.small"
2-
use_blockstorage_volume = true
3-
volume_size = 15 # GB
4-
image_disk_format = "qcow2"
52
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
63
ssh_keypair_name = "slurm-app-ci"
74
ssh_private_key_file = "~/.ssh/id_rsa"

environments/.stackhpc/LEAFCLOUD.pkrvars.hcl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
flavor = "ec1.large"
2-
use_blockstorage_volume = true
3-
volume_size = 15 # GB
42
volume_type = "unencrypted"
5-
image_disk_format = "qcow2"
63
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
74
ssh_keypair_name = "slurm-app-ci"
85
ssh_private_key_file = "~/.ssh/id_rsa"

0 commit comments

Comments
 (0)