Skip to content

Add role to install NVIDIA DOCA on top of an existing "fat" image #492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions .github/workflows/doca.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
name: Test DOCA extra build
on:
workflow_dispatch:
push:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'
pull_request:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'

jobs:
doca:
name: doca-build
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
build:
- image_name: openhpc-doca-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
inventory_groups: doca
- image_name: openhpc-doca-RL9
source_image_name_key: RL9
inventory_groups: doca
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
- uses: actions/checkout@v2

- name: Load current fat images into GITHUB_ENV
# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string
run: |
{
echo 'FAT_IMAGES<<EOF'
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
echo EOF
} >> "$GITHUB_ENV"

- name: Record settings
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
echo FAT_IMAGES: ${FAT_IMAGES}

- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: Install ansible etc
run: dev/setup-env.sh

- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash

- name: Setup environment
run: |
. venv/bin/activate
. environments/.stackhpc/activate

- name: Build fat image with packer
id: packer_build
run: |
set -x
. venv/bin/activate
. environments/.stackhpc/activate
cd packer/
packer init .

PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl

- name: Get created image names from manifest
id: manifest
run: |
. venv/bin/activate
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
while ! openstack image show -f value -c name $IMAGE_ID; do
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt

- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"

- name: Delete image for automatically-run workflows
run: |
. venv/bin/activate
openstack image delete "${{ steps.manifest.outputs.image-id }}"
if: ${{ github.event_name != 'workflow_dispatch' }}

- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
overwrite: true
38 changes: 18 additions & 20 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,23 @@ jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- image_name: openhpc-RL8
source_image_name: rocky-latest-RL8
inventory_groups: control,compute,login
- image_name: openhpc-RL9
source_image_name: rocky-latest-RL9
inventory_groups: control,compute,login
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
SOURCE_IMAGES_MAP: |
{
"RL8": {
"openstack.openhpc": "rocky-latest-RL8"
},
"RL9": {
"openstack.openhpc": "rocky-latest-RL9"
}
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
Expand Down Expand Up @@ -85,13 +78,11 @@ jobs:

PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}

- name: Get created image names from manifest
id: manifest
Expand All @@ -102,13 +93,20 @@ jobs:
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt

- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"

- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
Expand Down
54 changes: 20 additions & 34 deletions .github/workflows/nightlybuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,29 @@ on:
- SMS
- ARCUS
schedule:
- cron: '0 0 * * *' # Run at midnight
- cron: '0 0 * * *' # Run at midnight on default branch

jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.rocky-latest
- image_name: rocky-latest-RL8
source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2
inventory_groups: update
- image_name: rocky-latest-RL9
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
inventory_groups: update
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
SOURCE_IMAGES_MAP: |
{
"RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2",
"RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
Expand Down Expand Up @@ -83,15 +80,12 @@ jobs:

PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl

env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }}

- name: Get created image names from manifest
id: manifest
run: |
Expand Down Expand Up @@ -125,7 +119,7 @@ jobs:
name: upload-nightly-targets
needs: openstack
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }}
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
Expand All @@ -135,18 +129,15 @@ jobs:
- LEAFCLOUD
- SMS
- ARCUS
os_version:
- RL8
- RL9
image:
- rocky-latest
build:
- image_name: rocky-latest-RL8
- image_name: rocky-latest-RL9
exclude:
- target_cloud: LEAFCLOUD
env:
OS_CLOUD: openstack
SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
TARGET_CLOUD: ${{ matrix.target_cloud }}
IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}"
steps:
- uses: actions/checkout@v2

Expand All @@ -161,42 +152,37 @@ jobs:
. venv/bin/activate
pip install -U pip
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
shell: bash

- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml
shell: bash

- name: Download source image
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml
openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }}
shell: bash
openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }}

- name: Upload to target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml

openstack image create "${{ env.IMAGE_NAME }}" \
--file "${{ env.IMAGE_NAME }}" \
openstack image create "${{ matrix.build.image_name }}" \
--file "${{ matrix.build.image_name }}" \
--disk-format qcow2 \
shell: bash

- name: Delete old latest image from target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml

IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l)
IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l)
if [ "$IMAGE_COUNT" -gt 1 ]; then
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1)
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1)
openstack image delete "$OLD_IMAGE_ID"
else
echo "Only one image exists, skipping deletion."
fi
shell: bash
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,5 @@ roles/*
!roles/lustre/**
!roles/dnf_repos/
!roles/dnf_repos/**
!roles/doca/
!roles/doca/**
5 changes: 5 additions & 0 deletions ansible/cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,10 @@
os: "{{ ansible_distribution }} {{ ansible_distribution_version }}"
kernel: "{{ ansible_kernel }}"
ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}"
doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}"
cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}"
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"

- name: Show image summary
debug:
var: image_info
11 changes: 11 additions & 0 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
tasks:
- name: Report hostname (= final image name)
command: hostname
- name: Report inventory groups
debug:
var: group_names

- name: Run pre.yml hook
vars:
Expand Down Expand Up @@ -199,6 +202,14 @@
name: cloudalchemy.grafana
tasks_from: install.yml

- hosts: doca
become: yes
gather_facts: yes
tasks:
- name: Install NVIDIA DOCA
import_role:
name: doca

- name: Run post.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down
Loading
Loading