Skip to content

Commit 80c4ceb

Browse files
MoteHuesjpb
authored andcommitted
Build RL8+OFED image in CI (#427)
* Check major version for RL8 package installs * Gather facts on ofed role * Support kernel checks with mismatching version length 4.18.0-553.16.1.el8_9.x86_64 4.18.0-553.el8_9.x86_64 These would fail with the error: '<' not supported between instances of 'str' and 'int'. as the community.general.version_sort was trying to compare the `el8_9` of the latter with the `16` of the former. Strip the last two chunks so we just compare numbers. * Move to LTS version now RL9.4 is supported * Fail when any inventory source cannot be parsed * Always reboot after selinux and package updates * Cleat facts before OFED so install will match newest kernel * Clear facts after reboot so OFED install will match newest kernel * fail caas and stackhpc if any inventory can't be read * make reboot conditional on package or SELinux changes again * include OFED in both RL8 and RL9 builds * always run CI tests on RL8 and RL9 * allow concurrent RL8/RL9 CI tests * mark pending reboot check as not a change * fix workflow matrix definitions * bump CI images - now both OFED * use reboot hint for checking reboot required --------- Co-authored-by: Steve Brasier <[email protected]>
1 parent 9e53ce6 commit 80c4ceb

File tree

9 files changed

+45
-51
lines changed

9 files changed

+45
-51
lines changed

.github/workflows/fatimage.yml

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,26 @@
11

22
name: Build fat image
3-
'on':
3+
on:
44
workflow_dispatch:
5-
concurrency:
6-
group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build
7-
cancel-in-progress: true
85
jobs:
96
openstack:
107
name: openstack-imagebuild
8+
concurrency:
9+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
10+
cancel-in-progress: true
1111
runs-on: ubuntu-22.04
1212
strategy:
1313
fail-fast: false # allow other matrix jobs to continue even if one fails
14-
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
14+
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
1515
os_version:
1616
- RL8
1717
- RL9
1818
build:
19-
- openstack.openhpc
2019
- openstack.openhpc-ofed
2120
- openstack.openhpc-cuda
2221
exclude:
23-
- os_version: RL8
24-
build: openstack.openhpc-ofed
2522
- os_version: RL8
2623
build: openstack.openhpc-cuda
27-
- os_version: RL9
28-
build: openstack.openhpc
2924
env:
3025
ANSIBLE_FORCE_COLOR: True
3126
OS_CLOUD: openstack

.github/workflows/stackhpc.yml

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,29 @@
22
name: Test deployment and reimage on OpenStack
33
on:
44
workflow_dispatch:
5-
inputs:
6-
use_RL8:
7-
required: true
8-
description: Include RL8 tests
9-
type: boolean
10-
default: false
115
push:
126
branches:
137
- main
148
pull_request:
159
jobs:
1610
openstack:
1711
name: openstack-ci
18-
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
12+
concurrency:
13+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
14+
cancel-in-progress: true
1915
runs-on: ubuntu-22.04
2016
strategy:
17+
fail-fast: false # allow other matrix jobs to continue even if one fails
2118
matrix:
22-
os_version: [RL8, RL9]
23-
rl8_selected:
24-
- ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
25-
rl8_branch:
26-
- ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
27-
rl8_label:
28-
- ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
29-
exclude:
30-
- os_version: RL8
31-
rl8_selected: false
32-
rl8_branch: false
33-
rl8_label: false
19+
os_version:
20+
- RL8
21+
- RL9
3422
env:
3523
ANSIBLE_FORCE_COLOR: True
3624
OS_CLOUD: openstack
3725
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
3826
CI_CLOUD: ${{ vars.CI_CLOUD }}
27+
TF_VAR_os_version: ${{ matrix.os_version }}
3928
steps:
4029
- uses: actions/checkout@v2
4130

@@ -89,8 +78,6 @@ jobs:
8978
. environments/.stackhpc/activate
9079
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
9180
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
92-
env:
93-
TF_VAR_os_version: ${{ matrix.os_version }}
9481
9582
- name: Delete infrastructure if provisioning failed
9683
run: |
@@ -99,8 +86,6 @@ jobs:
9986
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
10087
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
10188
if: failure() && steps.provision_servers.outcome == 'failure'
102-
env:
103-
TF_VAR_os_version: ${{ matrix.os_version }}
10489

10590
- name: Configure cluster
10691
run: |
@@ -199,8 +184,6 @@ jobs:
199184
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
200185
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
201186
if: ${{ success() || cancelled() }}
202-
env:
203-
TF_VAR_os_version: ${{ matrix.os_version }}
204187

205188
# - name: Delete images
206189
# run: |

ansible/bootstrap.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -227,24 +227,25 @@
227227
- update
228228
tasks:
229229
- name: Check for pending reboot from package updates
230-
stat:
231-
path: /var/run/reboot-required
230+
command:
231+
cmd: dnf needs-restarting -r
232232
register: update_reboot_required
233-
- debug:
234-
msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}"
235-
- name: Reboot if required from SELinux state change or package upgrades
233+
failed_when: "update_reboot_required.rc not in [0, 1]"
234+
changed_when: false
235+
- name: Reboot to cover SELinux state change or package upgrades
236236
reboot:
237237
post_reboot_delay: 30
238-
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool)
238+
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1)
239239
- name: Wait for hosts to be reachable
240240
wait_for_connection:
241241
sleep: 15
242-
- name: update facts
242+
- name: Clear facts
243+
meta: clear_facts
244+
- name: Update facts
243245
setup:
244-
when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))
245246

246247
- hosts: ofed
247-
gather_facts: no
248+
gather_facts: yes
248249
become: yes
249250
tags: ofed
250251
tasks:

ansible/roles/ofed/defaults/main.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4
1+
ofed_version: '23.10-3.2.2.0' # LTS
22
ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
33
ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
44
ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
5+
ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8'
56
ofed_arch: "{{ ansible_architecture }}"
67
ofed_tmp_dir: /tmp
78
ofed_update_firmware: false

ansible/roles/ofed/tasks/install.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010

1111
- name: Check current kernel is newest installed
1212
assert:
13-
that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest
13+
that: _ofed_kernel_current == _ofed_dnf_kernels_newest
1414
fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
1515
vars:
16+
_ofed_kernel_current: >-
17+
{{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }}
1618
_ofed_dnf_kernels_newest: >-
17-
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }}
19+
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }}
1820
# dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos "
1921

2022
- name: Enable epel
@@ -31,7 +33,7 @@
3133

3234
- name: Install build prerequisites
3335
dnf:
34-
name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}"
36+
name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}"
3537
when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
3638
# don't want to install a load of prereqs unnecessarily
3739

environments/.caas/ansible.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
1313
[ssh_connection]
1414
ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
1515
pipelining = True
16+
17+
[inventory]
18+
# Fail when any inventory source cannot be parsed.
19+
any_unparsed_is_failed = True

environments/.stackhpc/ansible.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@ filter_plugins = ../../ansible/filter_plugins
1414
[ssh_connection]
1515
ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
1616
pipelining = True
17+
18+
[inventory]
19+
# Fail when any inventory source cannot be parsed.
20+
any_unparsed_is_failed = True

environments/.stackhpc/terraform/main.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ variable "cluster_image" {
2929
description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
3030
type = map(string)
3131
default = {
32-
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
33-
RL8: "openhpc-RL8-240904-1509-1687368f"
34-
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
32+
# https://github.com/stackhpc/ansible-slurm-appliance/pull/427
33+
RL8: "openhpc-ofed-RL8-240906-1042-32568dbb"
34+
RL9: "openhpc-ofed-RL9-240906-1041-32568dbb"
3535
}
3636
}
3737

environments/skeleton/{{cookiecutter.environment}}/ansible.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
1313
[ssh_connection]
1414
ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
1515
pipelining = True
16+
17+
[inventory]
18+
# Fail when any inventory source cannot be parsed.
19+
any_unparsed_is_failed = True

0 commit comments

Comments
 (0)