Skip to content

Speedup smslabs CI #162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 20 additions & 28 deletions .github/workflows/smslabs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
pull_request:
concurrency: stackhpc-ci # openstack project
jobs:
openstack-example:
smslabs:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -77,63 +77,55 @@ jobs:
TF_VAR_cluster_name: ci${{ github.run_id }}
if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }}

- name: Configure infrastructure
- name: Directly configure cluster and build compute + login images
# see pre-hook for the image build
run: |
. venv/bin/activate
. environments/smslabs/activate
ansible all -m wait_for_connection
ansible-playbook ansible/adhoc/generate-passwords.yml
echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml
ansible-playbook -vv ansible/site.yml
env:
OS_CLOUD: openstack
ANSIBLE_FORCE_COLOR: True
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Run MPI-based tests
run: |
. venv/bin/activate
. environments/smslabs/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
env:
ANSIBLE_FORCE_COLOR: True
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Build control and compute images
- name: Test reimage of login and compute nodes
run: |
. venv/bin/activate
. environments/smslabs/activate
cd packer
PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
ansible all -m wait_for_connection
ansible-playbook -vv ansible/ci/test_reimage.yml
env:
OS_CLOUD: openstack
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
ANSIBLE_FORCE_COLOR: True

- name: Reimage compute nodes via slurm and check cluster still up
- name: Run MPI-based tests
run: |
. venv/bin/activate
. environments/smslabs/activate
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
ansible-playbook -vv ansible/adhoc/hpctests.yml
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Reimage login nodes via openstack and check cluster still up
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/smslabs/activate
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml
ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve
env:
OS_CLOUD: openstack
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
TF_VAR_cluster_name: ci${{ github.run_id }}
if: ${{ success() || cancelled() }}

- name: Delete infrastructure
- name: Delete images
run: |
. venv/bin/activate
. environments/smslabs/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve
ansible-playbook -vv ansible/ci/delete_images.yml
env:
OS_CLOUD: openstack
TF_VAR_cluster_name: ci${{ github.run_id }}
if: ${{ success() || cancelled() }}
ANSIBLE_FORCE_COLOR: True
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# Reimage login nodes via OpenStack

- hosts: login
- hosts: login:!builder
become: no
gather_facts: no
tasks:
- name: Read packer build manifest
set_fact:
manifest: "{{ lookup('file', manifest_path) | from_json }}"
vars:
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
delegate_to: localhost
- name: Get latest login image build

- name: Get latest image builds
set_fact:
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"

- name: Reimage node via openstack
- name: Delete images
shell:
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
cmd: |
openstack image delete {{ login_build.artifact_id }}
openstack image delete {{ compute_build.artifact_id }}
delegate_to: localhost

- name: Wait for connection
wait_for_connection:

65 changes: 65 additions & 0 deletions ansible/ci/test_reimage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
- hosts: login:!builder
become: no
tasks:
- name: Read packer build manifest
set_fact:
manifest: "{{ lookup('file', manifest_path) | from_json }}"
vars:
manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json"
delegate_to: localhost

- name: Get latest image builds
set_fact:
login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}"
compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}"

- name: Reimage login node via openstack
shell:
cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}"
delegate_to: localhost

- name: Check login node rebuild completed
shell:
cmd: openstack server show {{ inventory_hostname }} --format value -c image
register: openstack_login
delegate_to: localhost
retries: 5
delay: 30
until: login_build.artifact_id in openstack_login.stdout
changed_when: false

- name: Wait for login connection
wait_for_connection:
timeout: 800

- name: Check slurm up after reimaging login node
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"

# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
- name: Request compute node rebuild via Slurm
shell:
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
become: yes

- name: Check compute node rebuild completed
shell:
cmd: openstack server show {{ item }} --format value -c image
register: openstack_compute
delegate_to: localhost
loop: "{{ groups['compute'] }}"
retries: 5
delay: 30
until: compute_build.artifact_id in openstack_compute.stdout
changed_when: false

- hosts: compute:!builder
become: no
gather_facts: no
tasks:
- name: Wait for compute connection
wait_for_connection:
timeout: 800

- name: Check slurm up after reimaging login node
import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml"
run_once: true
37 changes: 0 additions & 37 deletions environments/smslabs/ci/reimage-compute.yml

This file was deleted.

20 changes: 20 additions & 0 deletions environments/smslabs/hooks/check_slurm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
- name: Run sinfo
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
register: sinfo
changed_when: false
until: "'boot' not in sinfo.stdout_lines"
retries: 5
delay: 10
- name: Check nodes have expected slurm state
assert:
that: sinfo.stdout_lines == expected_sinfo
fail_msg: |
sinfo output not as expected:
actual:
{{ sinfo.stdout_lines }}
expected:
{{ expected_sinfo }}
<end>
vars:
expected_sinfo:
- "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle"
39 changes: 14 additions & 25 deletions environments/smslabs/hooks/post.yml
Original file line number Diff line number Diff line change
@@ -1,30 +1,19 @@
- hosts: login
- hosts: login:!builder # won't have a slurm control daemon when in build
become: no
gather_facts: false
tags: checks
tasks:
- block:
- name: Run sinfo
shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
register: sinfo
changed_when: false
- name: Check nodes have expected slurm state
assert:
that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them
fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}"
when: "'builder' not in group_names" # won't have a slurm control daemon when in build
- name: Check slurm up after direct deploy
import_tasks: check_slurm.yml

- hosts: openondemand
name: Check Open Ondemand is running
tags:
- checks
- openondemand
- openondemand_server
- hosts: localhost
become: false
tags: build
tasks:
- uri:
url: https://localhost
validate_certs: false # selfsigned
force_basic_auth: yes # as otherwise we get 401
url_username: testuser
url_password: "{{ test_user_password }}"
status_code: 200
- name: Check Packer build finished
async_status:
jid: "{{ packer_run.ansible_job_id }}"
register: packer_result
until: packer_result.finished
retries: 30 # allow 15 mins
delay: 30
when: packer_run is defined # allows rerunning post.yml
31 changes: 31 additions & 0 deletions environments/smslabs/hooks/pre.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
- hosts: localhost
become: false
tags: build
tasks:
- name: Ensure secrets generated
include_role:
name: passwords

- name: Build packer images
shell:
cmd: |
cd packer
PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}"
when: "'builder' not in group_names" # avoid recursion!
register: packer_run
async: 2700 # 45 minutes
poll: 0

# For some reason squid shows TCP_MISS_ABORTED/200 on everything
# - hosts: all
# become: yes
# gather_facts: no
# tasks:
# - name: Configure dnf proxy
# community.general.ini_file:
# path: /etc/dnf/dnf.conf
# section: main
# option: proxy
# value: "{{ squid_proxy }}"
# no_extra_spaces: true