Skip to content

Commit be6eafc

Browse files
authored
Merge pull request #173 from stackhpc/feature/offboard-state
Support moving state to persistent storage
2 parents fd3b805 + 7b206fc commit be6eafc

File tree

44 files changed

+592
-87
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+592
-87
lines changed

.github/workflows/stackhpc.yml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,16 @@ jobs:
100100
OS_CLOUD: openstack
101101
ANSIBLE_FORCE_COLOR: True
102102
TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
103-
103+
104+
- name: Run MPI-based tests
105+
run: |
106+
. venv/bin/activate
107+
. environments/${{ matrix.cloud }}/activate
108+
ansible-playbook -vv ansible/adhoc/hpctests.yml
109+
env:
110+
ANSIBLE_FORCE_COLOR: True
111+
OS_CLOUD: openstack
112+
104113
- name: Confirm Open Ondemand is up (via SOCKS proxy)
105114
run: |
106115
. venv/bin/activate
@@ -154,11 +163,11 @@ jobs:
154163
OS_CLOUD: openstack
155164
ANSIBLE_FORCE_COLOR: True
156165

157-
- name: Run MPI-based tests
166+
- name: Check sacct state survived reimage
158167
run: |
159168
. venv/bin/activate
160169
. environments/${{ matrix.cloud }}/activate
161-
ansible-playbook -vv ansible/adhoc/hpctests.yml
170+
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
162171
env:
163172
ANSIBLE_FORCE_COLOR: True
164173
OS_CLOUD: openstack

ansible/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,7 @@ roles/*
2626
!roles/slurm_exporter/**
2727
!roles/firewalld/
2828
!roles/firewalld/**
29+
!roles/mysql/
30+
!roles/mysql/**
31+
!roles/systemd/
32+
!roles/systemd/**

ansible/bootstrap.yml

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,36 @@
1616
- hosts: cluster
1717
gather_facts: false
1818
tasks:
19+
- name: Add groups
20+
ansible.builtin.group: "{{ item.group }}"
21+
loop: "{{ appliances_local_users }}"
22+
when:
23+
- item.enable | default(true) | bool
24+
- "'group' in item"
25+
become_method: "sudo"
26+
# Need to change working directory otherwise we try to switch back to non-existent directory.
27+
become_flags: '-i'
28+
become: true
1929
- name: Add users
20-
ansible.builtin.user: "{{ item }}"
21-
with_items: "{{ appliances_local_users }}"
30+
ansible.builtin.user: "{{ item.user }}"
31+
loop: "{{ appliances_local_users }}"
32+
when: item.enable | default(true) | bool
2233
become_method: "sudo"
2334
# Need to change working directory otherwise we try to switch back to non-existent directory.
2435
become_flags: '-i'
2536
become: true
2637
- name: Reset ssh connection to allow user changes to affect ansible_user
2738
meta: reset_connection
2839

40+
- hosts: systemd
41+
become: yes
42+
gather_facts: false
43+
tags: systemd
44+
tasks:
45+
- name: Make systemd unit modifications
46+
import_role:
47+
name: systemd
48+
2949
- hosts: selinux
3050
gather_facts: false
3151
become: yes

ansible/ci/check_sacct_hpctests.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
- hosts: control
2+
gather_facts: false
3+
become: true
4+
vars:
5+
sacct_stdout_expected: |- # based on CI running hpctests as the first job - NB note no trailing newline
6+
JobID,JobName,State
7+
2,pingpong.sh,COMPLETED
8+
3,pingmatrix.sh,COMPLETED
9+
4,hpl-build-linux64.sh,COMPLETED
10+
5_0,hpl-solo.sh,COMPLETED
11+
5_1,hpl-solo.sh,COMPLETED
12+
tasks:
13+
- name: Get info for ended jobs
14+
shell:
15+
cmd: sacct --format=jobid,jobname,state --allocations --parsable2 --delimiter=, --starttime=now-1days --endtime=now
16+
# by default start/end time is midnight/now which is not robust
17+
changed_when: false
18+
register: sacct
19+
- name: Check info for ended jobs
20+
assert:
21+
that: sacct.stdout == sacct_stdout_expected
22+
fail_msg: |
23+
Expected:
24+
--{{ sacct_stdout_expected }}--
25+
Got:
26+
--{{ sacct.stdout }}--
27+
success_msg: sacct shows hpctests jobs as first and only jobs

ansible/filter_plugins/utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# Apache 2 License
55

66
from ansible.errors import AnsibleError, AnsibleFilterError
7+
from ansible.utils.display import Display
78
from collections import defaultdict
89
import jinja2
910
from ansible.module_utils.six import string_types
@@ -36,10 +37,15 @@ def exists(fpath):
3637
class FilterModule(object):
3738
''' Ansible core jinja2 filters '''
3839

40+
def warn(self, message, **kwargs):
41+
Display().warning(message)
42+
return message
43+
3944
def filters(self):
4045
return {
4146
# jinja2 overrides
4247
'readfile': readfile,
4348
'prometheus_node_exporter_targets': prometheus_node_exporter_targets,
44-
'exists': exists
45-
}
49+
'exists': exists,
50+
'warn': self.warn
51+
}

ansible/roles/block_devices/README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
block_devices
22
=============
33

4-
Manage filesystems on block devices, including creating partitions, creating filesystems and mounting filesystems.
4+
Manage filesystems on block devices (such as OpenStack volumes), including creating partitions, creating filesystems and mounting filesystems.
55

66
This is a convenience wrapper around the ansible modules:
77
- community.general.parted
88
- community.general.filesystem
99
- ansible.buildin.file
1010
- ansible.posix.mount
1111

12-
It includes logic to handle OpenStack-provided volumes appropriately both for appliance instances and the Packer build VM.
12+
To avoid issues with device names changing after e.g. reboots, devices are identified by serial number and mounted by filesystem UUID.
1313

14-
To avoid issues with device names changing after e.g. reboots, devices are identified by serial number and mounted by filesystem UUID.
14+
**NB:** This role is ignored[^1] during Packer builds as block devices will not be attached to the Packer build VMs. This role is therefore deprecated and it is suggested that `cloud-init` is used instead. See e.g. `environments/skeleton/{{cookiecutter.environment}}/terraform/control.userdata.tpl`.
15+
16+
[^1]: See `environments/common/inventory/group_vars/builder/defaults.yml`
1517

1618
Requirements
1719
------------

ansible/roles/block_devices/tasks/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
- name: Warn role is deprecated
2+
debug:
3+
msg: "{{ 'Role block_devices is deprecated, see ansible/roles/block_devices/README.md' | warn }}"
4+
when: block_devices_configurations | length > 0
5+
16
- name: Enumerate block device paths by serial number
27
block_devices:
38
register: _block_devices

ansible/roles/mysql/README.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
mysql
2+
=====
3+
4+
Deploy containerised `mysql` server using Podman.
5+
6+
7+
Requirements
8+
------------
9+
10+
None.
11+
12+
Role Variables
13+
--------------
14+
15+
- `mysql_root_password`: Required str. Password to set for `root` mysql user. **NB** This cannot be changed by this role once mysql server has initialised.
16+
- `mysql_tag`: Optional str. Tag for version of `mysql` container image to use. Default `8.0.30`.
17+
- `mysql_systemd_service_enabled`: Optional bool. Whether `mysql` service starts on boot. Default `yes`.
18+
- `mysql_state`: Optional str. As per `ansible.builtin.systemd:state`. Default is `started` or `restarted` as required.
19+
- `mysql_podman_user`: Optional str. User running `podman`. Default `{{ ansible_user }}`.
20+
- `mysql_datadir`: Optional str. Path to data directory on the host to store databases etc. Default `/var/lib/mysql`. Note all path components will be created and user set appropriately if this does not exist.
21+
- `mysql_host`: Optional str. Address of host. Default `{{ inventory_hostname }}`.
22+
- `mysql_users`: Optional list of dicts defining users as per `community.mysql.mysql_user`. Default `[]`.
23+
- `mysql_databases`: Optional list of dicts defining databases as per `community.mysql.mysql_db`. Default `[]`.
24+
25+
Dependencies
26+
------------
27+
28+
None.
29+
30+
Example Playbook
31+
----------------
32+
33+
```yaml
34+
- name: Setup DB
35+
hosts: mysql
36+
become: true
37+
tags:
38+
- mysql
39+
tasks:
40+
- include_role:
41+
name: mysql
42+
```
43+
44+
License
45+
-------
46+
47+
Apache v2
48+
49+
Author Information
50+
------------------
51+
52+
Steve Brasier [email protected]

ansible/roles/mysql/defaults/main.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# required:
2+
# mysql_root_password: # TODO: make it possible to CHANGE root password
3+
4+
mysql_tag: 8.0.30
5+
mysql_systemd_service_enabled: yes
6+
#mysql_state: # default is started or restarted as required
7+
mysql_podman_user: "{{ ansible_user }}"
8+
mysql_datadir: /var/lib/mysql
9+
mysql_mysqld_options: [] # list of str options to mysqld, see `run -it --rm mysql:tag --verbose --help`
10+
mysql_users: [] # list of dicts for community.mysql.mysql_user
11+
mysql_databases: [] # list of dicts for community.mysql.mysql_db
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
- name: Create environment file for mysql server root password
2+
# NB: This doesn't trigger a restart on changes as it will be ignored once mysql is initialised
3+
copy:
4+
dest: /etc/sysconfig/mysqld
5+
content: |
6+
MYSQL_INITIAL_ROOT_PASSWORD='{{ mysql_root_password }}'
7+
owner: root
8+
group: root
9+
mode: u=rw,go=
10+
11+
- name: Ensure mysql service state
12+
systemd:
13+
name: mysql
14+
state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}"
15+
enabled: "{{ mysql_systemd_service_enabled }}"
16+
daemon_reload: "{{ _mysql_unitfile.changed }}"
17+
18+
- block:
19+
- name: Wait for mysql to initialise
20+
# NB: It is not sufficent to wait_for the port
21+
community.mysql.mysql_info:
22+
login_user: root
23+
login_password: "{{ mysql_root_password }}"
24+
# no_log: true # TODO: FIXME
25+
register: _mysql_info
26+
until: "'version' in _mysql_info"
27+
retries: 60
28+
delay: 2
29+
30+
- name: Ensure mysql databases created
31+
community.mysql.mysql_db: "{{ item }}"
32+
loop: "{{ mysql_databases}}"
33+
34+
- name: Ensure mysql users present
35+
community.mysql.mysql_user: "{{ item }}"
36+
loop: "{{ mysql_users }}"
37+
when: "mysql_state | default('unspecified') != 'stopped'"

ansible/roles/mysql/tasks/install.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- name: Install python mysql client
2+
pip:
3+
name: pymysql
4+
state: present
5+
6+
- name: Create systemd mysql container unit file
7+
template:
8+
dest: /etc/systemd/system/mysql.service
9+
src: mysql.service.j2
10+
register: _mysql_unitfile

ansible/roles/mysql/tasks/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
- import_tasks: install.yml
2+
- import_tasks: configure.yml
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# mysql.service
2+
3+
[Unit]
4+
Description=Podman container mysql.service
5+
Documentation=man:podman-generate-systemd(1)
6+
Wants=network.target
7+
After=network-online.target
8+
RequiresMountsFor={{ mysql_datadir }} /etc/sysconfig/mysqld
9+
10+
[Service]
11+
Environment=PODMAN_SYSTEMD_UNIT=%n
12+
Restart=always
13+
EnvironmentFile=/etc/sysconfig/mysqld
14+
# The above EnvironmentFile must define MYSQL_INITIAL_ROOT_PASSWORD
15+
ExecStartPre=+install -d -o {{ mysql_podman_user }} -g {{ mysql_podman_user }} -Z container_file_t {{ mysql_datadir }}
16+
ExecStart=/usr/bin/podman run \
17+
--network slirp4netns:cidr={{ podman_cidr }} \
18+
--sdnotify=conmon --cgroups=no-conmon \
19+
--detach --replace --name mysql --restart=no \
20+
--user mysql \
21+
--volume {{ mysql_datadir }}:/var/lib/mysql:U \
22+
--publish 3306:3306 \
23+
-e MYSQL_ROOT_PASSWORD=${MYSQL_INITIAL_ROOT_PASSWORD} \
24+
mysql:{{ mysql_tag }}{%- for opt in mysql_mysqld_options %} \
25+
--{{ opt }}{% endfor %}
26+
27+
ExecStop=/usr/bin/podman stop --ignore mysql -t 10
28+
# note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped
29+
ExecStopPost=/usr/bin/podman rm --ignore -f mysql
30+
SuccessExitStatus=143 SIGTERM
31+
KillMode=none
32+
Type=notify
33+
NotifyAccess=all
34+
LimitNOFILE=65536
35+
LimitMEMLOCK=infinity
36+
User={{ mysql_podman_user }}
37+
Group={{ mysql_podman_user }}
38+
TimeoutStartSec=180
39+
40+
[Install]
41+
WantedBy=multi-user.target default.target

ansible/roles/opendistro/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
#opendistro_internal_users_path:
44

55
opendistro_podman_user: "{{ ansible_user }}"
6+
opendistro_data_path: "/usr/share/elasticsearch/data" # path to host data directory

ansible/roles/opendistro/templates/opendistro.service.j2

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,21 @@ After=network-online.target
99
[Service]
1010
Environment=PODMAN_SYSTEMD_UNIT=%n
1111
Restart=always
12-
ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon -d --replace --name opendistro --restart=no --user elasticsearch --ulimit memlock=-1:-1 --ulimit nofile=65536:65536 --volume opendistro:/usr/share/elasticsearch/data --volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro --env node.name=opendistro --env discovery.type=single-node --env bootstrap.memory_lock=true --env "ES_JAVA_OPTS=-Xms512m -Xmx512m" --publish 9200:9200 amazon/opendistro-for-elasticsearch:1.12.0
12+
ExecStartPre=+install -d -o {{ opendistro_podman_user }} -g {{ opendistro_podman_user }} -Z container_file_t {{ opendistro_data_path }}
13+
ExecStart=/usr/bin/podman run \
14+
--network slirp4netns:cidr={{ podman_cidr }} \
15+
--sdnotify=conmon --cgroups=no-conmon \
16+
--detach --replace --name opendistro --restart=no \
17+
--user elasticsearch \
18+
--ulimit memlock=-1:-1 --ulimit nofile=65536:65536 \
19+
--volume {{ opendistro_data_path }}:/usr/share/elasticsearch/data:U \
20+
--volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro \
21+
--env node.name=opendistro \
22+
--env discovery.type=single-node \
23+
--env bootstrap.memory_lock=true \
24+
--env "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
25+
--publish 9200:9200 \
26+
amazon/opendistro-for-elasticsearch:1.12.0
1327
ExecStop=/usr/bin/podman stop --ignore opendistro -t 10
1428
# note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped
1529
ExecStopPost=/usr/bin/podman rm --ignore -f opendistro

ansible/roles/podman/tasks/config.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
dest: /etc/security/limits.d/custom.conf
1414
become: true
1515

16+
- name: Up default keys permitted
17+
ansible.posix.sysctl:
18+
name: kernel.keys.maxkeys # /proc/sys/kernel/keys/maxkeys
19+
value: 50000
20+
become: true
21+
1622
- name: reset ssh connection to allow user changes to affect 'current login user'
1723
meta: reset_connection
1824

@@ -60,9 +66,6 @@
6066
become: yes
6167
register: podman_tmp
6268

63-
- debug:
64-
var: podman_tmp
65-
6669
- name: Reset podman database
6770
# otherwise old config overrides!
6871
command:

ansible/roles/systemd/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# systemd
2+
3+
Create drop-in files for systemd services.
4+
5+
# Role Variables
6+
- `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows:
7+
- `group`: Required str. Inventory group this drop-in applies to.
8+
- `comment`: Optional str. Comment describing reason for drop-in.
9+
- `content`: Required str. Content of drop-in file.
10+
# systemd
11+
12+
Create drop-in files for systemd services.
13+
14+
# Role Variables
15+
- `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows:
16+
- `group`: Required str. Inventory group this drop-in applies to.
17+
- `comment`: Optional str. Comment describing reason for drop-in.
18+
- `content`: Required str. Content of drop-in file.
19+
- `systemd_restart`: Optional bool. Whether to reload unit definitions and restart services. Default `false`.

0 commit comments

Comments
 (0)