Skip to content

Commit d78a913

Browse files
authored
Merge branch 'main' into fix/skelton-ssh-persist
2 parents b69d2a9 + b13b98d commit d78a913

File tree

16 files changed

+102
-41
lines changed

16 files changed

+102
-41
lines changed

.github/workflows/fatimage.yml

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,26 @@ on:
44
workflow_dispatch:
55
jobs:
66
openstack:
7-
name: openstack-build-arcus
7+
name: openstack-imagebuild
88
concurrency: ${{ github.ref }} # to branch/PR
99
runs-on: ubuntu-20.04
1010
env:
1111
ANSIBLE_FORCE_COLOR: True
1212
OS_CLOUD: openstack
13+
CI_CLOUD: ${{ vars.CI_CLOUD }}
1314
steps:
1415
- uses: actions/checkout@v2
1516

1617
- name: Setup ssh
1718
run: |
1819
set -x
1920
mkdir ~/.ssh
20-
echo "${arcus_SSH_KEY}" > ~/.ssh/id_rsa
21+
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
2122
chmod 0600 ~/.ssh/id_rsa
22-
env:
23-
arcus_SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }}
23+
shell: bash
2424

2525
- name: Add bastion's ssh key to known_hosts
26-
run: cat environments/.stackhpc/bastion_fingerprint >> ~/.ssh/known_hosts
26+
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
2727
shell: bash
2828

2929
- name: Install ansible etc
@@ -32,11 +32,9 @@ jobs:
3232
- name: Write clouds.yaml
3333
run: |
3434
mkdir -p ~/.config/openstack/
35-
echo "${arcus_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
35+
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
3636
shell: bash
37-
env:
38-
arcus_CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }}
39-
37+
4038
- name: Setup environment
4139
run: |
4240
. venv/bin/activate
@@ -49,7 +47,7 @@ jobs:
4947
. environments/.stackhpc/activate
5048
cd packer/
5149
packer init .
52-
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
50+
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
5351
5452
- name: Get created image name from manifest
5553
id: manifest

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ jobs:
4040

4141
- name: Install terraform
4242
uses: hashicorp/setup-terraform@v1
43+
with:
44+
terraform: v1.5.5
4345

4446
- name: Initialise terraform
4547
run: terraform init

ansible/fatimage.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@
5959
tasks_from: vnc_compute.yml
6060

6161
# - import_playbook: monitoring.yml:
62+
- import_role:
63+
name: opensearch
64+
tasks_from: install.yml
65+
become: true
66+
6267
# opensearch - containerised, nothing to do
6368
# slurm_stats - nothing to do
6469
# filebeat - containerised - nothing to do

ansible/monitoring.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
tasks:
88
- import_role:
99
name: opensearch
10+
tasks_from: install.yml
11+
become: true
12+
- import_role:
13+
name: opensearch
14+
tasks_from: runtime.yml
1015
become: true
1116

1217
- name: Setup slurm stats

ansible/roles/opensearch/defaults/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#opensearch_internal_users_path:
44

55
opensearch_podman_user: "{{ ansible_user }}"
6-
opensearch_version: '2.4.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags
6+
opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags
77
opensearch_config_path: /usr/share/opensearch/config
88
opensearch_data_path: /usr/share/opensearch/data
99
opensearch_state: started # will be restarted if required

ansible/roles/opensearch/handlers/main.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,4 @@
55
name: opensearch.service
66
state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}"
77
enabled: "{{ opensearch_systemd_service_enabled }}"
8-
daemon_reload: "{{ 'started' in opensearch_state }}"
98
become: true
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Remove data which was NOT indexed by Slurm Job ID
2+
# It will be re-ingested by filebeat from the slurmdbd, with that index
3+
4+
- name: Ensure opensearch stopped
5+
systemd:
6+
name: opensearch
7+
state: stopped
8+
register: _opensearch_stop
9+
until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']"
10+
retries: 15
11+
delay: 5
12+
13+
- name: Archive existing data
14+
community.general.archive:
15+
path: "{{ opensearch_data_path }}"
16+
dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz"
17+
remove: true
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# safe to use during build
2+
3+
- name: Increase maximum number of virtual memory maps
4+
# see https://opensearch.org/docs/2.0/opensearch/install/important-settings/
5+
ansible.posix.sysctl:
6+
name: vm.max_map_count
7+
value: '262144'
8+
state: present
9+
reload: yes
10+
11+
- name: Create systemd unit file
12+
template:
13+
dest: /etc/systemd/system/opensearch.service
14+
src: opensearch.service.j2
15+
register: _opensearch_unit
16+
17+
- name: Reload opensearch unit file
18+
command: systemctl daemon-reload
19+
when: _opensearch_unit.changed

ansible/roles/opensearch/tasks/main.yml renamed to ansible/roles/opensearch/tasks/runtime.yml

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,16 @@
1515
path: /etc/systemd/system/opendistro.service
1616
state: absent
1717

18-
- name: Increase maximum number of virtual memory maps
19-
# see https://opensearch.org/docs/2.0/opensearch/install/important-settings/
20-
ansible.posix.sysctl:
21-
name: vm.max_map_count
22-
value: '262144'
23-
state: present
24-
reload: yes
25-
become: true
18+
- name: Enumerate files in data directory
19+
find:
20+
path: "{{ opensearch_data_path }}"
21+
register: _find_opensearch_data
22+
23+
- name: Archive incorrectly indexed data
24+
import_tasks: archive_data.yml
25+
when:
26+
- _find_opensearch_data.files | length > 0
27+
- "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')"
2628

2729
- name: Ensure required opensearch host directories exist
2830
file:
@@ -35,11 +37,18 @@
3537
loop:
3638
- "{{ opensearch_config_path }}"
3739
- "{{ opensearch_data_path }}"
38-
when: "'started' in opensearch_state" # don't run during image build
40+
41+
- name: Set indexed data flag
42+
copy:
43+
dest: "{{ opensearch_data_path }}/slurm_jobid_index"
44+
content: |
45+
This is a flag file to indicate that filebeat is pushing data
46+
indexed by Slurm JobID to prevent duplicate OpenSearch records
47+
owner: "{{ opensearch_podman_user }}"
48+
group: "{{ opensearch_podman_user }}"
3949

4050
- name: Create certs
4151
import_tasks: certs.yml
42-
when: "'started' in opensearch_state" # don't run during image build
4352

4453
- name: Template general configuration
4554
ansible.builtin.template:
@@ -52,7 +61,6 @@
5261
mode: 0660
5362
notify: Restart opensearch service
5463
become: true
55-
when: "'started' in opensearch_state" # don't run during image build
5664

5765
- name: Template internal user configuration
5866
template:
@@ -65,14 +73,11 @@
6573
mode: 0660
6674
notify: Restart opensearch service
6775
become: true
68-
when: "'started' in opensearch_state" # don't run during image build
6976

70-
- name: Create systemd unit file
71-
template:
72-
dest: /etc/systemd/system/opensearch.service
73-
src: opensearch.service.j2
74-
become: true
75-
notify: Restart opensearch service
77+
- name: Pull container
78+
containers.podman.podman_image:
79+
name: "opensearchproject/opensearch:{{ opensearch_version }}"
80+
become_user: "{{ opensearch_podman_user }}"
7681

7782
- name: Flush handlers
7883
meta: flush_handlers

environments/.stackhpc/builder.pkrvars.hcl renamed to environments/.stackhpc/ARCUS.pkrvars.hcl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
flavor = "vm.ska.cpu.general.small"
22
networks = ["a262aabd-e6bf-4440-a155-13dbc1b5db0e"] # WCDC-iLab-60
3-
source_image_name = "openhpc-230503-0944-bf8c3f63.qcow2" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
3+
source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298
44
fatimage_source_image_name = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
55
ssh_keypair_name = "slurm-app-ci"
66
ssh_private_key_file = "~/.ssh/id_rsa"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
flavor = "general.v1.tiny"
2+
networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # stackhpc-ipv4-geneve
3+
source_image_name = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
4+
fatimage_source_image_name = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
5+
ssh_keypair_name = "slurm-app-ci"
6+
ssh_private_key_file = "~/.ssh/id_rsa"
7+
security_groups = ["default", "SSH"]
8+
ssh_bastion_host = "185.45.78.150"
9+
ssh_bastion_username = "steveb"

environments/.stackhpc/inventory/group_vars/all/rebuild.yml

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
cluster_net = "WCDC-iLab-60"
2-
cluster_subnet = "WCDC-iLab-60"
3-
vnic_type = "direct"
1+
cluster_net = "portal-internal"
2+
cluster_subnet = "portal-internal"
3+
vnic_type = "normal"
44
control_node_flavor = "vm.ska.cpu.general.quarter"
55
other_node_flavor = "vm.ska.cpu.general.small"
6-
volume_backed_instances = false
76
state_volume_device_path = "/dev/sdb"
87
home_volume_device_path = "/dev/sdc"

environments/.stackhpc/terraform/SMS.tfvars

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,5 @@ cluster_subnet = "stackhpc-ipv4-geneve-subnet"
33
vnic_type = "normal"
44
control_node_flavor = "general.v1.medium"
55
other_node_flavor = "general.v1.tiny"
6-
volume_backed_instances = true
76
state_volume_device_path = "/dev/vdb"
87
home_volume_device_path = "/dev/vdc"

environments/.stackhpc/terraform/main.tf

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ variable "cluster_name" {
1313
variable "cluster_image" {
1414
description = "single image for all cluster nodes - a convenience for CI"
1515
type = string
16-
default = "openhpc-230503-0944-bf8c3f63" # https://github.com/stackhpc/ansible-slurm-appliance/pull/252
16+
default = "openhpc-230811-1548-a49164d1" # https://github.com/stackhpc/ansible-slurm-appliance/pull/301
1717
# default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2"
1818
# default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
1919
}
@@ -28,7 +28,9 @@ variable "control_node_flavor" {}
2828

2929
variable "other_node_flavor" {}
3030

31-
variable "volume_backed_instances" {}
31+
variable "volume_backed_instances" {
32+
default = false
33+
}
3234

3335
variable "state_volume_device_path" {}
3436

environments/common/files/filebeat/filebeat.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ filebeat.inputs:
2222
fields_under_root: true
2323

2424
processors:
25+
# Want to use the Slurm JobID as the ElasticSearch id to avoid duplicated records
26+
# Don't use filebeat.inputs:json.document_id as this removes the JobID from the record
27+
- fingerprint:
28+
fields: ["json.JobID"]
29+
target_field: "@metadata._id"
2530
- timestamp:
2631
field: json.End
2732
layouts:

0 commit comments

Comments
 (0)