Skip to content

Commit 679edee

Browse files
committed
Merge remote-tracking branch 'origin/stackhpc/2023.1' into sync-caracal-antelope
2 parents e4b6fbe + 6281412 commit 679edee

15 files changed

+149
-9
lines changed

doc/source/operations/upgrading-ceph.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Place the host or batch of hosts into maintenance mode:
6363

6464
.. code-block:: console
6565
66-
sudo cephadm shell -- ceph orch host maintenance enter <host>
66+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-enter-maintenance.yml -l <host>
6767
6868
To update all eligible packages, use ``*``, escaping if necessary:
6969

@@ -72,7 +72,8 @@ To update all eligible packages, use ``*``, escaping if necessary:
7272
kayobe overcloud host package update --packages "*" --limit <host>
7373
7474
If the kernel has been upgraded, reboot the host or batch of hosts to pick up
75-
the change:
75+
the change. While running this playbook, consider setting ``ANSIBLE_SERIAL`` to
76+
the maximum number of hosts that can safely reboot concurrently.
7677

7778
.. code-block:: console
7879
@@ -82,7 +83,7 @@ Remove the host or batch of hosts from maintenance mode:
8283

8384
.. code-block:: console
8485
85-
sudo cephadm shell -- ceph orch host maintenance exit <host>
86+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-exit-maintenance.yml -l <host>
8687
8788
Wait for Ceph health to return to ``HEALTH_OK``:
8889

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
- name: Ensure a Ceph host has entered maintenance
3+
gather_facts: true
4+
any_errors_fatal: true
5+
# We need to check whether it is OK to stop hosts after previous hosts have
6+
# entered maintenance.
7+
serial: 1
8+
hosts: ceph
9+
become: true
10+
tasks:
11+
- name: Ensure a Ceph host has entered maintenance
12+
ansible.builtin.import_role:
13+
name: stackhpc.cephadm.enter_maintenance
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
- name: Ensure a Ceph host has exited maintenance
3+
gather_facts: true
4+
any_errors_fatal: true
5+
hosts: ceph
6+
# The role currently requires hosts to exit maintenance serially.
7+
serial: 1
8+
become: true
9+
tasks:
10+
- name: Ensure a Ceph host has exited maintenance
11+
ansible.builtin.import_role:
12+
name: stackhpc.cephadm.exit_maintenance

etc/kayobe/ansible/cis.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,7 @@
3535
- include_role:
3636
name: ansible-lockdown.rhel9_cis
3737
when: ansible_facts.os_family == 'RedHat' and ansible_facts.distribution_major_version == '9'
38-
tags: always
3938

4039
- include_role:
4140
name: ansible-lockdown.ubuntu22_cis
4241
when: ansible_facts.distribution == 'Ubuntu' and ansible_facts.distribution_major_version == '22'
43-
tags: always

etc/kayobe/ansible/prometheus-network-names.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
---
12
- name: Prometheus friendly network names
23
hosts: overcloud
34
gather_facts: no

etc/kayobe/ansible/reboot.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,26 @@
22
- name: Reboot the host
33
hosts: seed-hypervisor:seed:overcloud:infra-vms
44
serial: "{{ lookup('env', 'ANSIBLE_SERIAL') | default(1, true) }}"
5+
gather_facts: false
6+
vars:
7+
reboot_timeout_s: "{{ 20 * 60 }}"
8+
reboot_with_bootstrap_user: false
9+
ansible_user: "{{ bootstrap_user if reboot_with_bootstrap_user | bool else kayobe_ansible_user }}"
10+
ansible_ssh_common_args: "{{ '-o StrictHostKeyChecking=no' if reboot_with_bootstrap_user | bool else '' }}"
11+
ansible_python_interpreter: "/usr/bin/python3"
512
tags:
613
- reboot
714
tasks:
815
- name: Reboot and wait
916
become: true
1017
reboot:
18+
reboot_timeout: "{{ reboot_timeout_s }}"
19+
search_paths:
20+
# Systems running molly-guard hang waiting for confirmation before rebooting without this.
21+
- "/lib/molly-guard"
22+
# Default list:
23+
- "/sbin"
24+
- "/bin"
25+
- "/usr/sbin"
26+
- "/usr/bin"
27+
- "/usr/local/sbin"

etc/kayobe/ansible/requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
collections:
33
- name: stackhpc.cephadm
4-
version: 1.15.1
4+
version: 1.18.0
55
# NOTE: Pinning pulp.squeezer to 0.0.13 because 0.0.14+ depends on the
66
# pulp_glue Python library being installed.
77
- name: pulp.squeezer

etc/kayobe/ansible/stackhpc-openstack-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
depth: 1
3232
single_branch: true
3333

34-
- name: Ensure the latest versions of pip and setuptools are installed # noqa package-latest
34+
- name: Ensure the latest versions of pip and setuptools are installed # noqa package-latest
3535
ansible.builtin.pip:
3636
name: "{{ item.name }}"
3737
state: latest

etc/kayobe/ansible/templates/wazuh-secrets.yml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ secrets_wazuh:
77
# Strengthen default wazuh api user pass
88
wazuh_api_users:
99
- username: "wazuh"
10-
password: "{{ secrets_wazuh.wazuh_api_users[0].password | default(lookup('community.general.random_string', min_lower=1, min_upper=1, min_special=1, min_numeric=1, length=30)) }}"
10+
password: "{{ secrets_wazuh.wazuh_api_users[0].password | default(lookup('community.general.random_string', min_lower=1, min_upper=1, min_special=1, min_numeric=1, length=30, override_special=override_special_characters)) }}"
1111
# OpenSearch 'admin' user pass
1212
opendistro_admin_password: "{{ secrets_wazuh.opendistro_admin_password | default(lookup('password', '/dev/null'), true) }}"
1313
# OpenSearch 'kibanaserver' user pass

etc/kayobe/ansible/ubuntu-upgrade.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@
4040
reboot:
4141
reboot_timeout: "{{ reboot_timeout_s }}"
4242
connect_timeout: 600
43+
search_paths:
44+
# Systems running molly-guard hang waiting for confirmation before rebooting without this.
45+
- "/lib/molly-guard"
46+
# Default list:
47+
- "/sbin"
48+
- "/bin"
49+
- "/usr/sbin"
50+
- "/usr/bin"
51+
- "/usr/local/sbin"
4352
become: true
4453
when: file_status.stat.exists
4554

@@ -101,6 +110,15 @@
101110
reboot:
102111
reboot_timeout: "{{ reboot_timeout_s }}"
103112
connect_timeout: 600
113+
search_paths:
114+
# Systems running molly-guard hang waiting for confirmation before rebooting without this.
115+
- "/lib/molly-guard"
116+
# Default list:
117+
- "/sbin"
118+
- "/bin"
119+
- "/usr/sbin"
120+
- "/usr/bin"
121+
- "/usr/local/sbin"
104122
become: true
105123

106124
- name: Update distribution facts

etc/kayobe/ansible/wazuh-secrets.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
gather_facts: false
44
vars:
55
wazuh_secrets_path: "{{ kayobe_env_config_path }}/wazuh-secrets.yml"
6+
override_special_characters: '"#$%&()*+,-./:;<=>?@[\]^_{|}~'
67
tasks:
78
- name: install passlib[bcrypt]
89
pip:
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
features:
3+
- |
4+
Adds two new custom playbooks for placing Ceph hosts into and removing them
5+
from maintenance:
6+
7+
- ``ceph-enter-maintenance.yml``
8+
- ``ceph-exit-maintenance.yml``
9+
upgrade:
10+
- |
11+
Updates the ``stackhpc.cephadm`` collection to version ``1.18.0``.
12+
fixes:
13+
- |
14+
Fixes an issue with idempotency in the ``stackhpc.ceph.cephadm_keys``
15+
plugin.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Added a script to automate RabbitMQ quorum queue migrations.

terraform/aio/vm.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ variable "aio_vm_subnet" {
3535

3636
variable "aio_vm_volume_size" {
3737
type = number
38-
default = 35
38+
default = 40
3939
}
4040

4141
variable "aio_vm_tags" {

tools/rabbitmq-quorum-migration.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#! /usr/bin/bash
2+
3+
set -ex
4+
5+
RABBITMQ_SERVICES_TO_RESTART=barbican,blazar,cinder,cloudkitty,designate,heat,ironic,keystone,magnum,manila,neutron,nova,octavia
6+
RABBITMQ_CONTAINER_NAME=rabbitmq
7+
8+
if [[ ! $KAYOBE_CONFIG_PATH ]]; then
9+
echo "Environment variable \$KAYOBE_CONFIG_PATH is not defined"
10+
echo "Ensure your environment is set up to run kayobe commands"
11+
exit 2
12+
fi
13+
14+
if [[ ! "$1" = "--skip-checks" ]]; then
15+
# Fail if clocks are not synced
16+
if ! kayobe overcloud host command run -l controllers -b --command "timedatectl status | grep 'synchronized: yes'"; then
17+
echo "Failed precheck: Time not synced on controllers"
18+
echo "Use 'timedatectl status' to check sync state"
19+
echo "Either wait for sync or use 'chronyc makestep'"
20+
exit 1
21+
fi
22+
kayobe overcloud service configuration generate --node-config-dir /tmp/rabbit-migration --kolla-tags none
23+
# Fail if HA is set or quorum is not
24+
if ! grep 'om_enable_rabbitmq_quorum_queues: true' $KOLLA_CONFIG_PATH/globals.yml || grep 'om_enable_rabbitmq_high_availability: true' $KOLLA_CONFIG_PATH/globals.yml; then
25+
echo "Failed precheck: om_enable_rabbitmq_quorum_queues must be enabled, om_enable_rabbitmq_high_availability must be disabled"
26+
exit 1
27+
fi
28+
fi
29+
30+
# Generate new config, stop services using rabbit, and reset rabbit state
31+
kayobe overcloud service configuration generate --node-config-dir /etc/kolla --kolla-skip-tags rabbitmq-ha-precheck
32+
kayobe kolla ansible run "stop --yes-i-really-really-mean-it" -kt $RABBITMQ_SERVICES_TO_RESTART
33+
kayobe kolla ansible run rabbitmq-reset-state
34+
35+
if [[ ! "$1" = "--skip-checks" ]]; then
36+
# Fail if any queues still exist
37+
sleep 20
38+
if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues name --silent | grep -v '^$'"; then
39+
echo "Failed check: RabbitMQ has not stopped properly, queues still exist"
40+
exit 1
41+
fi
42+
# Fail if any exchanges still exist (excluding those starting with 'amq.')
43+
if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_exchanges name --silent | grep -v '^$' | grep -v '^amq.'"; then
44+
echo "Failed check: RabbitMQ has not stopped properly, exchanges still exist"
45+
exit 1
46+
fi
47+
fi
48+
49+
# Redeploy with quorum queues enabled
50+
kayobe kolla ansible run deploy-containers -kt $RABBITMQ_SERVICES_TO_RESTART
51+
52+
if [[ ! "$1" = "--skip-checks" ]]; then
53+
sleep 20
54+
# Assert that at least one quorum queue exists on each controller
55+
if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues type | grep quorum"; then
56+
echo "Queues migrated successfully"
57+
else
58+
echo "Failed post-check: A controller does not have any quorum queues"
59+
fi
60+
fi

0 commit comments

Comments
 (0)