Skip to content

Commit 410e54b

Browse files
authored
Merge pull request #1212 from stackhpc/yoga-diagnostics
yoga: Backport diagnostics collection
2 parents 1e63262 + 57e321c commit 410e54b

File tree

4 files changed

+244
-2
lines changed

4 files changed

+244
-2
lines changed

.github/workflows/stackhpc-all-in-one.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,17 @@ jobs:
116116
run: ssh-keygen -f id_rsa -N ''
117117
working-directory: ${{ github.workspace }}/terraform/aio
118118

119+
# TODO: Remove the following step in Antelope.
120+
# NOTE: In Ansible 2.10 and lower the synchronize module used in the
121+
# ansible/diagnostics.yml playbook does not respect SSH connection
122+
# variables. This may result in Permission Denied issues if using an SSH
123+
# key that is not in ~/.ssh.
124+
- name: Copy SSH keypair to .ssh/
125+
run: |
126+
install -d ~/.ssh -m 700 &&
127+
cp id_rsa* ~/.ssh/
128+
working-directory: ${{ github.workspace }}/terraform/aio
129+
119130
- name: Generate clouds.yaml
120131
run: |
121132
cat << EOF > clouds.yaml
@@ -156,6 +167,7 @@ jobs:
156167
OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
157168

158169
- name: Terraform Apply
170+
id: tf_apply
159171
run: |
160172
for attempt in $(seq 5); do
161173
if terraform apply -auto-approve; then
@@ -290,6 +302,7 @@ jobs:
290302
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
291303

292304
- name: Tempest tests
305+
id: tempest
293306
run: |
294307
mkdir -p tempest-artifacts
295308
docker run -t --rm \
@@ -301,11 +314,28 @@ jobs:
301314
env:
302315
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
303316

317+
- name: Collect diagnostic information
318+
id: diagnostics
319+
run: |
320+
mkdir -p diagnostics
321+
sudo -E docker run -t --rm \
322+
-v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
323+
-v $(pwd)/diagnostics:/stack/diagnostics \
324+
-e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
325+
$KAYOBE_IMAGE \
326+
/stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml'
327+
env:
328+
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
329+
if: ${{ always() && steps.tf_apply.outcome == 'success' }}
330+
304331
- name: Upload test result artifacts
305332
uses: actions/upload-artifact@v4
306333
with:
307-
name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}
308-
path: tempest-artifacts/*
334+
name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}
335+
path: |
336+
diagnostics/
337+
tempest-artifacts/
338+
if: ${{ always() && (steps.tempest.outcome == 'success' || steps.diagnostics.outcome == 'success') }}
309339

310340
- name: Fail if any Tempest tests failed
311341
run: |

etc/kayobe/ansible/diagnostics.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
---
2+
# This playbook runs a script that collects diagnostic information from hosts.
3+
# The diagnostics are aggregated to a directory
4+
# (diagnostics_path_local/inventory_hostname) on localhost.
5+
#
6+
# NOTE: The diagnostic information contains sensitive information such as
7+
# passwords in configuration files.
8+
9+
- name: Collect diagnostic information
10+
hosts: seed-hypervisor:seed:overcloud:infra-vms
11+
vars:
12+
diagnostics_path_local: "{{ lookup('env', 'PWD') }}/diagnostics"
13+
tasks:
14+
- block:
15+
- name: Create a temporary directory for diagnostics
16+
ansible.builtin.tempfile:
17+
state: directory
18+
suffix: diagnostics
19+
register: diagnostics_tmpdir
20+
21+
- name: Write host variables to a file
22+
ansible.builtin.copy:
23+
content: "{{ hostvars[inventory_hostname].ansible_facts | to_nice_json }}"
24+
dest: "{{ diagnostics_tmpdir.path }}/facts.json"
25+
26+
- name: Run diagnostics script
27+
ansible.builtin.script: "{{ kayobe_config_path }}/../../tools/diagnostics.sh"
28+
become: true
29+
failed_when: diagnostics_result.rc is not defined
30+
register: diagnostics_result
31+
environment:
32+
LOG_DIR: "{{ diagnostics_tmpdir.path }}"
33+
CONFIG_DIR: "{{ kayobe_config_path }}/../.."
34+
35+
- name: Download diagnostic logs to localhost
36+
ansible.posix.synchronize:
37+
src: "{{ diagnostics_tmpdir.path }}/"
38+
dest: "{{ diagnostics_path_local }}/{{ inventory_hostname }}"
39+
mode: pull
40+
archive: no
41+
recursive: true
42+
copy_links: true
43+
verify_host: true
44+
# For jump host
45+
use_ssh_args: true
46+
vars:
47+
# FIXME: The synchronize module fails on Yoga, due to not templating
48+
# the SSH user.
49+
ansible_user: stack
50+
always:
51+
- name: Clean up temporary directory
52+
ansible.builtin.file:
53+
path: "{{ diagnostics_tmpdir.path }}"
54+
state: absent
55+
56+
- name: Display diagnostics collection stdout
57+
ansible.builtin.debug:
58+
msg: "{{ diagnostics_result.stdout }}"
59+
when: diagnostics_result.stdout is defined
60+
61+
- name: Display diagnostics collection stderr
62+
ansible.builtin.debug:
63+
msg: "{{ diagnostics_result.stderr }}"
64+
when: diagnostics_result.stderr is defined
65+
66+
- name: Fail if diagnostics collection failed
67+
ansible.builtin.fail:
68+
msg: Diagnostics collection failed
69+
when: diagnostics_result.rc != 0
70+
71+
- name: Display location of diagnostics archive
72+
ansible.builtin.debug:
73+
msg: >-
74+
Wrote diagnostics to {{ diagnostics_path_local }} on localhost
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
features:
3+
- |
4+
Adds a new ``diagnostics.yml`` playbook that collects diagnostic
5+
information from hosts. The diagnostics are aggregated to a directory
6+
(``$PWD/diagnostics/`` by default) on localhost. The diagnostics include:
7+
8+
* Docker container logs
9+
* Kolla configuration files
10+
* Log files
11+
12+
*The collected diagnostic information contains sensitive information such
13+
as passwords in configuration files.*
14+

tools/diagnostics.sh

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/bin/bash
2+
3+
# NOTE(mgoddard): This has been adapted from
4+
# roles/kayobe-diagnostics/files/get_logs.sh in Kayobe.
5+
6+
# Environment variables:
7+
# $LOG_DIR is the directory to copy logs to.
8+
9+
# TODO: Make this script more robust and use set -e.
10+
set +o errexit
11+
set -u
12+
13+
copy_logs() {
14+
mkdir -p ${LOG_DIR}/{docker_logs,kolla_node_configs,system_logs}
15+
16+
cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs
17+
# Don't save the IPA images.
18+
rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs}
19+
rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs}
20+
21+
if [[ -d /opt/kayobe/etc/kolla ]]; then
22+
mkdir -p ${LOG_DIR}/kolla_build_configs
23+
cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/
24+
fi
25+
26+
cp -rvnL /var/log/* ${LOG_DIR}/system_logs/
27+
28+
journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.log
29+
journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log
30+
journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log
31+
journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log
32+
33+
if [[ -d /etc/sysconfig/network-scripts/ ]]; then
34+
cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/
35+
fi
36+
37+
if [[ -d /etc/NetworkManager/system-connections/ ]]; then
38+
cp -r /etc/NetworkManager/system-connections/ ${LOG_DIR}/system_logs/
39+
fi
40+
41+
if [[ -d /etc/yum.repos.d/ ]]; then
42+
cp -r /etc/yum.repos.d/ ${LOG_DIR}/system_logs/
43+
fi
44+
45+
if [[ -d /etc/apt/sources.list.d/ ]]; then
46+
cp -r /etc/apt/sources.list.d/ ${LOG_DIR}/system_logs/
47+
fi
48+
49+
if [[ -d /etc/systemd/ ]]; then
50+
cp -rL /etc/systemd/ ${LOG_DIR}/system_logs/
51+
fi
52+
53+
df -h > ${LOG_DIR}/system_logs/df.txt
54+
# Gather disk usage statistics for files and directories larger than 1MB
55+
du -d 5 -hx / | sort -hr | grep '^[0-9\.]*[MGT]' > ${LOG_DIR}/system_logs/du.txt
56+
free > ${LOG_DIR}/system_logs/free.txt
57+
cat /etc/hosts > ${LOG_DIR}/system_logs/hosts.txt
58+
parted -l > ${LOG_DIR}/system_logs/parted-l.txt
59+
mount > ${LOG_DIR}/system_logs/mount.txt
60+
env > ${LOG_DIR}/system_logs/env.txt
61+
ip address > ${LOG_DIR}/system_logs/ip-address.txt
62+
ip route > ${LOG_DIR}/system_logs/ip-route.txt
63+
ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt
64+
ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt
65+
pvs > ${LOG_DIR}/system_logs/pvs.txt
66+
vgs > ${LOG_DIR}/system_logs/vgs.txt
67+
lvs > ${LOG_DIR}/system_logs/lvs.txt
68+
69+
iptables-save > ${LOG_DIR}/system_logs/iptables.txt
70+
71+
if [ `command -v dpkg` ]; then
72+
dpkg -l > ${LOG_DIR}/system_logs/dpkg-l.txt
73+
fi
74+
if [ `command -v rpm` ]; then
75+
rpm -qa > ${LOG_DIR}/system_logs/rpm-qa.txt
76+
fi
77+
78+
# final memory usage and process list
79+
ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt
80+
81+
# available entropy
82+
cat /proc/sys/kernel/random/entropy_avail > ${LOG_DIR}/system_logs/entropy_avail.txt
83+
84+
# docker related information
85+
(docker info && docker images && docker ps -a) > ${LOG_DIR}/system_logs/docker-info.txt
86+
87+
for container in $(docker ps -a --format "{{.Names}}"); do
88+
docker logs --tail all ${container} &> ${LOG_DIR}/docker_logs/${container}.txt
89+
done
90+
91+
# Bifrost: grab config files and logs from the container.
92+
if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then
93+
mkdir -p ${LOG_DIR}/bifrost
94+
for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do
95+
mkdir -p ${LOG_DIR}/bifrost/$service
96+
docker exec bifrost_deploy \
97+
systemctl status $service -l -n 10000 > ${LOG_DIR}/bifrost/$service/${service}-systemd-status.txt
98+
docker exec bifrost_deploy \
99+
journalctl -u $service --no-pager > ${LOG_DIR}/bifrost/$service/${service}-journal.txt
100+
done
101+
docker exec -it bifrost_deploy \
102+
journalctl --no-pager > ${LOG_DIR}/bifrost/bifrost-journal.log
103+
for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do
104+
docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/
105+
done
106+
docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/bifrost/mariadb/
107+
fi
108+
109+
# IPA build logs
110+
if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then
111+
mkdir -p ${LOG_DIR}/ipa
112+
cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/ipa/
113+
fi
114+
115+
# Overcloud host image build logs
116+
if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then
117+
mkdir -p ${LOG_DIR}/deployment_image
118+
cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/deployment_image/
119+
fi
120+
121+
chown -R stack: ${LOG_DIR}
122+
}
123+
124+
copy_logs

0 commit comments

Comments
 (0)