Skip to content

zed: yoga merge #1240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions .github/workflows/stackhpc-all-in-one.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,17 @@ jobs:
run: ssh-keygen -f id_rsa -N ''
working-directory: ${{ github.workspace }}/terraform/aio

# TODO: Remove the following step in Antelope.
# NOTE: In Ansible 2.10 and lower the synchronize module used in the
# ansible/diagnostics.yml playbook does not respect SSH connection
# variables. This may result in Permission Denied issues if using an SSH
# key that is not in ~/.ssh.
- name: Copy SSH keypair to .ssh/
run: |
install -d ~/.ssh -m 700 &&
cp id_rsa* ~/.ssh/
working-directory: ${{ github.workspace }}/terraform/aio

- name: Generate clouds.yaml
run: |
cat << EOF > clouds.yaml
Expand Down Expand Up @@ -179,6 +190,7 @@ jobs:
OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}

- name: Terraform Apply
id: tf_apply
run: |
for attempt in $(seq 5); do
if terraform apply -auto-approve; then
Expand Down Expand Up @@ -355,6 +367,7 @@ jobs:
if: inputs.upgrade

- name: Tempest tests
id: tempest
run: |
mkdir -p tempest-artifacts
docker run -t --rm \
Expand All @@ -366,11 +379,28 @@ jobs:
env:
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}

- name: Collect diagnostic information
id: diagnostics
run: |
mkdir -p diagnostics
sudo -E docker run -t --rm \
-v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
-v $(pwd)/diagnostics:/stack/diagnostics \
-e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
$KAYOBE_IMAGE \
/stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml'
env:
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
if: ${{ always() && steps.tf_apply.outcome == 'success' }}

- name: Upload test result artifacts
uses: actions/upload-artifact@v4
with:
name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
path: tempest-artifacts/*
name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
path: |
diagnostics/
tempest-artifacts/
if: ${{ always() && (steps.tempest.outcome == 'success' || steps.diagnostics.outcome == 'success') }}

- name: Fail if any Tempest tests failed
run: |
Expand Down
2 changes: 1 addition & 1 deletion doc/source/configuration/monitoring.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ on the overcloud hosts:
.. code-block:: console

(kayobe) [stack@node ~]$ cd etc/kayobe
(kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml
(kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmon-tools.yml

SMART reporting should now be enabled along with a Prometheus alert for
unhealthy disks and a Grafana dashboard called ``Hardware Overview``.
Expand Down
74 changes: 74 additions & 0 deletions etc/kayobe/ansible/diagnostics.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---
# This playbook runs a script that collects diagnostic information from hosts.
# The diagnostics are aggregated to a directory
# (diagnostics_path_local/inventory_hostname) on localhost.
#
# NOTE: The diagnostic information contains sensitive information such as
# passwords in configuration files.

- name: Collect diagnostic information
hosts: seed-hypervisor:seed:overcloud:infra-vms
vars:
diagnostics_path_local: "{{ lookup('env', 'PWD') }}/diagnostics"
tasks:
- block:
- name: Create a temporary directory for diagnostics
ansible.builtin.tempfile:
state: directory
suffix: diagnostics
register: diagnostics_tmpdir

- name: Write host variables to a file
ansible.builtin.copy:
content: "{{ hostvars[inventory_hostname].ansible_facts | to_nice_json }}"
dest: "{{ diagnostics_tmpdir.path }}/facts.json"

- name: Run diagnostics script
ansible.builtin.script: "{{ kayobe_config_path }}/../../tools/diagnostics.sh"
become: true
failed_when: diagnostics_result.rc is not defined
register: diagnostics_result
environment:
LOG_DIR: "{{ diagnostics_tmpdir.path }}"
CONFIG_DIR: "{{ kayobe_config_path }}/../.."

- name: Download diagnostic logs to localhost
ansible.posix.synchronize:
src: "{{ diagnostics_tmpdir.path }}/"
dest: "{{ diagnostics_path_local }}/{{ inventory_hostname }}"
mode: pull
archive: no
recursive: true
copy_links: true
verify_host: true
# For jump host
use_ssh_args: true
vars:
# FIXME: The synchronize module fails on Yoga, due to not templating
# the SSH user.
ansible_user: stack
always:
- name: Clean up temporary directory
ansible.builtin.file:
path: "{{ diagnostics_tmpdir.path }}"
state: absent

- name: Display diagnostics collection stdout
ansible.builtin.debug:
msg: "{{ diagnostics_result.stdout }}"
when: diagnostics_result.stdout is defined

- name: Display diagnostics collection stderr
ansible.builtin.debug:
msg: "{{ diagnostics_result.stderr }}"
when: diagnostics_result.stderr is defined

- name: Fail if diagnostics collection failed
ansible.builtin.fail:
msg: Diagnostics collection failed
when: diagnostics_result.rc != 0

- name: Display location of diagnostics archive
ansible.builtin.debug:
msg: >-
Wrote diagnostics to {{ diagnostics_path_local }} on localhost
6 changes: 3 additions & 3 deletions etc/kayobe/environments/aufn-ceph/tenks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ node_types:
volumes:
# There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent:
# https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290
- capacity: 10GiB
- capacity: 15GiB
# Ceph volume
- capacity: 10GiB
- capacity: 20GiB
physical_networks:
- provision-net
- cloud-net
Expand All @@ -34,7 +34,7 @@ node_types:
volumes:
# There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent:
# https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290
- capacity: 10GiB
- capacity: 15GiB
physical_networks:
- provision-net
- cloud-net
Expand Down
14 changes: 14 additions & 0 deletions releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
features:
- |
Adds a new ``diagnostics.yml`` playbook that collects diagnostic
information from hosts. The diagnostics are aggregated to a directory
(``$PWD/diagnostics/`` by default) on localhost. The diagnostics include:

* Docker container logs
* Kolla configuration files
* Log files

*The collected diagnostic information contains sensitive information such
as passwords in configuration files.*

124 changes: 124 additions & 0 deletions tools/diagnostics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash

# NOTE(mgoddard): This has been adapted from
# roles/kayobe-diagnostics/files/get_logs.sh in Kayobe.

# Environment variables:
# $LOG_DIR is the directory to copy logs to.

# TODO: Make this script more robust and use set -e.
set +o errexit
set -u

copy_logs() {
mkdir -p ${LOG_DIR}/{docker_logs,kolla_node_configs,system_logs}

cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs
# Don't save the IPA images.
rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs}
rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs}

if [[ -d /opt/kayobe/etc/kolla ]]; then
mkdir -p ${LOG_DIR}/kolla_build_configs
cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/
fi

cp -rvnL /var/log/* ${LOG_DIR}/system_logs/

journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.log
journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log
journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log
journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log

if [[ -d /etc/sysconfig/network-scripts/ ]]; then
cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/
fi

if [[ -d /etc/NetworkManager/system-connections/ ]]; then
cp -r /etc/NetworkManager/system-connections/ ${LOG_DIR}/system_logs/
fi

if [[ -d /etc/yum.repos.d/ ]]; then
cp -r /etc/yum.repos.d/ ${LOG_DIR}/system_logs/
fi

if [[ -d /etc/apt/sources.list.d/ ]]; then
cp -r /etc/apt/sources.list.d/ ${LOG_DIR}/system_logs/
fi

if [[ -d /etc/systemd/ ]]; then
cp -rL /etc/systemd/ ${LOG_DIR}/system_logs/
fi

df -h > ${LOG_DIR}/system_logs/df.txt
# Gather disk usage statistics for files and directories larger than 1MB
du -d 5 -hx / | sort -hr | grep '^[0-9\.]*[MGT]' > ${LOG_DIR}/system_logs/du.txt
free > ${LOG_DIR}/system_logs/free.txt
cat /etc/hosts > ${LOG_DIR}/system_logs/hosts.txt
parted -l > ${LOG_DIR}/system_logs/parted-l.txt
mount > ${LOG_DIR}/system_logs/mount.txt
env > ${LOG_DIR}/system_logs/env.txt
ip address > ${LOG_DIR}/system_logs/ip-address.txt
ip route > ${LOG_DIR}/system_logs/ip-route.txt
ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt
ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt
pvs > ${LOG_DIR}/system_logs/pvs.txt
vgs > ${LOG_DIR}/system_logs/vgs.txt
lvs > ${LOG_DIR}/system_logs/lvs.txt

iptables-save > ${LOG_DIR}/system_logs/iptables.txt

if [ `command -v dpkg` ]; then
dpkg -l > ${LOG_DIR}/system_logs/dpkg-l.txt
fi
if [ `command -v rpm` ]; then
rpm -qa > ${LOG_DIR}/system_logs/rpm-qa.txt
fi

# final memory usage and process list
ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt

# available entropy
cat /proc/sys/kernel/random/entropy_avail > ${LOG_DIR}/system_logs/entropy_avail.txt

# docker related information
(docker info && docker images && docker ps -a) > ${LOG_DIR}/system_logs/docker-info.txt

for container in $(docker ps -a --format "{{.Names}}"); do
docker logs --tail all ${container} &> ${LOG_DIR}/docker_logs/${container}.txt
done

# Bifrost: grab config files and logs from the container.
if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then
mkdir -p ${LOG_DIR}/bifrost
for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do
mkdir -p ${LOG_DIR}/bifrost/$service
docker exec bifrost_deploy \
systemctl status $service -l -n 10000 > ${LOG_DIR}/bifrost/$service/${service}-systemd-status.txt
docker exec bifrost_deploy \
journalctl -u $service --no-pager > ${LOG_DIR}/bifrost/$service/${service}-journal.txt
done
docker exec -it bifrost_deploy \
journalctl --no-pager > ${LOG_DIR}/bifrost/bifrost-journal.log
for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do
docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/
done
docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/bifrost/mariadb/
fi

# IPA build logs
if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then
mkdir -p ${LOG_DIR}/ipa
cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/ipa/
fi

# Overcloud host image build logs
if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then
mkdir -p ${LOG_DIR}/deployment_image
cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/deployment_image/
fi

chown -R stack: ${LOG_DIR}
}

copy_logs
Loading