Skip to content

Commit c5d8484

Browse files
INFRA-709 Rated dwpd alerts (#1077)
* Update smart metrics to include rated DWPD * Add release note * Update alert to use new metric * Update hardware overview dashboard with more nvme metrics * Add nvme drive drill down dashboard --------- Co-authored-by: Doug Szumski <[email protected]>
1 parent 09fa0e0 commit c5d8484

File tree

7 files changed

+2177
-75
lines changed

7 files changed

+2177
-75
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
---
2+
- name: Gather unique NVMe disk models on all hosts
3+
hosts: overcloud
4+
gather_facts: no
5+
tasks:
6+
- name: Retrieve NVMe device information
7+
ansible.builtin.command: "nvme list -o json"
8+
register: nvme_list
9+
changed_when: false
10+
become: true
11+
12+
- name: Parse NVMe device model names
13+
ansible.builtin.set_fact:
14+
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
15+
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
16+
changed_when: false
17+
18+
- name: Set unique NVMe models as host facts
19+
ansible.builtin.set_fact:
20+
unique_nvme_models: "{{ (nvme_models | default([])) | unique }}"
21+
22+
- name: Show unique NVMe models per host
23+
ansible.builtin.debug:
24+
var: unique_nvme_models
25+
26+
- name: Aggregate all unique NVMe models from all hosts
27+
hosts: localhost
28+
gather_facts: no
29+
tasks:
30+
- name: Aggregate unique NVMe models from all overcloud hosts
31+
ansible.builtin.set_fact:
32+
all_nvme_models: "{{ groups['overcloud'] | map('extract', hostvars, 'unique_nvme_models') | select('defined') | sum(start=[]) | unique }}"
33+
34+
- name: Show all unique NVMe models
35+
ansible.builtin.debug:
36+
var: all_nvme_models
37+
38+
- name: Ensure dwpd-ratings.yml exists
39+
ansible.builtin.stat:
40+
path: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
41+
register: dwpd_ratings_stat
42+
run_once: true
43+
44+
- name: Load existing dwpd-ratings.yml
45+
ansible.builtin.set_fact:
46+
existing_dwpd_yml: "{{ lookup('file', kayobe_env_config_path ~ '/dwpd-ratings.yml') | from_yaml }}"
47+
when: dwpd_ratings_stat.stat.exists
48+
run_once: true
49+
50+
- name: Convert existing YAML array into a dictionary
51+
ansible.builtin.set_fact:
52+
dwpd_lookup: "{{ dwpd_lookup | default({}) | combine({item.model_name: item.rated_dwpd}) }}"
53+
loop: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) }}"
54+
loop_control:
55+
label: "{{ item.model_name }}"
56+
run_once: true
57+
58+
- name: Get list of existing model names
59+
ansible.builtin.set_fact:
60+
existing_model_names: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) | map(attribute='model_name') | list }}"
61+
run_once: true
62+
63+
- name: Identify new models not already in the configuration
64+
ansible.builtin.set_fact:
65+
new_models: "{{ all_nvme_models | default([]) | reject('in', existing_model_names | default([])) | list }}"
66+
run_once: true
67+
68+
- name: Create entry dictionary for new models
69+
ansible.builtin.set_fact:
70+
new_entries: "{{ new_entries | default([]) + [{'model_name': item, 'rated_dwpd': 1}] }}"
71+
loop: "{{ new_models }}"
72+
run_once: true
73+
when: new_models | length > 0
74+
75+
- name: Build updated list for stackhpc_dwpd_ratings
76+
ansible.builtin.set_fact:
77+
new_dwpd_list: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) + (new_entries | default([])) }}"
78+
run_once: true
79+
80+
- name: Write updated dwpd-ratings.yml
81+
ansible.builtin.copy:
82+
content: "---\nstackhpc_dwpd_ratings:\n{% for item in new_dwpd_list %} - model_name: \"{{ item.model_name }}\"\n rated_dwpd: {{ item.rated_dwpd }}\n{% endfor %}"
83+
dest: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
84+
run_once: true
85+
notify: Show updated dwpd-ratings.yml contents
86+
when: new_dwpd_list is defined and new_dwpd_list | length > 0
87+
88+
handlers:
89+
- name: Show updated dwpd-ratings.yml contents
90+
ansible.builtin.debug:
91+
msg:
92+
- "Updated local dwpd-ratings.yml contents"
93+
- "{{ {'stackhpc_dwpd_ratings': new_dwpd_list} | to_nice_yaml }}"
94+
- "PLEASE REVIEW AND COMMIT {{ kayobe_env_config_path }}/dwpd-ratings.yml TO VERSION CONTROL."
95+
run_once: true
96+
changed_when: true

etc/kayobe/ansible/scripts/nvmemon.sh

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,43 @@ if ! command -v nvme >/dev/null 2>&1; then
2121
exit 1
2222
fi
2323

24+
if ! command -v jq >/dev/null 2>&1; then
25+
echo "${0##*/}: jq is required but not installed. Aborting." >&2
26+
exit 1
27+
fi
28+
29+
# Path to the DWPD ratings JSON file
30+
dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json"
31+
32+
declare -A rated_dwpd
33+
34+
load_dwpd_ratings() {
35+
if [[ -f "$dwpd_file" ]]; then
36+
# Read the JSON; if it fails, default to empty array
37+
dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')"
38+
39+
# We iterate over each array element in dwpd_json
40+
while IFS= read -r line; do
41+
key="$(echo "$line" | jq -r '.model_name')"
42+
value="$(echo "$line" | jq -r '.rated_dwpd')"
43+
44+
# Clean up trailing whitespace
45+
key="${key%%[[:space:]]*}"
46+
value="${value%%[[:space:]]*}"
47+
48+
# If we have a valid key, store it in the dictionary
49+
if [[ -n "$key" && "$key" != "null" ]]; then
50+
rated_dwpd["$key"]="$value"
51+
fi
52+
done < <(echo "$dwpd_json" | jq -c '.[]')
53+
else
54+
echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1." >&2
55+
fi
56+
}
57+
58+
59+
load_dwpd_ratings
60+
2461
output_format_awk="$(
2562
cat <<'OUTPUTAWK'
2663
BEGIN { v = "" }
@@ -44,58 +81,70 @@ format_output() {
4481
nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
4582
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output
4683

47-
# Get devices (DevicePath and PhysicalSize)
48-
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')"
84+
# Get devices (DevicePath, PhysicalSize and ModelNumber)
85+
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber, SerialNumber}')"
86+
87+
# Convert device_info to an array
88+
device_info_array=()
89+
while IFS= read -r line; do
90+
device_info_array+=("$line")
91+
done <<< "$device_info"
4992

5093
# Loop through the NVMe devices
51-
echo "$device_info" | while read -r device_data; do
52-
device=$(echo "$device_data" | jq -r '.DevicePath')
94+
for device_data in "${device_info_array[@]}"; do
95+
device="$(echo "$device_data" | jq -r '.DevicePath')"
5396
json_check="$(nvme smart-log -o json "${device}")"
5497
disk="${device##*/}"
98+
model_name="$(echo "$device_data" | jq -r '.ModelNumber')"
99+
serial_number="$(echo "$device_data" | jq -r '.SerialNumber')"
55100

56-
physical_size=$(echo "$device_data" | jq -r '.PhysicalSize')
57-
echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}"
101+
physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')"
102+
echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${physical_size}"
58103

59104
# The temperature value in JSON is in Kelvin, we want Celsius
60105
value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
61-
echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}"
106+
echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_temperature}"
107+
108+
# Get the rated DWPD from the dictionary or default to 1 if not found
109+
value_rated_dwpd="${rated_dwpd[$model_name]:-1}"
110+
echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_rated_dwpd}"
62111

63112
value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
64-
echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
113+
echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare}"
65114

66115
value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
67-
echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
116+
echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare_threshold}"
68117

69118
value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
70-
echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"
119+
echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_percentage_used}"
71120

72121
value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
73-
echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
122+
echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_critical_warning}"
74123

75124
value_media_errors="$(echo "$json_check" | jq '.media_errors')"
76-
echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"
125+
echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_media_errors}"
77126

78127
value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
79-
echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
128+
echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_num_err_log_entries}"
80129

81130
value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
82-
echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
131+
echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_cycles}"
83132

84133
value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
85-
echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
134+
echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_on_hours}"
86135

87136
value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
88-
echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
137+
echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_controller_busy_time}"
89138

90139
value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
91-
echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
140+
echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_written}"
92141

93142
value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
94-
echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
143+
echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_read}"
95144

96145
value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
97-
echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
146+
echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_read_commands}"
98147

99148
value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
100-
echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
149+
echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_write_commands}"
101150
done | format_output

etc/kayobe/ansible/smartmon-tools.yml

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
---
2-
- name: Install and set up smartmon-tools
2+
- name: Install and set up SMART monitoring tools
33
hosts: overcloud
4-
54
tasks:
65
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
76
ansible.builtin.package:
@@ -13,11 +12,23 @@
1312
state: present
1413
become: true
1514

16-
- name: Ensure Python 3, venv, and pip are installed
17-
ansible.builtin.package:
18-
name: >
19-
{{ ['python3', 'python3-pip'] + (['python3-venv'] if ansible_facts['distribution'] == 'Ubuntu' else []) }}
15+
- name: Ensure Python 3, venv, and pip are installed on Debian/Ubuntu
16+
ansible.builtin.apt:
17+
name:
18+
- python3
19+
- python3-venv
20+
- python3-pip
21+
state: present
22+
when: ansible_facts.os_family == 'Debian'
23+
become: true
24+
25+
- name: Ensure Python 3, and pip are installed on RedHat/CentOS
26+
ansible.builtin.yum:
27+
name:
28+
- python3
29+
- python3-pip
2030
state: present
31+
when: ansible_facts.os_family == 'RedHat'
2132
become: true
2233

2334
- name: Create smartmon Python virtual environment
@@ -31,6 +42,7 @@
3142
name:
3243
- prometheus_client
3344
- pySMART
45+
state: present
3446
virtualenv: /opt/smartmon-venv
3547
virtualenv_python: python3
3648
become: true
@@ -98,3 +110,35 @@
98110
path: /usr/local/bin/smartmon.sh
99111
state: absent
100112
become: true
113+
114+
- name: Gather NVMe drives and generate dwpd ratings
115+
import_playbook: get-nvme-drives.yml
116+
when: create_dwpd_ratings | default(false)
117+
118+
- name: Copy DWPD ratings to overcloud hosts
119+
hosts: overcloud
120+
gather_facts: false
121+
tasks:
122+
- name: Convert the stackhpc_dwpd_ratings variable to JSON
123+
ansible.builtin.set_fact:
124+
dwpd_ratings_json: "{{ stackhpc_dwpd_ratings | default([]) | to_json }}"
125+
run_once: true
126+
when: stackhpc_dwpd_ratings is defined
127+
128+
- name: Ensure /opt/kayobe/etc/monitoring directory exists
129+
ansible.builtin.file:
130+
path: /opt/kayobe/etc/monitoring
131+
state: directory
132+
mode: '0755'
133+
become: true
134+
when: stackhpc_dwpd_ratings is defined
135+
136+
- name: Copy JSON file to remote
137+
ansible.builtin.copy:
138+
content: "{{ dwpd_ratings_json }}"
139+
dest: "/opt/kayobe/etc/monitoring/dwpd_ratings.json"
140+
owner: root
141+
group: root
142+
mode: '0644'
143+
become: true
144+
when: stackhpc_dwpd_ratings is defined

0 commit comments

Comments
 (0)