Skip to content

Commit 533ee57

Browse files
authored
Merge pull request #621 from stackhpc/DWPD
Add DWPD to Hardware Overview dashboard
2 parents 4fb099b + 02eb8cd commit 533ee57

File tree

3 files changed

+118
-3
lines changed

3 files changed

+118
-3
lines changed

etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -637,8 +637,8 @@
637637
"overrides": []
638638
},
639639
"gridPos": {
640-
"h": 12,
641-
"w": 20,
640+
"h": 13,
641+
"w": 9,
642642
"x": 0,
643643
"y": 17
644644
},
@@ -674,6 +674,95 @@
674674
],
675675
"title": "Disk Temperatures",
676676
"type": "timeseries"
677+
},
678+
{
679+
"datasource": {
680+
"type": "prometheus",
681+
"uid": "${datasource}"
682+
},
683+
"description": "The data written to the disk in the last 24h period divided by the physical capacity of the disk",
684+
"fieldConfig": {
685+
"defaults": {
686+
"color": {
687+
"mode": "palette-classic"
688+
},
689+
"custom": {
690+
"axisCenteredZero": false,
691+
"axisColorMode": "text",
692+
"axisLabel": "",
693+
"axisPlacement": "auto",
694+
"barAlignment": 0,
695+
"drawStyle": "line",
696+
"fillOpacity": 0,
697+
"gradientMode": "none",
698+
"hideFrom": {
699+
"legend": false,
700+
"tooltip": false,
701+
"viz": false
702+
},
703+
"lineInterpolation": "linear",
704+
"lineWidth": 1,
705+
"pointSize": 5,
706+
"scaleDistribution": {
707+
"type": "linear"
708+
},
709+
"showPoints": "auto",
710+
"spanNulls": false,
711+
"stacking": {
712+
"group": "A",
713+
"mode": "none"
714+
},
715+
"thresholdsStyle": {
716+
"mode": "off"
717+
}
718+
},
719+
"mappings": [],
720+
"thresholds": {
721+
"mode": "absolute",
722+
"steps": [
723+
{
724+
"color": "green",
725+
"value": null
726+
}
727+
]
728+
}
729+
},
730+
"overrides": []
731+
},
732+
"gridPos": {
733+
"h": 13,
734+
"w": 10,
735+
"x": 9,
736+
"y": 17
737+
},
738+
"id": 9,
739+
"options": {
740+
"legend": {
741+
"calcs": [],
742+
"displayMode": "list",
743+
"placement": "bottom",
744+
"showLegend": true
745+
},
746+
"tooltip": {
747+
"mode": "single",
748+
"sort": "none"
749+
}
750+
},
751+
"targets": [
752+
{
753+
"datasource": {
754+
"type": "prometheus",
755+
"uid": "${datasource}"
756+
},
757+
"editorMode": "code",
758+
"expr": "delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000 / nvme_physical_size_bytes{instance=~\"$node\"}",
759+
"legendFormat": "{{instance}} - {{device}}",
760+
"range": true,
761+
"refId": "A"
762+
}
763+
],
764+
"title": "DWPD",
765+
"type": "timeseries"
677766
}
678767
],
679768
"refresh": false,

etc/kayobe/kolla/config/prometheus/smart.rules

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,20 @@ groups:
1313
summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})"
1414
description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}"
1515

16-
{% endraw %}
16+
- alert: DWPDTooHigh
17+
expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > 1
18+
labels:
19+
severity: alert
20+
annotations:
21+
summary: "High 30-Day Average DWPD for {{ $labels.instance }}"
22+
description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD"
23+
24+
- alert: DWPDTooHighWarning
25+
expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > 1
26+
labels:
27+
severity: warning
28+
annotations:
29+
summary: "High 7-Day Average DWPD for {{ $labels.instance }}"
30+
description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD"
31+
32+
{% endraw %}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
features:
3+
- |
4+
Adds a panel in the Hardware Overview dashboard to show DWPD (Drive writes
5+
per day) for NVMEs. This is calculated by dividing the total bytes written
6+
in the past 24 hours by the drive capacity. This is currently only
7+
supported on NVMEs.
8+
- |
9+
Adds alerts that will fire after 1 DWPD is sustained for 7 days, and a
10+
critical alert if 1 DWPD is sustained for 30 days.

0 commit comments

Comments
 (0)