Skip to content

Commit 02d06fd

Browse files
authored
Merge pull request #1173 from stackhpc/ping-alerts
Make packet drop alert configurable
2 parents 97cc295 + ac0f360 commit 02d06fd

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

etc/kayobe/kolla/config/prometheus/ceph.rules

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,25 +137,25 @@ groups:
137137
annotations:
138138
description: "Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free."
139139

140-
# alert on nic packet errors and drops rates > 1 packet/s
140+
# alert on nic packet errors and drops rates > alertmanager_packet_drop_threshold packet/s
141141
- alert: NetworkPacketsDropped
142-
expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > 1
142+
expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > {% endraw %}{{ alertmanager_packet_drop_threshold }}{% raw %}
143143
labels:
144144
severity: warning
145145
annotations:
146146
description: >
147-
Node {{ $labels.instance }} experiences packet drop > 1
147+
Node {{ $labels.instance }} experiences packet drop > {% endraw %}{{ alertmanager_packet_drop_threshold }}{% raw %}
148148
packet/s on interface {{ $labels.device }}.
149149

150150
- alert: NetworkPacketErrors
151151
expr: |
152152
irate(node_network_receive_errs_total{device!="lo"}[5m]) +
153-
irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
153+
irate(node_network_transmit_errs_total{device!="lo"}[5m]) > {% endraw %}{{ alertmanager_packet_errors_threshold }}{% raw %}
154154
labels:
155155
severity: warning
156156
annotations:
157157
description: >
158-
Node {{ $labels.instance }} experiences packet errors > 1
158+
Node {{ $labels.instance }} experiences packet errors > {% endraw %}{{ alertmanager_packet_errors_threshold }}{% raw %}
159159
packet/s on interface {{ $labels.device }}.
160160

161161
- alert: StorageFillingUp

etc/kayobe/stackhpc-monitoring.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ alertmanager_warn_network_bond_single_link: true
1818
alertmanager_node_free_swap_warning_threshold_ratio: 0.25
1919
alertmanager_node_free_swap_critical_threshold_ratio: 0.1
2020

21+
# Threshold to trigger an alert for dropped packets, measured in packets/s
22+
# averaged over 5 minutes.
23+
alertmanager_packet_drop_threshold: 1
24+
25+
# Threshold to trigger an alert for packet receive/transmit errors, measured in
26+
# packets/s averaged over 5 minutes.
27+
alertmanager_packet_errors_threshold: 1
28+
2129
###############################################################################
2230
# Exporter configuration
2331

0 commit comments

Comments
 (0)