Skip to content

Commit 43f4a87

Browse files
committed
change alerts to ignore compute
1 parent 72245f9 commit 43f4a87

File tree

2 files changed

+9
-8
lines changed

2 files changed

+9
-8
lines changed

environments/common/files/prometheus/rules/node-exporter.rules

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
# Mostly derived from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware
1+
# Mostly taken from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware
2+
# If modified, this is noted in a comment.
3+
#
24
# In general have ignored lack of resources (memory, cpu) on compute nodes as
35
# this is expected, and ignored things which will be hard to threshold due to
46
# the nature of a Slurm cluster.
@@ -7,11 +9,11 @@ groups:
79
- name: node-exporter
810
rules:
911

10-
# Modified: only on login/control nodes
12+
# Modified: ignore compute nodes
1113
- alert: HostOutOfMemory
1214
expr: (
13-
node_memory_MemAvailable_bytes{group=~"login|control"} /
14-
node_memory_MemTotal_bytes{group=~"login|control"}
15+
node_memory_MemAvailable_bytes{group!~"compute"} /
16+
node_memory_MemTotal_bytes{group!~"compute"}
1517
< .10
1618
)
1719
for: 2m
@@ -88,9 +90,9 @@ groups:
8890
summary: Host software RAID disk failure (instance {{ $labels.instance }})
8991
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
9092

91-
# Modified: only on login/control nodes
93+
# Modified: ignore compute nodes
9294
- alert: HostOomKillDetected
93-
expr: (increase(node_vmstat_oom_kill{group=~"login|control"}[1m]) > 0)
95+
expr: (increase(node_vmstat_oom_kill{group!~"compute"}[1m]) > 0)
9496
for: 0m
9597
labels:
9698
severity: warning

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,8 @@ prometheus_targets:
2727
control: "{{ groups.get('node_exporter', []) | intersect(groups['control']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'control') }}"
2828
login: "{{ groups.get('node_exporter', []) | intersect(groups['login']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'login') }}"
2929
compute: "{{ groups.get('node_exporter', []) | intersect(groups['compute']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'compute') }}"
30-
# openhpc is defined as control+login+compute so this gets anything else:
30+
# openhpc is defined as control+login+compute so this gets any other node exporter targets:
3131
other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}"
32-
# TODO: check empty list gets coped with correctly!
3332

3433
prometheus_scrape_configs_default:
3534
- job_name: "prometheus"

0 commit comments

Comments
 (0)