Skip to content

Commit a9cd55e

Browse files
committed
add node-exporter rules
1 parent 16b3a9b commit a9cd55e

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

environments/common/files/prometheus/rules/node-exporter.rules

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
1+
# Mostly derived from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware
2+
# In general have ignored lack of resources (memory, cpu) on compute nodes as
3+
# this is expected, and ignored things which will be hard to threshold due to
4+
# the nature of a Slurm cluster.
5+
16
groups:
27
- name: node-exporter
38
rules:
9+
10+
# Modified: only on login/control nodes
11+
- alert: HostOutOfMemory
12+
expr: (
13+
node_memory_MemAvailable_bytes{group=~"login|control"} /
14+
node_memory_MemTotal_bytes{group=~"login|control"}
15+
< .10
16+
)
17+
for: 2m
18+
labels:
19+
severity: warning
20+
annotations:
21+
summary: Host out of memory (instance {{ $labels.instance }})
22+
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
23+
424
# Please add ignored mountpoints in node_exporter parameters like
525
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
626
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@@ -12,3 +32,140 @@ groups:
1232
annotations:
1333
summary: Host out of disk space (instance {{ $labels.instance }})
1434
description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}'
35+
36+
- alert: HostOutOfInodes
37+
expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
38+
for: 2m
39+
labels:
40+
severity: critical
41+
annotations:
42+
summary: Host out of inodes (instance {{ $labels.instance }})
43+
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
44+
45+
- alert: HostFilesystemDeviceError
46+
expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1
47+
for: 2m
48+
labels:
49+
severity: critical
50+
annotations:
51+
summary: Host filesystem device error (instance {{ $labels.instance }})
52+
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
53+
54+
# TODO: make tunable
55+
- alert: HostUnusualDiskWriteLatency
56+
expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)
57+
for: 2m
58+
labels:
59+
severity: warning
60+
annotations:
61+
summary: Host unusual disk write latency (instance {{ $labels.instance }})
62+
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
63+
64+
- alert: HostCpuHighIowait
65+
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
66+
for: 0m
67+
labels:
68+
severity: warning
69+
annotations:
70+
summary: Host CPU high iowait (instance {{ $labels.instance }})
71+
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
72+
73+
- alert: HostSystemdServiceCrashed
74+
expr: (node_systemd_unit_state{state="failed"} == 1)
75+
for: 0m
76+
labels:
77+
severity: warning
78+
annotations:
79+
summary: Host systemd service crashed (instance {{ $labels.instance }})
80+
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
81+
82+
- alert: HostSoftwareRaidDiskFailure
83+
expr: (node_md_disks{state="failed"} > 0)
84+
for: 2m
85+
labels:
86+
severity: warning
87+
annotations:
88+
summary: Host software RAID disk failure (instance {{ $labels.instance }})
89+
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
90+
91+
# Modified: only on login/control nodes
92+
- alert: HostOomKillDetected
93+
expr: (increase(node_vmstat_oom_kill{group=~"login|control"}[1m]) > 0)
94+
for: 0m
95+
labels:
96+
severity: warning
97+
annotations:
98+
summary: Host OOM kill detected (instance {{ $labels.instance }})
99+
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
100+
101+
- alert: HostEdacUncorrectableErrorsDetected
102+
expr: (node_edac_uncorrectable_errors_total > 0)
103+
for: 0m
104+
labels:
105+
severity: warning
106+
annotations:
107+
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
108+
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
109+
110+
- alert: HostNetworkReceiveErrors
111+
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)
112+
for: 2m
113+
labels:
114+
severity: warning
115+
annotations:
116+
summary: Host Network Receive Errors (instance {{ $labels.instance }})
117+
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
118+
119+
- alert: HostNetworkTransmitErrors
120+
expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)
121+
for: 2m
122+
labels:
123+
severity: warning
124+
annotations:
125+
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
126+
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
127+
128+
- alert: HostNetworkBondDegraded
129+
expr: ((node_bonding_active - node_bonding_slaves) != 0)
130+
for: 2m
131+
labels:
132+
severity: warning
133+
annotations:
134+
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
135+
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
136+
137+
- alert: HostConntrackLimit
138+
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)
139+
for: 5m
140+
labels:
141+
severity: warning
142+
annotations:
143+
summary: Host conntrack limit (instance {{ $labels.instance }})
144+
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
145+
146+
- alert: HostClockSkew
147+
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
148+
for: 10m
149+
labels:
150+
severity: warning
151+
annotations:
152+
summary: Host clock skew (instance {{ $labels.instance }})
153+
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
154+
155+
- alert: HostClockNotSynchronising
156+
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
157+
for: 2m
158+
labels:
159+
severity: warning
160+
annotations:
161+
summary: Host clock not synchronising (instance {{ $labels.instance }})
162+
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
163+
164+
- alert: HostRequiresReboot
165+
expr: (node_reboot_required > 0)
166+
for: 4h
167+
labels:
168+
severity: info
169+
annotations:
170+
summary: Host requires reboot (instance {{ $labels.instance }})
171+
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 commit comments

Comments
 (0)