1
+ # Mostly derived from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware
2
+ # In general have ignored lack of resources (memory, cpu) on compute nodes as
3
+ # this is expected, and ignored things which will be hard to threshold due to
4
+ # the nature of a Slurm cluster.
5
+
1
6
groups:
2
7
- name: node-exporter
3
8
rules:
9
+
10
+ # Modified: only on login/control nodes
11
+ - alert: HostOutOfMemory
12
+ expr: (
13
+ node_memory_MemAvailable_bytes{group=~"login|control"} /
14
+ node_memory_MemTotal_bytes{group=~"login|control"}
15
+ < .10
16
+ )
17
+ for: 2m
18
+ labels:
19
+ severity: warning
20
+ annotations:
21
+ summary: Host out of memory (instance {{ $labels.instance }})
22
+ description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
23
+
4
24
# Please add ignored mountpoints in node_exporter parameters like
5
25
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
6
26
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@@ -12,3 +32,140 @@ groups:
12
32
annotations:
13
33
summary: Host out of disk space (instance {{ $labels.instance }})
14
34
description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}'
35
+
36
+ - alert: HostOutOfInodes
37
+ expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
38
+ for: 2m
39
+ labels:
40
+ severity: critical
41
+ annotations:
42
+ summary: Host out of inodes (instance {{ $labels.instance }})
43
+ description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
44
+
45
+ - alert: HostFilesystemDeviceError
46
+ expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1
47
+ for: 2m
48
+ labels:
49
+ severity: critical
50
+ annotations:
51
+ summary: Host filesystem device error (instance {{ $labels.instance }})
52
+ description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
53
+
54
+ # TODO: make tunable
55
+ - alert: HostUnusualDiskWriteLatency
56
+ expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)
57
+ for: 2m
58
+ labels:
59
+ severity: warning
60
+ annotations:
61
+ summary: Host unusual disk write latency (instance {{ $labels.instance }})
62
+ description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
63
+
64
+ - alert: HostCpuHighIowait
65
+ expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
66
+ for: 0m
67
+ labels:
68
+ severity: warning
69
+ annotations:
70
+ summary: Host CPU high iowait (instance {{ $labels.instance }})
71
+ description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
72
+
73
+ - alert: HostSystemdServiceCrashed
74
+ expr: (node_systemd_unit_state{state="failed"} == 1)
75
+ for: 0m
76
+ labels:
77
+ severity: warning
78
+ annotations:
79
+ summary: Host systemd service crashed (instance {{ $labels.instance }})
80
+ description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
81
+
82
+ - alert: HostSoftwareRaidDiskFailure
83
+ expr: (node_md_disks{state="failed"} > 0)
84
+ for: 2m
85
+ labels:
86
+ severity: warning
87
+ annotations:
88
+ summary: Host software RAID disk failure (instance {{ $labels.instance }})
89
+ description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
90
+
91
+ # Modified: only on login/control nodes
92
+ - alert: HostOomKillDetected
93
+ expr: (increase(node_vmstat_oom_kill{group=~"login|control"}[1m]) > 0)
94
+ for: 0m
95
+ labels:
96
+ severity: warning
97
+ annotations:
98
+ summary: Host OOM kill detected (instance {{ $labels.instance }})
99
+ description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
100
+
101
+ - alert: HostEdacUncorrectableErrorsDetected
102
+ expr: (node_edac_uncorrectable_errors_total > 0)
103
+ for: 0m
104
+ labels:
105
+ severity: warning
106
+ annotations:
107
+ summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
108
+ description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
109
+
110
+ - alert: HostNetworkReceiveErrors
111
+ expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)
112
+ for: 2m
113
+ labels:
114
+ severity: warning
115
+ annotations:
116
+ summary: Host Network Receive Errors (instance {{ $labels.instance }})
117
+ description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
118
+
119
+ - alert: HostNetworkTransmitErrors
120
+ expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)
121
+ for: 2m
122
+ labels:
123
+ severity: warning
124
+ annotations:
125
+ summary: Host Network Transmit Errors (instance {{ $labels.instance }})
126
+ description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
127
+
128
+ - alert: HostNetworkBondDegraded
129
+ expr: ((node_bonding_active - node_bonding_slaves) != 0)
130
+ for: 2m
131
+ labels:
132
+ severity: warning
133
+ annotations:
134
+ summary: Host Network Bond Degraded (instance {{ $labels.instance }})
135
+ description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
136
+
137
+ - alert: HostConntrackLimit
138
+ expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)
139
+ for: 5m
140
+ labels:
141
+ severity: warning
142
+ annotations:
143
+ summary: Host conntrack limit (instance {{ $labels.instance }})
144
+ description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
145
+
146
+ - alert: HostClockSkew
147
+ expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
148
+ for: 10m
149
+ labels:
150
+ severity: warning
151
+ annotations:
152
+ summary: Host clock skew (instance {{ $labels.instance }})
153
+ description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
154
+
155
+ - alert: HostClockNotSynchronising
156
+ expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
157
+ for: 2m
158
+ labels:
159
+ severity: warning
160
+ annotations:
161
+ summary: Host clock not synchronising (instance {{ $labels.instance }})
162
+ description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
163
+
164
+ - alert: HostRequiresReboot
165
+ expr: (node_reboot_required > 0)
166
+ for: 4h
167
+ labels:
168
+ severity: info
169
+ annotations:
170
+ summary: Host requires reboot (instance {{ $labels.instance }})
171
+ description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
0 commit comments