File tree Expand file tree Collapse file tree 3 files changed +8
-5
lines changed Expand file tree Collapse file tree 3 files changed +8
-5
lines changed Original file line number Diff line number Diff line change @@ -14,6 +14,3 @@ groups:
14
14
description: '{{ $value }} Slurm nodes are in fail status'
15
15
summary: 'At least one Slurm node is failed.'
16
16
expr: "slurm_nodes_fail > 0\n"
17
-
18
- # TODO: alert on slurm_scheduler_dbd_queue_size - see vpenso exporter, man sdiag, and MaxDBDMsgs
19
- # but node its dynamic
Original file line number Diff line number Diff line change @@ -20,7 +20,13 @@ prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if gr
20
20
prometheus_alert_rules_files_inventory_glob : ../files/prometheus/rules/*.rules
21
21
prometheus_alert_rules_files : " {{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}"
22
22
23
- prometheus_alert_rules : []
23
+ prometheus_alert_rules :
24
+ - alert : SlurmDBDQueueLarge
25
+ # NB: {{ templates }} in annotations.description are interpolated by prometheus, in expr by ansible
26
+ annotations :
27
+ description : ' {% raw %}Slurm DBD message queue size {{ $value }} is larger than half Slurm parameter MaxDBDMsgs - check database health{% endraw %}'
28
+ summary : ' Slurm DBD message queue is large.'
29
+ expr : " slurm_scheduler_dbd_queue_size > {{ hostvars[groups['control'].0].ansible_local.slurm.MaxDBDMsgs | int }}"
24
30
25
31
# Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack.
26
32
prometheus_targets :
Original file line number Diff line number Diff line change 4
4
version : v25.3.2
5
5
name : stackhpc.nfs
6
6
- src : https://github.com/stackhpc/ansible-role-openhpc.git
7
- version : v0.27.0
7
+ version : feat/facts # TODO: bump to release
8
8
name : stackhpc.openhpc
9
9
- src : https://github.com/stackhpc/ansible-node-exporter.git
10
10
version : stackhpc
You can’t perform that action at this time.
0 commit comments