Skip to content

Commit 99df07f

Browse files
committed
alert on large Slurmdbd queue
1 parent a2c07e0 commit 99df07f

File tree

3 files changed

+8
-5
lines changed

3 files changed

+8
-5
lines changed

environments/common/files/prometheus/rules/slurm.rules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,3 @@ groups:
1414
description: '{{ $value }} Slurm nodes are in fail status'
1515
summary: 'At least one Slurm node is failed.'
1616
expr: "slurm_nodes_fail > 0\n"
17-
18-
# TODO: alert on slurm_scheduler_dbd_queue_size - see vpenso exporter, man sdiag, and MaxDBDMsgs
19-
# but node its dynamic

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@ prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if gr
2020
prometheus_alert_rules_files_inventory_glob: ../files/prometheus/rules/*.rules
2121
prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}"
2222

23-
prometheus_alert_rules: []
23+
prometheus_alert_rules:
24+
- alert: SlurmDBDQueueLarge
25+
# NB: {{ templates }} in annotations.description are interpolated by prometheus, in expr by ansible
26+
annotations:
27+
description: '{% raw %}Slurm DBD message queue size {{ $value }} is larger than half Slurm parameter MaxDBDMsgs - check database health{% endraw %}'
28+
summary: 'Slurm DBD message queue is large.'
29+
expr: "slurm_scheduler_dbd_queue_size > {{ hostvars[groups['control'].0].ansible_local.slurm.MaxDBDMsgs | int }}"
2430

2531
# Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack.
2632
prometheus_targets:

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ roles:
44
version: v25.3.2
55
name: stackhpc.nfs
66
- src: https://github.com/stackhpc/ansible-role-openhpc.git
7-
version: v0.27.0
7+
version: feat/facts # TODO: bump to release
88
name: stackhpc.openhpc
99
- src: https://github.com/stackhpc/ansible-node-exporter.git
1010
version: stackhpc

0 commit comments

Comments
 (0)