Skip to content

Commit 9fed915

Browse files
committed
Add alarms prometheus collector.
close #2653
1 parent 4e91a96 commit 9fed915

File tree

3 files changed

+91
-1
lines changed

3 files changed

+91
-1
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
%% This Source Code Form is subject to the terms of the Mozilla Public
2+
%% License, v. 2.0. If a copy of the MPL was not distributed with this
3+
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
%%
5+
%% Copyright (c) 2007-2021 VMware, Inc. or its affiliates. All rights reserved.
6+
%%
7+
-module(prometheus_rabbitmq_alarm_metrics_collector).
8+
9+
-export([register/0, deregister_cleanup/1, collect_mf/2]).
10+
11+
-import(prometheus_model_helpers, [create_mf/4, untyped_metric/1]).
12+
13+
-include_lib("prometheus/include/prometheus.hrl").
14+
15+
-behaviour(prometheus_collector).
16+
17+
-define(METRIC_NAME_PREFIX, "rabbitmq_alarms_").
18+
19+
%%====================================================================
20+
%% Collector API
21+
%%====================================================================
22+
23+
register() ->
24+
ok = prometheus_registry:register_collector(?MODULE).
25+
26+
deregister_cleanup(_) ->
27+
ok.
28+
29+
-spec collect_mf(_Registry, Callback) -> ok
30+
when _Registry :: prometheus_registry:registry(),
31+
Callback :: prometheus_collector:callback().
32+
collect_mf(_Registry, Callback) ->
33+
try
34+
case rabbit_alarm:get_local_alarms(500) %% TODO: figure out timeout
35+
of
36+
Alarms when is_list(Alarms) ->
37+
ActiveAlarms =
38+
lists:foldl(fun ({{resource_limit, disk, _}, _}, Acc) ->
39+
maps:put(disk_limit, 1, Acc);
40+
({{resource_limit, memory, _}, _}, Acc) ->
41+
maps:put(memory_limit, 1, Acc);
42+
({file_descriptor_limit, _}, Acc) ->
43+
maps:put(file_descriptor_limit, 1, Acc)
44+
end,
45+
#{},
46+
Alarms),
47+
48+
Callback(create_mf(?METRIC_NAME(<<"file_descriptor_limit">>),
49+
<<"is 1 if file descriptor limit alarm is in effect">>,
50+
untyped,
51+
[untyped_metric(maps:get(file_descriptor_limit,
52+
ActiveAlarms,
53+
0))])),
54+
Callback(create_mf(?METRIC_NAME(<<"disk_limit">>),
55+
<<"is 1 if disk alarm is in effect">>,
56+
untyped,
57+
[untyped_metric(maps:get(disk_limit, ActiveAlarms, 0))])),
58+
Callback(create_mf(?METRIC_NAME(<<"memory_limit">>),
59+
<<"is 1 if memory alarm is in effect">>,
60+
untyped,
61+
[untyped_metric(maps:get(memory_limit, ActiveAlarms, 0))])),
62+
ok;
63+
Error ->
64+
rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
65+
"rabbitm_alarm:get_local_alarms returned ~p",
66+
[Error]),
67+
%% We are not going to render any alarm metrics here.
68+
%% Breaks continuity but at least doesn't crash the
69+
%% whole scraping endpoint
70+
ok
71+
end
72+
catch
73+
exit:{timeout, _} ->
74+
rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
75+
"rabbitm_alarm:get_local_alarms timed out"),
76+
%% We are not going to render any alarm metrics here.
77+
%% Breaks continuity but at least doesn't crash the
78+
%% whole scraping endpoint
79+
ok
80+
end.

deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ build_dispatcher() ->
1515
{ok, _} = application:ensure_all_started(prometheus),
1616
prometheus_registry:register_collectors([
1717
prometheus_rabbitmq_core_metrics_collector,
18-
prometheus_rabbitmq_global_metrics_collector]),
18+
prometheus_rabbitmq_global_metrics_collector,
19+
prometheus_rabbitmq_alarm_metrics_collector]),
1920
prometheus_registry:register_collectors('per-object', [
2021
prometheus_vm_system_info_collector,
2122
prometheus_vm_dist_collector,

release-notes/3.9.4.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@ consistent release schedule.
1515

1616
#### Enhancements
1717

18+
* New Prometheus metrics for alarms:
19+
* `rabbitmq_alarms_file_descriptor_limit` 1|0
20+
* `rabbitmq_alarms_disk_limit` 1|0
21+
* `rabbitmq_alarms_memory_limit` 1|0
22+
23+
While some of the alarms have cluster-wide effect, these metrics are node-local.
24+
25+
GitHub issue: [#2653](https://github.com/rabbitmq/rabbitmq-server/pull/2653)
26+
1827
* Nodes will now use four more environment variables, if set: `RABBITMQ_DEFAULT_USER` (overrides `default_user` in `rabbitmq.conf`), `RABBITMQ_DEFAULT_PASS` (overrides `default_pass`), `RABBITMQ_DEFAULT_VHOST` (overrides `default_vhost`) and `RABBITMQ_ERLANG_COOKIE` (sets [shared authentication secret value](https://www.rabbitmq.com/clustering.html#erlang-cookie)).
1928
These variables **are not recommended to be used in production** but can be the only realistic option in some environment, such as service containers, ECS, and so on.
2029
Most users should continue using `rabbitmq.conf` and a securely generated local cookie file.

0 commit comments

Comments
 (0)