Skip to content

Add health checks for testing readiness to serve clients #13879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions deps/rabbitmq_management/priv/www/api/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1252,6 +1252,41 @@ <h2>Reference</h2>
Service Unavailable.
</td>
</tr>
<tr>
<td>X</td>
<td></td>
<td></td>
<td></td>
<td class="path">/api/health/checks/below-node-connection-limit</td>
<td>
Responds a 200 OK if the target node has fewer connections to the AMQP
and AMQPS ports than the configured maximum, otherwise responds with a
503 Service Unavailable.
</td>
</tr>
<tr>
<td>X</td>
<td></td>
<td></td>
<td></td>
<td class="path">/api/health/checks/ready-to-serve-clients</td>
<td>
<p>
Responds a 200 OK if the target node is ready to serve clients, otherwise
responds with a 503 Service Unavailable. This check combines:
</p>
<ol>
<li>/api/health/checks/is-in-service</li>
<li>/api/health/checks/protocol-listener/amqp or /api/health/checks/protocol-listener/amqps</li>
<li>/api/health/checks/below-node-connection-limit</li>
</ol>
<p>
So this check will only return 200 OK if the target node is in service,
an AMQP or AMQPS listener is available and the target node has fewer active
AMQP and AMQPS connections that its configured limit.
</p>
</td>
</tr>
<tr>
<td>X</td>
<td></td>
Expand Down
2 changes: 2 additions & 0 deletions deps/rabbitmq_management/src/rabbit_mgmt_dispatcher.erl
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ dispatcher() ->
{"/health/checks/quorum-queues-without-elected-leaders/vhost/:vhost/pattern/:pattern", rabbit_mgmt_wm_health_check_quorum_queues_without_elected_leaders, []},
{"/health/checks/node-is-quorum-critical", rabbit_mgmt_wm_health_check_node_is_quorum_critical, []},
{"/health/checks/is-in-service", rabbit_mgmt_wm_health_check_is_in_service, []},
{"/health/checks/below-node-connection-limit", rabbit_mgmt_wm_health_check_below_node_connection_limit, []},
{"/health/checks/ready-to-serve-clients", rabbit_mgmt_wm_health_check_ready_to_serve_clients, []},
{"/reset", rabbit_mgmt_wm_reset, []},
{"/reset/:node", rabbit_mgmt_wm_reset, []},
{"/rebalance/queues", rabbit_mgmt_wm_rebalance_queues, [{queues, all}]},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
%% This Source Code Form is subject to the terms of the Mozilla Public
%% License, v. 2.0. If a copy of the MPL was not distributed with this
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
%%
%% Copyright (c) 2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
%%

-module(rabbit_mgmt_wm_health_check_below_node_connection_limit).

-export([init/2]).
-export([to_json/2, content_types_provided/2]).
-export([variances/2]).

-include("rabbit_mgmt.hrl").
-include_lib("rabbitmq_management_agent/include/rabbit_mgmt_records.hrl").

init(Req, _State) ->
Req1 = rabbit_mgmt_headers:set_no_cache_headers(
rabbit_mgmt_headers:set_common_permission_headers(
Req, ?MODULE), ?MODULE),
{cowboy_rest, Req1, #context{}}.

variances(Req, Context) ->
{[<<"accept-encoding">>, <<"origin">>], Req, Context}.

content_types_provided(ReqData, Context) ->
{rabbit_mgmt_util:responder_map(to_json), ReqData, Context}.

to_json(ReqData, Context) ->
ActiveConns = lists:foldl(
fun(Protocol, Acc) ->
Acc + protocol_connection_count(Protocol)
end, 0, [amqp, 'amqp/ssl']),
Limit = rabbit_misc:get_env(rabbit, connection_max, infinity),
case ActiveConns < Limit of
true ->
rabbit_mgmt_util:reply(
#{status => ok,
limit => Limit,
connections => ActiveConns}, ReqData, Context);
false ->
Body = #{
status => failed,
reason => <<"node connection limit is reached">>,
limit => Limit,
connections => ActiveConns
},
{Response, ReqData1, Context1} = rabbit_mgmt_util:reply(
Body, ReqData, Context),
{stop,
cowboy_req:reply(
?HEALTH_CHECK_FAILURE_STATUS, #{}, Response, ReqData1),
Context1}
end.

protocol_connection_count(Protocol) ->
case rabbit_networking:ranch_ref_of_protocol(Protocol) of
undefined ->
0;
RanchRef ->
#{active_connections := Count} = ranch:info(RanchRef),
Count
end.
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
%% This Source Code Form is subject to the terms of the Mozilla Public
%% License, v. 2.0. If a copy of the MPL was not distributed with this
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
%%
%% Copyright (c) 2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
%%

%% A composite health check that combines:
%% * GET /api/health/checks/is-in-service
%% * GET /api/health/checks/protocol-listener/amqp
%% * GET /api/health/checks/below-node-connection-limit

-module(rabbit_mgmt_wm_health_check_ready_to_serve_clients).

-export([init/2]).
-export([to_json/2, content_types_provided/2]).
-export([variances/2]).

-include("rabbit_mgmt.hrl").
-include_lib("rabbitmq_management_agent/include/rabbit_mgmt_records.hrl").

init(Req, _State) ->
Req1 = rabbit_mgmt_headers:set_no_cache_headers(
rabbit_mgmt_headers:set_common_permission_headers(
Req, ?MODULE), ?MODULE),
{cowboy_rest, Req1, #context{}}.

variances(Req, Context) ->
{[<<"accept-encoding">>, <<"origin">>], Req, Context}.

content_types_provided(ReqData, Context) ->
{rabbit_mgmt_util:responder_map(to_json), ReqData, Context}.

to_json(ReqData, Context) ->
case check() of
{ok, Body} ->
rabbit_mgmt_util:reply(Body, ReqData, Context);
{error, Body} ->
{Response, ReqData1, Context1} = rabbit_mgmt_util:reply(
Body, ReqData, Context),
{stop,
cowboy_req:reply(
?HEALTH_CHECK_FAILURE_STATUS, #{}, Response, ReqData1),
Context1}
end.

check() ->
case rabbit:is_serving() of
true ->
RanchRefs0 = [
rabbit_networking:ranch_ref_of_protocol(amqp),
rabbit_networking:ranch_ref_of_protocol('amqp/ssl')
],
RanchRefs = [R || R <- RanchRefs0, R =/= undefined],
case RanchRefs of
[_ | _] ->
ActiveConns = lists:foldl(
fun(RanchRef, Acc) ->
#{active_connections := Count} = ranch:info(RanchRef),
Acc + Count
end, 0, RanchRefs),
Limit = rabbit_misc:get_env(rabbit, connection_max, infinity),
case ActiveConns < Limit of
true ->
{ok, #{status => ok,
limit => Limit,
connections => ActiveConns}};
false ->
{error, #{status => failed,
reason => <<"node connection limit is reached">>,
limit => Limit,
connections => ActiveConns}}
end;
[] ->
{error, #{status => failed,
reason => <<"no active listeners for AMQP/AMQPS">>}}
end;
false ->
{error, #{status => failed,
reason => <<"the rabbit node is not currently available to serve">>}}
end.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ all_tests() -> [
protocol_listener_test,
port_listener_test,
certificate_expiration_test,
is_in_service_test
is_in_service_test,
below_node_connection_limit_test,
ready_to_serve_clients_test
].

%% -------------------------------------------------------------------
Expand Down Expand Up @@ -470,8 +472,66 @@ is_in_service_test(Config) ->

passed.

below_node_connection_limit_test(Config) ->
Path = "/health/checks/below-node-connection-limit",
Check0 = http_get(Config, Path, ?OK),
?assertEqual(<<"ok">>, maps:get(status, Check0)),
?assertEqual(0, maps:get(connections, Check0)),
?assertEqual(<<"infinity">>, maps:get(limit, Check0)),

%% Set the connection limit low and open 'limit' connections.
Limit = 10,
rabbit_ct_broker_helpers:rpc(
Config, 0, application, set_env, [rabbit, connection_max, Limit]),
Connections = [rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0) || _ <- lists:seq(1, Limit)],
true = lists:all(fun(E) -> is_pid(E) end, Connections),
{error, not_allowed} = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0),

Body0 = http_get_failed(Config, Path),
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body0)),
?assertEqual(10, maps:get(<<"limit">>, Body0)),
?assertEqual(10, maps:get(<<"connections">>, Body0)),

%% Clean up the connections and reset the limit.
[catch rabbit_ct_client_helpers:close_connection(C) || C <- Connections],
rabbit_ct_broker_helpers:rpc(
Config, 0, application, set_env, [rabbit, connection_max, infinity]),

passed.

ready_to_serve_clients_test(Config) ->
Path = "/health/checks/ready-to-serve-clients",
Check0 = http_get(Config, Path, ?OK),
?assertEqual(<<"ok">>, maps:get(status, Check0)),

true = rabbit_ct_broker_helpers:mark_as_being_drained(Config, 0),
Body0 = http_get_failed(Config, Path),
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body0)),
true = rabbit_ct_broker_helpers:unmark_as_being_drained(Config, 0),

%% Set the connection limit low and open 'limit' connections.
Limit = 10,
rabbit_ct_broker_helpers:rpc(
Config, 0, application, set_env, [rabbit, connection_max, Limit]),
Connections = [rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0) || _ <- lists:seq(1, Limit)],
true = lists:all(fun(E) -> is_pid(E) end, Connections),
{error, not_allowed} = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0),

Body1 = http_get_failed(Config, Path),
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body1)),
?assertEqual(10, maps:get(<<"limit">>, Body1)),
?assertEqual(10, maps:get(<<"connections">>, Body1)),

%% Clean up the connections and reset the limit.
[catch rabbit_ct_client_helpers:close_connection(C) || C <- Connections],
rabbit_ct_broker_helpers:rpc(
Config, 0, application, set_env, [rabbit, connection_max, infinity]),

passed.

http_get_failed(Config, Path) ->
{ok, {{_, Code, _}, _, ResBody}} = req(Config, get, Path, [auth_header("guest", "guest")]),
ct:pal("GET ~s: ~w ~w", [Path, Code, ResBody]),
?assertEqual(Code, ?HEALTH_CHECK_FAILURE_STATUS),
rabbit_json:decode(rabbit_data_coercion:to_binary(ResBody)).

Expand Down