Skip to content

Commit 4632a5a

Browse files
committed
Handle timeouts in transient queue deletion
Transient queue deletion previously caused a crash if Khepri was enabled and a node with a transient queue went down while its cluster was in a minority. We need to handle the `{error,timeout}` return possible from `rabbit_db_queue:delete_transient/1`. In the `rabbit_amqqueue:on_node_down/1` callback we log a warning when we see this return. We then try this deletion again during that node's `rabbit_khepri:init/0` which is called from a boot step after `rabbit_khepri:setup/0`. At that point we can return an error and halt the node's boot if the command times out. The cluster is very likely to be in a majority at that point since `rabbit_khepri:setup/0` waits for a leader to be elected (requiring a majority). This fixes a crash report found in the `cluster_minority_SUITE`'s `end_per_group`. (cherry picked from commit 3f734ef) (cherry picked from commit 006f517)
1 parent 62bb6ec commit 4632a5a

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

deps/rabbit/src/rabbit_amqqueue.erl

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
-export([queue/1, queue_names/1]).
7676

7777
-export([kill_queue/2, kill_queue/3, kill_queue_hard/2, kill_queue_hard/3]).
78+
-export([delete_transient_queues_on_node/1]).
7879

7980
%% internal
8081
-export([internal_declare/2, internal_delete/2, run_backing_queue/3,
@@ -2055,13 +2056,39 @@ maybe_clear_recoverable_node(Node) ->
20552056
-spec on_node_down(node()) -> 'ok'.
20562057

20572058
on_node_down(Node) ->
2059+
case delete_transient_queues_on_node(Node) of
2060+
ok ->
2061+
ok;
2062+
{error, timeout} ->
2063+
%% This case is possible when running Khepri. The node going down
2064+
%% could leave the cluster in a minority so the command to delete
2065+
%% the transient queue records would fail. Also see
2066+
%% `rabbit_khepri:init/0': we also try this deletion when the node
2067+
%% restarts - a time that the cluster is very likely to have a
2068+
%% majority - to ensure these records are deleted.
2069+
rabbit_log:warning("transient queues for node '~ts' could not be "
2070+
"deleted because of a timeout. These queues "
2071+
"will be removed when node '~ts' restarts or "
2072+
"is removed from the cluster.", [Node, Node]),
2073+
ok
2074+
end.
2075+
2076+
-spec delete_transient_queues_on_node(Node) -> Ret when
2077+
Node :: node(),
2078+
Ret :: ok | rabbit_khepri:timeout_error().
2079+
2080+
delete_transient_queues_on_node(Node) ->
20582081
{Time, Ret} = timer:tc(fun() -> rabbit_db_queue:delete_transient(filter_transient_queues_to_delete(Node)) end),
20592082
case Ret of
2060-
ok -> ok;
2061-
{QueueNames, Deletions} ->
2083+
ok ->
2084+
ok;
2085+
{error, timeout} = Err ->
2086+
Err;
2087+
{QueueNames, Deletions} when is_list(QueueNames) ->
20622088
case length(QueueNames) of
20632089
0 -> ok;
2064-
N -> rabbit_log:info("~b transient queues from an old incarnation of node ~tp deleted in ~fs",
2090+
N -> rabbit_log:info("~b transient queues from node '~ts' "
2091+
"deleted in ~fs",
20652092
[N, Node, Time / 1_000_000])
20662093
end,
20672094
notify_queue_binding_deletions(Deletions),

deps/rabbit/src/rabbit_khepri.erl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ wait_for_register_projections(Timeout, Retries) ->
325325
%% @private
326326

327327
-spec init() -> Ret when
328-
Ret :: ok.
328+
Ret :: ok | timeout_error().
329329

330330
init() ->
331331
case members() of
@@ -336,7 +336,14 @@ init() ->
336336
?LOG_NOTICE(
337337
"Found the following metadata store members: ~p", [Members],
338338
#{domain => ?RMQLOG_DOMAIN_DB}),
339-
ok
339+
%% Delete transient queues on init.
340+
%% Note that we also do this in the
341+
%% `rabbit_amqqueue:on_node_down/1' callback. We must try this
342+
%% deletion during init because the cluster may have been in a
343+
%% minority when this node went down. We wait for a majority while
344+
%% booting (via `rabbit_khepri:setup/0') though so this deletion is
345+
%% likely to succeed.
346+
rabbit_amqqueue:delete_transient_queues_on_node(node())
340347
end.
341348

342349
%% @private

0 commit comments

Comments
 (0)