Skip to content

Commit 3f734ef

Browse files
committed
Handle timeouts in transient queue deletion
Transient queue deletion previously caused a crash if Khepri was enabled and a node with a transient queue went down while its cluster was in a minority. We need to handle the `{error,timeout}` return possible from `rabbit_db_queue:delete_transient/1`. In the `rabbit_amqqueue:on_node_down/1` callback we log a warning when we see this return. We then try this deletion again during that node's `rabbit_khepri:init/0` which is called from a boot step after `rabbit_khepri:setup/0`. At that point we can return an error and halt the node's boot if the command times out. The cluster is very likely to be in a majority at that point since `rabbit_khepri:setup/0` waits for a leader to be elected (requiring a majority). This fixes a crash report found in the `cluster_minority_SUITE`'s `end_per_group`.
1 parent 0dd26f0 commit 3f734ef

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

deps/rabbit/src/rabbit_amqqueue.erl

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
-export([queue/1, queue_names/1]).
7171

7272
-export([kill_queue/2, kill_queue/3, kill_queue_hard/2, kill_queue_hard/3]).
73+
-export([delete_transient_queues_on_node/1]).
7374

7475
%% internal
7576
-export([internal_declare/2, internal_delete/2, run_backing_queue/3,
@@ -1839,13 +1840,39 @@ on_node_up(_Node) ->
18391840
-spec on_node_down(node()) -> 'ok'.
18401841

18411842
on_node_down(Node) ->
1843+
case delete_transient_queues_on_node(Node) of
1844+
ok ->
1845+
ok;
1846+
{error, timeout} ->
1847+
%% This case is possible when running Khepri. The node going down
1848+
%% could leave the cluster in a minority so the command to delete
1849+
%% the transient queue records would fail. Also see
1850+
%% `rabbit_khepri:init/0': we also try this deletion when the node
1851+
%% restarts - a time that the cluster is very likely to have a
1852+
%% majority - to ensure these records are deleted.
1853+
rabbit_log:warning("transient queues for node '~ts' could not be "
1854+
"deleted because of a timeout. These queues "
1855+
"will be removed when node '~ts' restarts or "
1856+
"is removed from the cluster.", [Node, Node]),
1857+
ok
1858+
end.
1859+
1860+
-spec delete_transient_queues_on_node(Node) -> Ret when
1861+
Node :: node(),
1862+
Ret :: ok | rabbit_khepri:timeout_error().
1863+
1864+
delete_transient_queues_on_node(Node) ->
18421865
{Time, Ret} = timer:tc(fun() -> rabbit_db_queue:delete_transient(filter_transient_queues_to_delete(Node)) end),
18431866
case Ret of
1844-
ok -> ok;
1845-
{QueueNames, Deletions} ->
1867+
ok ->
1868+
ok;
1869+
{error, timeout} = Err ->
1870+
Err;
1871+
{QueueNames, Deletions} when is_list(QueueNames) ->
18461872
case length(QueueNames) of
18471873
0 -> ok;
1848-
N -> rabbit_log:info("~b transient queues from an old incarnation of node ~tp deleted in ~fs",
1874+
N -> rabbit_log:info("~b transient queues from node '~ts' "
1875+
"deleted in ~fs",
18491876
[N, Node, Time / 1_000_000])
18501877
end,
18511878
notify_queue_binding_deletions(Deletions),

deps/rabbit/src/rabbit_khepri.erl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ wait_for_register_projections(Timeout, Retries) ->
325325
%% @private
326326

327327
-spec init() -> Ret when
328-
Ret :: ok.
328+
Ret :: ok | timeout_error().
329329

330330
init() ->
331331
case members() of
@@ -336,7 +336,14 @@ init() ->
336336
?LOG_NOTICE(
337337
"Found the following metadata store members: ~p", [Members],
338338
#{domain => ?RMQLOG_DOMAIN_DB}),
339-
ok
339+
%% Delete transient queues on init.
340+
%% Note that we also do this in the
341+
%% `rabbit_amqqueue:on_node_down/1' callback. We must try this
342+
%% deletion during init because the cluster may have been in a
343+
%% minority when this node went down. We wait for a majority while
344+
%% booting (via `rabbit_khepri:setup/0') though so this deletion is
345+
%% likely to succeed.
346+
rabbit_amqqueue:delete_transient_queues_on_node(node())
340347
end.
341348

342349
%% @private

0 commit comments

Comments
 (0)