Skip to content

Commit 0d9d3d7

Browse files
mkuratczykLoïc Hoguin
andcommitted
Use erlang:system_info(creation) as GUID
Node GUID allows to differentiate between different incarnations of a node. However, since rabbit may take some time to start (many queues/bindings, etc), there could be a significant difference between Erlang VM being up and responding to RPC requests and the new GUID being announced. During that time, node monitor could incorrectly assume there was a network partition, while in fact a node was simply restarted. With this change, as soon as the Erlang VM is up, we can tell whether it was restarted and avoid false positives. Additionally, we now log if any queues were deleted on behalf of the restarted node. This can take quite a long time if there are many transient queues (eg. auto-delete queues). The longer this takes, the higher were the odds of a restarted node being up again by the time check_partial_partition was called. We may need to reconsider this logic as well but for now - we just log this activity. Co-authored-by: Loïc Hoguin <[email protected]>
1 parent f69aaf3 commit 0d9d3d7

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

deps/rabbit/src/rabbit_amqqueue.erl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1905,7 +1905,11 @@ maybe_clear_recoverable_node(Node, Q) ->
19051905
-spec on_node_down(node()) -> 'ok'.
19061906

19071907
on_node_down(Node) ->
1908-
{QueueNames, QueueDeletions} = delete_queues_on_node_down(Node),
1908+
{Time, {QueueNames, QueueDeletions}} = timer:tc(fun() -> delete_queues_on_node_down(Node) end),
1909+
case length(QueueNames) of
1910+
0 -> ok;
1911+
_ -> rabbit_log:info("~p transient queues from an old incarnation of node ~p deleted in ~fs", [length(QueueNames), Node, Time/1000000])
1912+
end,
19091913
notify_queue_binding_deletions(QueueDeletions),
19101914
rabbit_core_metrics:queues_deleted(QueueNames),
19111915
notify_queues_deleted(QueueNames),

deps/rabbit/src/rabbit_node_monitor.erl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ init([]) ->
366366
{ok, ensure_keepalive_timer(#state{monitors = Monitors,
367367
subscribers = pmon:new(),
368368
partitions = [],
369-
guid = rabbit_guid:gen(),
369+
guid = erlang:system_info(creation),
370370
node_guids = maps:new(),
371371
autoheal = rabbit_autoheal:init()})}.
372372

@@ -416,6 +416,13 @@ handle_cast(notify_node_up, State = #state{guid = GUID}) ->
416416
%% disconnected, it would become a minority, pause, realise it's not
417417
%% in a minority any more, and come back, still partitioned (albeit no
418418
%% longer partially).
419+
%%
420+
%% UPDATE: The GUID is actually not a GUID anymore - it is the value
421+
%% returned by erlang:system_info(creation). This prevent false-positives
422+
%% in a situation when a node is restarted (Erlang VM is up) but the rabbit
423+
%% app is not yet up. The GUID was only generated and announced upon rabbit
424+
%% startup; creation is available immediately. Therefore we can tell that
425+
%% the node was restarted, before it announces the new value.
419426
%% ----------------------------------------------------------------------------
420427

421428
handle_cast({node_up, Node, NodeType, GUID},
@@ -435,15 +442,18 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID},
435442
maps:find(Node, GUIDs) =:= {ok, NodeGUID} of
436443
true -> spawn_link( %%[1]
437444
fun () ->
438-
case rpc:call(Node, rabbit, is_running, []) of
445+
case rpc:call(Node, erlang, system_info, [creation]) of
439446
{badrpc, _} -> ok;
440-
_ ->
447+
NodeGUID ->
441448
rabbit_log:warning("Received a 'DOWN' message"
442449
" from ~p but still can"
443450
" communicate with it ",
444451
[Node]),
445452
cast(Rep, {partial_partition,
446-
Node, node(), RepGUID})
453+
Node, node(), RepGUID});
454+
_ ->
455+
rabbit_log:warning("Node ~p was restarted", [Node]),
456+
ok
447457
end
448458
end);
449459
false -> ok

0 commit comments

Comments
 (0)