Skip to content

Commit e72d911

Browse files
committed
rabbit_peer_discovery: Compute start time once
... and cache it. [Why] It happens at least in CI that the computed start time varies by a few seconds. I think this comes from the Erlang time offset which might be adjusted over time. This affects peer discovery's sorting of RabbitMQ nodes which uses that start time to determine the oldest node. When the start time of a node changes, it could be considered the seed node to join by some nodes but ignored by the other nodes, leading to troubles with cluster formation.
1 parent 09f1ab4 commit e72d911

File tree

3 files changed

+75
-14
lines changed

3 files changed

+75
-14
lines changed

deps/rabbit/src/rabbit_peer_discovery.erl

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -637,8 +637,7 @@ query_node_props2([{Node, Members} | Rest], NodesAndProps, FromNode) ->
637637
["Peer discovery: temporary hidden node '~ts' "
638638
"queries properties from node '~ts'",
639639
[node(), Node]], FromNode),
640-
StartTime = get_node_start_time(
641-
Node, microsecond, FromNode),
640+
StartTime = get_node_start_time(Node, FromNode),
642641
IsReady = is_node_db_ready(Node, FromNode),
643642
NodeAndProps = {Node, Members, StartTime, IsReady},
644643
NodesAndProps1 = [NodeAndProps | NodesAndProps],
@@ -666,9 +665,8 @@ query_node_props2([], NodesAndProps, _FromNode) ->
666665
?assert(length(NodesAndProps1) =< length(nodes(hidden))),
667666
NodesAndProps1.
668667

669-
-spec get_node_start_time(Node, Unit, FromNode) -> StartTime when
668+
-spec get_node_start_time(Node, FromNode) -> StartTime when
670669
Node :: node(),
671-
Unit :: erlang:time_unit(),
672670
FromNode :: node(),
673671
StartTime :: non_neg_integer().
674672
%% @doc Returns the start time of the given `Node' in `Unit'.
@@ -689,15 +687,21 @@ query_node_props2([], NodesAndProps, _FromNode) ->
689687
%%
690688
%% @private
691689

692-
get_node_start_time(Node, Unit, FromNode) ->
693-
NativeStartTime = erpc_call(
694-
Node, erlang, system_info, [start_time], FromNode),
695-
TimeOffset = erpc_call(Node, erlang, time_offset, [], FromNode),
696-
SystemStartTime = NativeStartTime + TimeOffset,
697-
StartTime = erpc_call(
698-
Node, erlang, convert_time_unit,
699-
[SystemStartTime, native, Unit], FromNode),
700-
StartTime.
690+
get_node_start_time(Node, FromNode) ->
691+
try
692+
erpc_call(Node,rabbit_boot_state, get_start_time, [], FromNode)
693+
catch
694+
error:{exception, _, _} ->
695+
NativeStartTime = erpc_call(
696+
Node, erlang, system_info, [start_time],
697+
FromNode),
698+
TimeOffset = erpc_call(Node, erlang, time_offset, [], FromNode),
699+
SystemStartTime = NativeStartTime + TimeOffset,
700+
StartTime = erpc_call(
701+
Node, erlang, convert_time_unit,
702+
[SystemStartTime, native, microsecond], FromNode),
703+
StartTime
704+
end.
701705

702706
-spec is_node_db_ready(Node, FromNode) -> IsReady when
703707
Node :: node(),

deps/rabbitmq_prelaunch/src/rabbit_boot_state.erl

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@
1717
set/1,
1818
wait_for/2,
1919
has_reached/1,
20-
has_reached_and_is_active/1]).
20+
has_reached_and_is_active/1,
21+
get_start_time/0,
22+
record_start_time/0]).
2123

2224
-define(PT_KEY_BOOT_STATE, {?MODULE, boot_state}).
25+
-define(PT_KEY_START_TIME, {?MODULE, start_time}).
2326

2427
-type boot_state() :: stopped |
2528
booting |
@@ -95,3 +98,56 @@ has_reached_and_is_active(TargetBootState) ->
9598
andalso
9699
not has_reached(CurrentBootState, stopping)
97100
end.
101+
102+
-spec get_start_time() -> StartTime when
103+
StartTime :: integer().
104+
%% @doc Returns the start time of the Erlang VM.
105+
%%
106+
%% This time was recorded by {@link record_start_time/0} as early as possible
107+
%% and is immutable.
108+
109+
get_start_time() ->
110+
persistent_term:get(?PT_KEY_START_TIME).
111+
112+
-spec record_start_time() -> ok.
113+
%% @doc Records the start time of the Erlang VM.
114+
%%
115+
%% The time is expressed in microseconds since Epoch. It can be compared to
116+
%% other non-native times. This is used by the Peer Discovery subsystem to
117+
%% sort nodes and select a seed node if the peer discovery backend did not
118+
%% select one.
119+
%%
120+
%% This time is recorded once. Calling this function multiple times won't
121+
%% overwrite the value.
122+
123+
record_start_time() ->
124+
Key = ?PT_KEY_START_TIME,
125+
try
126+
%% Check if the start time was recorded.
127+
_ = persistent_term:get(Key),
128+
ok
129+
catch
130+
error:badarg ->
131+
%% The start time was not recorded yet. Acquire a lock and check
132+
%% again in case another process got the lock first and recorded
133+
%% the start time.
134+
Node = node(),
135+
LockId = {?PT_KEY_START_TIME, self()},
136+
true = global:set_lock(LockId, [Node]),
137+
try
138+
_ = persistent_term:get(Key),
139+
ok
140+
catch
141+
error:badarg ->
142+
%% We are really the first to get the lock and we can
143+
%% record the start time.
144+
NativeStartTime = erlang:system_info(start_time),
145+
TimeOffset = erlang:time_offset(),
146+
SystemStartTime = NativeStartTime + TimeOffset,
147+
StartTime = erlang:convert_time_unit(
148+
SystemStartTime, native, microsecond),
149+
persistent_term:put(Key, StartTime)
150+
after
151+
global:del_lock(LockId, [Node])
152+
end
153+
end.

deps/rabbitmq_prelaunch/src/rabbit_prelaunch.erl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ run_prelaunch_first_phase() ->
4848
do_run() ->
4949
%% Indicate RabbitMQ is booting.
5050
clear_stop_reason(),
51+
rabbit_boot_state:record_start_time(),
5152
rabbit_boot_state:set(booting),
5253

5354
%% Configure dbg if requested.

0 commit comments

Comments
 (0)