Skip to content

Commit ed0ba6a

Browse files
Merge pull request #3075 from rabbitmq/remove-randomized-startup-delays
Remove randomized startup delays
2 parents 49a70a0 + 0876746 commit ed0ba6a

19 files changed

+305
-260
lines changed

deps/rabbit/priv/schema/rabbit.schema

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -945,31 +945,37 @@ fun(Conf) ->
945945
end}.
946946

947947
%% Cluster formation: Randomized startup delay
948+
%%
949+
%% DEPRECATED: This is a no-op. Old configs are still allowed, but a warning will be printed.
948950

949-
{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range",
950-
[{datatype, integer}]}.
951-
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range",
952-
[{datatype, integer}]}.
951+
{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range", []}.
952+
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range", []}.
953953

954954
{translation, "rabbit.cluster_formation.randomized_startup_delay_range",
955955
fun(Conf) ->
956956
Min = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.min", Conf, undefined),
957957
Max = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.max", Conf, undefined),
958958

959959
case {Min, Max} of
960-
{undefined, undefined} ->
961-
cuttlefish:unset();
962-
{undefined, Max} ->
963-
%% fallback default
964-
{5, Max};
965-
{Min, undefined} ->
966-
%% fallback default
967-
{Min, 60};
968-
{Min, Max} ->
969-
{Min, Max}
970-
end
960+
{undefined, undefined} ->
961+
ok;
962+
_ ->
963+
cuttlefish:warn("cluster_formation.randomized_startup_delay_range.min and "
964+
"cluster_formation.randomized_startup_delay_range.max are deprecated")
965+
end,
966+
cuttlefish:unset()
971967
end}.
972968

969+
%% Cluster formation: lock acquisition retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
970+
%%
971+
%% Currently used in classic, k8s, and aws peer discovery backends.
972+
973+
{mapping, "cluster_formation.internal_lock_retries", "rabbit.cluster_formation.internal_lock_retries",
974+
[
975+
{datatype, integer},
976+
{validators, ["non_zero_positive_integer"]}
977+
]}.
978+
973979
%% Cluster formation: discovery failure retries
974980

975981
{mapping, "cluster_formation.lock_retry_limit", "rabbit.cluster_formation.lock_retry_limit",

deps/rabbit/src/rabbit_mnesia.erl

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,21 +105,18 @@ init_with_lock(Retries, Timeout, RunPeerDiscovery) ->
105105
rabbit_log:debug("rabbit_peer_discovery:lock returned ~p", [LockResult]),
106106
case LockResult of
107107
not_supported ->
108-
rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"),
109-
%% See rabbitmq/rabbitmq-server#1202 for details.
110-
rabbit_peer_discovery:maybe_inject_randomized_delay(),
111108
RunPeerDiscovery(),
112109
rabbit_peer_discovery:maybe_register();
113-
{error, _Reason} ->
114-
timer:sleep(Timeout),
115-
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery);
116110
{ok, Data} ->
117111
try
118112
RunPeerDiscovery(),
119113
rabbit_peer_discovery:maybe_register()
120114
after
121115
rabbit_peer_discovery:unlock(Data)
122-
end
116+
end;
117+
{error, _Reason} ->
118+
timer:sleep(Timeout),
119+
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery)
123120
end.
124121

125122
-spec run_peer_discovery() -> ok | {[node()], node_type()}.
@@ -178,7 +175,7 @@ join_discovered_peers(TryNodes, NodeType) ->
178175
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval).
179176

180177
join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) ->
181-
rabbit_log:warning(
178+
rabbit_log:info(
182179
"Could not successfully contact any node of: ~s (as in Erlang distribution). "
183180
"Starting as a blank standalone node...",
184181
[string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
@@ -193,7 +190,7 @@ join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterva
193190
rabbit_node_monitor:notify_joined_cluster();
194191
none ->
195192
RetriesLeft1 = RetriesLeft - 1,
196-
rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
193+
rabbit_log:info("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
197194
[DelayInterval, RetriesLeft1]),
198195
timer:sleep(DelayInterval),
199196
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval)

deps/rabbit/src/rabbit_nodes.erl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
boot/0]).
1616
-export([persistent_cluster_id/0, seed_internal_cluster_id/0, seed_user_provided_cluster_name/0]).
1717
-export([all_running_with_hashes/0]).
18+
-export([lock_id/1, lock_retries/0]).
1819

1920
-include_lib("kernel/include/inet.hrl").
2021
-include_lib("rabbit_common/include/rabbit.hrl").
@@ -23,6 +24,12 @@
2324

2425
-define(INTERNAL_CLUSTER_ID_PARAM_NAME, internal_cluster_id).
2526

27+
% Retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
28+
% To understand how retries map to the timeout, read
29+
% https://github.com/erlang/otp/blob/d256ae477014158a49bb860b283df9c040011197/lib/kernel/src/global.erl#L2062-L2075
30+
% 80 corresponds to a timeout of ca 300 seconds.
31+
-define(DEFAULT_LOCK_RETRIES, 80).
32+
2633
%%----------------------------------------------------------------------------
2734
%% API
2835
%%----------------------------------------------------------------------------
@@ -162,3 +169,16 @@ await_running_count_with_retries(TargetCount, Retries) ->
162169
-spec all_running_with_hashes() -> #{non_neg_integer() => node()}.
163170
all_running_with_hashes() ->
164171
maps:from_list([{erlang:phash2(Node), Node} || Node <- all_running()]).
172+
173+
-spec lock_id(Node :: node()) -> {ResourceId :: string(), LockRequesterId :: node()}.
174+
lock_id(Node) ->
175+
{cookie_hash(), Node}.
176+
177+
-spec lock_retries() -> integer().
178+
lock_retries() ->
179+
case application:get_env(rabbit, cluster_formation) of
180+
{ok, PropList} ->
181+
proplists:get_value(internal_lock_retries, PropList, ?DEFAULT_LOCK_RETRIES);
182+
undefined ->
183+
?DEFAULT_LOCK_RETRIES
184+
end.

deps/rabbit/src/rabbit_peer_discovery.erl

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
-export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0,
1515
normalize/1, format_discovered_nodes/1, log_configured_backend/0,
1616
register/0, unregister/0, maybe_register/0, maybe_unregister/0,
17-
maybe_inject_randomized_delay/0, lock/0, unlock/1,
18-
discovery_retries/0]).
17+
lock/0, unlock/1, discovery_retries/0]).
1918
-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0,
2019
lock_acquisition_failure_mode/0]).
2120

@@ -28,9 +27,6 @@
2827
%% default node prefix to attach to discovered hostnames
2928
-define(DEFAULT_PREFIX, "rabbit").
3029

31-
%% default randomized delay range, in seconds
32-
-define(DEFAULT_STARTUP_RANDOMIZED_DELAY, {5, 60}).
33-
3430
%% default discovery retries and interval.
3531
-define(DEFAULT_DISCOVERY_RETRY_COUNT, 10).
3632
-define(DEFAULT_DISCOVERY_RETRY_INTERVAL_MS, 500).
@@ -159,61 +155,6 @@ discovery_retries() ->
159155
{?DEFAULT_DISCOVERY_RETRY_COUNT, ?DEFAULT_DISCOVERY_RETRY_INTERVAL_MS}
160156
end.
161157

162-
163-
-spec maybe_inject_randomized_delay() -> ok.
164-
maybe_inject_randomized_delay() ->
165-
Backend = backend(),
166-
case Backend:supports_registration() of
167-
true ->
168-
rabbit_log:info("Peer discovery backend ~s supports registration.", [Backend]),
169-
inject_randomized_delay();
170-
false ->
171-
rabbit_log:info("Peer discovery backend ~s does not support registration, skipping randomized startup delay.", [Backend]),
172-
ok
173-
end.
174-
175-
-spec inject_randomized_delay() -> ok.
176-
177-
inject_randomized_delay() ->
178-
{Min, Max} = randomized_delay_range_in_ms(),
179-
case {Min, Max} of
180-
%% When the max value is set to 0, consider the delay to be disabled.
181-
%% In addition, `rand:uniform/1` will fail with a "no function clause"
182-
%% when the argument is 0.
183-
{_, 0} ->
184-
rabbit_log:info("Randomized delay range's upper bound is set to 0. Considering it disabled."),
185-
ok;
186-
{_, N} when is_number(N) ->
187-
rand:seed(exsplus),
188-
RandomVal = rand:uniform(round(N)),
189-
rabbit_log:debug("Randomized startup delay: configured range is from ~p to ~p milliseconds, PRNG pick: ~p...",
190-
[Min, Max, RandomVal]),
191-
Effective = case RandomVal < Min of
192-
true -> Min;
193-
false -> RandomVal
194-
end,
195-
rabbit_log:info("Will wait for ~p milliseconds before proceeding with registration...", [Effective]),
196-
timer:sleep(Effective),
197-
ok
198-
end.
199-
200-
-spec randomized_delay_range_in_ms() -> {integer(), integer()}.
201-
202-
randomized_delay_range_in_ms() ->
203-
Backend = backend(),
204-
Default = case erlang:function_exported(Backend, randomized_startup_delay_range, 0) of
205-
true -> Backend:randomized_startup_delay_range();
206-
false -> ?DEFAULT_STARTUP_RANDOMIZED_DELAY
207-
end,
208-
{Min, Max} = case application:get_env(rabbit, cluster_formation) of
209-
{ok, Proplist} ->
210-
proplists:get_value(randomized_startup_delay_range, Proplist, Default);
211-
undefined ->
212-
Default
213-
end,
214-
{Min * 1000, Max * 1000}.
215-
216-
217158
-spec register() -> ok.
218159

219160
register() ->

deps/rabbit/src/rabbit_peer_discovery_classic_config.erl

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,36 @@ list_nodes() ->
2626
Nodes when is_list(Nodes) -> {ok, {Nodes, disc}}
2727
end.
2828

29+
-spec lock(Node :: node()) -> {ok, {ResourceId :: string(), LockRequesterId :: node()}} | {error, Reason :: string()}.
30+
31+
lock(Node) ->
32+
{ok, {Nodes, _NodeType}} = list_nodes(),
33+
case lists:member(Node, Nodes) of
34+
false when Nodes =/= [] ->
35+
rabbit_log:warning("Local node ~s is not part of configured nodes ~p. "
36+
"This might lead to incorrect cluster formation.", [Node, Nodes]);
37+
_ -> ok
38+
end,
39+
LockId = rabbit_nodes:lock_id(Node),
40+
Retries = rabbit_nodes:lock_retries(),
41+
case global:set_lock(LockId, Nodes, Retries) of
42+
true ->
43+
{ok, LockId};
44+
false ->
45+
{error, io_lib:format("Acquiring lock taking too long, bailing out after ~b retries", [Retries])}
46+
end.
47+
48+
-spec unlock({ResourceId :: string(), LockRequesterId :: node()}) -> ok.
49+
50+
unlock(LockId) ->
51+
{ok, {Nodes, _NodeType}} = list_nodes(),
52+
global:del_lock(LockId, Nodes),
53+
ok.
54+
2955
-spec supports_registration() -> boolean().
3056

3157
supports_registration() ->
32-
%% If we don't have any nodes configured, skip randomized delay and similar operations
33-
%% as we don't want to delay startup for no reason. MK.
34-
has_any_peer_nodes_configured().
58+
false.
3559

3660
-spec register() -> ok.
3761

@@ -47,29 +71,3 @@ unregister() ->
4771

4872
post_registration() ->
4973
ok.
50-
51-
-spec lock(Node :: atom()) -> not_supported.
52-
53-
lock(_Node) ->
54-
not_supported.
55-
56-
-spec unlock(Data :: term()) -> ok.
57-
58-
unlock(_Data) ->
59-
ok.
60-
61-
%%
62-
%% Helpers
63-
%%
64-
65-
has_any_peer_nodes_configured() ->
66-
case application:get_env(rabbit, cluster_nodes, []) of
67-
{[], _NodeType} ->
68-
false;
69-
{Nodes, _NodeType} when is_list(Nodes) ->
70-
true;
71-
[] ->
72-
false;
73-
Nodes when is_list(Nodes) ->
74-
true
75-
end.

deps/rabbit/test/config_schema_SUITE_data/rabbit.snippets

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -540,23 +540,22 @@ tcp_listen_options.exit_on_close = false",
540540
{cluster_formation_randomized_startup_delay_both_values,
541541
"cluster_formation.randomized_startup_delay_range.min = 10
542542
cluster_formation.randomized_startup_delay_range.max = 30",
543-
[{rabbit, [{cluster_formation, [
544-
{randomized_startup_delay_range, {10, 30}}
545-
]}]}],
543+
[],
546544
[]},
547545

548546
{cluster_formation_randomized_startup_delay_min_only,
549547
"cluster_formation.randomized_startup_delay_range.min = 10",
550-
[{rabbit, [{cluster_formation, [
551-
{randomized_startup_delay_range, {10, 60}}
552-
]}]}],
548+
[],
553549
[]},
554550

555551
{cluster_formation_randomized_startup_delay_max_only,
556552
"cluster_formation.randomized_startup_delay_range.max = 30",
557-
[{rabbit, [{cluster_formation, [
558-
{randomized_startup_delay_range, {5, 30}}
559-
]}]}],
553+
[],
554+
[]},
555+
556+
{cluster_formation_internal_lock_retries,
557+
"cluster_formation.internal_lock_retries = 10",
558+
[{rabbit,[{cluster_formation,[{internal_lock_retries,10}]}]}],
560559
[]},
561560

562561
{cluster_formation_dns,

0 commit comments

Comments
 (0)