Skip to content

Commit a0ba0ad

Browse files
authored
Merge pull request #2276 from rabbitmq/mk-peer-discovery-retries
Introduce peer discovery retries
2 parents 141f6bf + 17890d5 commit a0ba0ad

7 files changed

+268
-25
lines changed

priv/schema/rabbit.schema

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,30 @@ fun(Conf) ->
952952
end
953953
end}.
954954

955+
%% Cluster formation: discovery failure retries
956+
957+
{mapping, "cluster_formation.lock_retry_limit", "rabbit.cluster_formation.lock_retry_limit",
958+
[
959+
{datatype, integer},
960+
{validators, ["non_zero_positive_integer"]}
961+
]}.
962+
{mapping, "cluster_formation.lock_retry_timeout", "rabbit.cluster_formation.lock_retry_timeout",
963+
[
964+
{datatype, integer},
965+
{validators, ["non_zero_positive_integer"]}
966+
]}.
967+
968+
{mapping, "cluster_formation.discovery_retry_limit", "rabbit.cluster_formation.discovery_retry_limit",
969+
[
970+
{datatype, integer},
971+
{validators, ["non_zero_positive_integer"]}
972+
]}.
973+
{mapping, "cluster_formation.discovery_retry_interval", "rabbit.cluster_formation.discovery_retry_interval",
974+
[
975+
{datatype, integer},
976+
{validators, ["non_zero_positive_integer"]}
977+
]}.
978+
955979
%% Classic config-driven peer discovery backend.
956980
%%
957981
%% Make clustering happen *automatically* at startup - only applied

src/rabbit_mnesia.erl

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -97,48 +97,61 @@ init() ->
9797
ok.
9898

9999
init_with_lock() ->
100-
{Retries, Timeout} = rabbit_peer_discovery:retry_timeout(),
101-
init_with_lock(Retries, Timeout, fun init_from_config/0).
100+
{Retries, Timeout} = rabbit_peer_discovery:locking_retry_timeout(),
101+
init_with_lock(Retries, Timeout, fun run_peer_discovery/0).
102102

103-
init_with_lock(0, _, InitFromConfig) ->
103+
init_with_lock(0, _, RunPeerDiscovery) ->
104104
case rabbit_peer_discovery:lock_acquisition_failure_mode() of
105105
ignore ->
106106
rabbit_log:warning("Cannot acquire a lock during clustering", []),
107-
InitFromConfig(),
107+
RunPeerDiscovery(),
108108
rabbit_peer_discovery:maybe_register();
109109
fail ->
110110
exit(cannot_acquire_startup_lock)
111111
end;
112-
init_with_lock(Retries, Timeout, InitFromConfig) ->
112+
init_with_lock(Retries, Timeout, RunPeerDiscovery) ->
113113
case rabbit_peer_discovery:lock() of
114114
not_supported ->
115115
rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"),
116116
%% See rabbitmq/rabbitmq-server#1202 for details.
117117
rabbit_peer_discovery:maybe_inject_randomized_delay(),
118-
InitFromConfig(),
118+
RunPeerDiscovery(),
119119
rabbit_peer_discovery:maybe_register();
120120
{error, _Reason} ->
121121
timer:sleep(Timeout),
122-
init_with_lock(Retries - 1, Timeout, InitFromConfig);
122+
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery);
123123
{ok, Data} ->
124124
try
125-
InitFromConfig(),
125+
RunPeerDiscovery(),
126126
rabbit_peer_discovery:maybe_register()
127127
after
128128
rabbit_peer_discovery:unlock(Data)
129129
end
130130
end.
131131

132-
init_from_config() ->
132+
-spec run_peer_discovery() -> ok | {[node()], node_type()}.
133+
run_peer_discovery() ->
134+
{RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(),
135+
run_peer_discovery_with_retries(RetriesLeft, DelayInterval).
136+
137+
-spec run_peer_discovery_with_retries(non_neg_integer(), non_neg_integer()) -> ok | {[node()], node_type()}.
138+
run_peer_discovery_with_retries(0, _DelayInterval) ->
139+
ok;
140+
run_peer_discovery_with_retries(RetriesLeft, DelayInterval) ->
133141
FindBadNodeNames = fun
134142
(Name, BadNames) when is_atom(Name) -> BadNames;
135143
(Name, BadNames) -> [Name | BadNames]
136144
end,
137145
{DiscoveredNodes, NodeType} =
138146
case rabbit_peer_discovery:discover_cluster_nodes() of
147+
{error, Reason} ->
148+
RetriesLeft1 = RetriesLeft - 1,
149+
rabbit_log:error("Peer discovery returned an error: ~p. Will retry after a delay of ~b, ~b retries left...",
150+
[Reason, DelayInterval, RetriesLeft1]),
151+
timer:sleep(DelayInterval),
152+
run_peer_discovery_with_retries(RetriesLeft1, DelayInterval);
139153
{ok, {Nodes, Type} = Config}
140-
when is_list(Nodes) andalso
141-
(Type == disc orelse Type == disk orelse Type == ram) ->
154+
when is_list(Nodes) andalso (Type == disc orelse Type == disk orelse Type == ram) ->
142155
case lists:foldr(FindBadNodeNames, [], Nodes) of
143156
[] -> Config;
144157
BadNames -> e({invalid_cluster_node_names, BadNames})
@@ -167,6 +180,16 @@ init_from_config() ->
167180
%% reachable and compatible (in terms of Mnesia internal protocol version and such)
168181
%% cluster peers in order.
169182
join_discovered_peers(TryNodes, NodeType) ->
183+
{RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(),
184+
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval).
185+
186+
join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) ->
187+
rabbit_log:warning(
188+
"Could not successfully contact any node of: ~s (as in Erlang distribution). "
189+
"Starting as a blank standalone node...~n",
190+
[string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
191+
init_db_and_upgrade([node()], disc, false, _Retry = true);
192+
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval) ->
170193
case find_reachable_peer_to_cluster_with(nodes_excl_me(TryNodes)) of
171194
{ok, Node} ->
172195
rabbit_log:info("Node '~s' selected for auto-clustering~n", [Node]),
@@ -175,11 +198,11 @@ join_discovered_peers(TryNodes, NodeType) ->
175198
rabbit_connection_tracking:boot(),
176199
rabbit_node_monitor:notify_joined_cluster();
177200
none ->
178-
rabbit_log:warning(
179-
"Could not successfully contact any node of: ~s (as in Erlang distribution). "
180-
"Starting as a blank standalone node...~n",
181-
[string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
182-
init_db_and_upgrade([node()], disc, false, _Retry = true)
201+
RetriesLeft1 = RetriesLeft - 1,
202+
rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b, ~b retries left...",
203+
[DelayInterval, RetriesLeft1]),
204+
timer:sleep(DelayInterval),
205+
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval)
183206
end.
184207

185208
%% Make the node join a cluster. The node will be reset automatically

src/rabbit_nodes.erl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
-module(rabbit_nodes).
1919

20-
-export([names/1, diagnostics/1, make/1, parts/1, cookie_hash/0,
20+
-export([names/1, diagnostics/1, make/1, make/2, parts/1, cookie_hash/0,
2121
is_running/2, is_process_running/2,
2222
cluster_name/0, set_cluster_name/1, set_cluster_name/2, ensure_epmd/0,
2323
all_running/0, name_type/0, running_count/0,
@@ -55,8 +55,11 @@ names(Hostname) ->
5555
diagnostics(Nodes) ->
5656
rabbit_nodes_common:diagnostics(Nodes).
5757

58-
make(NodeStr) ->
59-
rabbit_nodes_common:make(NodeStr).
58+
make(NameOrParts) ->
59+
rabbit_nodes_common:make(NameOrParts).
60+
61+
make(ShortName, Hostname) ->
62+
make({ShortName, Hostname}).
6063

6164
parts(NodeStr) ->
6265
rabbit_nodes_common:parts(NodeStr).

src/rabbit_peer_discovery.erl

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
-export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0,
2424
normalize/1, format_discovered_nodes/1, log_configured_backend/0,
2525
register/0, unregister/0, maybe_register/0, maybe_unregister/0,
26-
maybe_inject_randomized_delay/0, lock/0, unlock/1]).
27-
-export([append_node_prefix/1, node_prefix/0, retry_timeout/0,
26+
maybe_inject_randomized_delay/0, lock/0, unlock/1,
27+
discovery_retries/0]).
28+
-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0,
2829
lock_acquisition_failure_mode/0]).
2930

3031
-define(DEFAULT_BACKEND, rabbit_peer_discovery_classic_config).
@@ -61,9 +62,9 @@ node_type() ->
6162
?DEFAULT_NODE_TYPE
6263
end.
6364

64-
-spec retry_timeout() -> {Retries :: integer(), Timeout :: integer()}.
65+
-spec locking_retry_timeout() -> {Retries :: integer(), Timeout :: integer()}.
6566

66-
retry_timeout() ->
67+
locking_retry_timeout() ->
6768
case application:get_env(rabbit, cluster_formation) of
6869
{ok, Proplist} ->
6970
Retries = proplists:get_value(lock_retry_limit, Proplist, 10),
@@ -146,6 +147,18 @@ maybe_unregister() ->
146147
ok
147148
end.
148149

150+
-spec discovery_retries() -> {Retries :: integer(), Interval :: integer()}.
151+
152+
discovery_retries() ->
153+
case application:get_env(rabbit, cluster_formation) of
154+
{ok, Proplist} ->
155+
Retries = proplists:get_value(discovery_retry_limit, Proplist, 10),
156+
Interval = proplists:get_value(discovery_retry_interval, Proplist, 500),
157+
{Retries, Interval};
158+
undefined ->
159+
{10, 500}
160+
end.
161+
149162

150163
-spec maybe_inject_randomized_delay() -> ok.
151164
maybe_inject_randomized_delay() ->

src/rabbit_peer_discovery_classic_config.erl

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
%% API
2727
%%
2828

29-
-spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}}.
29+
-spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}} |
30+
{error, Reason :: string()}.
3031

3132
list_nodes() ->
3233
case application:get_env(rabbit, cluster_nodes, {[], disc}) of
@@ -37,7 +38,9 @@ list_nodes() ->
3738
-spec supports_registration() -> boolean().
3839

3940
supports_registration() ->
40-
false.
41+
%% If we don't have any nodes configured, skip randomized delay and similar operations
42+
%% as we don't want to delay startup for no reason. MK.
43+
has_any_peer_nodes_configured().
4144

4245
-spec register() -> ok.
4346

@@ -63,3 +66,19 @@ lock(_Node) ->
6366

6467
unlock(_Data) ->
6568
ok.
69+
70+
%%
71+
%% Helpers
72+
%%
73+
74+
has_any_peer_nodes_configured() ->
75+
case application:get_env(rabbit, cluster_nodes, []) of
76+
{[], _NodeType} ->
77+
false;
78+
{Nodes, _NodeType} when is_list(Nodes) ->
79+
true;
80+
[] ->
81+
false;
82+
Nodes when is_list(Nodes) ->
83+
true
84+
end.

src/rabbit_table.erl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ wait(TableNames, Timeout, Retries) ->
114114
end,
115115
case {Retries, Result} of
116116
{_, ok} ->
117+
rabbit_log:info("Successfully synced tables from a peer"),
117118
ok;
118119
{1, {error, _} = Error} ->
119120
throw(Error);

0 commit comments

Comments
 (0)