Skip to content

Commit 8749c60

Browse files
committed
rabbit_feature_flags: Retry after erpc:call() fails with noconnection
[Why] There could be a transient network issue. Let's give a few more chances to perform the requested RPC call. [How] We retry until the given timeout is reached, if any. To honor that timeout, we measure the time taken by the RPC call itself. We also sleep between retries. Before each retry, the timeout is reduced by the total of the time taken by the RPC call and the sleep. References #8346. V2: Treat `infinity` timeout differently. In this case, we never retry following a `noconnection` error. The reason is that this timeout is used specifically for callbacks executed remotely. We don't know how long they take (for instance if there is a lot of data to migrate). We don't want an infinite retry loop either, so in this case, we don't retry.
1 parent 3f9e93c commit 8749c60

File tree

2 files changed

+48
-0
lines changed

2 files changed

+48
-0
lines changed

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,9 +1186,32 @@ this_node_first(Nodes) ->
11861186
Ret :: term() | {error, term()}.
11871187

11881188
rpc_call(Node, Module, Function, Args, Timeout) ->
1189+
SleepBetweenRetries = 5000,
1190+
T0 = erlang:monotonic_time(),
11891191
try
11901192
erpc:call(Node, Module, Function, Args, Timeout)
11911193
catch
1194+
%% In case of `noconnection' with `Timeout'=infinity, we don't retry
1195+
%% at all. This is because the infinity "timeout" is used to run
1196+
%% callbacks on remote node and they can last an indefinite amount of
1197+
%% time, for instance, if there is a lot of data to migrate.
1198+
error:{erpc, noconnection} = Reason
1199+
when is_integer(Timeout) andalso Timeout > SleepBetweenRetries ->
1200+
?LOG_ERROR(
1201+
"Feature flags: no connection to node `~ts`; "
1202+
"retrying in ~b milliseconds",
1203+
[Node, SleepBetweenRetries],
1204+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
1205+
timer:sleep(SleepBetweenRetries),
1206+
T1 = erlang:monotonic_time(),
1207+
TDiff = erlang:convert_time_unit(T1 - T0, native, millisecond),
1208+
Remaining = Timeout - TDiff,
1209+
Timeout1 = erlang:max(Remaining, 0),
1210+
case Timeout1 of
1211+
0 -> {error, Reason};
1212+
_ -> rpc_call(Node, Module, Function, Args, Timeout1)
1213+
end;
1214+
11921215
Class:Reason:Stacktrace ->
11931216
Message0 = erl_error:format_exception(Class, Reason, Stacktrace),
11941217
Message1 = lists:flatten(Message0),

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
mf_wait_and_count_runs_v2_post_enable/1,
2929
mf_crash_on_joining_node/1,
3030

31+
rpc_calls/1,
3132
enable_unknown_feature_flag_on_a_single_node/1,
3233
enable_supported_feature_flag_on_a_single_node/1,
3334
enable_unknown_feature_flag_in_a_3node_cluster/1,
@@ -36,6 +37,7 @@
3637
enable_unsupported_feature_flag_in_a_3node_cluster/1,
3738
enable_feature_flag_in_cluster_and_add_member_after/1,
3839
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1,
40+
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/0,
3941
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1,
4042
enable_feature_flag_with_post_enable/1,
4143
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1,
@@ -54,6 +56,10 @@ all() ->
5456
groups() ->
5557
Groups =
5658
[
59+
{direct, [parallel],
60+
[
61+
rpc_calls
62+
]},
5763
{cluster_size_1, [parallel],
5864
[
5965
enable_unknown_feature_flag_on_a_single_node,
@@ -94,6 +100,8 @@ end_per_suite(Config) ->
94100

95101
init_per_group(feature_flags_v2, Config) ->
96102
rabbit_ct_helpers:set_config(Config, {enable_feature_flags_v2, true});
103+
init_per_group(direct, Config) ->
104+
Config;
97105
init_per_group(cluster_size_1, Config) ->
98106
rabbit_ct_helpers:set_config(Config, {nodes_count, 1});
99107
init_per_group(cluster_size_3, Config) ->
@@ -104,11 +112,15 @@ init_per_group(_Group, Config) ->
104112
end_per_group(_Group, Config) ->
105113
Config.
106114

115+
init_per_testcase(rpc_calls, Config) ->
116+
Config;
107117
init_per_testcase(Testcase, Config) ->
108118
rabbit_ct_helpers:run_steps(
109119
Config,
110120
[fun(Cfg) -> start_slave_nodes(Cfg, Testcase) end]).
111121

122+
end_per_testcase(rpc_calls, Config) ->
123+
Config;
112124
end_per_testcase(_Testcase, Config) ->
113125
rabbit_ct_helpers:run_steps(
114126
Config,
@@ -328,6 +340,16 @@ get_peer_proc() ->
328340
%% Testcases.
329341
%% -------------------------------------------------------------------
330342

343+
rpc_calls(_Config) ->
344+
List = [1, 2, 3],
345+
?assertEqual(
346+
lists:sum(List),
347+
rabbit_ff_controller:rpc_call(node(), lists, sum, [List], 11000)),
348+
?assertEqual(
349+
{error, {erpc, noconnection}},
350+
rabbit_ff_controller:rpc_call(
351+
nonode@non_existing_host, lists, sum, [List], 11000)).
352+
331353
enable_unknown_feature_flag_on_a_single_node(Config) ->
332354
[Node] = ?config(nodes, Config),
333355
ok = run_on_node(
@@ -865,6 +887,9 @@ enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2(Config) ->
865887
|| Node <- AllNodes],
866888
ok.
867889

890+
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2() ->
891+
[{timetrap, {minutes, 3}}].
892+
868893
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2(Config) ->
869894
AllNodes = [LeavingNode | [FirstNode | _] = Nodes] = ?config(
870895
nodes, Config),

0 commit comments

Comments
 (0)