Skip to content

Commit 7f06a08

Browse files
authored
Merge pull request #8411 from rabbitmq/retry-feature-flags-rpcs
rabbit_feature_flags: Retry after `erpc:call()` fails with `noconnection`
2 parents 3f9e93c + f09544b commit 7f06a08

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

deps/rabbit/src/rabbit_feature_flags.erl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,6 @@
126126
get_overriden_running_nodes/0]).
127127
-endif.
128128

129-
%% Default timeout for operations on remote nodes.
130-
-define(TIMEOUT, 60000).
131-
132129
-type feature_flag_modattr() :: {feature_name(),
133130
feature_props()}.
134131
%% The value of a `-rabbitmq_feature_flag()' module attribute used to

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,9 +1186,32 @@ this_node_first(Nodes) ->
11861186
Ret :: term() | {error, term()}.
11871187

11881188
rpc_call(Node, Module, Function, Args, Timeout) ->
1189+
SleepBetweenRetries = 5000,
1190+
T0 = erlang:monotonic_time(),
11891191
try
11901192
erpc:call(Node, Module, Function, Args, Timeout)
11911193
catch
1194+
%% In case of `noconnection' with `Timeout'=infinity, we don't retry
1195+
%% at all. This is because the infinity "timeout" is used to run
1196+
%% callbacks on remote node and they can last an indefinite amount of
1197+
%% time, for instance, if there is a lot of data to migrate.
1198+
error:{erpc, noconnection} = Reason
1199+
when is_integer(Timeout) andalso Timeout > SleepBetweenRetries ->
1200+
?LOG_ERROR(
1201+
"Feature flags: no connection to node `~ts`; "
1202+
"retrying in ~b milliseconds",
1203+
[Node, SleepBetweenRetries],
1204+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
1205+
timer:sleep(SleepBetweenRetries),
1206+
T1 = erlang:monotonic_time(),
1207+
TDiff = erlang:convert_time_unit(T1 - T0, native, millisecond),
1208+
Remaining = Timeout - TDiff,
1209+
Timeout1 = erlang:max(Remaining, 0),
1210+
case Timeout1 of
1211+
0 -> {error, Reason};
1212+
_ -> rpc_call(Node, Module, Function, Args, Timeout1)
1213+
end;
1214+
11921215
Class:Reason:Stacktrace ->
11931216
Message0 = erl_error:format_exception(Class, Reason, Stacktrace),
11941217
Message1 = lists:flatten(Message0),

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
mf_wait_and_count_runs_v2_post_enable/1,
2929
mf_crash_on_joining_node/1,
3030

31+
rpc_calls/1,
3132
enable_unknown_feature_flag_on_a_single_node/1,
3233
enable_supported_feature_flag_on_a_single_node/1,
3334
enable_unknown_feature_flag_in_a_3node_cluster/1,
@@ -36,6 +37,7 @@
3637
enable_unsupported_feature_flag_in_a_3node_cluster/1,
3738
enable_feature_flag_in_cluster_and_add_member_after/1,
3839
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1,
40+
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/0,
3941
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1,
4042
enable_feature_flag_with_post_enable/1,
4143
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1,
@@ -54,6 +56,10 @@ all() ->
5456
groups() ->
5557
Groups =
5658
[
59+
{direct, [parallel],
60+
[
61+
rpc_calls
62+
]},
5763
{cluster_size_1, [parallel],
5864
[
5965
enable_unknown_feature_flag_on_a_single_node,
@@ -94,6 +100,8 @@ end_per_suite(Config) ->
94100

95101
init_per_group(feature_flags_v2, Config) ->
96102
rabbit_ct_helpers:set_config(Config, {enable_feature_flags_v2, true});
103+
init_per_group(direct, Config) ->
104+
Config;
97105
init_per_group(cluster_size_1, Config) ->
98106
rabbit_ct_helpers:set_config(Config, {nodes_count, 1});
99107
init_per_group(cluster_size_3, Config) ->
@@ -104,11 +112,15 @@ init_per_group(_Group, Config) ->
104112
end_per_group(_Group, Config) ->
105113
Config.
106114

115+
init_per_testcase(rpc_calls, Config) ->
116+
Config;
107117
init_per_testcase(Testcase, Config) ->
108118
rabbit_ct_helpers:run_steps(
109119
Config,
110120
[fun(Cfg) -> start_slave_nodes(Cfg, Testcase) end]).
111121

122+
end_per_testcase(rpc_calls, Config) ->
123+
Config;
112124
end_per_testcase(_Testcase, Config) ->
113125
rabbit_ct_helpers:run_steps(
114126
Config,
@@ -328,6 +340,16 @@ get_peer_proc() ->
328340
%% Testcases.
329341
%% -------------------------------------------------------------------
330342

343+
rpc_calls(_Config) ->
344+
List = [1, 2, 3],
345+
?assertEqual(
346+
lists:sum(List),
347+
rabbit_ff_controller:rpc_call(node(), lists, sum, [List], 11000)),
348+
?assertEqual(
349+
{error, {erpc, noconnection}},
350+
rabbit_ff_controller:rpc_call(
351+
nonode@non_existing_host, lists, sum, [List], 11000)).
352+
331353
enable_unknown_feature_flag_on_a_single_node(Config) ->
332354
[Node] = ?config(nodes, Config),
333355
ok = run_on_node(
@@ -865,6 +887,9 @@ enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2(Config) ->
865887
|| Node <- AllNodes],
866888
ok.
867889

890+
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2() ->
891+
[{timetrap, {minutes, 3}}].
892+
868893
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2(Config) ->
869894
AllNodes = [LeavingNode | [FirstNode | _] = Nodes] = ?config(
870895
nodes, Config),

0 commit comments

Comments
 (0)