Skip to content

Commit e65ba83

Browse files
author
dcorbacho
committed
Fix delete_replica bug
It caused a lot of flakiness on the rabbit_stream_queue_SUITE, both on `delete_replica` and `delete_last_replica` test cases.
1 parent 6052ecd commit e65ba83

File tree

3 files changed

+83
-8
lines changed

3 files changed

+83
-8
lines changed

deps/rabbit/src/rabbit_stream_coordinator.erl

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,16 +1018,14 @@ update_stream0(#{system_time := _Ts},
10181018
_ ->
10191019
false
10201020
end,
1021-
10221021
case maps:get(Node, Members0) of
10231022
#member{role = {replica, Epoch},
10241023
current = {stopping, Idx},
10251024
state = _} = Member0
10261025
when IsLeaderInCurrent ->
10271026
%% A leader has already been selected so skip straight to ready state
1028-
Member = Member0#member{state = {ready, Epoch},
1029-
target = Target,
1030-
current = undefined},
1027+
Member = update_target(Member0#member{state = {ready, Epoch},
1028+
current = undefined}, Target),
10311029
Members1 = Members0#{Node => Member},
10321030
Stream0#stream{members = Members1};
10331031
#member{role = {_, Epoch},
@@ -1037,9 +1035,8 @@ update_stream0(#{system_time := _Ts},
10371035
%% epoch
10381036
Member = case StoppedEpoch of
10391037
Epoch ->
1040-
Member0#member{state = {stopped, StoppedEpoch, Tail},
1041-
target = Target,
1042-
current = undefined};
1038+
update_target(Member0#member{state = {stopped, StoppedEpoch, Tail},
1039+
current = undefined}, Target);
10431040
_ ->
10441041
%% if stopped epoch is from another epoch
10451042
%% leave target as is to retry stop in current term
@@ -1518,3 +1515,8 @@ set_running_to_stopped(Members) ->
15181515
M
15191516
end, Members).
15201517

1518+
update_target(#member{target = deleted} = Member, _) ->
1519+
%% A deleted member can never transition to another state
1520+
Member;
1521+
update_target(Member, Target) ->
1522+
Member#member{target = Target}.

deps/rabbit/test/rabbit_stream_coordinator_SUITE.erl

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ all_tests() ->
3030
delete_stream,
3131
delete_replica_leader,
3232
delete_replica,
33+
delete_two_replicas,
3334
delete_replica_2,
3435
leader_start_failed
3536
].
@@ -907,6 +908,79 @@ delete_replica(_) ->
907908
{S4, []} = evaluate_stream(meta(?LINE), S4, []),
908909
ok.
909910

911+
delete_two_replicas(_) ->
912+
%% There was a race condition on the rabbit_stream_queue_SUITE testcases delete_replica
913+
%% and delete_last_replica. A replica can sometimes restart after deletion as it transitions
914+
%% again to running state. This test reproduces it. See `rabbit_stream_coordinator.erl`
915+
%% line 1039, the processing of `member_stopped` command. The new function `update_target`
916+
%% ensures this transition never happens.
917+
%% This test reproduces the trace that leads to that error.
918+
E = 1,
919+
StreamId = atom_to_list(?FUNCTION_NAME),
920+
LeaderPid = fake_pid(n1),
921+
[Replica1, Replica2] = [fake_pid(n2), fake_pid(n3)],
922+
N1 = node(LeaderPid),
923+
N2 = node(Replica1),
924+
%% this is to be added
925+
N3 = node(Replica2),
926+
927+
S0 = started_stream(StreamId, LeaderPid, [Replica1, Replica2]),
928+
From = {self(), make_ref()},
929+
Idx1 = ?LINE,
930+
Meta1 = (meta(Idx1))#{from => From},
931+
S1 = update_stream(Meta1, {delete_replica, StreamId, #{node => N3}}, S0),
932+
?assertMatch(#stream{target = running,
933+
nodes = [N1, N2],
934+
members = #{N1 := #member{target = stopped,
935+
current = undefined,
936+
state = {running, _, _}},
937+
N2 := #member{target = stopped,
938+
current = undefined,
939+
state = {running, _, _}},
940+
N3 := #member{target = deleted,
941+
current = undefined,
942+
state = {running, _, _}}
943+
}},
944+
S1),
945+
{S2, Actions1} = evaluate_stream(Meta1, S1, []),
946+
?assertMatch([{aux, {delete_member, StreamId, #{node := N3}, _}},
947+
{aux, {stop, StreamId, #{node := N1, epoch := E}, _}},
948+
{aux, {stop, StreamId, #{node := N2, epoch := E}, _}}],
949+
lists:sort(Actions1)),
950+
951+
Idx2 = ?LINE,
952+
Meta2 = (meta(Idx2))#{from => From},
953+
S3 = update_stream(Meta2, {delete_replica, StreamId, #{node => N2}}, S2),
954+
?assertMatch(#stream{target = running,
955+
nodes = [N1],
956+
members = #{N1 := #member{target = stopped,
957+
current = {stopping, _},
958+
state = {running, _, _}},
959+
N2 := #member{target = deleted,
960+
current = {stopping, _},
961+
state = {running, _, _}},
962+
N3 := #member{target = deleted,
963+
current = {deleting, _},
964+
state = {running, _, _}}
965+
}},
966+
S3),
967+
{S4, []} = evaluate_stream(Meta2, S3, []),
968+
969+
970+
Idx3 = ?LINE,
971+
S5 = update_stream(meta(Idx3),
972+
{member_stopped, StreamId, #{node => N2,
973+
index => Idx1,
974+
epoch => E,
975+
tail => {E, 101}}},
976+
S4),
977+
%% A deleted member can never transition to another target.
978+
?assertMatch(#stream{members = #{N2 := #member{target = deleted,
979+
current = undefined,
980+
state = {stopped, _, _}}}},
981+
S5),
982+
ok.
983+
910984
delete_replica_2(_) ->
911985
%% replica is deleted before it has been fully started
912986
E = 1,

deps/rabbit/test/rabbit_stream_queue_SUITE.erl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ merge_app_env(Config) ->
186186
{rabbit, [{core_metrics_gc_interval, 100}]}).
187187

188188
end_per_testcase(Testcase, Config) ->
189-
Q = ?config(queue_name, Config),
190189
Config1 = rabbit_ct_helpers:run_steps(
191190
Config,
192191
rabbit_ct_client_helpers:teardown_steps()),

0 commit comments

Comments
 (0)