Skip to content

Commit 66b831d

Browse files
authored
Merge pull request #1260 from rabbitmq/improve-partitions_SUITE
Improve `partitions_SUITE`
2 parents 910446f + 699c455 commit 66b831d

File tree

2 files changed

+58
-20
lines changed

2 files changed

+58
-20
lines changed

src/rabbit_node_monitor.erl

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -344,8 +344,8 @@ init([]) ->
344344
Nodes = possibly_partitioned_nodes(),
345345
startup_log(Nodes),
346346
Monitors = lists:foldl(fun(Node, Monitors0) ->
347-
pmon:monitor({rabbit, Node}, Monitors0)
348-
end, pmon:new(), Nodes),
347+
pmon:monitor({rabbit, Node}, Monitors0)
348+
end, pmon:new(), Nodes),
349349
{ok, ensure_keepalive_timer(#state{monitors = Monitors,
350350
subscribers = pmon:new(),
351351
partitions = [],
@@ -420,12 +420,12 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID},
420420
fun () ->
421421
case rpc:call(Node, rabbit, is_running, []) of
422422
{badrpc, _} -> ok;
423-
_ ->
424-
rabbit_log:warning("Received a 'DOWN' message"
425-
" from ~p but still can"
426-
" communicate with it ~n",
427-
[Node]),
428-
cast(Rep, {partial_partition,
423+
_ ->
424+
rabbit_log:warning("Received a 'DOWN' message"
425+
" from ~p but still can"
426+
" communicate with it ~n",
427+
[Node]),
428+
cast(Rep, {partial_partition,
429429
Node, node(), RepGUID})
430430
end
431431
end);
@@ -499,18 +499,18 @@ handle_cast({node_up, Node, NodeType},
499499
rabbit_log:info("rabbit on node ~p up~n", [Node]),
500500
{AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
501501
write_cluster_status({add_node(Node, AllNodes),
502-
case NodeType of
503-
disc -> add_node(Node, DiscNodes);
504-
ram -> DiscNodes
505-
end,
506-
add_node(Node, RunningNodes)}),
502+
case NodeType of
503+
disc -> add_node(Node, DiscNodes);
504+
ram -> DiscNodes
505+
end,
506+
add_node(Node, RunningNodes)}),
507507
ok = handle_live_rabbit(Node),
508508
Monitors1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
509-
true ->
510-
Monitors;
511-
false ->
512-
pmon:monitor({rabbit, Node}, Monitors)
513-
end,
509+
true ->
510+
Monitors;
511+
false ->
512+
pmon:monitor({rabbit, Node}, Monitors)
513+
end,
514514
{noreply, maybe_autoheal(State#state{monitors = Monitors1})};
515515

516516
handle_cast({joined_cluster, Node, NodeType}, State) ->
@@ -584,7 +584,7 @@ handle_info({mnesia_system_event,
584584
State1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
585585
true -> State;
586586
false -> State#state{
587-
monitors = pmon:monitor({rabbit, Node}, Monitors)}
587+
monitors = pmon:monitor({rabbit, Node}, Monitors)}
588588
end,
589589
ok = handle_live_rabbit(Node),
590590
Partitions1 = lists:usort([Node | Partitions]),
@@ -893,4 +893,4 @@ startup_log([]) ->
893893
rabbit_log:info("Starting rabbit_node_monitor~n", []);
894894
startup_log(Nodes) ->
895895
rabbit_log:info("Starting rabbit_node_monitor, might be partitioned from ~p~n",
896-
[Nodes]).
896+
[Nodes]).

test/partitions_SUITE.erl

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,16 +335,26 @@ autoheal_unexpected_finish(Config) ->
335335

336336
partial_false_positive(Config) ->
337337
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
338+
suspend_node_monitor(Config, C),
338339
block([{A, B}]),
339340
timer:sleep(1000),
340341
block([{A, C}]),
341342
timer:sleep(?DELAY),
343+
resume_node_monitor(Config, C),
344+
timer:sleep(?DELAY),
342345
unblock([{A, B}, {A, C}]),
343346
timer:sleep(?DELAY),
344347
%% When B times out A's connection, it will check with C. C will
345348
%% not have timed out A yet, but already it can't talk to it. We
346349
%% need to not consider this a partial partition; B and C should
347350
%% still talk to each other.
351+
%%
352+
%% Because there is a chance that C can still talk to A when B
353+
%% requests to check for a partial partition, we suspend C's
354+
%% rabbit_node_monitor at the beginning and resume it after the
355+
%% link between A and C is blocked. This way, when B asks C about
356+
%% A, we make sure that the A<->C link is blocked before C's
357+
%% rabbit_node_monitor processes B's request.
348358
[B, C] = partitions(A),
349359
[A] = partitions(B),
350360
[A] = partitions(C),
@@ -369,7 +379,19 @@ partial_to_full(Config) ->
369379
partial_pause_minority(Config) ->
370380
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
371381
set_mode(Config, pause_minority),
382+
%% We suspend rabbit_node_monitor on C while we block the link
383+
%% between A and B. This should make sure C's rabbit_node_monitor
384+
%% processes both partial partition checks from A and B at about
385+
%% the same time, and thus increase the chance both A and B decides
386+
%% there is a partial partition.
387+
%%
388+
%% Without this, one node may see the partial partition and stop,
389+
%% before the other node sees it. In this case, the other node
390+
%% doesn't stop and this testcase fails.
391+
suspend_node_monitor(Config, C),
372392
block([{A, B}]),
393+
timer:sleep(?DELAY),
394+
resume_node_monitor(Config, C),
373395
[await_running(N, false) || N <- [A, B]],
374396
await_running(C, true),
375397
unblock([{A, B}]),
@@ -394,6 +416,22 @@ set_mode(Config, Mode) ->
394416
set_mode(Config, Nodes, Mode) ->
395417
rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode).
396418

419+
suspend_node_monitor(Config, Node) ->
420+
rabbit_ct_broker_helpers:rpc(
421+
Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]).
422+
423+
resume_node_monitor(Config, Node) ->
424+
rabbit_ct_broker_helpers:rpc(
425+
Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]).
426+
427+
suspend_or_resume_node_monitor(SuspendOrResume) ->
428+
Action = case SuspendOrResume of
429+
suspend -> "Suspending";
430+
resume -> "Resuming"
431+
end,
432+
rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]),
433+
ok = sys:SuspendOrResume(rabbit_node_monitor).
434+
397435
block_unblock(Pairs) ->
398436
block(Pairs),
399437
timer:sleep(?DELAY),

0 commit comments

Comments
 (0)