Skip to content

Commit de2843d

Browse files
authored
Merge pull request #1217 from rabbitmq/improve-wait-when-clustering-1214
Check rabbit is running after the boot process finishes
2 parents 9e1235c + 61a1c88 commit de2843d

File tree

2 files changed

+71
-14
lines changed

2 files changed

+71
-14
lines changed

src/rabbit.erl

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ stop() ->
436436
undefined -> ok;
437437
_ ->
438438
rabbit_log:info("RabbitMQ hasn't finished starting yet. Waiting for startup to finish before stopping..."),
439-
await_startup(true)
439+
wait_to_finish_booting()
440440
end,
441441
rabbit_log:info("RabbitMQ is asked to stop...~n", []),
442442
Apps = ?APPS ++ rabbit_plugins:active(),
@@ -604,19 +604,32 @@ handle_app_error(Term) ->
604604
end.
605605

606606
await_startup() ->
607-
await_startup(false).
607+
case is_booting() of
608+
true -> wait_to_finish_booting();
609+
false ->
610+
case is_running() of
611+
true -> ok;
612+
false -> wait_to_start_booting(),
613+
wait_to_finish_booting()
614+
end
615+
end.
608616

609-
await_startup(HaveSeenRabbitBoot) ->
610-
%% We don't take absence of rabbit_boot as evidence we've started,
611-
%% since there's a small window before it is registered.
617+
is_booting() ->
618+
whereis(rabbit_boot) /= undefined.
619+
620+
wait_to_start_booting() ->
612621
case whereis(rabbit_boot) of
613-
undefined -> case HaveSeenRabbitBoot orelse is_running() of
614-
true -> ok;
615-
false -> timer:sleep(100),
616-
await_startup(false)
617-
end;
622+
undefined -> timer:sleep(100),
623+
wait_to_start_booting();
624+
_ -> ok
625+
end.
626+
627+
wait_to_finish_booting() ->
628+
case whereis(rabbit_boot) of
629+
undefined -> true = is_running(),
630+
ok;
618631
_ -> timer:sleep(100),
619-
await_startup(true)
632+
wait_to_finish_booting()
620633
end.
621634

622635
status() ->

test/clustering_management_SUITE.erl

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ groups() ->
5252
reset_removes_things,
5353
forget_offline_removes_things,
5454
force_boot,
55-
status_with_alarm
55+
status_with_alarm,
56+
wait_fails_when_cluster_fails
5657
]},
5758
{cluster_size_4, [], [
5859
forget_promotes_offline_slave
@@ -73,8 +74,10 @@ suite() ->
7374
init_per_suite(Config) ->
7475
rabbit_ct_helpers:log_environment(),
7576
Config1 = rabbit_ct_helpers:merge_app_env(
76-
Config,
77-
{rabbit, [{mnesia_table_loading_retry_limit, 1}]}),
77+
Config, {rabbit, [
78+
{mnesia_table_loading_retry_limit, 2},
79+
{mnesia_table_loading_retry_timeout,1000}
80+
]}),
7881
rabbit_ct_helpers:run_setup_steps(Config1).
7982

8083
end_per_suite(Config) ->
@@ -595,8 +598,49 @@ status_with_alarm(Config) ->
595598
ok = alarm_information_on_each_node(R, Rabbit, Hare).
596599

597600

601+
wait_fails_when_cluster_fails(Config) ->
602+
[Rabbit, Hare] = rabbit_ct_broker_helpers:get_node_configs(Config,
603+
nodename),
604+
RabbitConfig = rabbit_ct_broker_helpers:get_node_config(Config,Rabbit),
605+
RabbitPidFile = ?config(pid_file, RabbitConfig),
606+
%% ensure pid file is readable
607+
{ok, _} = file:read_file(RabbitPidFile),
608+
%% ensure wait works on running node
609+
{ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, Rabbit,
610+
["wait", RabbitPidFile]),
611+
%% stop both nodes
612+
ok = rabbit_ct_broker_helpers:stop_node(Config, Rabbit),
613+
ok = rabbit_ct_broker_helpers:stop_node(Config, Hare),
614+
%% starting first node fails - it was not the last node to stop
615+
{error, _} = rabbit_ct_broker_helpers:start_node(Config, Rabbit),
616+
%% start first node in the background
617+
spawn_link(fun() ->
618+
rabbit_ct_broker_helpers:start_node(Config, Rabbit)
619+
end),
620+
Attempts = 10,
621+
Timeout = 500,
622+
wait_for_pid_file_to_contain_running_process_pid(RabbitPidFile, Attempts, Timeout),
623+
{error, _, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, Rabbit,
624+
["wait", RabbitPidFile]).
625+
598626
%% ----------------------------------------------------------------------------
599627
%% Internal utils
628+
%% ----------------------------------------------------------------------------
629+
630+
wait_for_pid_file_to_contain_running_process_pid(_, 0, _) ->
631+
error(timeout_waiting_for_pid_file_to_have_running_pid);
632+
wait_for_pid_file_to_contain_running_process_pid(PidFile, Attempts, Timeout) ->
633+
Pid = pid_from_file(PidFile),
634+
case rabbit_misc:is_os_process_alive(Pid) of
635+
true -> ok;
636+
false ->
637+
ct:sleep(Timeout),
638+
wait_for_pid_file_to_contain_running_process_pid(PidFile, Attempts - 1, Timeout)
639+
end.
640+
641+
pid_from_file(PidFile) ->
642+
{ok, Content} = file:read_file(PidFile),
643+
string:strip(binary_to_list(Content), both, $\n).
600644

601645
cluster_members(Config) ->
602646
rabbit_ct_broker_helpers:get_node_configs(Config, nodename).

0 commit comments

Comments
 (0)