Skip to content

Commit bafb801

Browse files
Merge pull request #1179 from rabbitmq/rabbitmq-server-1178
Re-enable disk_monitor in case of parser failures
2 parents f0da426 + c340c0d commit bafb801

File tree

3 files changed

+68
-13
lines changed

3 files changed

+68
-13
lines changed

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ define PROJECT_ENV
107107
{queue_explicit_gc_run_operation_threshold, 1000},
108108
{lazy_queue_explicit_gc_run_operation_threshold, 1000},
109109
{background_gc_enabled, false},
110-
{background_gc_target_interval, 60000}
110+
{background_gc_target_interval, 60000},
111+
{disk_monitor_failure_retries, 10},
112+
{disk_monitor_failure_retry_interval, 120000}
111113
]
112114
endef
113115

src/rabbit_disk_monitor.erl

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,12 @@
6565
alarmed,
6666
%% is monitoring enabled? false on unsupported
6767
%% platforms
68-
enabled
68+
enabled,
69+
%% number of retries to enable monitoring if it fails
70+
%% on start-up
71+
retries,
72+
%% Interval between retries
73+
interval
6974
}).
7075

7176
%%----------------------------------------------------------------------------
@@ -114,20 +119,17 @@ start_link(Args) ->
114119

115120
init([Limit]) ->
116121
Dir = dir(),
122+
{ok, Retries} = application:get_env(rabbit, disk_monitor_failure_retries),
123+
{ok, Interval} = application:get_env(rabbit, disk_monitor_failure_retry_interval),
117124
State = #state{dir = Dir,
118125
min_interval = ?DEFAULT_MIN_DISK_CHECK_INTERVAL,
119126
max_interval = ?DEFAULT_MAX_DISK_CHECK_INTERVAL,
120127
alarmed = false,
121-
enabled = true},
122-
case {catch get_disk_free(Dir),
123-
vm_memory_monitor:get_total_memory()} of
124-
{N1, N2} when is_integer(N1), is_integer(N2) ->
125-
{ok, start_timer(set_disk_limits(State, Limit))};
126-
Err ->
127-
rabbit_log:info("Disabling disk free space monitoring "
128-
"on unsupported platform:~n~p~n", [Err]),
129-
{ok, State#state{enabled = false}}
130-
end.
128+
enabled = true,
129+
limit = Limit,
130+
retries = Retries,
131+
interval = Interval},
132+
{ok, enable(State)}.
131133

132134
handle_call(get_disk_free_limit, _From, State = #state{limit = Limit}) ->
133135
{reply, Limit, State};
@@ -161,6 +163,8 @@ handle_call(_Request, _From, State) ->
161163
handle_cast(_Request, State) ->
162164
{noreply, State}.
163165

166+
handle_info(try_enable, #state{retries = Retries} = State) ->
167+
{noreply, enable(State#state{retries = Retries - 1})};
164168
handle_info(update, State) ->
165169
{noreply, start_timer(internal_update(State))};
166170

@@ -246,7 +250,7 @@ interpret_limit(Absolute) ->
246250

247251
emit_update_info(StateStr, CurrentFree, Limit) ->
248252
rabbit_log:info(
249-
"Disk free space ~s. Free bytes:~p Limit:~p~n",
253+
"Free disk space is ~s. Free bytes: ~p. Limit: ~p~n",
250254
[StateStr, CurrentFree, Limit]).
251255

252256
start_timer(State) ->
@@ -261,3 +265,20 @@ interval(#state{limit = Limit,
261265
max_interval = MaxInterval}) ->
262266
IdealInterval = 2 * (Actual - Limit) / ?FAST_RATE,
263267
trunc(erlang:max(MinInterval, erlang:min(MaxInterval, IdealInterval))).
268+
269+
enable(#state{retries = 0} = State) ->
270+
State;
271+
enable(#state{dir = Dir, interval = Interval, limit = Limit, retries = Retries}
272+
= State) ->
273+
case {catch get_disk_free(Dir),
274+
vm_memory_monitor:get_total_memory()} of
275+
{N1, N2} when is_integer(N1), is_integer(N2) ->
276+
rabbit_log:info("Enabling free disk space monitoring~n", []),
277+
start_timer(set_disk_limits(State, Limit));
278+
Err ->
279+
rabbit_log:info("Free disk space monitor encountered an error "
280+
"(e.g. failed to parse output from OS tools): ~p, retries left: ~s~n",
281+
[Err, Retries]),
282+
timer:send_after(Interval, self(), try_enable),
283+
State#state{enabled = false}
284+
end.

test/unit_inbroker_non_parallel_SUITE.erl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ groups() ->
3535
app_management, %% Restart RabbitMQ.
3636
channel_statistics, %% Expect specific statistics.
3737
disk_monitor, %% Replace rabbit_misc module.
38+
disk_monitor_enable,
3839
file_handle_cache, %% Change FHC limit.
3940
head_message_timestamp_statistics, %% Expect specific statistics.
4041
log_management, %% Check log files.
@@ -744,6 +745,37 @@ disk_monitor1(_Config) ->
744745
meck:unload(rabbit_misc),
745746
passed.
746747

748+
disk_monitor_enable(Config) ->
749+
passed = rabbit_ct_broker_helpers:rpc(Config, 0,
750+
?MODULE, disk_monitor_enable1, [Config]).
751+
752+
disk_monitor_enable1(_Config) ->
753+
case os:type() of
754+
{unix, _} ->
755+
disk_monitor_enable1();
756+
_ ->
757+
%% skip windows testing
758+
skipped
759+
end.
760+
761+
disk_monitor_enable1() ->
762+
ok = meck:new(rabbit_misc, [passthrough]),
763+
ok = meck:expect(rabbit_misc, os_cmd, fun(_) -> "\n" end),
764+
application:set_env(rabbit, disk_monitor_failure_retries, 20000),
765+
application:set_env(rabbit, disk_monitor_failure_retry_interval, 100),
766+
ok = rabbit_sup:stop_child(rabbit_disk_monitor_sup),
767+
ok = rabbit_sup:start_delayed_restartable_child(rabbit_disk_monitor, [1000]),
768+
undefined = rabbit_disk_monitor:get_disk_free(),
769+
Cmd = "Filesystem 1024-blocks Used Available Capacity iused ifree %iused Mounted on\n/dev/disk1 975798272 234783364 740758908 25% 58759839 185189727 24% /\n",
770+
ok = meck:expect(rabbit_misc, os_cmd, fun(_) -> Cmd end),
771+
timer:sleep(1000),
772+
Bytes = 740758908 * 1024,
773+
Bytes = rabbit_disk_monitor:get_disk_free(),
774+
meck:unload(rabbit_misc),
775+
application:set_env(rabbit, disk_monitor_failure_retries, 10),
776+
application:set_env(rabbit, disk_monitor_failure_retry_interval, 120000),
777+
passed.
778+
747779
%% ---------------------------------------------------------------------------
748780
%% rabbitmqctl helpers.
749781
%% ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)