Skip to content

Commit cb8b0bf

Browse files
committed
rabbit_ct_helpers: Detect crashes in RabbitMQ logs
[Why] Some crashes may go under the radar if a testcase didn't check everything (which is hard to do anyway). It could be useful to be aware of those crashes. [How] This new check will search for any logged crashes in the RabbitMQ log files and throw an exception as part of stopping a node. V2: This facility can be disabled by setting `find_crashes` to false in the common_test's `Config`, or by setting the `$FIND_CRASHES` environment variable to anything but "1"/"yes"/"true".
1 parent 3783388 commit cb8b0bf

File tree

1 file changed

+107
-1
lines changed

1 file changed

+107
-1
lines changed

deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,31 @@ stop_rabbitmq_nodes(Config) ->
10081008
fun(NodeConfig) ->
10091009
stop_rabbitmq_node(Config, NodeConfig)
10101010
end),
1011+
%% Except if disabled, we search for crashes logged in the test nodes after
1012+
%% they are stopped. If we find some, we log them again in the common_test
1013+
%% logs and throw an exception to make the test fail.
1014+
FindCrashes = case rabbit_ct_helpers:get_config(Config, find_crashes) of
1015+
true ->
1016+
true;
1017+
false ->
1018+
false;
1019+
undefined ->
1020+
case os:getenv("FIND_CRASHES") of
1021+
undefined -> true;
1022+
"1" -> true;
1023+
"yes" -> true;
1024+
"true" -> true;
1025+
_ -> false
1026+
end
1027+
end,
1028+
case FindCrashes of
1029+
true ->
1030+
%% TODO: Make the ignore list configurable.
1031+
IgnoredCrashes = ["** force_vhost_failure"],
1032+
find_crashes_in_logs(NodeConfigs, IgnoredCrashes);
1033+
false ->
1034+
ok
1035+
end,
10111036
proplists:delete(rmq_nodes, Config).
10121037

10131038
stop_rabbitmq_node(Config, NodeConfig) ->
@@ -1029,6 +1054,84 @@ stop_rabbitmq_node(Config, NodeConfig) ->
10291054
end,
10301055
NodeConfig.
10311056

1057+
find_crashes_in_logs(NodeConfigs, IgnoredCrashes) ->
1058+
ct:pal(
1059+
"Looking up any crash reports in the nodes' log files. If we find "
1060+
"some, they will appear below:"),
1061+
CrashesCount = lists:foldl(
1062+
fun(NodeConfig, Total) ->
1063+
Count = count_crashes_in_logs(
1064+
NodeConfig, IgnoredCrashes),
1065+
Total + Count
1066+
end, 0, NodeConfigs),
1067+
ct:pal("Found ~b crash report(s)", [CrashesCount]),
1068+
?assertEqual(0, CrashesCount).
1069+
1070+
count_crashes_in_logs(NodeConfig, IgnoredCrashes) ->
1071+
LogLocations = ?config(log_locations, NodeConfig),
1072+
lists:foldl(
1073+
fun(LogLocation, Total) ->
1074+
Count = count_crashes_in_log(LogLocation, IgnoredCrashes),
1075+
Total + Count
1076+
end, 0, LogLocations).
1077+
1078+
count_crashes_in_log(LogLocation, IgnoredCrashes) ->
1079+
case file:read_file(LogLocation) of
1080+
{ok, Content} -> count_crashes_in_content(Content, IgnoredCrashes);
1081+
_ -> 0
1082+
end.
1083+
1084+
count_crashes_in_content(Content, IgnoredCrashes) ->
1085+
ReOpts = [multiline],
1086+
Lines = re:split(Content, "^", ReOpts),
1087+
count_gen_server_terminations(Lines, IgnoredCrashes).
1088+
1089+
count_gen_server_terminations(Lines, IgnoredCrashes) ->
1090+
count_gen_server_terminations(Lines, 0, IgnoredCrashes).
1091+
1092+
count_gen_server_terminations([Line | Rest], Count, IgnoredCrashes) ->
1093+
ReOpts = [{capture, all_but_first, list}],
1094+
Ret = re:run(
1095+
Line,
1096+
"(<[0-9.]+> )[*]{2} Generic server .+ terminating$",
1097+
ReOpts),
1098+
case Ret of
1099+
{match, [Prefix]} ->
1100+
capture_gen_server_termination(
1101+
Rest, Prefix, [Line], Count, IgnoredCrashes);
1102+
nomatch ->
1103+
count_gen_server_terminations(Rest, Count, IgnoredCrashes)
1104+
end;
1105+
count_gen_server_terminations([], Count, _IgnoredCrashes) ->
1106+
Count.
1107+
1108+
capture_gen_server_termination(
1109+
[Line | Rest] = Lines, Prefix, Acc, Count, IgnoredCrashes) ->
1110+
ReOpts = [{capture, all_but_first, list}],
1111+
Ret = re:run(Line, Prefix ++ "( .*|\\*.*|)$", ReOpts),
1112+
case Ret of
1113+
{match, [Suffix]} ->
1114+
case lists:member(Suffix, IgnoredCrashes) of
1115+
false ->
1116+
capture_gen_server_termination(
1117+
Rest, Prefix, [Line | Acc], Count, IgnoredCrashes);
1118+
true ->
1119+
count_gen_server_terminations(
1120+
Lines, Count, IgnoredCrashes)
1121+
end;
1122+
nomatch ->
1123+
found_gen_server_termiation(
1124+
lists:reverse(Acc), Lines, Count, IgnoredCrashes)
1125+
end;
1126+
capture_gen_server_termination(
1127+
[] = Rest, _Prefix, Acc, Count, IgnoredCrashes) ->
1128+
found_gen_server_termiation(
1129+
lists:reverse(Acc), Rest, Count, IgnoredCrashes).
1130+
1131+
found_gen_server_termiation(Message, Lines, Count, IgnoredCrashes) ->
1132+
ct:pal("gen_server termination:~n~n~s", [Message]),
1133+
count_gen_server_terminations(Lines, Count + 1, IgnoredCrashes).
1134+
10321135
%% -------------------------------------------------------------------
10331136
%% Helpers for partition simulation
10341137
%% -------------------------------------------------------------------
@@ -1346,6 +1449,8 @@ delete_vhost(Config, Node, VHost) ->
13461449
delete_vhost(Config, Node, VHost, Username) ->
13471450
catch rpc(Config, Node, rabbit_vhost, delete, [VHost, Username]).
13481451

1452+
-define(FORCE_VHOST_FAILURE_REASON, force_vhost_failure).
1453+
13491454
force_vhost_failure(Config, VHost) -> force_vhost_failure(Config, 0, VHost).
13501455

13511456
force_vhost_failure(Config, Node, VHost) ->
@@ -1359,7 +1464,8 @@ force_vhost_failure(Config, Node, VHost, Attempts) ->
13591464
try
13601465
MessageStorePid = get_message_store_pid(Config, Node, VHost),
13611466
rpc(Config, Node,
1362-
erlang, exit, [MessageStorePid, force_vhost_failure]),
1467+
erlang, exit,
1468+
[MessageStorePid, ?FORCE_VHOST_FAILURE_REASON]),
13631469
%% Give it a time to fail
13641470
timer:sleep(300),
13651471
force_vhost_failure(Config, Node, VHost, Attempts - 1)

0 commit comments

Comments
 (0)