Skip to content

Commit 4ebaf7d

Browse files
committed
rabbit_khepri: Fix error handling with net_adm:ping/1
[Why] The initial implementation was a bit too optimistic: it just asserted that `net_adm:ping/1` returned pong. This led to a crash if it was not the case. [How] It is better to handle an error from `net_adm:ping/1` and return something appropriate instead of crashing. The rest of the function already does that. It improves the integration with peer discovery.
1 parent b7604e2 commit 4ebaf7d

File tree

1 file changed

+31
-27
lines changed

1 file changed

+31
-27
lines changed

deps/rabbit/src/rabbit_khepri.erl

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -342,35 +342,39 @@ do_join(RemoteNode) when RemoteNode =/= node() ->
342342
khepri:info(?RA_CLUSTER_NAME),
343343

344344
%% Ensure the remote node is reachable before we add it.
345-
pong = net_adm:ping(RemoteNode),
345+
case net_adm:ping(RemoteNode) of
346+
pong ->
347+
%% We verify the cluster membership before adding `ThisNode' to
348+
%% `RemoteNode''s cluster. We do it mostly to keep the same
349+
%% behavior as what we do with Mnesia. Otherwise, the interest is
350+
%% limited given the check and the actual join are not atomic.
346351

347-
%% We verify the cluster membership before adding `ThisNode' to
348-
%% `RemoteNode''s cluster. We do it mostly to keep the same behavior as
349-
%% what we do with Mnesia. Otherwise, the interest is limited given the
350-
%% check and the actual join are not atomic.
351-
352-
?LOG_DEBUG(
353-
"Adding this node (~p) to Khepri cluster \"~s\" through node ~p",
354-
[ThisNode, ?RA_CLUSTER_NAME, RemoteNode],
355-
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
356-
357-
%% If the remote node to add is running RabbitMQ, we need to put it in
358-
%% maintenance mode at least. We remember that state to revive the node
359-
%% only if it was fully running before this code.
360-
IsRunning = rabbit:is_running(ThisNode),
361-
AlreadyBeingDrained =
362-
rabbit_maintenance:is_being_drained_consistent_read(ThisNode),
363-
NeedToRevive = IsRunning andalso not AlreadyBeingDrained,
364-
maybe_drain_node(IsRunning),
365-
366-
%% Joining a cluster includes a reset of the local Khepri store.
367-
Ret = khepri_cluster:join(?RA_CLUSTER_NAME, RemoteNode),
368-
369-
%% Revive the remote node if it was running and not under maintenance
370-
%% before we changed the cluster membership.
371-
maybe_revive_node(NeedToRevive),
352+
?LOG_DEBUG(
353+
"Adding this node (~p) to Khepri cluster \"~s\" through "
354+
"node ~p",
355+
[ThisNode, ?RA_CLUSTER_NAME, RemoteNode],
356+
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
372357

373-
Ret.
358+
%% If the remote node to add is running RabbitMQ, we need to put it
359+
%% in maintenance mode at least. We remember that state to revive
360+
%% the node only if it was fully running before this code.
361+
IsRunning = rabbit:is_running(ThisNode),
362+
AlreadyBeingDrained =
363+
rabbit_maintenance:is_being_drained_consistent_read(ThisNode),
364+
NeedToRevive = IsRunning andalso not AlreadyBeingDrained,
365+
maybe_drain_node(IsRunning),
366+
367+
%% Joining a cluster includes a reset of the local Khepri store.
368+
Ret = khepri_cluster:join(?RA_CLUSTER_NAME, RemoteNode),
369+
370+
%% Revive the remote node if it was running and not under
371+
%% maintenance before we changed the cluster membership.
372+
maybe_revive_node(NeedToRevive),
373+
374+
Ret;
375+
pang ->
376+
{error, {node_unreachable, RemoteNode}}
377+
end.
374378

375379
maybe_drain_node(true) ->
376380
ok = rabbit_maintenance:drain();

0 commit comments

Comments
 (0)