Skip to content

Commit 51e2837

Browse files
felixhuettnermergify-bot
authored andcommitted
Fix migrate leadership timeout
The code of `migrate_leadership_to_existing_replica` previously assumed that it can control the target node of a failover of mirrored queues. While its sibling function `transfer_leadership` can do that (as it drops all mirrors besides the one on the target node) `migrate_leadership_to_existing_replica` tries to be less intrusive and only tries to migrate the queue away for the current primary. However it then used `wait_for_new_master` to wait for the queue to actually transfer the primary to the specified target node. As the specified target node might not ever become the primary of the queue (as this decission is only made between the remaining mirrors) this can cause the migrate operation to run into a timeout (10 sec per default). As `migrate_leadership_to_existing_replica` is only used during `transfer_leadership_of_classic_mirrored_queues` its only goal is to get the primary away from the current node. Therefor we can just wait for the queue to become active on some other node instead of expecting a specific node to become the primary. (cherry picked from commit fa0a8de)
1 parent d58684c commit 51e2837

File tree

2 files changed

+38
-6
lines changed

2 files changed

+38
-6
lines changed

deps/rabbit/src/rabbit_maintenance.erl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,15 +270,15 @@ transfer_leadership_of_classic_mirrored_queues(TransferCandidates) ->
270270
[rabbit_misc:rs(Name), readable_candidate_list(ExistingReplicaNodes)]),
271271
case random_primary_replica_transfer_candidate_node(TransferCandidates, ExistingReplicaNodes) of
272272
{ok, Pick} ->
273-
rabbit_log:debug("Will transfer leadership of local ~s to node ~s",
273+
rabbit_log:debug("Will transfer leadership of local ~s. Planned target node: ~s",
274274
[rabbit_misc:rs(Name), Pick]),
275275
case rabbit_mirror_queue_misc:migrate_leadership_to_existing_replica(Q, Pick) of
276-
{migrated, _} ->
276+
{migrated, NewPrimary} ->
277277
rabbit_log:debug("Successfully transferred leadership of queue ~s to node ~s",
278-
[rabbit_misc:rs(Name), Pick]);
278+
[rabbit_misc:rs(Name), NewPrimary]);
279279
Other ->
280-
rabbit_log:warning("Could not transfer leadership of queue ~s to node ~s: ~p",
281-
[rabbit_misc:rs(Name), Pick, Other])
280+
rabbit_log:warning("Could not transfer leadership of queue ~s: ~p",
281+
[rabbit_misc:rs(Name), Other])
282282
end;
283283
undefined ->
284284
rabbit_log:warning("Could not transfer leadership of queue ~s: no suitable candidates?",

deps/rabbit/src/rabbit_mirror_queue_misc.erl

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,8 @@ transfer_leadership(Q, Destination) ->
602602
%% Moves the primary replica (leader) of a classic mirrored queue to another node
603603
%% which already hosts a replica of this queue. In this case we can stop
604604
%% fewer replicas and reduce the load the operation has on the cluster.
605+
%% Note that there is no guarantee that the queue will actually end up on the
606+
%% destination node. The actual destination node is returned.
605607
migrate_leadership_to_existing_replica(Q, Destination) ->
606608
QName = amqqueue:get_name(Q),
607609
{PreTransferPrimaryNode, PreTransferMirrorNodes, _PreTransferInSyncMirrorNodes} = actual_queue_nodes(Q),
@@ -616,7 +618,7 @@ migrate_leadership_to_existing_replica(Q, Destination) ->
616618
NodesToDropMirrorsOn = [PreTransferPrimaryNode],
617619
drop_mirrors(QName, NodesToDropMirrorsOn),
618620

619-
case wait_for_new_master(QName, Destination) of
621+
case wait_for_different_master(QName, PreTransferPrimaryNode) of
620622
not_migrated ->
621623
{not_migrated, undefined};
622624
{{not_migrated, Destination} = Result, _Q1} ->
@@ -655,6 +657,36 @@ wait_for_new_master(QName, Destination, N) ->
655657
end
656658
end.
657659

660+
-spec wait_for_different_master(rabbit_amqqueue:name(), atom()) -> {{migrated, node()}, amqqueue:amqqueue()} | {{not_migrated, node()}, amqqueue:amqqueue()} | not_migrated.
661+
wait_for_different_master(QName, Source) ->
662+
wait_for_different_master(QName, Source, 100).
663+
664+
wait_for_different_master(QName, _, 0) ->
665+
case rabbit_amqqueue:lookup(QName) of
666+
{error, not_found} -> not_migrated;
667+
{ok, Q} -> {{not_migrated, undefined}, Q}
668+
end;
669+
wait_for_different_master(QName, Source, N) ->
670+
case rabbit_amqqueue:lookup(QName) of
671+
{error, not_found} ->
672+
not_migrated;
673+
{ok, Q} ->
674+
case amqqueue:get_pid(Q) of
675+
none ->
676+
timer:sleep(100),
677+
wait_for_different_master(QName, Source, N - 1);
678+
Pid ->
679+
case node(Pid) of
680+
Source ->
681+
timer:sleep(100),
682+
wait_for_different_master(QName, Source, N - 1);
683+
Destination ->
684+
{{migrated, Destination}, Q}
685+
end
686+
end
687+
end.
688+
689+
658690
%% The arrival of a newly synced mirror may cause the master to die if
659691
%% the policy does not want the master but it has been kept alive
660692
%% because there were no synced mirrors.

0 commit comments

Comments
 (0)