Skip to content

Commit 73f55b2

Browse files
Merge pull request #4486 from rabbitmq/mergify/bp/v3.9.x/pr-4481
Fix migrate leadership timeout (backport #4468) (backport #4481)
2 parents bc7016a + 5f71db9 commit 73f55b2

File tree

2 files changed

+38
-6
lines changed

2 files changed

+38
-6
lines changed

deps/rabbit/src/rabbit_maintenance.erl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,15 +270,15 @@ transfer_leadership_of_classic_mirrored_queues(TransferCandidates) ->
270270
[rabbit_misc:rs(Name), readable_candidate_list(ExistingReplicaNodes)]),
271271
case random_primary_replica_transfer_candidate_node(TransferCandidates, ExistingReplicaNodes) of
272272
{ok, Pick} ->
273-
rabbit_log:debug("Will transfer leadership of local ~s to node ~s",
273+
rabbit_log:debug("Will transfer leadership of local ~s. Planned target node: ~s",
274274
[rabbit_misc:rs(Name), Pick]),
275275
case rabbit_mirror_queue_misc:migrate_leadership_to_existing_replica(Q, Pick) of
276-
{migrated, _} ->
276+
{migrated, NewPrimary} ->
277277
rabbit_log:debug("Successfully transferred leadership of queue ~s to node ~s",
278-
[rabbit_misc:rs(Name), Pick]);
278+
[rabbit_misc:rs(Name), NewPrimary]);
279279
Other ->
280-
rabbit_log:warning("Could not transfer leadership of queue ~s to node ~s: ~p",
281-
[rabbit_misc:rs(Name), Pick, Other])
280+
rabbit_log:warning("Could not transfer leadership of queue ~s: ~p",
281+
[rabbit_misc:rs(Name), Other])
282282
end;
283283
undefined ->
284284
rabbit_log:warning("Could not transfer leadership of queue ~s: no suitable candidates?",

deps/rabbit/src/rabbit_mirror_queue_misc.erl

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,8 @@ transfer_leadership(Q, Destination) ->
582582
%% Moves the primary replica (leader) of a classic mirrored queue to another node
583583
%% which already hosts a replica of this queue. In this case we can stop
584584
%% fewer replicas and reduce the load the operation has on the cluster.
585+
%% Note that there is no guarantee that the queue will actually end up on the
586+
%% destination node. The actual destination node is returned.
585587
migrate_leadership_to_existing_replica(Q, Destination) ->
586588
QName = amqqueue:get_name(Q),
587589
{PreTransferPrimaryNode, PreTransferMirrorNodes, _PreTransferInSyncMirrorNodes} = actual_queue_nodes(Q),
@@ -596,7 +598,7 @@ migrate_leadership_to_existing_replica(Q, Destination) ->
596598
NodesToDropMirrorsOn = [PreTransferPrimaryNode],
597599
drop_mirrors(QName, NodesToDropMirrorsOn),
598600

599-
case wait_for_new_master(QName, Destination) of
601+
case wait_for_different_master(QName, PreTransferPrimaryNode) of
600602
not_migrated ->
601603
{not_migrated, undefined};
602604
{{not_migrated, Destination} = Result, _Q1} ->
@@ -635,6 +637,36 @@ wait_for_new_master(QName, Destination, N) ->
635637
end
636638
end.
637639

640+
-spec wait_for_different_master(rabbit_amqqueue:name(), atom()) -> {{migrated, node()}, amqqueue:amqqueue()} | {{not_migrated, node()}, amqqueue:amqqueue()} | not_migrated.
641+
wait_for_different_master(QName, Source) ->
642+
wait_for_different_master(QName, Source, 100).
643+
644+
wait_for_different_master(QName, _, 0) ->
645+
case rabbit_amqqueue:lookup(QName) of
646+
{error, not_found} -> not_migrated;
647+
{ok, Q} -> {{not_migrated, undefined}, Q}
648+
end;
649+
wait_for_different_master(QName, Source, N) ->
650+
case rabbit_amqqueue:lookup(QName) of
651+
{error, not_found} ->
652+
not_migrated;
653+
{ok, Q} ->
654+
case amqqueue:get_pid(Q) of
655+
none ->
656+
timer:sleep(100),
657+
wait_for_different_master(QName, Source, N - 1);
658+
Pid ->
659+
case node(Pid) of
660+
Source ->
661+
timer:sleep(100),
662+
wait_for_different_master(QName, Source, N - 1);
663+
Destination ->
664+
{{migrated, Destination}, Q}
665+
end
666+
end
667+
end.
668+
669+
638670
%% The arrival of a newly synced mirror may cause the master to die if
639671
%% the policy does not want the master but it has been kept alive
640672
%% because there were no synced mirrors.

0 commit comments

Comments
 (0)