Skip to content

Commit cbf6a46

Browse files
committed
Drop source node allocations if finish_resize fails
By the time finish_resize runs on the dest host, the instance host/node values are already pointing at the dest (they are set by resize_instance on the source compute before casting to finish_resize on the dest). If finish_resize fails, the instance is essentially stuck on the dest host so rather than revert the allocations (which will drop the new flavor allocations against the dest host where the instance now lives) we should just drop the old flavor allocations on the source node resource provider, which is what this change does. The functional regression recreate test is updated to show this working. Conflicts: nova/tests/functional/regressions/test_bug_1825537.py nova/tests/functional/test_servers.py NOTE(mriedem): The functional test conflicts are due to not having change If6aa37d9b6b48791e070799ab026c816fda4441c in Rocky. The code fix is also different because we don't have change I0851e2d54a1fdc82fe3291fb7e286e790f121e92 in Rocky and cannot use the _delete_allocation_after_move method as documented inline. This also means we have to deal with migration-based and legacy tracked allocations (where the source and dest node allocations are both on the instance consumer). As such, similar logic to _delete_allocation_after_move is used to try and delete the migration-based allocations first and failing that, attempt to cleanup the source node allocations the legacy way by removing them from the "doubled up" allocations created by the scheduler. Since this is new logic in the backport, a functional test is added to cover the legacy cleanup flow. Change-Id: I52c8d038118c858004e17e71b2fba9e9e2714815 Closes-Bug: #1825537 (cherry picked from commit ea297d6) (cherry picked from commit e6c6178)
1 parent 9a977cb commit cbf6a46

File tree

4 files changed

+82
-18
lines changed

4 files changed

+82
-18
lines changed

nova/compute/manager.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4693,12 +4693,50 @@ def finish_resize(self, context, disk_info, image, instance,
46934693
new host machine.
46944694
46954695
"""
4696+
# _finish_resize sets instance.old_flavor to instance.flavor and
4697+
# changes instance.flavor to instance.new_flavor (if doing a resize
4698+
# rather than a cold migration). We save off the old_flavor here in
4699+
# case we need it for error handling below.
4700+
old_flavor = instance.flavor
46964701
try:
46974702
self._finish_resize_helper(context, disk_info, image, instance,
46984703
migration)
46994704
except Exception:
47004705
with excutils.save_and_reraise_exception():
4701-
self._revert_allocation(context, instance, migration)
4706+
# At this point, resize_instance (which runs on the source) has
4707+
# already updated the instance host/node values to point to
4708+
# this (the dest) compute, so we need to leave the allocations
4709+
# against the dest node resource provider intact and drop the
4710+
# allocations against the source node resource provider. If the
4711+
# user tries to recover the server by hard rebooting it, it
4712+
# will happen on this host so that's where the allocations
4713+
# should go.
4714+
LOG.info('Deleting allocations for old flavor on source node '
4715+
'%s after finish_resize failure. You may be able to '
4716+
'recover the instance by hard rebooting it.',
4717+
migration.source_compute, instance=instance)
4718+
# NOTE(mriedem): We can't use _delete_allocation_after_move
4719+
# because it relies on the resource tracker to look up the
4720+
# node uuid and since we are on the dest host, passing the
4721+
# source nodename won't work since the RT isn't tracking that
4722+
# node here. So we just try to remove the migration-based
4723+
# allocations directly and handle the case they don't exist.
4724+
if not self.reportclient.delete_allocation_for_instance(
4725+
context, migration.uuid):
4726+
# No migration-based allocation. Try to cleanup directly.
4727+
cn = objects.ComputeNode.get_by_host_and_nodename(
4728+
context, migration.source_compute,
4729+
migration.source_node)
4730+
if not scheduler_utils.remove_allocation_from_compute(
4731+
context, instance, cn.uuid, self.reportclient,
4732+
flavor=old_flavor):
4733+
LOG.error('Failed to delete allocations for old '
4734+
'flavor %s against source node %s. The '
4735+
'instance is now on the dest node %s. The '
4736+
'allocations against the source node need '
4737+
'to be manually cleaned up in Placement.',
4738+
old_flavor.flavorid, migration.source_node,
4739+
migration.dest_node, instance=instance)
47024740

47034741
def _finish_resize_helper(self, context, disk_info, image, instance,
47044742
migration):

nova/tests/functional/regressions/test_bug_1825537.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,24 @@ class FinishResizeErrorAllocationCleanupTestCase(
2424

2525
compute_driver = 'fake.FakeFinishMigrationFailDriver'
2626

27+
# ProviderUsageBaseTestCase uses the AllServicesCurrent fixture which
28+
# means we'll use migration-based allocations by default. This flag allows
29+
# us to control the logic in conductor to handle legacy allocations where
30+
# the source (old flavor) and dest (new flavor) node allocations are
31+
# doubled up on the instance.
32+
migration_based_allocations = True
33+
2734
def setUp(self):
2835
super(FinishResizeErrorAllocationCleanupTestCase, self).setUp()
2936
# Get the flavors we're going to use.
3037
flavors = self.api.get_flavors()
3138
self.flavor1 = flavors[0]
3239
self.flavor2 = flavors[1]
3340

41+
self.stub_out('nova.conductor.tasks.migrate.'
42+
'should_do_migration_allocation',
43+
lambda *args, **kwargs: self.migration_based_allocations)
44+
3445
def _resize_and_assert_error(self, server, dest_host):
3546
# Now resize the server and wait for it to go to ERROR status because
3647
# the finish_migration virt driver method in host2 should fail.
@@ -67,16 +78,20 @@ def test_finish_resize_fails_allocation_cleanup(self):
6778
# allocations should still exist with the new flavor.
6879
source_rp_uuid = self._get_provider_uuid_by_host('host1')
6980
dest_rp_uuid = self._get_provider_uuid_by_host('host2')
70-
# FIXME(mriedem): This is bug 1825537 where the allocations are
71-
# reverted when finish_resize fails so the dest node resource provider
72-
# does not have any allocations and the instance allocations are for
73-
# the old flavor on the source node resource provider even though the
74-
# instance is not running on the source host nor pointed at the source
75-
# host in the DB.
76-
# self.assertFlavorMatchesAllocation(
77-
# self.flavor2, server['id'], dest_rp_uuid)
7881
dest_rp_usages = self._get_provider_usages(dest_rp_uuid)
82+
self.assertFlavorMatchesAllocation(self.flavor2, dest_rp_usages)
83+
# And the source node provider should not have any usage.
84+
source_rp_usages = self._get_provider_usages(source_rp_uuid)
7985
no_usage = {'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}
80-
self.assertEqual(no_usage, dest_rp_usages)
81-
source_usages = self._get_provider_usages(source_rp_uuid)
82-
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
86+
self.assertEqual(no_usage, source_rp_usages)
87+
88+
89+
class FinishResizeErrorAllocationCleanupLegacyTestCase(
90+
FinishResizeErrorAllocationCleanupTestCase):
91+
"""Variant of FinishResizeErrorAllocationCleanupTestCase which does not
92+
use migration-based allocations, e.g. tests the scenario that there are
93+
older computes in the deployment so the source and dest node allocations
94+
are doubled up on the instance consumer record rather than the migration
95+
record.
96+
"""
97+
migration_based_allocations = False

nova/tests/functional/test_servers.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3088,10 +3088,13 @@ def fake_resize_method(*args, **kwargs):
30883088
# Ensure the allocation records still exist on the host.
30893089
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
30903090
source_usages = self._get_provider_usages(source_rp_uuid)
3091-
# FIXME(mriedem): This is wrong for the _finish_resize case.
3092-
# The new_flavor should have been subtracted from the doubled
3093-
# allocation which just leaves us with the original flavor.
3094-
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
3091+
if failing_method == '_finish_resize':
3092+
# finish_resize will drop the old flavor allocations.
3093+
self.assertFlavorMatchesAllocation(self.flavor2, source_usages)
3094+
else:
3095+
# The new_flavor should have been subtracted from the doubled
3096+
# allocation which just leaves us with the original flavor.
3097+
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
30953098

30963099
def test_resize_to_same_host_prep_resize_fails(self):
30973100
self._test_resize_to_same_host_instance_fails(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6675,7 +6675,9 @@ def _mock_finish_resize(self):
66756675
test_instance_fault.fake_faults['fake-uuid'][0])
66766676
yield _finish_resize
66776677

6678-
def test_finish_resize_failure(self):
6678+
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6679+
'delete_allocation_for_instance')
6680+
def test_finish_resize_failure(self, mock_del_allocs):
66796681
self.migration.status = 'post-migrating'
66806682

66816683
with self._mock_finish_resize() as _finish_resize:
@@ -6689,10 +6691,14 @@ def test_finish_resize_failure(self):
66896691

66906692
# Assert that we set the migration to an error state
66916693
self.assertEqual("error", self.migration.status)
6694+
mock_del_allocs.assert_called_once_with(
6695+
self.context, self.migration.uuid)
66926696

6697+
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6698+
'delete_allocation_for_instance')
66936699
@mock.patch('nova.compute.manager.ComputeManager.'
66946700
'_notify_about_instance_usage')
6695-
def test_finish_resize_notify_failure(self, notify):
6701+
def test_finish_resize_notify_failure(self, notify, mock_del_allocs):
66966702
self.migration.status = 'post-migrating'
66976703

66986704
with self._mock_finish_resize():
@@ -6706,6 +6712,8 @@ def test_finish_resize_notify_failure(self, notify):
67066712

67076713
# Assert that we did not set the migration to an error state
67086714
self.assertEqual('post-migrating', self.migration.status)
6715+
mock_del_allocs.assert_called_once_with(
6716+
self.context, self.migration.uuid)
67096717

67106718
@contextlib.contextmanager
67116719
def _mock_resize_instance(self):

0 commit comments

Comments
 (0)