Skip to content

Commit 069bda3

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Drop source node allocations if finish_resize fails" into stable/rocky
2 parents 3728321 + cbf6a46 commit 069bda3

File tree

4 files changed

+82
-18
lines changed

4 files changed

+82
-18
lines changed

nova/compute/manager.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4727,12 +4727,50 @@ def finish_resize(self, context, disk_info, image, instance,
47274727
new host machine.
47284728
47294729
"""
4730+
# _finish_resize sets instance.old_flavor to instance.flavor and
4731+
# changes instance.flavor to instance.new_flavor (if doing a resize
4732+
# rather than a cold migration). We save off the old_flavor here in
4733+
# case we need it for error handling below.
4734+
old_flavor = instance.flavor
47304735
try:
47314736
self._finish_resize_helper(context, disk_info, image, instance,
47324737
migration)
47334738
except Exception:
47344739
with excutils.save_and_reraise_exception():
4735-
self._revert_allocation(context, instance, migration)
4740+
# At this point, resize_instance (which runs on the source) has
4741+
# already updated the instance host/node values to point to
4742+
# this (the dest) compute, so we need to leave the allocations
4743+
# against the dest node resource provider intact and drop the
4744+
# allocations against the source node resource provider. If the
4745+
# user tries to recover the server by hard rebooting it, it
4746+
# will happen on this host so that's where the allocations
4747+
# should go.
4748+
LOG.info('Deleting allocations for old flavor on source node '
4749+
'%s after finish_resize failure. You may be able to '
4750+
'recover the instance by hard rebooting it.',
4751+
migration.source_compute, instance=instance)
4752+
# NOTE(mriedem): We can't use _delete_allocation_after_move
4753+
# because it relies on the resource tracker to look up the
4754+
# node uuid and since we are on the dest host, passing the
4755+
# source nodename won't work since the RT isn't tracking that
4756+
# node here. So we just try to remove the migration-based
4757+
# allocations directly and handle the case they don't exist.
4758+
if not self.reportclient.delete_allocation_for_instance(
4759+
context, migration.uuid):
4760+
# No migration-based allocation. Try to cleanup directly.
4761+
cn = objects.ComputeNode.get_by_host_and_nodename(
4762+
context, migration.source_compute,
4763+
migration.source_node)
4764+
if not scheduler_utils.remove_allocation_from_compute(
4765+
context, instance, cn.uuid, self.reportclient,
4766+
flavor=old_flavor):
4767+
LOG.error('Failed to delete allocations for old '
4768+
'flavor %s against source node %s. The '
4769+
'instance is now on the dest node %s. The '
4770+
'allocations against the source node need '
4771+
'to be manually cleaned up in Placement.',
4772+
old_flavor.flavorid, migration.source_node,
4773+
migration.dest_node, instance=instance)
47364774

47374775
def _finish_resize_helper(self, context, disk_info, image, instance,
47384776
migration):

nova/tests/functional/regressions/test_bug_1825537.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,24 @@ class FinishResizeErrorAllocationCleanupTestCase(
2424

2525
compute_driver = 'fake.FakeFinishMigrationFailDriver'
2626

27+
# ProviderUsageBaseTestCase uses the AllServicesCurrent fixture which
28+
# means we'll use migration-based allocations by default. This flag allows
29+
# us to control the logic in conductor to handle legacy allocations where
30+
# the source (old flavor) and dest (new flavor) node allocations are
31+
# doubled up on the instance.
32+
migration_based_allocations = True
33+
2734
def setUp(self):
2835
super(FinishResizeErrorAllocationCleanupTestCase, self).setUp()
2936
# Get the flavors we're going to use.
3037
flavors = self.api.get_flavors()
3138
self.flavor1 = flavors[0]
3239
self.flavor2 = flavors[1]
3340

41+
self.stub_out('nova.conductor.tasks.migrate.'
42+
'should_do_migration_allocation',
43+
lambda *args, **kwargs: self.migration_based_allocations)
44+
3445
def _resize_and_assert_error(self, server, dest_host):
3546
# Now resize the server and wait for it to go to ERROR status because
3647
# the finish_migration virt driver method in host2 should fail.
@@ -67,16 +78,20 @@ def test_finish_resize_fails_allocation_cleanup(self):
6778
# allocations should still exist with the new flavor.
6879
source_rp_uuid = self._get_provider_uuid_by_host('host1')
6980
dest_rp_uuid = self._get_provider_uuid_by_host('host2')
70-
# FIXME(mriedem): This is bug 1825537 where the allocations are
71-
# reverted when finish_resize fails so the dest node resource provider
72-
# does not have any allocations and the instance allocations are for
73-
# the old flavor on the source node resource provider even though the
74-
# instance is not running on the source host nor pointed at the source
75-
# host in the DB.
76-
# self.assertFlavorMatchesAllocation(
77-
# self.flavor2, server['id'], dest_rp_uuid)
7881
dest_rp_usages = self._get_provider_usages(dest_rp_uuid)
82+
self.assertFlavorMatchesAllocation(self.flavor2, dest_rp_usages)
83+
# And the source node provider should not have any usage.
84+
source_rp_usages = self._get_provider_usages(source_rp_uuid)
7985
no_usage = {'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}
80-
self.assertEqual(no_usage, dest_rp_usages)
81-
source_usages = self._get_provider_usages(source_rp_uuid)
82-
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
86+
self.assertEqual(no_usage, source_rp_usages)
87+
88+
89+
class FinishResizeErrorAllocationCleanupLegacyTestCase(
90+
FinishResizeErrorAllocationCleanupTestCase):
91+
"""Variant of FinishResizeErrorAllocationCleanupTestCase which does not
92+
use migration-based allocations, e.g. tests the scenario that there are
93+
older computes in the deployment so the source and dest node allocations
94+
are doubled up on the instance consumer record rather than the migration
95+
record.
96+
"""
97+
migration_based_allocations = False

nova/tests/functional/test_servers.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3088,10 +3088,13 @@ def fake_resize_method(*args, **kwargs):
30883088
# Ensure the allocation records still exist on the host.
30893089
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
30903090
source_usages = self._get_provider_usages(source_rp_uuid)
3091-
# FIXME(mriedem): This is wrong for the _finish_resize case.
3092-
# The new_flavor should have been subtracted from the doubled
3093-
# allocation which just leaves us with the original flavor.
3094-
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
3091+
if failing_method == '_finish_resize':
3092+
# finish_resize will drop the old flavor allocations.
3093+
self.assertFlavorMatchesAllocation(self.flavor2, source_usages)
3094+
else:
3095+
# The new_flavor should have been subtracted from the doubled
3096+
# allocation which just leaves us with the original flavor.
3097+
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
30953098

30963099
def test_resize_to_same_host_prep_resize_fails(self):
30973100
self._test_resize_to_same_host_instance_fails(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6766,7 +6766,9 @@ def _mock_finish_resize(self):
67666766
test_instance_fault.fake_faults['fake-uuid'][0])
67676767
yield _finish_resize
67686768

6769-
def test_finish_resize_failure(self):
6769+
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6770+
'delete_allocation_for_instance')
6771+
def test_finish_resize_failure(self, mock_del_allocs):
67706772
self.migration.status = 'post-migrating'
67716773

67726774
with self._mock_finish_resize() as _finish_resize:
@@ -6780,10 +6782,14 @@ def test_finish_resize_failure(self):
67806782

67816783
# Assert that we set the migration to an error state
67826784
self.assertEqual("error", self.migration.status)
6785+
mock_del_allocs.assert_called_once_with(
6786+
self.context, self.migration.uuid)
67836787

6788+
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6789+
'delete_allocation_for_instance')
67846790
@mock.patch('nova.compute.manager.ComputeManager.'
67856791
'_notify_about_instance_usage')
6786-
def test_finish_resize_notify_failure(self, notify):
6792+
def test_finish_resize_notify_failure(self, notify, mock_del_allocs):
67876793
self.migration.status = 'post-migrating'
67886794

67896795
with self._mock_finish_resize():
@@ -6797,6 +6803,8 @@ def test_finish_resize_notify_failure(self, notify):
67976803

67986804
# Assert that we did not set the migration to an error state
67996805
self.assertEqual('post-migrating', self.migration.status)
6806+
mock_del_allocs.assert_called_once_with(
6807+
self.context, self.migration.uuid)
68006808

68016809
@contextlib.contextmanager
68026810
def _mock_resize_instance(self):

0 commit comments

Comments
 (0)