Skip to content

Commit 37ac54a

Browse files
committed
Delete allocations even if _confirm_resize raises
When we are confirming a resize, the guest is on the dest host and the instance host/node values in the database are pointing at the dest host, so the _confirm_resize method on the source is really best effort. If something fails, we should not leak allocations in placement for the source compute node resource provider since the instance is not actually consuming the source node provider resources. This change refactors the error handling around the _confirm_resize call so the big nesting for _error_out_instance_on_exception is moved to confirm_resize and then a try/finally is added around _confirm_resize so we can be sure to try and cleanup the allocations even if _confirm_resize fails in some obscure way. If _confirm_resize does fail, the error gets re-raised along with logging a traceback and hint about how to correct the instance state in the DB by hard rebooting the server on the dest host. Conflicts: nova/compute/manager.py nova/tests/unit/compute/test_compute_mgr.py NOTE(mriedem): The manager.py conflict is due to not having change Ibb8c12fb2799bb5ceb9e3d72a2b86dbb4f14451e in Rocky. In addition, since change I0851e2d54a1fdc82fe3291fb7e286e790f121e92 is not in Rocky the _delete_allocation_after_move signature needs to be handled a bit different in this backport. The test_compute_mgr.py conflict and source_node addition in the setUp is due to the same two changes. Change-Id: I29c5f491ec20a71283190a1599e7732541de736f Closes-Bug: #1821594 (cherry picked from commit 03a6d26) (cherry picked from commit 5f51506)
1 parent 2a25d1e commit 37ac54a

File tree

2 files changed

+89
-57
lines changed

2 files changed

+89
-57
lines changed

nova/compute/manager.py

Lines changed: 72 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3875,7 +3875,31 @@ def do_confirm_resize(context, instance, migration_id):
38753875
instance=instance)
38763876
return
38773877

3878-
self._confirm_resize(context, instance, migration=migration)
3878+
with self._error_out_instance_on_exception(context, instance):
3879+
old_instance_type = instance.old_flavor
3880+
try:
3881+
self._confirm_resize(
3882+
context, instance, migration=migration)
3883+
except Exception:
3884+
# Something failed when cleaning up the source host so
3885+
# log a traceback and leave a hint about hard rebooting
3886+
# the server to correct its state in the DB.
3887+
with excutils.save_and_reraise_exception(logger=LOG):
3888+
LOG.exception(
3889+
'Confirm resize failed on source host %s. '
3890+
'Resource allocations in the placement service '
3891+
'will be removed regardless because the instance '
3892+
'is now on the destination host %s. You can try '
3893+
'hard rebooting the instance to correct its '
3894+
'state.', self.host, migration.dest_compute,
3895+
instance=instance)
3896+
finally:
3897+
# Whether an error occurred or not, at this point the
3898+
# instance is on the dest host so to avoid leaking
3899+
# allocations in placement, delete them here.
3900+
self._delete_allocation_after_move(
3901+
context, instance, migration, old_instance_type,
3902+
migration.source_node)
38793903

38803904
do_confirm_resize(context, instance, migration.id)
38813905

@@ -3887,63 +3911,59 @@ def _confirm_resize(self, context, instance, migration=None):
38873911
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
38883912
phase=fields.NotificationPhase.START)
38893913

3890-
with self._error_out_instance_on_exception(context, instance):
3891-
# NOTE(danms): delete stashed migration information
3892-
old_instance_type = instance.old_flavor
3893-
instance.old_flavor = None
3894-
instance.new_flavor = None
3895-
instance.system_metadata.pop('old_vm_state', None)
3896-
instance.save()
3897-
3898-
# NOTE(tr3buchet): tear down networks on source host
3899-
self.network_api.setup_networks_on_host(context, instance,
3900-
migration.source_compute, teardown=True)
3914+
# NOTE(danms): delete stashed migration information
3915+
old_instance_type = instance.old_flavor
3916+
instance.old_flavor = None
3917+
instance.new_flavor = None
3918+
instance.system_metadata.pop('old_vm_state', None)
3919+
instance.save()
39013920

3902-
network_info = self.network_api.get_instance_nw_info(context,
3903-
instance)
3904-
# TODO(mriedem): Get BDMs here and pass them to the driver.
3905-
self.driver.confirm_migration(context, migration, instance,
3906-
network_info)
3921+
# NOTE(tr3buchet): tear down networks on source host
3922+
self.network_api.setup_networks_on_host(context, instance,
3923+
migration.source_compute, teardown=True)
39073924

3908-
migration.status = 'confirmed'
3909-
with migration.obj_as_admin():
3910-
migration.save()
3925+
network_info = self.network_api.get_instance_nw_info(context,
3926+
instance)
3927+
# TODO(mriedem): Get BDMs here and pass them to the driver.
3928+
self.driver.confirm_migration(context, migration, instance,
3929+
network_info)
39113930

3912-
rt = self._get_resource_tracker()
3913-
rt.drop_move_claim(context, instance, migration.source_node,
3914-
old_instance_type, prefix='old_')
3915-
self._delete_allocation_after_move(context, instance, migration,
3916-
old_instance_type,
3917-
migration.source_node)
3918-
instance.drop_migration_context()
3931+
migration.status = 'confirmed'
3932+
with migration.obj_as_admin():
3933+
migration.save()
39193934

3920-
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
3921-
# might have manually powered up the instance to confirm the
3922-
# resize/migrate, so we need to check the current power state
3923-
# on the instance and set the vm_state appropriately. We default
3924-
# to ACTIVE because if the power state is not SHUTDOWN, we
3925-
# assume _sync_instance_power_state will clean it up.
3926-
p_state = instance.power_state
3927-
vm_state = None
3928-
if p_state == power_state.SHUTDOWN:
3929-
vm_state = vm_states.STOPPED
3930-
LOG.debug("Resized/migrated instance is powered off. "
3931-
"Setting vm_state to '%s'.", vm_state,
3932-
instance=instance)
3933-
else:
3934-
vm_state = vm_states.ACTIVE
3935+
rt = self._get_resource_tracker()
3936+
rt.drop_move_claim(context, instance, migration.source_node,
3937+
old_instance_type, prefix='old_')
3938+
instance.drop_migration_context()
3939+
3940+
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
3941+
# might have manually powered up the instance to confirm the
3942+
# resize/migrate, so we need to check the current power state
3943+
# on the instance and set the vm_state appropriately. We default
3944+
# to ACTIVE because if the power state is not SHUTDOWN, we
3945+
# assume _sync_instance_power_state will clean it up.
3946+
p_state = instance.power_state
3947+
vm_state = None
3948+
if p_state == power_state.SHUTDOWN:
3949+
vm_state = vm_states.STOPPED
3950+
LOG.debug("Resized/migrated instance is powered off. "
3951+
"Setting vm_state to '%s'.", vm_state,
3952+
instance=instance)
3953+
else:
3954+
vm_state = vm_states.ACTIVE
39353955

3936-
instance.vm_state = vm_state
3937-
instance.task_state = None
3938-
instance.save(expected_task_state=[None, task_states.DELETING,
3939-
task_states.SOFT_DELETING])
3956+
instance.vm_state = vm_state
3957+
instance.task_state = None
3958+
instance.save(expected_task_state=[None, task_states.DELETING,
3959+
task_states.SOFT_DELETING])
39403960

3941-
self._notify_about_instance_usage(
3942-
context, instance, "resize.confirm.end",
3943-
network_info=network_info)
3944-
compute_utils.notify_about_instance_action(context, instance,
3945-
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3946-
phase=fields.NotificationPhase.END)
3961+
self._notify_about_instance_usage(
3962+
context, instance, "resize.confirm.end",
3963+
network_info=network_info)
3964+
compute_utils.notify_about_instance_action(context, instance,
3965+
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3966+
phase=fields.NotificationPhase.END)
39473967

39483968
def _delete_allocation_after_move(self, context, instance, migration,
39493969
flavor, nodename):

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6636,6 +6636,7 @@ def setUp(self):
66366636
dest_node=None,
66376637
dest_host=None,
66386638
source_compute='source_compute',
6639+
source_node='source_node',
66396640
status='migrating')
66406641
self.migration.save = mock.MagicMock()
66416642
self.useFixture(fixtures.SpawnIsSynchronousFixture())
@@ -6995,6 +6996,8 @@ def do_finish_revert_resize(mock_attachment_complete,
69956996
do_finish_revert_resize()
69966997

69976998
def test_confirm_resize_deletes_allocations(self):
6999+
@mock.patch('nova.objects.Instance.get_by_uuid')
7000+
@mock.patch('nova.objects.Migration.get_by_id')
69987001
@mock.patch.object(self.migration, 'save')
69997002
@mock.patch.object(self.compute, '_notify_about_instance_usage')
70007003
@mock.patch.object(self.compute, 'network_api')
@@ -7005,12 +7008,15 @@ def test_confirm_resize_deletes_allocations(self):
70057008
@mock.patch.object(self.instance, 'save')
70067009
def do_confirm_resize(mock_save, mock_drop, mock_delete, mock_get_rt,
70077010
mock_confirm, mock_nwapi, mock_notify,
7008-
mock_mig_save):
7011+
mock_mig_save, mock_mig_get, mock_inst_get):
70097012
self.instance.migration_context = objects.MigrationContext()
70107013
self.migration.source_compute = self.instance['host']
70117014
self.migration.source_node = self.instance['node']
7012-
self.compute._confirm_resize(self.context, self.instance,
7013-
self.migration)
7015+
self.migration.status = 'confirming'
7016+
mock_mig_get.return_value = self.migration
7017+
mock_inst_get.return_value = self.instance
7018+
self.compute.confirm_resize(self.context, self.instance,
7019+
self.migration)
70147020
mock_delete.assert_called_once_with(self.context, self.instance,
70157021
self.migration,
70167022
self.instance.old_flavor,
@@ -7066,9 +7072,10 @@ def test_confirm_resize_driver_confirm_migration_fails(
70667072
with test.nested(
70677073
mock.patch.object(self.compute, 'network_api'),
70687074
mock.patch.object(self.compute.driver, 'confirm_migration',
7069-
side_effect=error)
7075+
side_effect=error),
7076+
mock.patch.object(self.compute, '_delete_allocation_after_move')
70707077
) as (
7071-
network_api, confirm_migration
7078+
network_api, confirm_migration, delete_allocation
70727079
):
70737080
self.assertRaises(exception.HypervisorUnavailable,
70747081
self.compute.confirm_resize,
@@ -7082,6 +7089,11 @@ def test_confirm_resize_driver_confirm_migration_fails(
70827089
self.assertEqual(2, instance_save.call_count)
70837090
# The migration.status should have been saved.
70847091
self.migration.save.assert_called_once_with()
7092+
# Allocations should always be cleaned up even if cleaning up the
7093+
# source host fails.
7094+
delete_allocation.assert_called_once_with(
7095+
self.context, self.instance, self.migration,
7096+
self.instance.old_flavor, self.migration.source_node)
70857097
# Assert other mocks we care less about.
70867098
notify_usage.assert_called_once()
70877099
notify_action.assert_called_once()

0 commit comments

Comments
 (0)