Skip to content

Commit 03a6d26

Browse files
committed
Delete allocations even if _confirm_resize raises
When we are confirming a resize, the guest is on the dest host and the instance host/node values in the database are pointing at the dest host, so the _confirm_resize method on the source is really best effort. If something fails, we should not leak allocations in placement for the source compute node resource provider since the instance is not actually consuming the source node provider resources. This change refactors the error handling around the _confirm_resize call so the big nesting for _error_out_instance_on_exception is moved to confirm_resize and then a try/finally is added around _confirm_resize so we can be sure to try and cleanup the allocations even if _confirm_resize fails in some obscure way. If _confirm_resize does fail, the error gets re-raised along with logging a traceback and hint about how to correct the instance state in the DB by hard rebooting the server on the dest host. Change-Id: I29c5f491ec20a71283190a1599e7732541de736f Closes-Bug: #1821594
1 parent 28944d0 commit 03a6d26

File tree

2 files changed

+83
-53
lines changed

2 files changed

+83
-53
lines changed

nova/compute/manager.py

Lines changed: 68 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3973,7 +3973,29 @@ def do_confirm_resize(context, instance, migration_id):
39733973
instance=instance)
39743974
return
39753975

3976-
self._confirm_resize(context, instance, migration=migration)
3976+
with self._error_out_instance_on_exception(context, instance):
3977+
try:
3978+
self._confirm_resize(
3979+
context, instance, migration=migration)
3980+
except Exception:
3981+
# Something failed when cleaning up the source host so
3982+
# log a traceback and leave a hint about hard rebooting
3983+
# the server to correct its state in the DB.
3984+
with excutils.save_and_reraise_exception(logger=LOG):
3985+
LOG.exception(
3986+
'Confirm resize failed on source host %s. '
3987+
'Resource allocations in the placement service '
3988+
'will be removed regardless because the instance '
3989+
'is now on the destination host %s. You can try '
3990+
'hard rebooting the instance to correct its '
3991+
'state.', self.host, migration.dest_compute,
3992+
instance=instance)
3993+
finally:
3994+
# Whether an error occurred or not, at this point the
3995+
# instance is on the dest host so to avoid leaking
3996+
# allocations in placement, delete them here.
3997+
self._delete_allocation_after_move(
3998+
context, instance, migration)
39773999

39784000
do_confirm_resize(context, instance, migration.id)
39794001

@@ -3985,59 +4007,57 @@ def _confirm_resize(self, context, instance, migration=None):
39854007
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
39864008
phase=fields.NotificationPhase.START)
39874009

3988-
with self._error_out_instance_on_exception(context, instance):
3989-
# NOTE(danms): delete stashed migration information
3990-
old_instance_type = instance.old_flavor
3991-
instance.old_flavor = None
3992-
instance.new_flavor = None
3993-
instance.system_metadata.pop('old_vm_state', None)
3994-
instance.save()
3995-
3996-
# NOTE(tr3buchet): tear down networks on source host
3997-
self.network_api.setup_networks_on_host(context, instance,
3998-
migration.source_compute, teardown=True)
4010+
# NOTE(danms): delete stashed migration information
4011+
old_instance_type = instance.old_flavor
4012+
instance.old_flavor = None
4013+
instance.new_flavor = None
4014+
instance.system_metadata.pop('old_vm_state', None)
4015+
instance.save()
39994016

4000-
network_info = self.network_api.get_instance_nw_info(context,
4001-
instance)
4002-
# TODO(mriedem): Get BDMs here and pass them to the driver.
4003-
self.driver.confirm_migration(context, migration, instance,
4004-
network_info)
4017+
# NOTE(tr3buchet): tear down networks on source host
4018+
self.network_api.setup_networks_on_host(context, instance,
4019+
migration.source_compute, teardown=True)
40054020

4006-
migration.status = 'confirmed'
4007-
migration.save()
4021+
network_info = self.network_api.get_instance_nw_info(context,
4022+
instance)
4023+
# TODO(mriedem): Get BDMs here and pass them to the driver.
4024+
self.driver.confirm_migration(context, migration, instance,
4025+
network_info)
40084026

4009-
self.rt.drop_move_claim(context, instance, migration.source_node,
4010-
old_instance_type, prefix='old_')
4011-
self._delete_allocation_after_move(context, instance, migration)
4012-
instance.drop_migration_context()
4027+
migration.status = 'confirmed'
4028+
migration.save()
40134029

4014-
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
4015-
# might have manually powered up the instance to confirm the
4016-
# resize/migrate, so we need to check the current power state
4017-
# on the instance and set the vm_state appropriately. We default
4018-
# to ACTIVE because if the power state is not SHUTDOWN, we
4019-
# assume _sync_instance_power_state will clean it up.
4020-
p_state = instance.power_state
4021-
vm_state = None
4022-
if p_state == power_state.SHUTDOWN:
4023-
vm_state = vm_states.STOPPED
4024-
LOG.debug("Resized/migrated instance is powered off. "
4025-
"Setting vm_state to '%s'.", vm_state,
4026-
instance=instance)
4027-
else:
4028-
vm_state = vm_states.ACTIVE
4030+
self.rt.drop_move_claim(context, instance, migration.source_node,
4031+
old_instance_type, prefix='old_')
4032+
instance.drop_migration_context()
4033+
4034+
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
4035+
# might have manually powered up the instance to confirm the
4036+
# resize/migrate, so we need to check the current power state
4037+
# on the instance and set the vm_state appropriately. We default
4038+
# to ACTIVE because if the power state is not SHUTDOWN, we
4039+
# assume _sync_instance_power_state will clean it up.
4040+
p_state = instance.power_state
4041+
vm_state = None
4042+
if p_state == power_state.SHUTDOWN:
4043+
vm_state = vm_states.STOPPED
4044+
LOG.debug("Resized/migrated instance is powered off. "
4045+
"Setting vm_state to '%s'.", vm_state,
4046+
instance=instance)
4047+
else:
4048+
vm_state = vm_states.ACTIVE
40294049

4030-
instance.vm_state = vm_state
4031-
instance.task_state = None
4032-
instance.save(expected_task_state=[None, task_states.DELETING,
4033-
task_states.SOFT_DELETING])
4050+
instance.vm_state = vm_state
4051+
instance.task_state = None
4052+
instance.save(expected_task_state=[None, task_states.DELETING,
4053+
task_states.SOFT_DELETING])
40344054

4035-
self._notify_about_instance_usage(
4036-
context, instance, "resize.confirm.end",
4037-
network_info=network_info)
4038-
compute_utils.notify_about_instance_action(context, instance,
4039-
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4040-
phase=fields.NotificationPhase.END)
4055+
self._notify_about_instance_usage(
4056+
context, instance, "resize.confirm.end",
4057+
network_info=network_info)
4058+
compute_utils.notify_about_instance_action(context, instance,
4059+
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4060+
phase=fields.NotificationPhase.END)
40414061

40424062
def _delete_allocation_after_move(self, context, instance, migration):
40434063
"""Deletes resource allocations held by the migration record against

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7183,6 +7183,8 @@ def do_finish_revert_resize(mock_attachment_complete,
71837183
do_finish_revert_resize()
71847184

71857185
def test_confirm_resize_deletes_allocations(self):
7186+
@mock.patch('nova.objects.Instance.get_by_uuid')
7187+
@mock.patch('nova.objects.Migration.get_by_id')
71867188
@mock.patch.object(self.migration, 'save')
71877189
@mock.patch.object(self.compute, '_notify_about_instance_usage')
71887190
@mock.patch.object(self.compute, 'network_api')
@@ -7192,13 +7194,16 @@ def test_confirm_resize_deletes_allocations(self):
71927194
@mock.patch.object(self.instance, 'save')
71937195
def do_confirm_resize(mock_save, mock_drop, mock_delete,
71947196
mock_confirm, mock_nwapi, mock_notify,
7195-
mock_mig_save):
7197+
mock_mig_save, mock_mig_get, mock_inst_get):
71967198
self._mock_rt()
71977199
self.instance.migration_context = objects.MigrationContext()
71987200
self.migration.source_compute = self.instance['host']
71997201
self.migration.source_node = self.instance['node']
7200-
self.compute._confirm_resize(self.context, self.instance,
7201-
self.migration)
7202+
self.migration.status = 'confirming'
7203+
mock_mig_get.return_value = self.migration
7204+
mock_inst_get.return_value = self.instance
7205+
self.compute.confirm_resize(self.context, self.instance,
7206+
self.migration)
72027207
mock_delete.assert_called_once_with(self.context, self.instance,
72037208
self.migration)
72047209
mock_save.assert_called_with(expected_task_state=
@@ -7229,9 +7234,10 @@ def test_confirm_resize_driver_confirm_migration_fails(
72297234
with test.nested(
72307235
mock.patch.object(self.compute, 'network_api'),
72317236
mock.patch.object(self.compute.driver, 'confirm_migration',
7232-
side_effect=error)
7237+
side_effect=error),
7238+
mock.patch.object(self.compute, '_delete_allocation_after_move')
72337239
) as (
7234-
network_api, confirm_migration
7240+
network_api, confirm_migration, delete_allocation
72357241
):
72367242
self.assertRaises(exception.HypervisorUnavailable,
72377243
self.compute.confirm_resize,
@@ -7245,6 +7251,10 @@ def test_confirm_resize_driver_confirm_migration_fails(
72457251
self.assertEqual(2, instance_save.call_count)
72467252
# The migration.status should have been saved.
72477253
self.migration.save.assert_called_once_with()
7254+
# Allocations should always be cleaned up even if cleaning up the
7255+
# source host fails.
7256+
delete_allocation.assert_called_once_with(
7257+
self.context, self.instance, self.migration)
72487258
# Assert other mocks we care less about.
72497259
notify_usage.assert_called_once()
72507260
notify_action.assert_called_once()

0 commit comments

Comments
 (0)