Skip to content

Commit 5f51506

Browse files
committed
Delete allocations even if _confirm_resize raises
When we are confirming a resize, the guest is on the dest host and the instance host/node values in the database are pointing at the dest host, so the _confirm_resize method on the source is really best effort. If something fails, we should not leak allocations in placement for the source compute node resource provider since the instance is not actually consuming the source node provider resources. This change refactors the error handling around the _confirm_resize call so the big nesting for _error_out_instance_on_exception is moved to confirm_resize and then a try/finally is added around _confirm_resize so we can be sure to try and cleanup the allocations even if _confirm_resize fails in some obscure way. If _confirm_resize does fail, the error gets re-raised along with logging a traceback and hint about how to correct the instance state in the DB by hard rebooting the server on the dest host. Conflicts: nova/compute/manager.py NOTE(mriedem): The conflict is due to not having change I49034f87e240775cca7a4d769819dd788b86832a in Stein. Change-Id: I29c5f491ec20a71283190a1599e7732541de736f Closes-Bug: #1821594 (cherry picked from commit 03a6d26)
1 parent 972d4e0 commit 5f51506

File tree

2 files changed

+84
-54
lines changed

2 files changed

+84
-54
lines changed

nova/compute/manager.py

Lines changed: 69 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -3979,7 +3979,29 @@ def do_confirm_resize(context, instance, migration_id):
39793979
instance=instance)
39803980
return
39813981

3982-
self._confirm_resize(context, instance, migration=migration)
3982+
with self._error_out_instance_on_exception(context, instance):
3983+
try:
3984+
self._confirm_resize(
3985+
context, instance, migration=migration)
3986+
except Exception:
3987+
# Something failed when cleaning up the source host so
3988+
# log a traceback and leave a hint about hard rebooting
3989+
# the server to correct its state in the DB.
3990+
with excutils.save_and_reraise_exception(logger=LOG):
3991+
LOG.exception(
3992+
'Confirm resize failed on source host %s. '
3993+
'Resource allocations in the placement service '
3994+
'will be removed regardless because the instance '
3995+
'is now on the destination host %s. You can try '
3996+
'hard rebooting the instance to correct its '
3997+
'state.', self.host, migration.dest_compute,
3998+
instance=instance)
3999+
finally:
4000+
# Whether an error occurred or not, at this point the
4001+
# instance is on the dest host so to avoid leaking
4002+
# allocations in placement, delete them here.
4003+
self._delete_allocation_after_move(
4004+
context, instance, migration)
39834005

39844006
do_confirm_resize(context, instance, migration.id)
39854007

@@ -3991,60 +4013,58 @@ def _confirm_resize(self, context, instance, migration=None):
39914013
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
39924014
phase=fields.NotificationPhase.START)
39934015

3994-
with self._error_out_instance_on_exception(context, instance):
3995-
# NOTE(danms): delete stashed migration information
3996-
old_instance_type = instance.old_flavor
3997-
instance.old_flavor = None
3998-
instance.new_flavor = None
3999-
instance.system_metadata.pop('old_vm_state', None)
4000-
instance.save()
4001-
4002-
# NOTE(tr3buchet): tear down networks on source host
4003-
self.network_api.setup_networks_on_host(context, instance,
4004-
migration.source_compute, teardown=True)
4016+
# NOTE(danms): delete stashed migration information
4017+
old_instance_type = instance.old_flavor
4018+
instance.old_flavor = None
4019+
instance.new_flavor = None
4020+
instance.system_metadata.pop('old_vm_state', None)
4021+
instance.save()
40054022

4006-
network_info = self.network_api.get_instance_nw_info(context,
4007-
instance)
4008-
# TODO(mriedem): Get BDMs here and pass them to the driver.
4009-
self.driver.confirm_migration(context, migration, instance,
4010-
network_info)
4023+
# NOTE(tr3buchet): tear down networks on source host
4024+
self.network_api.setup_networks_on_host(context, instance,
4025+
migration.source_compute, teardown=True)
40114026

4012-
migration.status = 'confirmed'
4013-
with migration.obj_as_admin():
4014-
migration.save()
4027+
network_info = self.network_api.get_instance_nw_info(context,
4028+
instance)
4029+
# TODO(mriedem): Get BDMs here and pass them to the driver.
4030+
self.driver.confirm_migration(context, migration, instance,
4031+
network_info)
40154032

4016-
self.rt.drop_move_claim(context, instance, migration.source_node,
4017-
old_instance_type, prefix='old_')
4018-
self._delete_allocation_after_move(context, instance, migration)
4019-
instance.drop_migration_context()
4033+
migration.status = 'confirmed'
4034+
with migration.obj_as_admin():
4035+
migration.save()
40204036

4021-
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
4022-
# might have manually powered up the instance to confirm the
4023-
# resize/migrate, so we need to check the current power state
4024-
# on the instance and set the vm_state appropriately. We default
4025-
# to ACTIVE because if the power state is not SHUTDOWN, we
4026-
# assume _sync_instance_power_state will clean it up.
4027-
p_state = instance.power_state
4028-
vm_state = None
4029-
if p_state == power_state.SHUTDOWN:
4030-
vm_state = vm_states.STOPPED
4031-
LOG.debug("Resized/migrated instance is powered off. "
4032-
"Setting vm_state to '%s'.", vm_state,
4033-
instance=instance)
4034-
else:
4035-
vm_state = vm_states.ACTIVE
4037+
self.rt.drop_move_claim(context, instance, migration.source_node,
4038+
old_instance_type, prefix='old_')
4039+
instance.drop_migration_context()
4040+
4041+
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
4042+
# might have manually powered up the instance to confirm the
4043+
# resize/migrate, so we need to check the current power state
4044+
# on the instance and set the vm_state appropriately. We default
4045+
# to ACTIVE because if the power state is not SHUTDOWN, we
4046+
# assume _sync_instance_power_state will clean it up.
4047+
p_state = instance.power_state
4048+
vm_state = None
4049+
if p_state == power_state.SHUTDOWN:
4050+
vm_state = vm_states.STOPPED
4051+
LOG.debug("Resized/migrated instance is powered off. "
4052+
"Setting vm_state to '%s'.", vm_state,
4053+
instance=instance)
4054+
else:
4055+
vm_state = vm_states.ACTIVE
40364056

4037-
instance.vm_state = vm_state
4038-
instance.task_state = None
4039-
instance.save(expected_task_state=[None, task_states.DELETING,
4040-
task_states.SOFT_DELETING])
4057+
instance.vm_state = vm_state
4058+
instance.task_state = None
4059+
instance.save(expected_task_state=[None, task_states.DELETING,
4060+
task_states.SOFT_DELETING])
40414061

4042-
self._notify_about_instance_usage(
4043-
context, instance, "resize.confirm.end",
4044-
network_info=network_info)
4045-
compute_utils.notify_about_instance_action(context, instance,
4046-
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4047-
phase=fields.NotificationPhase.END)
4062+
self._notify_about_instance_usage(
4063+
context, instance, "resize.confirm.end",
4064+
network_info=network_info)
4065+
compute_utils.notify_about_instance_action(context, instance,
4066+
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4067+
phase=fields.NotificationPhase.END)
40484068

40494069
def _delete_allocation_after_move(self, context, instance, migration):
40504070
"""Deletes resource allocations held by the migration record against

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7192,6 +7192,8 @@ def do_finish_revert_resize(mock_attachment_complete,
71927192
do_finish_revert_resize()
71937193

71947194
def test_confirm_resize_deletes_allocations(self):
7195+
@mock.patch('nova.objects.Instance.get_by_uuid')
7196+
@mock.patch('nova.objects.Migration.get_by_id')
71957197
@mock.patch.object(self.migration, 'save')
71967198
@mock.patch.object(self.compute, '_notify_about_instance_usage')
71977199
@mock.patch.object(self.compute, 'network_api')
@@ -7201,13 +7203,16 @@ def test_confirm_resize_deletes_allocations(self):
72017203
@mock.patch.object(self.instance, 'save')
72027204
def do_confirm_resize(mock_save, mock_drop, mock_delete,
72037205
mock_confirm, mock_nwapi, mock_notify,
7204-
mock_mig_save):
7206+
mock_mig_save, mock_mig_get, mock_inst_get):
72057207
self._mock_rt()
72067208
self.instance.migration_context = objects.MigrationContext()
72077209
self.migration.source_compute = self.instance['host']
72087210
self.migration.source_node = self.instance['node']
7209-
self.compute._confirm_resize(self.context, self.instance,
7210-
self.migration)
7211+
self.migration.status = 'confirming'
7212+
mock_mig_get.return_value = self.migration
7213+
mock_inst_get.return_value = self.instance
7214+
self.compute.confirm_resize(self.context, self.instance,
7215+
self.migration)
72117216
mock_delete.assert_called_once_with(self.context, self.instance,
72127217
self.migration)
72137218
mock_save.assert_called_with(expected_task_state=
@@ -7238,9 +7243,10 @@ def test_confirm_resize_driver_confirm_migration_fails(
72387243
with test.nested(
72397244
mock.patch.object(self.compute, 'network_api'),
72407245
mock.patch.object(self.compute.driver, 'confirm_migration',
7241-
side_effect=error)
7246+
side_effect=error),
7247+
mock.patch.object(self.compute, '_delete_allocation_after_move')
72427248
) as (
7243-
network_api, confirm_migration
7249+
network_api, confirm_migration, delete_allocation
72447250
):
72457251
self.assertRaises(exception.HypervisorUnavailable,
72467252
self.compute.confirm_resize,
@@ -7254,6 +7260,10 @@ def test_confirm_resize_driver_confirm_migration_fails(
72547260
self.assertEqual(2, instance_save.call_count)
72557261
# The migration.status should have been saved.
72567262
self.migration.save.assert_called_once_with()
7263+
# Allocations should always be cleaned up even if cleaning up the
7264+
# source host fails.
7265+
delete_allocation.assert_called_once_with(
7266+
self.context, self.instance, self.migration)
72577267
# Assert other mocks we care less about.
72587268
notify_usage.assert_called_once()
72597269
notify_action.assert_called_once()

0 commit comments

Comments
 (0)