Skip to content

Commit 11b7bcd

Browse files
committed
Add revert_snapshot_based_resize_at_dest compute method
This adds the revert_snapshot_based_resize_at_dest() compute service method which will be called from conductor during a cross-cell resize revert operation. This runs on the dest host in the target cell and is similar to confirm_snapshot_based_resize_at_source() except it destroys the guest and works on dropping the new_flavor resource usage rather than the old_flavor. It also sends the legacy "compute.instance.exists" notification like a traditional revert_resize(). As part of this cleanup, the dest host volume attachments are deleted. The source host port bindings, which are inactive when this runs, are activated and then the (previously active) dest host port bindings are deleted. This is to make sure the ports are not left unbound after the revert. Part of blueprint cross-cell-resize Change-Id: I200c51acbbda767c2d8ff90103eef54dbf35fb01
1 parent 386aa31 commit 11b7bcd

File tree

5 files changed

+317
-2
lines changed

5 files changed

+317
-2
lines changed

nova/compute/manager.py

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def update_compute_provider_status(self, context, rp_uuid, enabled):
560560
class ComputeManager(manager.Manager):
561561
"""Manages the running instances from creation to destruction."""
562562

563-
target = messaging.Target(version='5.8')
563+
target = messaging.Target(version='5.9')
564564

565565
def __init__(self, compute_driver=None, *args, **kwargs):
566566
"""Load configuration options and connect to the hypervisor."""
@@ -4579,6 +4579,113 @@ def _delete_volume_attachments(self, ctxt, bdms):
45794579
'Error: %s', bdm.attachment_id, six.text_type(e),
45804580
instance_uuid=bdm.instance_uuid)
45814581

4582+
@wrap_exception()
4583+
@reverts_task_state
4584+
@wrap_instance_event(prefix='compute')
4585+
@errors_out_migration
4586+
@wrap_instance_fault
4587+
def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):
4588+
"""Reverts a snapshot-based resize at the destination host.
4589+
4590+
Cleans the guest from the destination compute service host hypervisor
4591+
and related resources (ports, volumes) and frees resource usage from
4592+
the compute service on that host.
4593+
4594+
:param ctxt: nova auth request context targeted at the target cell
4595+
:param instance: Instance object whose vm_state is "resized" and
4596+
task_state is "resize_reverting".
4597+
:param migration: Migration object whose status is "reverting".
4598+
"""
4599+
# A resize revert is essentially a resize back to the old size, so we
4600+
# need to send a usage event here.
4601+
compute_utils.notify_usage_exists(
4602+
self.notifier, ctxt, instance, self.host, current_period=True)
4603+
4604+
@utils.synchronized(instance.uuid)
4605+
def do_revert():
4606+
LOG.info('Reverting resize on destination host.',
4607+
instance=instance)
4608+
with self._error_out_instance_on_exception(ctxt, instance):
4609+
self._revert_snapshot_based_resize_at_dest(
4610+
ctxt, instance, migration)
4611+
do_revert()
4612+
4613+
# Broadcast to all schedulers that the instance is no longer on
4614+
# this host and clear any waiting callback events. This is best effort
4615+
# so if anything fails just log it.
4616+
try:
4617+
self._delete_scheduler_instance_info(ctxt, instance.uuid)
4618+
self.instance_events.clear_events_for_instance(instance)
4619+
except Exception as e:
4620+
LOG.warning('revert_snapshot_based_resize_at_dest failed during '
4621+
'post-processing. Error: %s', e, instance=instance)
4622+
4623+
def _revert_snapshot_based_resize_at_dest(
4624+
self, ctxt, instance, migration):
4625+
"""Private version of revert_snapshot_based_resize_at_dest.
4626+
4627+
This allows the main method to be decorated with error handlers.
4628+
4629+
:param ctxt: nova auth request context targeted at the target cell
4630+
:param instance: Instance object whose vm_state is "resized" and
4631+
task_state is "resize_reverting".
4632+
:param migration: Migration object whose status is "reverting".
4633+
"""
4634+
# Cleanup the guest from the hypervisor including local disks.
4635+
network_info = self.network_api.get_instance_nw_info(ctxt, instance)
4636+
bdms = instance.get_bdms()
4637+
block_device_info = self._get_instance_block_device_info(
4638+
ctxt, instance, bdms=bdms)
4639+
LOG.debug('Destroying guest from destination hypervisor including '
4640+
'disks.', instance=instance)
4641+
self.driver.destroy(
4642+
ctxt, instance, network_info, block_device_info=block_device_info)
4643+
4644+
# Activate source host port bindings. We need to do this before
4645+
# deleting the (active) dest host port bindings in
4646+
# setup_networks_on_host otherwise the ports will be unbound and
4647+
# finish on the source will fail.
4648+
# migrate_instance_start uses migration.dest_compute for the port
4649+
# binding host and since we want to activate the source host port
4650+
# bindings, we need to temporarily mutate the migration object.
4651+
with utils.temporary_mutation(
4652+
migration, dest_compute=migration.source_compute):
4653+
LOG.debug('Activating port bindings for source host %s.',
4654+
migration.source_compute, instance=instance)
4655+
# TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow
4656+
# us to remove this and make setup_networks_on_host do it.
4657+
# TODO(mriedem): Should we try/except/log any errors but continue?
4658+
self.network_api.migrate_instance_start(
4659+
ctxt, instance, migration)
4660+
4661+
# Delete port bindings for the target host. This relies on the
4662+
# instance.host not being the same as the host we pass in, so we
4663+
# have to mutate the instance to effectively trick this code.
4664+
with utils.temporary_mutation(instance, host=migration.source_compute):
4665+
LOG.debug('Deleting port bindings for target host %s.',
4666+
self.host, instance=instance)
4667+
try:
4668+
# Note that deleting the destination host port bindings does
4669+
# not automatically activate the source host port bindings.
4670+
self.network_api.setup_networks_on_host(
4671+
ctxt, instance, host=self.host, teardown=True)
4672+
except exception.PortBindingDeletionFailed as e:
4673+
# Do not let this stop us from cleaning up since the guest
4674+
# is already gone.
4675+
LOG.error('Failed to delete port bindings from target host. '
4676+
'Error: %s', six.text_type(e), instance=instance)
4677+
4678+
# Delete any volume attachments remaining for this target host.
4679+
LOG.debug('Deleting volume attachments for target host.',
4680+
instance=instance)
4681+
self._delete_volume_attachments(ctxt, bdms)
4682+
4683+
# Free up the new_flavor usage from the resource tracker for this host.
4684+
instance.revert_migration_context()
4685+
instance.save(expected_task_state=task_states.RESIZE_REVERTING)
4686+
self.rt.drop_move_claim(ctxt, instance, instance.node,
4687+
instance_type=instance.new_flavor)
4688+
45824689
@wrap_exception()
45834690
@reverts_task_state
45844691
@wrap_instance_event(prefix='compute')

nova/compute/rpcapi.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ class ComputeAPI(object):
374374
* 5.6 - Add prep_snapshot_based_resize_at_source()
375375
* 5.7 - Add finish_snapshot_based_resize_at_dest()
376376
* 5.8 - Add confirm_snapshot_based_resize_at_source()
377+
* 5.9 - Add revert_snapshot_based_resize_at_dest()
377378
'''
378379

379380
VERSION_ALIASES = {
@@ -1133,6 +1134,37 @@ def revert_resize(self, ctxt, instance, migration, host, request_spec):
11331134
server=_compute_host(host, instance), version=version)
11341135
cctxt.cast(ctxt, 'revert_resize', **msg_args)
11351136

1137+
def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):
1138+
"""Reverts a snapshot-based resize at the destination host.
1139+
1140+
Cleans the guest from the destination compute service host hypervisor
1141+
and related resources (ports, volumes) and frees resource usage from
1142+
the compute service on that host.
1143+
1144+
This is a synchronous RPC call using the ``long_rpc_timeout``
1145+
configuration option.
1146+
1147+
:param ctxt: nova auth request context targeted at the target cell
1148+
:param instance: Instance object whose vm_state is "resized" and
1149+
task_state is "resize_reverting".
1150+
:param migration: Migration object whose status is "reverting".
1151+
:raises: nova.exception.MigrationError if the destination compute
1152+
service is too old to perform the operation
1153+
:raises: oslo_messaging.exceptions.MessagingTimeout if the RPC call
1154+
times out
1155+
"""
1156+
version = '5.9'
1157+
client = self.router.client(ctxt)
1158+
if not client.can_send_version(version):
1159+
raise exception.MigrationError(reason=_('Compute too old'))
1160+
cctxt = client.prepare(server=migration.dest_compute,
1161+
version=version,
1162+
call_monitor_timeout=CONF.rpc_response_timeout,
1163+
timeout=CONF.long_rpc_timeout)
1164+
return cctxt.call(
1165+
ctxt, 'revert_snapshot_based_resize_at_dest',
1166+
instance=instance, migration=migration)
1167+
11361168
def rollback_live_migration_at_destination(self, ctxt, instance, host,
11371169
destroy_disks,
11381170
migrate_data):

nova/objects/service.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232

3333
# NOTE(danms): This is the global service version counter
34-
SERVICE_VERSION = 45
34+
SERVICE_VERSION = 46
3535

3636

3737
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -171,6 +171,8 @@
171171
{'compute_rpc': '5.7'},
172172
# Version 45: Compute RPC v5.8: confirm_snapshot_based_resize_at_source
173173
{'compute_rpc': '5.8'},
174+
# Version 46: Compute RPC v5.9: revert_snapshot_based_resize_at_dest
175+
{'compute_rpc': '5.9'},
174176
)
175177

176178

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11114,6 +11114,153 @@ def test_delete_volume_attachments_errors(self):
1111411114
self.assertIn('Failed to delete volume attachment with ID %s' %
1111511115
uuids.attachment1, self.stdlog.logger.output)
1111611116

11117+
@mock.patch('nova.compute.utils.notify_usage_exists')
11118+
@mock.patch('nova.objects.Instance.save')
11119+
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
11120+
@mock.patch('nova.compute.manager.InstanceEvents.'
11121+
'clear_events_for_instance')
11122+
def test_revert_snapshot_based_resize_at_dest_error_handling(
11123+
self, mock_clear_events, mock_add_fault, mock_inst_save,
11124+
mock_notify_usage):
11125+
"""Tests error handling in revert_snapshot_based_resize_at_dest when
11126+
a failure occurs.
11127+
"""
11128+
self.instance.task_state = task_states.RESIZE_REVERTING
11129+
error = test.TestingException('oops')
11130+
with mock.patch.object(
11131+
self.compute, '_revert_snapshot_based_resize_at_dest',
11132+
side_effect=error) as mock_revert:
11133+
self.assertRaises(
11134+
test.TestingException,
11135+
self.compute.revert_snapshot_based_resize_at_dest,
11136+
self.context, self.instance, self.migration)
11137+
mock_notify_usage.assert_called_once_with(
11138+
self.compute.notifier, self.context, self.instance,
11139+
self.compute.host, current_period=True)
11140+
mock_revert.assert_called_once_with(
11141+
self.context, self.instance, self.migration)
11142+
mock_inst_save.assert_called()
11143+
# _error_out_instance_on_exception sets the instance to ERROR.
11144+
self.assertEqual(vm_states.ERROR, self.instance.vm_state)
11145+
# reverts_task_state will reset the task_state to None.
11146+
self.assertIsNone(self.instance.task_state)
11147+
# Ensure wrap_instance_fault was called.
11148+
mock_add_fault.assert_called_once_with(
11149+
self.context, self.instance, error, test.MatchType(tuple))
11150+
# errors_out_migration should mark the migration as 'error' status
11151+
self.assertEqual('error', self.migration.status)
11152+
self.migration.save.assert_called_once_with()
11153+
# Assert wrap_exception is called.
11154+
self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS))
11155+
self.assertEqual(
11156+
'compute.%s' % fields.NotificationAction.EXCEPTION,
11157+
fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type'])
11158+
# clear_events_for_instance should not have been called.
11159+
mock_clear_events.assert_not_called()
11160+
11161+
@mock.patch('nova.compute.utils.notify_usage_exists', new=mock.Mock())
11162+
@mock.patch('nova.compute.manager.ComputeManager.'
11163+
'_revert_snapshot_based_resize_at_dest')
11164+
def test_revert_snapshot_based_resize_at_dest_post_error_log(self, revert):
11165+
"""Tests when _revert_snapshot_based_resize_at_dest is OK but
11166+
post-processing cleanup fails and is just logged.
11167+
"""
11168+
# First test _delete_scheduler_instance_info failing.
11169+
with mock.patch.object(
11170+
self.compute, '_delete_scheduler_instance_info',
11171+
side_effect=(
11172+
test.TestingException('scheduler'), None)) as mock_del:
11173+
self.compute.revert_snapshot_based_resize_at_dest(
11174+
self.context, self.instance, self.migration)
11175+
revert.assert_called_once()
11176+
mock_del.assert_called_once_with(self.context, self.instance.uuid)
11177+
self.assertIn('revert_snapshot_based_resize_at_dest failed during '
11178+
'post-processing. Error: scheduler',
11179+
self.stdlog.logger.output)
11180+
revert.reset_mock()
11181+
mock_del.reset_mock()
11182+
11183+
# Now test clear_events_for_instance failing.
11184+
with mock.patch.object(
11185+
self.compute.instance_events, 'clear_events_for_instance',
11186+
side_effect=test.TestingException(
11187+
'events')) as mock_events:
11188+
self.compute.revert_snapshot_based_resize_at_dest(
11189+
self.context, self.instance, self.migration)
11190+
revert.assert_called_once()
11191+
mock_del.assert_called_once_with(self.context, self.instance.uuid)
11192+
mock_events.assert_called_once_with(self.instance)
11193+
self.assertIn('revert_snapshot_based_resize_at_dest failed during '
11194+
'post-processing. Error: events',
11195+
self.stdlog.logger.output)
11196+
# Assert _error_out_instance_on_exception wasn't tripped somehow.
11197+
self.assertNotEqual(vm_states.ERROR, self.instance.vm_state)
11198+
11199+
@mock.patch('nova.objects.Instance.save')
11200+
@mock.patch('nova.objects.Instance.revert_migration_context')
11201+
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
11202+
def test_revert_snapshot_based_resize_at_dest(
11203+
self, mock_get_bdms, mock_revert_mig_ctxt, mock_inst_save):
11204+
"""Happy path test for _revert_snapshot_based_resize_at_dest"""
11205+
# Setup more mocks.
11206+
def stub_migrate_instance_start(ctxt, instance, migration):
11207+
# The migration.dest_compute should have been mutated to point
11208+
# at the source compute.
11209+
self.assertEqual(migration.source_compute, migration.dest_compute)
11210+
11211+
def stub_setup_networks_on_host(ctxt, instance, *args, **kwargs):
11212+
# The instance.host should have been mutated to point at the
11213+
# source compute.
11214+
self.assertEqual(self.migration.source_compute, instance.host)
11215+
# Raise PortBindingDeletionFailed to make sure it's caught and
11216+
# logged but not fatal.
11217+
raise exception.PortBindingDeletionFailed(port_id=uuids.port_id,
11218+
host=self.compute.host)
11219+
11220+
with test.nested(
11221+
mock.patch.object(self.compute, 'network_api'),
11222+
mock.patch.object(self.compute, '_get_instance_block_device_info'),
11223+
mock.patch.object(self.compute.driver, 'destroy'),
11224+
mock.patch.object(self.compute, '_delete_volume_attachments'),
11225+
mock.patch.object(self.compute.rt, 'drop_move_claim')
11226+
) as (
11227+
mock_network_api, mock_get_bdi, mock_destroy,
11228+
mock_delete_attachments, mock_drop_claim
11229+
):
11230+
mock_network_api.migrate_instance_start.side_effect = \
11231+
stub_migrate_instance_start
11232+
mock_network_api.setup_networks_on_host.side_effect = \
11233+
stub_setup_networks_on_host
11234+
# Run the code.
11235+
self.compute._revert_snapshot_based_resize_at_dest(
11236+
self.context, self.instance, self.migration)
11237+
# Assert the calls.
11238+
mock_network_api.get_instance_nw_info.assert_called_once_with(
11239+
self.context, self.instance)
11240+
mock_get_bdi.assert_called_once_with(
11241+
self.context, self.instance, bdms=mock_get_bdms.return_value)
11242+
mock_destroy.assert_called_once_with(
11243+
self.context, self.instance,
11244+
mock_network_api.get_instance_nw_info.return_value,
11245+
block_device_info=mock_get_bdi.return_value)
11246+
mock_network_api.migrate_instance_start.assert_called_once_with(
11247+
self.context, self.instance, self.migration)
11248+
mock_network_api.setup_networks_on_host.assert_called_once_with(
11249+
self.context, self.instance, host=self.compute.host,
11250+
teardown=True)
11251+
# Assert that even though setup_networks_on_host raised
11252+
# PortBindingDeletionFailed it was handled and logged.
11253+
self.assertIn('Failed to delete port bindings from target host.',
11254+
self.stdlog.logger.output)
11255+
mock_delete_attachments.assert_called_once_with(
11256+
self.context, mock_get_bdms.return_value)
11257+
mock_revert_mig_ctxt.assert_called_once_with()
11258+
mock_inst_save.assert_called_once_with(
11259+
expected_task_state=task_states.RESIZE_REVERTING)
11260+
mock_drop_claim.assert_called_once_with(
11261+
self.context, self.instance, self.instance.node,
11262+
instance_type=self.instance.new_flavor)
11263+
1111711264

1111811265
class ComputeManagerInstanceUsageAuditTestCase(test.TestCase):
1111911266
def setUp(self):

nova/tests/unit/compute/test_rpcapi.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,33 @@ def test_confirm_snapshot_based_resize_at_source_old_compute(self, client):
628628
migration=migration_obj.Migration(source_compute='source'))
629629
self.assertIn('Compute too old', six.text_type(ex))
630630

631+
def test_revert_snapshot_based_resize_at_dest(self):
632+
"""Tests happy path for revert_snapshot_based_resize_at_dest."""
633+
self.flags(long_rpc_timeout=1234)
634+
self._test_compute_api(
635+
'revert_snapshot_based_resize_at_dest', 'call',
636+
# compute method kwargs
637+
instance=self.fake_instance_obj,
638+
migration=migration_obj.Migration(dest_compute='dest'),
639+
# client.prepare kwargs
640+
version='5.9', prepare_server='dest',
641+
call_monitor_timeout=60, timeout=1234)
642+
643+
@mock.patch('nova.rpc.ClientRouter.client')
644+
def test_revert_snapshot_based_resize_at_dest_old_compute(self, client):
645+
"""Tests when the dest compute service is too old to call
646+
revert_snapshot_based_resize_at_dest so MigrationError is raised.
647+
"""
648+
client.return_value.can_send_version.return_value = False
649+
rpcapi = compute_rpcapi.ComputeAPI()
650+
ex = self.assertRaises(
651+
exception.MigrationError,
652+
rpcapi.revert_snapshot_based_resize_at_dest,
653+
self.context,
654+
instance=self.fake_instance_obj,
655+
migration=migration_obj.Migration(dest_compute='dest'))
656+
self.assertIn('Compute too old', six.text_type(ex))
657+
631658
def test_reboot_instance(self):
632659
self.maxDiff = None
633660
self._test_compute_api('reboot_instance', 'cast',

0 commit comments

Comments
 (0)