Skip to content

Commit 9cacaad

Browse files
Balazs GibizerBalazs Gibizer
authored andcommitted
cleanup evacuated instances not on hypervisor
When the compute host recovers from a failure the compute manager destroys instances that were evacuated from the host while it was down. However these code paths only consider evacuated instances that are still reported by the hypervisor. This means that if the compute host is recovered in a way that the hypervisor lost the definition of the instances (for example the compute host was redeployed) then the allocation of these instances will not be deleted. This patch makes sure that the instance allocation is cleaned up even if the driver doesn't return that instance as exists on the hypervisor. Note that the functional test coverage will be added on top as it needs some refactoring that would make the bugfix non backportable. Change-Id: I4bc81b482380c5778781659c4d167a712316dab4 Closes-Bug: #1724172
1 parent 37ccd7e commit 9cacaad

File tree

2 files changed

+119
-13
lines changed

2 files changed

+119
-13
lines changed

nova/compute/manager.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,9 @@ def _destroy_evacuated_instances(self, context):
632632
compared to the instances for the migration records and those local
633633
guests are destroyed, along with instance allocation records in
634634
Placement for this node.
635+
Then allocations are removed from Placement for every instance that is
636+
evacuated from this host regardless if the instance is reported by the
637+
hypervisor or not.
635638
"""
636639
filters = {
637640
'source_compute': self.host,
@@ -658,18 +661,14 @@ def _destroy_evacuated_instances(self, context):
658661
# TODO(mriedem): We could optimize by pre-loading the joined fields
659662
# we know we'll use, like info_cache and flavor.
660663
local_instances = self._get_instances_on_driver(context)
661-
evacuated = [inst for inst in local_instances
662-
if inst.uuid in evacuations]
664+
evacuated_local_instances = {inst.uuid: inst
665+
for inst in local_instances
666+
if inst.uuid in evacuations}
663667

664-
# NOTE(gibi): We are called from init_host and at this point the
665-
# compute_nodes of the resource tracker has not been populated yet so
666-
# we cannot rely on the resource tracker here.
667-
compute_nodes = {}
668-
669-
for instance in evacuated:
670-
migration = evacuations[instance.uuid]
671-
LOG.info('Deleting instance as it has been evacuated from '
672-
'this host', instance=instance)
668+
for instance in evacuated_local_instances.values():
669+
LOG.info('Destroying instance as it has been evacuated from '
670+
'this host but still exists in the hypervisor',
671+
instance=instance)
673672
try:
674673
network_info = self.network_api.get_instance_nw_info(
675674
context, instance)
@@ -689,7 +688,28 @@ def _destroy_evacuated_instances(self, context):
689688
network_info,
690689
bdi, destroy_disks)
691690

692-
# delete the allocation of the evacuated instance from this host
691+
# NOTE(gibi): We are called from init_host and at this point the
692+
# compute_nodes of the resource tracker has not been populated yet so
693+
# we cannot rely on the resource tracker here.
694+
compute_nodes = {}
695+
696+
for instance_uuid, migration in evacuations.items():
697+
try:
698+
if instance_uuid in evacuated_local_instances:
699+
# Avoid the db call if we already have the instance loaded
700+
# above
701+
instance = evacuated_local_instances[instance_uuid]
702+
else:
703+
instance = objects.Instance.get_by_uuid(
704+
context, instance_uuid)
705+
except exception.InstanceNotFound:
706+
# The instance already deleted so we expect that every
707+
# allocation of that instance has already been cleaned up
708+
continue
709+
710+
LOG.info('Cleaning up allocations of the instance as it has been '
711+
'evacuated from this host',
712+
instance=instance)
693713
if migration.source_node not in compute_nodes:
694714
try:
695715
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3923,7 +3923,8 @@ def fake_get_node(context, host, node):
39233923
# both instance_1 and instance_2 is destroyed in the driver
39243924
destroy.assert_has_calls(
39253925
[mock.call(self.context, instance_1, None, {}, True),
3926-
mock.call(self.context, instance_2, None, {}, True)])
3926+
mock.call(self.context, instance_2, None, {}, True)],
3927+
any_order=True)
39273928

39283929
# but only instance_2 is deallocated as the compute node for
39293930
# instance_1 is already deleted
@@ -3932,6 +3933,91 @@ def fake_get_node(context, host, node):
39323933

39333934
self.assertEqual(2, get_node.call_count)
39343935

3936+
def test_destroy_evacuated_instances_not_on_hypervisor(self):
3937+
our_host = self.compute.host
3938+
flavor = objects.Flavor()
3939+
instance_1 = objects.Instance(self.context, flavor=flavor)
3940+
instance_1.uuid = uuids.instance_1
3941+
instance_1.task_state = None
3942+
instance_1.vm_state = vm_states.ACTIVE
3943+
instance_1.host = 'not-' + our_host
3944+
instance_1.user_id = uuids.user_id
3945+
instance_1.project_id = uuids.project_id
3946+
instance_1.deleted = False
3947+
3948+
migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
3949+
migration_1.status = 'done'
3950+
migration_1.source_node = 'our-node'
3951+
3952+
our_node = objects.ComputeNode(
3953+
host=our_host, uuid=uuids.our_node_uuid)
3954+
3955+
with test.nested(
3956+
mock.patch.object(self.compute, '_get_instances_on_driver',
3957+
return_value=[]),
3958+
mock.patch.object(self.compute.driver, 'destroy'),
3959+
mock.patch('nova.objects.MigrationList.get_by_filters'),
3960+
mock.patch('nova.objects.Migration.save'),
3961+
mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
3962+
mock.patch('nova.scheduler.utils.resources_from_flavor'),
3963+
mock.patch.object(self.compute.reportclient,
3964+
'remove_provider_tree_from_instance_allocation'),
3965+
mock.patch('nova.objects.Instance.get_by_uuid')
3966+
) as (_get_intances_on_driver, destroy, migration_list, migration_save,
3967+
get_node, get_resources, remove_allocation,
3968+
instance_get_by_uuid):
3969+
migration_list.return_value = [migration_1]
3970+
instance_get_by_uuid.return_value = instance_1
3971+
get_node.return_value = our_node
3972+
get_resources.return_value = mock.sentinel.resources
3973+
3974+
self.compute._destroy_evacuated_instances(self.context)
3975+
3976+
# nothing to be destroyed as the driver returned no instances on
3977+
# the hypervisor
3978+
self.assertFalse(destroy.called)
3979+
3980+
# but our only instance still cleaned up in placement
3981+
remove_allocation.assert_called_once_with(
3982+
self.context, instance_1.uuid, uuids.our_node_uuid)
3983+
instance_get_by_uuid.assert_called_once_with(
3984+
self.context, instance_1.uuid)
3985+
get_node.assert_called_once_with(
3986+
self.context, our_host, 'our-node')
3987+
3988+
def test_destroy_evacuated_instances_not_on_hyp_and_instance_deleted(self):
3989+
migration_1 = objects.Migration(instance_uuid=uuids.instance_1)
3990+
migration_1.status = 'done'
3991+
migration_1.source_node = 'our-node'
3992+
3993+
with test.nested(
3994+
mock.patch.object(self.compute, '_get_instances_on_driver',
3995+
return_value=[]),
3996+
mock.patch.object(self.compute.driver, 'destroy'),
3997+
mock.patch('nova.objects.MigrationList.get_by_filters'),
3998+
mock.patch('nova.objects.Migration.save'),
3999+
mock.patch('nova.scheduler.utils.resources_from_flavor'),
4000+
mock.patch.object(self.compute.reportclient,
4001+
'remove_provider_tree_from_instance_allocation'),
4002+
mock.patch('nova.objects.Instance.get_by_uuid')
4003+
) as (_get_instances_on_driver,
4004+
destroy, migration_list, migration_save, get_resources,
4005+
remove_allocation, instance_get_by_uuid):
4006+
migration_list.return_value = [migration_1]
4007+
instance_get_by_uuid.side_effect = exception.InstanceNotFound(
4008+
instance_id=uuids.instance_1)
4009+
4010+
self.compute._destroy_evacuated_instances(self.context)
4011+
4012+
# nothing to be destroyed as the driver returned no instances on
4013+
# the hypervisor
4014+
self.assertFalse(destroy.called)
4015+
4016+
instance_get_by_uuid.assert_called_once_with(
4017+
self.context, uuids.instance_1)
4018+
# nothing to be cleaned as the instance was deleted already
4019+
self.assertFalse(remove_allocation.called)
4020+
39354021
@mock.patch('nova.compute.manager.ComputeManager.'
39364022
'_destroy_evacuated_instances')
39374023
@mock.patch('nova.compute.manager.LOG')

0 commit comments

Comments
 (0)