Skip to content

Commit b8f2cd6

Browse files
Balazs GibizerBalazs Gibizer
authored andcommitted
cleanup evacuated instances not on hypervisor
When the compute host recovers from a failure the compute manager destroys instances that were evacuated from the host while it was down. However these code paths only consider evacuated instances that are still reported by the hypervisor. This means that if the compute host is recovered in a way that the hypervisor lost the definition of the instances (for example the compute host was redeployed) then the allocation of these instances will not be deleted. This patch makes sure that the instance allocation is cleaned up even if the driver doesn't return that instance as exists on the hypervisor. Note that the functional test coverage will be added on top as it needs some refactoring that would make the bugfix non backportable. Change-Id: I4bc81b482380c5778781659c4d167a712316dab4 Closes-Bug: #1724172 (cherry picked from commit 9cacaad)
1 parent dc7eb62 commit b8f2cd6

File tree

2 files changed

+119
-13
lines changed

2 files changed

+119
-13
lines changed

nova/compute/manager.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,9 @@ def _destroy_evacuated_instances(self, context):
635635
compared to the instances for the migration records and those local
636636
guests are destroyed, along with instance allocation records in
637637
Placement for this node.
638+
Then allocations are removed from Placement for every instance that is
639+
evacuated from this host regardless if the instance is reported by the
640+
hypervisor or not.
638641
"""
639642
filters = {
640643
'source_compute': self.host,
@@ -661,18 +664,14 @@ def _destroy_evacuated_instances(self, context):
661664
# TODO(mriedem): We could optimize by pre-loading the joined fields
662665
# we know we'll use, like info_cache and flavor.
663666
local_instances = self._get_instances_on_driver(context)
664-
evacuated = [inst for inst in local_instances
665-
if inst.uuid in evacuations]
667+
evacuated_local_instances = {inst.uuid: inst
668+
for inst in local_instances
669+
if inst.uuid in evacuations}
666670

667-
# NOTE(gibi): We are called from init_host and at this point the
668-
# compute_nodes of the resource tracker has not been populated yet so
669-
# we cannot rely on the resource tracker here.
670-
compute_nodes = {}
671-
672-
for instance in evacuated:
673-
migration = evacuations[instance.uuid]
674-
LOG.info('Deleting instance as it has been evacuated from '
675-
'this host', instance=instance)
671+
for instance in evacuated_local_instances.values():
672+
LOG.info('Destroying instance as it has been evacuated from '
673+
'this host but still exists in the hypervisor',
674+
instance=instance)
676675
try:
677676
network_info = self.network_api.get_instance_nw_info(
678677
context, instance)
@@ -692,7 +691,28 @@ def _destroy_evacuated_instances(self, context):
692691
network_info,
693692
bdi, destroy_disks)
694693

695-
# delete the allocation of the evacuated instance from this host
694+
# NOTE(gibi): We are called from init_host and at this point the
695+
# compute_nodes of the resource tracker has not been populated yet so
696+
# we cannot rely on the resource tracker here.
697+
compute_nodes = {}
698+
699+
for instance_uuid, migration in evacuations.items():
700+
try:
701+
if instance_uuid in evacuated_local_instances:
702+
# Avoid the db call if we already have the instance loaded
703+
# above
704+
instance = evacuated_local_instances[instance_uuid]
705+
else:
706+
instance = objects.Instance.get_by_uuid(
707+
context, instance_uuid)
708+
except exception.InstanceNotFound:
709+
# The instance already deleted so we expect that every
710+
# allocation of that instance has already been cleaned up
711+
continue
712+
713+
LOG.info('Cleaning up allocations of the instance as it has been '
714+
'evacuated from this host',
715+
instance=instance)
696716
if migration.source_node not in compute_nodes:
697717
try:
698718
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3920,7 +3920,8 @@ def fake_get_node(context, host, node):
39203920
# both instance_1 and instance_2 is destroyed in the driver
39213921
destroy.assert_has_calls(
39223922
[mock.call(self.context, instance_1, None, {}, True),
3923-
mock.call(self.context, instance_2, None, {}, True)])
3923+
mock.call(self.context, instance_2, None, {}, True)],
3924+
any_order=True)
39243925

39253926
# but only instance_2 is deallocated as the compute node for
39263927
# instance_1 is already deleted
@@ -3929,6 +3930,91 @@ def fake_get_node(context, host, node):
39293930

39303931
self.assertEqual(2, get_node.call_count)
39313932

3933+
def test_destroy_evacuated_instances_not_on_hypervisor(self):
3934+
our_host = self.compute.host
3935+
flavor = objects.Flavor()
3936+
instance_1 = objects.Instance(self.context, flavor=flavor)
3937+
instance_1.uuid = uuids.instance_1
3938+
instance_1.task_state = None
3939+
instance_1.vm_state = vm_states.ACTIVE
3940+
instance_1.host = 'not-' + our_host
3941+
instance_1.user_id = uuids.user_id
3942+
instance_1.project_id = uuids.project_id
3943+
instance_1.deleted = False
3944+
3945+
migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
3946+
migration_1.status = 'done'
3947+
migration_1.source_node = 'our-node'
3948+
3949+
our_node = objects.ComputeNode(
3950+
host=our_host, uuid=uuids.our_node_uuid)
3951+
3952+
with test.nested(
3953+
mock.patch.object(self.compute, '_get_instances_on_driver',
3954+
return_value=[]),
3955+
mock.patch.object(self.compute.driver, 'destroy'),
3956+
mock.patch('nova.objects.MigrationList.get_by_filters'),
3957+
mock.patch('nova.objects.Migration.save'),
3958+
mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
3959+
mock.patch('nova.scheduler.utils.resources_from_flavor'),
3960+
mock.patch.object(self.compute.reportclient,
3961+
'remove_provider_tree_from_instance_allocation'),
3962+
mock.patch('nova.objects.Instance.get_by_uuid')
3963+
) as (_get_intances_on_driver, destroy, migration_list, migration_save,
3964+
get_node, get_resources, remove_allocation,
3965+
instance_get_by_uuid):
3966+
migration_list.return_value = [migration_1]
3967+
instance_get_by_uuid.return_value = instance_1
3968+
get_node.return_value = our_node
3969+
get_resources.return_value = mock.sentinel.resources
3970+
3971+
self.compute._destroy_evacuated_instances(self.context)
3972+
3973+
# nothing to be destroyed as the driver returned no instances on
3974+
# the hypervisor
3975+
self.assertFalse(destroy.called)
3976+
3977+
# but our only instance still cleaned up in placement
3978+
remove_allocation.assert_called_once_with(
3979+
self.context, instance_1.uuid, uuids.our_node_uuid)
3980+
instance_get_by_uuid.assert_called_once_with(
3981+
self.context, instance_1.uuid)
3982+
get_node.assert_called_once_with(
3983+
self.context, our_host, 'our-node')
3984+
3985+
def test_destroy_evacuated_instances_not_on_hyp_and_instance_deleted(self):
3986+
migration_1 = objects.Migration(instance_uuid=uuids.instance_1)
3987+
migration_1.status = 'done'
3988+
migration_1.source_node = 'our-node'
3989+
3990+
with test.nested(
3991+
mock.patch.object(self.compute, '_get_instances_on_driver',
3992+
return_value=[]),
3993+
mock.patch.object(self.compute.driver, 'destroy'),
3994+
mock.patch('nova.objects.MigrationList.get_by_filters'),
3995+
mock.patch('nova.objects.Migration.save'),
3996+
mock.patch('nova.scheduler.utils.resources_from_flavor'),
3997+
mock.patch.object(self.compute.reportclient,
3998+
'remove_provider_tree_from_instance_allocation'),
3999+
mock.patch('nova.objects.Instance.get_by_uuid')
4000+
) as (_get_instances_on_driver,
4001+
destroy, migration_list, migration_save, get_resources,
4002+
remove_allocation, instance_get_by_uuid):
4003+
migration_list.return_value = [migration_1]
4004+
instance_get_by_uuid.side_effect = exception.InstanceNotFound(
4005+
instance_id=uuids.instance_1)
4006+
4007+
self.compute._destroy_evacuated_instances(self.context)
4008+
4009+
# nothing to be destroyed as the driver returned no instances on
4010+
# the hypervisor
4011+
self.assertFalse(destroy.called)
4012+
4013+
instance_get_by_uuid.assert_called_once_with(
4014+
self.context, uuids.instance_1)
4015+
# nothing to be cleaned as the instance was deleted already
4016+
self.assertFalse(remove_allocation.called)
4017+
39324018
@mock.patch('nova.compute.manager.ComputeManager.'
39334019
'_destroy_evacuated_instances')
39344020
@mock.patch('nova.compute.manager.LOG')

0 commit comments

Comments
 (0)