Skip to content

Commit e5a34ff

Browse files
committed
Clean up allocations left by evacuation when deleting service
When a compute node goes down and all instances on the compute node are evacuated, allocation records about these instance are still left in the source compute node until nova-compute service is again started on the node. However if a compute node is completely broken, it is not possible to start the service again. In this situation deleting nova-compute service for the compute node doesn't delete its resource provider record, and even if a user tries to delete the resource provider, the delete request is rejected because allocations are still left on that node. This change ensures that remaining allocations left by successful evacuations are cleared when deleting a nova-compute service, to avoid any resource provider record left even if a compute node can't be recovered. Migration records are still left in 'done' status to trigger clean-up tasks in case the compute node is recovered later. Closes-Bug: #1829479 Change-Id: I3ce6f6275bfe09d43718c3a491b3991a804027bd
1 parent 402fe18 commit e5a34ff

File tree

5 files changed

+74
-90
lines changed

5 files changed

+74
-90
lines changed

nova/scheduler/client/report.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from nova import exception
3737
from nova.i18n import _
3838
from nova import objects
39+
from nova.objects import fields
3940
from nova import utils
4041

4142

@@ -2248,6 +2249,22 @@ def get_allocations_for_provider_tree(self, context, nodename):
22482249
return {consumer: self.get_allocs_for_consumer(context, consumer)
22492250
for consumer in consumers}
22502251

2252+
def _remove_allocations_for_evacuated_instances(self, context,
2253+
compute_node):
2254+
filters = {
2255+
'source_compute': compute_node.host,
2256+
'status': ['done'],
2257+
'migration_type': fields.MigrationType.EVACUATION,
2258+
}
2259+
evacuations = objects.MigrationList.get_by_filters(context, filters)
2260+
2261+
for evacuation in evacuations:
2262+
if not self.remove_provider_tree_from_instance_allocation(
2263+
context, evacuation.instance_uuid, compute_node.uuid):
2264+
LOG.error("Failed to clean allocation of evacuated "
2265+
"instance on the source node %s",
2266+
compute_node.uuid, instance=evacuation.instance)
2267+
22512268
def delete_resource_provider(self, context, compute_node, cascade=False):
22522269
"""Deletes the ResourceProvider record for the compute_node.
22532270
@@ -2266,17 +2283,20 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
22662283
# Delete any allocations for this resource provider.
22672284
# Since allocations are by consumer, we get the consumers on this
22682285
# host, which are its instances.
2269-
# NOTE(mriedem): This assumes the only allocations on this node
2270-
# are instances, but there could be migration consumers if the
2271-
# node is deleted during a migration or allocations from an
2272-
# evacuated host (bug 1829479). Obviously an admin shouldn't
2273-
# do that but...you know. I guess the provider deletion should fail
2274-
# in that case which is what we'd want to happen.
22752286
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
22762287
context, host, nodename)
22772288
for instance_uuid in instance_uuids:
22782289
self.delete_allocation_for_instance(
22792290
context, instance_uuid, force=True)
2291+
2292+
# When an instance is evacuated, its allocation remains in
2293+
# the source compute node until the node recovers again.
2294+
# If the broken compute never recovered but instead it is
2295+
# decommissioned, then we should delete the allocations of
2296+
# successfully evacuated instances during service delete.
2297+
self._remove_allocations_for_evacuated_instances(context,
2298+
compute_node)
2299+
22802300
# Ensure to delete resource provider in tree by top-down
22812301
# traversable order.
22822302
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)

nova/tests/functional/test_nova_manage.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1849,71 +1849,6 @@ def test_audit_orphaned_allocations_from_confirmed_resize(self):
18491849
self.assertIn('Processed 1 allocation.', output)
18501850
self.assertEqual(4, ret)
18511851

1852-
# TODO(sbauza): Remove this test once bug #1829479 is fixed
1853-
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
1854-
"""Evacuate a server and the delete the source node so that it will
1855-
leave a source allocation that the audit command will find.
1856-
"""
1857-
1858-
source_hostname = self.compute1.host
1859-
dest_hostname = self.compute2.host
1860-
1861-
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
1862-
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
1863-
1864-
server = self._boot_and_check_allocations(self.flavor, source_hostname)
1865-
1866-
# Stop the service and fake it down
1867-
self.compute1.stop()
1868-
source_service_id = self.admin_api.get_services(
1869-
host=source_hostname, binary='nova-compute')[0]['id']
1870-
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
1871-
1872-
# evacuate the instance to the target
1873-
self._evacuate_server(
1874-
server, {'host': dest_hostname}, expected_host=dest_hostname)
1875-
1876-
# Now the instance is gone, we can delete the compute service
1877-
self.admin_api.api_delete('/os-services/%s' % source_service_id)
1878-
1879-
# Since the compute is deleted, we should have in theory a single
1880-
# allocation against the destination resource provider, but evacuated
1881-
# instances are not having their allocations deleted. See bug #1829479.
1882-
# We have two allocations for the same consumer, source and destination
1883-
self._check_allocation_during_evacuate(
1884-
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
1885-
1886-
# Now, run the audit command that will find this orphaned allocation
1887-
ret = self.cli.audit(verbose=True)
1888-
output = self.output.getvalue()
1889-
self.assertIn(
1890-
'Allocations for consumer UUID %(consumer_uuid)s on '
1891-
'Resource Provider %(rp_uuid)s can be deleted' %
1892-
{'consumer_uuid': server['id'],
1893-
'rp_uuid': source_rp_uuid},
1894-
output)
1895-
self.assertIn('Processed 1 allocation.', output)
1896-
self.assertEqual(3, ret)
1897-
1898-
# Now we want to delete the orphaned allocation that is duplicate
1899-
ret = self.cli.audit(delete=True, verbose=True)
1900-
1901-
# We finally should only have the target allocations
1902-
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
1903-
self.assertRequestMatchesUsage({'VCPU': 0,
1904-
'MEMORY_MB': 0,
1905-
'DISK_GB': 0}, source_rp_uuid)
1906-
1907-
output = self.output.getvalue()
1908-
self.assertIn(
1909-
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
1910-
'Resource Provider %(rp_uuid)s' %
1911-
{'consumer_uuid': server['id'],
1912-
'rp_uuid': source_rp_uuid},
1913-
output)
1914-
self.assertIn('Processed 1 allocation.', output)
1915-
self.assertEqual(4, ret)
1916-
19171852

19181853
class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
19191854
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""

nova/tests/functional/wsgi/test_services.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ def test_evacuate_then_delete_compute_service(self):
120120
"""Tests a scenario where a server is created on a host, the host
121121
goes down, the server is evacuated to another host, and then the
122122
source host compute service is deleted. After that the deleted
123-
compute service is restarted. Related placement resources are checked
124-
throughout.
123+
compute service is restarted and starts successfully.
125124
"""
126125
# Create our source host that we will evacuate *from* later.
127126
host1 = self._start_compute('host1')
@@ -152,23 +151,15 @@ def test_evacuate_then_delete_compute_service(self):
152151
services = self.admin_api.get_services(
153152
binary='nova-compute', host='host1')
154153
self.assertEqual(0, len(services), services)
155-
# FIXME(mriedem): This is bug 1829479 where the compute service is
156-
# deleted but the resource provider is not because there are still
157-
# allocations against the provider from the evacuated server.
154+
# Then the resource provider is also deleted.
158155
resp = self.placement.get('/resource_providers/%s' % rp_uuid)
159-
self.assertEqual(200, resp.status)
160-
self.assertFlavorMatchesUsage(rp_uuid, flavor)
161-
# Try to restart the host1 compute service to create a new resource
162-
# provider.
156+
self.assertEqual(404, resp.status)
157+
# Try to restart the host1 compute service to create a new service
158+
# and a new resource provider.
163159
self.restart_compute_service(host1)
164-
# FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
165-
# compute service attempts to create a new resource provider with a
166-
# new uuid but the same name which results in a conflict. The service
167-
# does not die, however, because _update_available_resource_for_node
168-
# catches and logs but does not re-raise the error.
169-
log_output = self.stdlog.logger.output
170-
self.assertIn('Error updating resources for node host1.', log_output)
171-
self.assertIn('Failed to create resource provider host1', log_output)
160+
# Make sure the compute service record for host1 is recreated.
161+
service = self.admin_api.get_services(
162+
binary='nova-compute', host='host1')[0]
172163

173164
def test_migrate_confirm_after_deleted_source_compute(self):
174165
"""Tests a scenario where a server is cold migrated and while in

nova/tests/unit/scheduler/client/test_report.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,15 +3184,41 @@ def test_refresh_associations_disabled(self, mock_time):
31843184

31853185
class TestAllocations(SchedulerReportClientTestCase):
31863186

3187+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3188+
"remove_provider_tree_from_instance_allocation")
3189+
@mock.patch('nova.objects.MigrationList.get_by_filters')
3190+
def test_remove_allocations_for_evacuated_instances(self,
3191+
mock_get_migrations, mock_rm_pr_tree):
3192+
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
3193+
hypervisor_hostname="fake_hostname", )
3194+
migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
3195+
mock.Mock(instance_uuid=uuids.inst2, status='done')]
3196+
mock_get_migrations.return_value = migrations
3197+
mock_rm_pr_tree.return_value = True
3198+
self.client._remove_allocations_for_evacuated_instances(self.context,
3199+
cn)
3200+
mock_get_migrations.assert_called_once_with(
3201+
self.context,
3202+
{'source_compute': cn.host, 'status': ['done'],
3203+
'migration_type': 'evacuation'})
3204+
mock_rm_pr_tree.assert_has_calls(
3205+
[mock.call(self.context, uuids.inst1, cn.uuid),
3206+
mock.call(self.context, uuids.inst2, cn.uuid)])
3207+
# status of migrations should be kept
3208+
for migration in migrations:
3209+
self.assertEqual('done', migration.status)
3210+
31873211
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
31883212
'get_providers_in_tree')
31893213
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31903214
"delete")
3215+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3216+
"_remove_allocations_for_evacuated_instances")
31913217
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31923218
"delete_allocation_for_instance")
31933219
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
31943220
def test_delete_resource_provider_cascade(self, mock_by_host,
3195-
mock_del_alloc, mock_delete, mock_get_rpt):
3221+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
31963222
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
31973223
hypervisor_hostname="fake_hostname", )
31983224
mock_by_host.return_value = [uuids.inst1, uuids.inst2]
@@ -3208,6 +3234,7 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32083234
mock_by_host.assert_called_once_with(
32093235
self.context, cn.host, cn.hypervisor_hostname)
32103236
self.assertEqual(2, mock_del_alloc.call_count)
3237+
mock_evacuate.assert_called_once_with(self.context, cn)
32113238
exp_url = "/resource_providers/%s" % uuids.cn
32123239
mock_delete.assert_called_once_with(
32133240
exp_url, global_request_id=self.context.global_id)
@@ -3217,11 +3244,13 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32173244
'get_providers_in_tree')
32183245
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32193246
"delete")
3247+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3248+
"_remove_allocations_for_evacuated_instances")
32203249
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32213250
"delete_allocation_for_instance")
32223251
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
32233252
def test_delete_resource_provider_no_cascade(self, mock_by_host,
3224-
mock_del_alloc, mock_delete, mock_get_rpt):
3253+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
32253254
self.client._association_refresh_time[uuids.cn] = mock.Mock()
32263255
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
32273256
hypervisor_hostname="fake_hostname", )
@@ -3236,6 +3265,7 @@ def test_delete_resource_provider_no_cascade(self, mock_by_host,
32363265
mock_delete.return_value = resp_mock
32373266
self.client.delete_resource_provider(self.context, cn)
32383267
mock_del_alloc.assert_not_called()
3268+
mock_evacuate.assert_not_called()
32393269
exp_url = "/resource_providers/%s" % uuids.cn
32403270
mock_delete.assert_called_once_with(
32413271
exp_url, global_request_id=self.context.global_id)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
fixes:
3+
- |
4+
`Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
5+
deleting a nova-compute service removes allocations of successfully
6+
evacuated instances. This allows the associated resource provider to be
7+
deleted automatically even if the nova-compute service cannot recover after
8+
all instances on the node have been successfully evacuated.

0 commit comments

Comments
 (0)