Skip to content

Commit 8ca7476

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Clean up allocations left by evacuation when deleting service" into stable/xena
2 parents 108778c + 037e588 commit 8ca7476

File tree

5 files changed

+74
-90
lines changed

5 files changed

+74
-90
lines changed

nova/scheduler/client/report.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from nova import exception
3737
from nova.i18n import _
3838
from nova import objects
39+
from nova.objects import fields
3940
from nova import utils
4041

4142

@@ -2248,6 +2249,22 @@ def get_allocations_for_provider_tree(self, context, nodename):
22482249
return {consumer: self.get_allocs_for_consumer(context, consumer)
22492250
for consumer in consumers}
22502251

2252+
def _remove_allocations_for_evacuated_instances(self, context,
2253+
compute_node):
2254+
filters = {
2255+
'source_compute': compute_node.host,
2256+
'status': ['done'],
2257+
'migration_type': fields.MigrationType.EVACUATION,
2258+
}
2259+
evacuations = objects.MigrationList.get_by_filters(context, filters)
2260+
2261+
for evacuation in evacuations:
2262+
if not self.remove_provider_tree_from_instance_allocation(
2263+
context, evacuation.instance_uuid, compute_node.uuid):
2264+
LOG.error("Failed to clean allocation of evacuated "
2265+
"instance on the source node %s",
2266+
compute_node.uuid, instance=evacuation.instance)
2267+
22512268
def delete_resource_provider(self, context, compute_node, cascade=False):
22522269
"""Deletes the ResourceProvider record for the compute_node.
22532270
@@ -2266,17 +2283,20 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
22662283
# Delete any allocations for this resource provider.
22672284
# Since allocations are by consumer, we get the consumers on this
22682285
# host, which are its instances.
2269-
# NOTE(mriedem): This assumes the only allocations on this node
2270-
# are instances, but there could be migration consumers if the
2271-
# node is deleted during a migration or allocations from an
2272-
# evacuated host (bug 1829479). Obviously an admin shouldn't
2273-
# do that but...you know. I guess the provider deletion should fail
2274-
# in that case which is what we'd want to happen.
22752286
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
22762287
context, host, nodename)
22772288
for instance_uuid in instance_uuids:
22782289
self.delete_allocation_for_instance(
22792290
context, instance_uuid, force=True)
2291+
2292+
# When an instance is evacuated, its allocation remains in
2293+
# the source compute node until the node recovers again.
2294+
# If the broken compute never recovered but instead it is
2295+
# decommissioned, then we should delete the allocations of
2296+
# successfully evacuated instances during service delete.
2297+
self._remove_allocations_for_evacuated_instances(context,
2298+
compute_node)
2299+
22802300
# Ensure to delete resource provider in tree by top-down
22812301
# traversable order.
22822302
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)

nova/tests/functional/test_nova_manage.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1849,71 +1849,6 @@ def test_audit_orphaned_allocations_from_confirmed_resize(self):
18491849
self.assertIn('Processed 1 allocation.', output)
18501850
self.assertEqual(4, ret)
18511851

1852-
# TODO(sbauza): Remove this test once bug #1829479 is fixed
1853-
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
1854-
"""Evacuate a server and the delete the source node so that it will
1855-
leave a source allocation that the audit command will find.
1856-
"""
1857-
1858-
source_hostname = self.compute1.host
1859-
dest_hostname = self.compute2.host
1860-
1861-
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
1862-
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
1863-
1864-
server = self._boot_and_check_allocations(self.flavor, source_hostname)
1865-
1866-
# Stop the service and fake it down
1867-
self.compute1.stop()
1868-
source_service_id = self.admin_api.get_services(
1869-
host=source_hostname, binary='nova-compute')[0]['id']
1870-
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
1871-
1872-
# evacuate the instance to the target
1873-
self._evacuate_server(
1874-
server, {'host': dest_hostname}, expected_host=dest_hostname)
1875-
1876-
# Now the instance is gone, we can delete the compute service
1877-
self.admin_api.api_delete('/os-services/%s' % source_service_id)
1878-
1879-
# Since the compute is deleted, we should have in theory a single
1880-
# allocation against the destination resource provider, but evacuated
1881-
# instances are not having their allocations deleted. See bug #1829479.
1882-
# We have two allocations for the same consumer, source and destination
1883-
self._check_allocation_during_evacuate(
1884-
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
1885-
1886-
# Now, run the audit command that will find this orphaned allocation
1887-
ret = self.cli.audit(verbose=True)
1888-
output = self.output.getvalue()
1889-
self.assertIn(
1890-
'Allocations for consumer UUID %(consumer_uuid)s on '
1891-
'Resource Provider %(rp_uuid)s can be deleted' %
1892-
{'consumer_uuid': server['id'],
1893-
'rp_uuid': source_rp_uuid},
1894-
output)
1895-
self.assertIn('Processed 1 allocation.', output)
1896-
self.assertEqual(3, ret)
1897-
1898-
# Now we want to delete the orphaned allocation that is duplicate
1899-
ret = self.cli.audit(delete=True, verbose=True)
1900-
1901-
# We finally should only have the target allocations
1902-
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
1903-
self.assertRequestMatchesUsage({'VCPU': 0,
1904-
'MEMORY_MB': 0,
1905-
'DISK_GB': 0}, source_rp_uuid)
1906-
1907-
output = self.output.getvalue()
1908-
self.assertIn(
1909-
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
1910-
'Resource Provider %(rp_uuid)s' %
1911-
{'consumer_uuid': server['id'],
1912-
'rp_uuid': source_rp_uuid},
1913-
output)
1914-
self.assertIn('Processed 1 allocation.', output)
1915-
self.assertEqual(4, ret)
1916-
19171852

19181853
class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
19191854
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""

nova/tests/functional/wsgi/test_services.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ def test_evacuate_then_delete_compute_service(self):
120120
"""Tests a scenario where a server is created on a host, the host
121121
goes down, the server is evacuated to another host, and then the
122122
source host compute service is deleted. After that the deleted
123-
compute service is restarted. Related placement resources are checked
124-
throughout.
123+
compute service is restarted and starts successfully.
125124
"""
126125
# Create our source host that we will evacuate *from* later.
127126
host1 = self._start_compute('host1')
@@ -152,23 +151,15 @@ def test_evacuate_then_delete_compute_service(self):
152151
services = self.admin_api.get_services(
153152
binary='nova-compute', host='host1')
154153
self.assertEqual(0, len(services), services)
155-
# FIXME(mriedem): This is bug 1829479 where the compute service is
156-
# deleted but the resource provider is not because there are still
157-
# allocations against the provider from the evacuated server.
154+
# Then the resource provider is also deleted.
158155
resp = self.placement.get('/resource_providers/%s' % rp_uuid)
159-
self.assertEqual(200, resp.status)
160-
self.assertFlavorMatchesUsage(rp_uuid, flavor)
161-
# Try to restart the host1 compute service to create a new resource
162-
# provider.
156+
self.assertEqual(404, resp.status)
157+
# Try to restart the host1 compute service to create a new service
158+
# and a new resource provider.
163159
self.restart_compute_service(host1)
164-
# FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
165-
# compute service attempts to create a new resource provider with a
166-
# new uuid but the same name which results in a conflict. The service
167-
# does not die, however, because _update_available_resource_for_node
168-
# catches and logs but does not re-raise the error.
169-
log_output = self.stdlog.logger.output
170-
self.assertIn('Error updating resources for node host1.', log_output)
171-
self.assertIn('Failed to create resource provider host1', log_output)
160+
# Make sure the compute service record for host1 is recreated.
161+
service = self.admin_api.get_services(
162+
binary='nova-compute', host='host1')[0]
172163

173164
def test_migrate_confirm_after_deleted_source_compute(self):
174165
"""Tests a scenario where a server is cold migrated and while in

nova/tests/unit/scheduler/client/test_report.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,15 +3184,41 @@ def test_refresh_associations_disabled(self, mock_time):
31843184

31853185
class TestAllocations(SchedulerReportClientTestCase):
31863186

3187+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3188+
"remove_provider_tree_from_instance_allocation")
3189+
@mock.patch('nova.objects.MigrationList.get_by_filters')
3190+
def test_remove_allocations_for_evacuated_instances(self,
3191+
mock_get_migrations, mock_rm_pr_tree):
3192+
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
3193+
hypervisor_hostname="fake_hostname", )
3194+
migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
3195+
mock.Mock(instance_uuid=uuids.inst2, status='done')]
3196+
mock_get_migrations.return_value = migrations
3197+
mock_rm_pr_tree.return_value = True
3198+
self.client._remove_allocations_for_evacuated_instances(self.context,
3199+
cn)
3200+
mock_get_migrations.assert_called_once_with(
3201+
self.context,
3202+
{'source_compute': cn.host, 'status': ['done'],
3203+
'migration_type': 'evacuation'})
3204+
mock_rm_pr_tree.assert_has_calls(
3205+
[mock.call(self.context, uuids.inst1, cn.uuid),
3206+
mock.call(self.context, uuids.inst2, cn.uuid)])
3207+
# status of migrations should be kept
3208+
for migration in migrations:
3209+
self.assertEqual('done', migration.status)
3210+
31873211
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
31883212
'get_providers_in_tree')
31893213
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31903214
"delete")
3215+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3216+
"_remove_allocations_for_evacuated_instances")
31913217
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31923218
"delete_allocation_for_instance")
31933219
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
31943220
def test_delete_resource_provider_cascade(self, mock_by_host,
3195-
mock_del_alloc, mock_delete, mock_get_rpt):
3221+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
31963222
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
31973223
hypervisor_hostname="fake_hostname", )
31983224
mock_by_host.return_value = [uuids.inst1, uuids.inst2]
@@ -3208,6 +3234,7 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32083234
mock_by_host.assert_called_once_with(
32093235
self.context, cn.host, cn.hypervisor_hostname)
32103236
self.assertEqual(2, mock_del_alloc.call_count)
3237+
mock_evacuate.assert_called_once_with(self.context, cn)
32113238
exp_url = "/resource_providers/%s" % uuids.cn
32123239
mock_delete.assert_called_once_with(
32133240
exp_url, global_request_id=self.context.global_id)
@@ -3217,11 +3244,13 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32173244
'get_providers_in_tree')
32183245
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32193246
"delete")
3247+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3248+
"_remove_allocations_for_evacuated_instances")
32203249
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32213250
"delete_allocation_for_instance")
32223251
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
32233252
def test_delete_resource_provider_no_cascade(self, mock_by_host,
3224-
mock_del_alloc, mock_delete, mock_get_rpt):
3253+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
32253254
self.client._association_refresh_time[uuids.cn] = mock.Mock()
32263255
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
32273256
hypervisor_hostname="fake_hostname", )
@@ -3236,6 +3265,7 @@ def test_delete_resource_provider_no_cascade(self, mock_by_host,
32363265
mock_delete.return_value = resp_mock
32373266
self.client.delete_resource_provider(self.context, cn)
32383267
mock_del_alloc.assert_not_called()
3268+
mock_evacuate.assert_not_called()
32393269
exp_url = "/resource_providers/%s" % uuids.cn
32403270
mock_delete.assert_called_once_with(
32413271
exp_url, global_request_id=self.context.global_id)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
fixes:
3+
- |
4+
`Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
5+
deleting a nova-compute service removes allocations of successfully
6+
evacuated instances. This allows the associated resource provider to be
7+
deleted automatically even if the nova-compute service cannot recover after
8+
all instances on the node have been successfully evacuated.

0 commit comments

Comments
 (0)