Skip to content

Commit 903ae5e

Browse files
committed
Prevent deletion of a compute node belonging to another host
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The main race condition involved is in update_available_resources in the compute manager. When the list of compute nodes is queried, there is a compute node belonging to the host that it does not expect to be managing, i.e. it is an orphan. Between that time and deleting the orphan, the real owner of the compute node takes ownership of it ( in the resource tracker). However, the node is still deleted as the first host is unaware of the ownership change. This change prevents this from occurring by filtering on the host when deleting a compute node. If another compute host has taken ownership of a node, it will have updated the host field and this will prevent deletion from occurring. The first host sees this has happened via the ComputeHostNotFound exception, and avoids deleting its resource provider. Closes-Bug: #1853009 Related-Bug: #1841481 Change-Id: I260c1fded79a85d4899e94df4d9036a1ee437f02
1 parent c434707 commit 903ae5e

File tree

10 files changed

+92
-23
lines changed

10 files changed

+92
-23
lines changed

nova/compute/manager.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8051,13 +8051,27 @@ def update_available_resource(self, context, startup=False):
80518051
"nodes are %(nodes)s",
80528052
{'id': cn.id, 'hh': cn.hypervisor_hostname,
80538053
'nodes': nodenames})
8054-
cn.destroy()
8055-
rt.remove_node(cn.hypervisor_hostname)
8056-
# Delete the corresponding resource provider in placement,
8057-
# along with any associated allocations and inventory.
8058-
# TODO(cdent): Move use of reportclient into resource tracker.
8059-
self.scheduler_client.reportclient.delete_resource_provider(
8060-
context, cn, cascade=True)
8054+
try:
8055+
cn.destroy()
8056+
except exception.ComputeHostNotFound:
8057+
# NOTE(mgoddard): it's possible that another compute
8058+
# service took ownership of this compute node since we
8059+
# queried it due to a rebalance, and this will cause the
8060+
# deletion to fail. Ignore the error in that case.
8061+
LOG.info("Ignoring failure to delete orphan compute node "
8062+
"%(id)s on hypervisor host %(hh)s due to "
8063+
"possible node rebalance",
8064+
{'id': cn.id, 'hh': cn.hypervisor_hostname})
8065+
rt.remove_node(cn.hypervisor_hostname)
8066+
else:
8067+
rt.remove_node(cn.hypervisor_hostname)
8068+
# Delete the corresponding resource provider in placement,
8069+
# along with any associated allocations.
8070+
# TODO(cdent): Move use of reportclient into resource
8071+
# tracker.
8072+
reportclient = self.scheduler_client.reportclient
8073+
reportclient.delete_resource_provider(
8074+
context, cn, cascade=True)
80618075

80628076
for nodename in nodenames:
80638077
self._update_available_resource_for_node(context, nodename)

nova/db/api.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,15 +330,16 @@ def compute_node_update(context, compute_id, values):
330330
return IMPL.compute_node_update(context, compute_id, values)
331331

332332

333-
def compute_node_delete(context, compute_id):
333+
def compute_node_delete(context, compute_id, compute_host):
334334
"""Delete a compute node from the database.
335335
336336
:param context: The security context
337337
:param compute_id: ID of the compute node
338+
:param compute_host: Hostname of the compute service
338339
339340
Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
340341
"""
341-
return IMPL.compute_node_delete(context, compute_id)
342+
return IMPL.compute_node_delete(context, compute_id, compute_host)
342343

343344

344345
def compute_node_statistics(context):

nova/db/sqlalchemy/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -758,10 +758,10 @@ def compute_node_update(context, compute_id, values):
758758

759759

760760
@pick_context_manager_writer
761-
def compute_node_delete(context, compute_id):
761+
def compute_node_delete(context, compute_id, compute_host):
762762
"""Delete a ComputeNode record."""
763763
result = model_query(context, models.ComputeNode).\
764-
filter_by(id=compute_id).\
764+
filter_by(id=compute_id, host=compute_host).\
765765
soft_delete(synchronize_session=False)
766766

767767
if not result:

nova/objects/compute_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ def save(self, prune_stats=False):
337337

338338
@base.remotable
339339
def destroy(self):
340-
db.compute_node_delete(self._context, self.id)
340+
db.compute_node_delete(self._context, self.id, self.host)
341341

342342
def update_from_virt_driver(self, resources):
343343
# NOTE(pmurray): the virt driver provides a dict of values that

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,32 @@ def test_node_rebalance_deleted_compute_node_race(self):
123123
# Mock out the compute node query to simulate a race condition where
124124
# the list includes an orphan compute node that is taken ownership of
125125
# by host1 by the time host2 deletes it.
126-
# FIXME(mgoddard): Ideally host2 would not delete a node that does not
127-
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
128126
with mock.patch('nova.compute.manager.ComputeManager.'
129127
'_get_compute_nodes_in_db') as mock_get:
130128
mock_get.return_value = [cn]
131129
host2.manager.update_available_resource(ctxt)
132130

133-
# Verify that the node was deleted.
131+
# Verify that the node was almost deleted, but saved by the host check.
134132
self.assertIn("Deleting orphan compute node %s hypervisor host "
135133
"is fake-node, nodes are" % cn.id,
136134
self.stdlog.logger.output)
137-
hypervisors = self.api.api_get(
138-
'/os-hypervisors/detail').body['hypervisors']
139-
self.assertEqual(0, len(hypervisors), hypervisors)
135+
self.assertIn("Ignoring failure to delete orphan compute node %s on "
136+
"hypervisor host fake-node" % cn.id,
137+
self.stdlog.logger.output)
138+
self._assert_hypervisor_api(nodename, 'host1')
140139
rps = self._get_all_providers()
141-
self.assertEqual(0, len(rps), rps)
140+
self.assertEqual(1, len(rps), rps)
141+
self.assertEqual(nodename, rps[0]['name'])
142+
143+
# Simulate deletion of an orphan by host2. It shouldn't happen anymore,
144+
# but let's assume it already did.
145+
cn = objects.ComputeNode.get_by_host_and_nodename(
146+
ctxt, 'host1', nodename)
147+
cn.destroy()
148+
rt2 = host2.manager._get_resource_tracker()
149+
rt2.remove_node(cn.hypervisor_hostname)
150+
host2.manager.reportclient.delete_resource_provider(
151+
ctxt, cn, cascade=True)
142152

143153
# host1[3]: Should recreate compute node and resource provider.
144154
host1.manager.update_available_resource(ctxt)

nova/tests/unit/compute/test_compute.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,10 @@ def fake_get_compute_nodes_in_db(self, context, **kwargs):
207207
context, objects.ComputeNode(), cn)
208208
for cn in fake_compute_nodes]
209209

210-
def fake_compute_node_delete(context, compute_node_id):
210+
def fake_compute_node_delete(context, compute_node_id,
211+
compute_node_host):
211212
self.assertEqual(2, compute_node_id)
213+
self.assertEqual('fake_phyp1', compute_node_host)
212214

213215
self.stub_out(
214216
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,32 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
342342
update_mock.assert_not_called()
343343
del_rp_mock.assert_not_called()
344344

345+
@mock.patch.object(manager.ComputeManager, '_get_resource_tracker')
346+
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
347+
'delete_resource_provider')
348+
@mock.patch.object(manager.ComputeManager,
349+
'_update_available_resource_for_node')
350+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
351+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
352+
def test_update_available_resource_destroy_rebalance(
353+
self, get_db_nodes, get_avail_nodes, update_mock, del_rp_mock,
354+
mock_get_rt):
355+
db_nodes = [self._make_compute_node('node1', 1)]
356+
get_db_nodes.return_value = db_nodes
357+
# Destroy can fail if nodes were rebalanced between getting the node
358+
# list and calling destroy.
359+
db_nodes[0].destroy.side_effect = exception.ComputeHostNotFound(
360+
'node1')
361+
get_avail_nodes.return_value = set()
362+
self.compute.update_available_resource(self.context)
363+
get_db_nodes.assert_called_once_with(self.context, use_slave=True,
364+
startup=False)
365+
self.assertEqual(0, update_mock.call_count)
366+
367+
db_nodes[0].destroy.assert_called_once_with()
368+
self.assertEqual(0, del_rp_mock.call_count)
369+
mock_get_rt.return_value.remove_node.assert_called_once_with('node1')
370+
345371
@mock.patch('nova.context.get_admin_context')
346372
def test_pre_start_hook(self, get_admin_context):
347373
"""Very simple test just to make sure update_available_resource is

nova/tests/unit/db/test_db_api.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7034,7 +7034,7 @@ def test_compute_node_get_all_deleted_compute_node(self):
70347034

70357035
# Now delete the newly-created compute node to ensure the related
70367036
# compute node stats are wiped in a cascaded fashion
7037-
db.compute_node_delete(self.ctxt, node['id'])
7037+
db.compute_node_delete(self.ctxt, node['id'], node['host'])
70387038

70397039
# Clean up the service
70407040
db.service_destroy(self.ctxt, service['id'])
@@ -7186,10 +7186,18 @@ def test_compute_node_update(self):
71867186

71877187
def test_compute_node_delete(self):
71887188
compute_node_id = self.item['id']
7189-
db.compute_node_delete(self.ctxt, compute_node_id)
7189+
compute_node_host = self.item['host']
7190+
db.compute_node_delete(self.ctxt, compute_node_id, compute_node_host)
71907191
nodes = db.compute_node_get_all(self.ctxt)
71917192
self.assertEqual(len(nodes), 0)
71927193

7194+
def test_compute_node_delete_different_host(self):
7195+
compute_node_id = self.item['id']
7196+
compute_node_host = 'invalid-host'
7197+
self.assertRaises(exception.ComputeHostNotFound,
7198+
db.compute_node_delete,
7199+
self.ctxt, compute_node_id, compute_node_host)
7200+
71937201
def test_compute_node_search_by_hypervisor(self):
71947202
nodes_created = []
71957203
new_service = copy.copy(self.service_dict)

nova/tests/unit/objects/test_compute_node.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,8 +388,9 @@ def test_set_id_failure(self, mock_get, db_mock):
388388
def test_destroy(self, mock_delete):
389389
compute = compute_node.ComputeNode(context=self.context)
390390
compute.id = 123
391+
compute.host = 'fake'
391392
compute.destroy()
392-
mock_delete.assert_called_once_with(self.context, 123)
393+
mock_delete.assert_called_once_with(self.context, 123, 'fake')
393394

394395
@mock.patch.object(db, 'compute_node_get_all')
395396
def test_get_all(self, mock_get_all):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
5+
where a rebalance operation could result in a compute node being deleted
6+
from the database and not recreated. See `bug 1853009
7+
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.

0 commit comments

Comments
 (0)