Skip to content

Commit cbbca58

Browse files
markgoddardmelwitt
authored andcommitted
Prevent deletion of a compute node belonging to another host
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The main race condition involved is in update_available_resources in the compute manager. When the list of compute nodes is queried, there is a compute node belonging to the host that it does not expect to be managing, i.e. it is an orphan. Between that time and deleting the orphan, the real owner of the compute node takes ownership of it ( in the resource tracker). However, the node is still deleted as the first host is unaware of the ownership change. This change prevents this from occurring by filtering on the host when deleting a compute node. If another compute host has taken ownership of a node, it will have updated the host field and this will prevent deletion from occurring. The first host sees this has happened via the ComputeHostNotFound exception, and avoids deleting its resource provider. Co-Authored-By: melanie witt <[email protected]> Conflicts: nova/db/sqlalchemy/api.py NOTE(melwitt): The conflict is because change I9f414cf831316b624132d9e06192f1ecbbd3dd78 (db: Copy docs from 'nova.db.*' to 'nova.db.sqlalchemy.*') is not in Wallaby. NOTE(melwitt): Differences from the cherry picked change from calling nova.db.api => nova.db.sqlalchemy.api directly are due to the alembic migration in Xena which looks to have made the nova.db.api interface obsolete. Closes-Bug: #1853009 Related-Bug: #1841481 Change-Id: I260c1fded79a85d4899e94df4d9036a1ee437f02 (cherry picked from commit a8492e8)
1 parent 0fc104e commit cbbca58

File tree

9 files changed

+127
-27
lines changed

9 files changed

+127
-27
lines changed

nova/compute/manager.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9996,17 +9996,30 @@ def update_available_resource(self, context, startup=False):
99969996
"nodes are %(nodes)s",
99979997
{'id': cn.id, 'hh': cn.hypervisor_hostname,
99989998
'nodes': nodenames})
9999-
cn.destroy()
10000-
self.rt.remove_node(cn.hypervisor_hostname)
10001-
# Delete the corresponding resource provider in placement,
10002-
# along with any associated allocations.
100039999
try:
10004-
self.reportclient.delete_resource_provider(context, cn,
10005-
cascade=True)
10006-
except keystone_exception.ClientException as e:
10007-
LOG.error(
10008-
"Failed to delete compute node resource provider "
10009-
"for compute node %s: %s", cn.uuid, str(e))
10000+
cn.destroy()
10001+
except exception.ObjectActionError:
10002+
# NOTE(mgoddard): it's possible that another compute
10003+
# service took ownership of this compute node since we
10004+
# queried it due to a rebalance, and this will cause the
10005+
# deletion to fail. Ignore the error in that case.
10006+
LOG.info("Ignoring failure to delete orphan compute node "
10007+
"%(id)s on hypervisor host %(hh)s due to "
10008+
"possible node rebalance",
10009+
{'id': cn.id, 'hh': cn.hypervisor_hostname})
10010+
self.rt.remove_node(cn.hypervisor_hostname)
10011+
self.reportclient.invalidate_resource_provider(cn.uuid)
10012+
else:
10013+
self.rt.remove_node(cn.hypervisor_hostname)
10014+
# Delete the corresponding resource provider in placement,
10015+
# along with any associated allocations.
10016+
try:
10017+
self.reportclient.delete_resource_provider(
10018+
context, cn, cascade=True)
10019+
except keystone_exception.ClientException as e:
10020+
LOG.error(
10021+
"Failed to delete compute node resource provider "
10022+
"for compute node %s: %s", cn.uuid, str(e))
1001010023

1001110024
for nodename in nodenames:
1001210025
self._update_available_resource_for_node(context, nodename,

nova/db/sqlalchemy/api.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):
761761

762762

763763
@pick_context_manager_writer
764-
def compute_node_delete(context, compute_id):
764+
def compute_node_delete(context, compute_id, constraint=None):
765765
"""Delete a ComputeNode record."""
766-
result = model_query(context, models.ComputeNode).\
767-
filter_by(id=compute_id).\
768-
soft_delete(synchronize_session=False)
766+
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
767+
768+
if constraint is not None:
769+
query = constraint.apply(models.ComputeNode, query)
770+
771+
result = query.soft_delete(synchronize_session=False)
769772

770773
if not result:
771-
raise exception.ComputeHostNotFound(host=compute_id)
774+
# The soft_delete could fail for one of two reasons:
775+
# 1) The compute node no longer exists
776+
# 2) The constraint, if specified, was not met
777+
# Try to read the compute node and let it raise ComputeHostNotFound if
778+
# 1) happened.
779+
compute_node_get(context, compute_id)
780+
# Else, raise ConstraintNotMet if 2) happened.
781+
raise exception.ConstraintNotMet()
772782

773783

774784
@pick_context_manager_reader

nova/objects/compute_node.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,20 @@ def save(self, prune_stats=False):
360360

361361
@base.remotable
362362
def destroy(self):
363-
db.compute_node_delete(self._context, self.id)
363+
if self.obj_attr_is_set('host') and self.host:
364+
# NOTE(melwitt): If our host is set, avoid a race between
365+
# nova-computes during ironic driver node rebalances which can
366+
# change node ownership.
367+
constraint = db.constraint(host=db.equal_any(self.host))
368+
else:
369+
constraint = None
370+
371+
try:
372+
sa_api.compute_node_delete(
373+
self._context, self.id, constraint=constraint)
374+
except exception.ConstraintNotMet:
375+
raise exception.ObjectActionError(action='destroy',
376+
reason='host changed')
364377

365378
def update_from_virt_driver(self, resources):
366379
# NOTE(pmurray): the virt driver provides a dict of values that

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,24 +133,35 @@ def test_node_rebalance_deleted_compute_node_race(self):
133133
# Mock out the compute node query to simulate a race condition where
134134
# the list includes an orphan compute node that is newly owned by
135135
# host_b by the time host_a attempts to delete it.
136-
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
137-
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
138136
with mock.patch(
139137
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
140138
) as mock_get:
141139
mock_get.return_value = [cn]
142140
host_a.manager.update_available_resource(self.ctxt)
143141

144-
# Verify that the node was deleted.
142+
# Verify that the node was almost deleted, but was saved by the host
143+
# check
145144
self.assertIn(
146145
'Deleting orphan compute node %s hypervisor host '
147146
'is fake-node, nodes are' % cn.id,
148147
self.stdlog.logger.output)
149-
hypervisors = self.api.api_get(
150-
'/os-hypervisors/detail').body['hypervisors']
151-
self.assertEqual(0, len(hypervisors), hypervisors)
148+
self.assertIn(
149+
'Ignoring failure to delete orphan compute node %s on '
150+
'hypervisor host fake-node' % cn.id,
151+
self.stdlog.logger.output)
152+
self._assert_hypervisor_api(self.nodename, 'host_b')
152153
rps = self._get_all_providers()
153-
self.assertEqual(0, len(rps), rps)
154+
self.assertEqual(1, len(rps), rps)
155+
self.assertEqual(self.nodename, rps[0]['name'])
156+
157+
# Simulate deletion of an orphan by host_a. It shouldn't happen
158+
# anymore, but let's assume it already did.
159+
cn = objects.ComputeNode.get_by_host_and_nodename(
160+
self.ctxt, 'host_b', self.nodename)
161+
cn.destroy()
162+
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
163+
host_a.manager.reportclient.delete_resource_provider(
164+
self.ctxt, cn, cascade=True)
154165

155166
# host_b[3]: Should recreate compute node and resource provider.
156167
# FIXME(mgoddard): Resource provider not recreated here, due to

nova/tests/unit/compute/test_compute.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,13 +200,14 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs):
200200
context, objects.ComputeNode(), cn)
201201
for cn in fake_compute_nodes]
202202

203-
def fake_compute_node_delete(context, compute_node_id):
203+
def fake_compute_node_delete(context, compute_node_id,
204+
constraint=None):
204205
self.assertEqual(2, compute_node_id)
205206

206207
self.stub_out(
207208
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
208209
fake_get_compute_nodes_in_db)
209-
self.stub_out('nova.db.api.compute_node_delete',
210+
self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
210211
fake_compute_node_delete)
211212

212213
self.compute.update_available_resource(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
409409
update_mock.assert_not_called()
410410
del_rp_mock.assert_not_called()
411411

412+
@mock.patch.object(manager.ComputeManager,
413+
'_update_available_resource_for_node')
414+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
415+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
416+
def test_update_available_resource_destroy_rebalance(
417+
self, get_db_nodes, get_avail_nodes, update_mock):
418+
mock_rt = self._mock_rt()
419+
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
420+
self.compute, 'reportclient')).mock
421+
db_nodes = [self._make_compute_node('node1', 1)]
422+
get_db_nodes.return_value = db_nodes
423+
# Destroy can fail if nodes were rebalanced between getting the node
424+
# list and calling destroy.
425+
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
426+
action='destroy', reason='host changed')
427+
get_avail_nodes.return_value = set()
428+
self.compute.update_available_resource(self.context)
429+
get_db_nodes.assert_called_once_with(self.context, set(),
430+
use_slave=True, startup=False)
431+
self.assertEqual(0, update_mock.call_count)
432+
433+
db_nodes[0].destroy.assert_called_once_with()
434+
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
435+
mock_rt.remove_node.assert_called_once_with('node1')
436+
rc_mock.invalidate_resource_provider.assert_called_once_with(
437+
db_nodes[0].uuid)
438+
412439
@mock.patch('nova.context.get_admin_context')
413440
def test_pre_start_hook(self, get_admin_context):
414441
"""Very simple test just to make sure update_available_resource is

nova/tests/unit/db/test_db_api.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5545,6 +5545,13 @@ def test_compute_node_delete(self):
55455545
nodes = db.compute_node_get_all(self.ctxt)
55465546
self.assertEqual(len(nodes), 0)
55475547

5548+
def test_compute_node_delete_different_host(self):
5549+
compute_node_id = self.item['id']
5550+
constraint = db.constraint(host=db.equal_any('different-host'))
5551+
self.assertRaises(exception.ConstraintNotMet,
5552+
sqlalchemy_api.compute_node_delete,
5553+
self.ctxt, compute_node_id, constraint=constraint)
5554+
55485555
def test_compute_node_search_by_hypervisor(self):
55495556
nodes_created = []
55505557
new_service = copy.copy(self.service_dict)

nova/tests/unit/objects/test_compute_node.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from nova import conf
2626
from nova.db import api as db
27+
from nova.db.sqlalchemy import api as sa_api
2728
from nova import exception
2829
from nova import objects
2930
from nova.objects import base
@@ -410,12 +411,22 @@ def test_set_id_failure(self, mock_get, db_mock):
410411
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
411412
compute, 'id', 124)
412413

413-
@mock.patch.object(db, 'compute_node_delete')
414+
@mock.patch.object(sa_api, 'compute_node_delete')
414415
def test_destroy(self, mock_delete):
415416
compute = compute_node.ComputeNode(context=self.context)
416417
compute.id = 123
417418
compute.destroy()
418-
mock_delete.assert_called_once_with(self.context, 123)
419+
mock_delete.assert_called_once_with(self.context, 123, constraint=None)
420+
421+
def test_destroy_host_constraint(self):
422+
# Create compute node with host='fake'
423+
compute = fake_compute_with_resources.obj_clone()
424+
compute._context = self.context
425+
compute.host = 'fake'
426+
compute.create()
427+
# Simulate a compute node ownership change due to a node rebalance
428+
compute.host = 'different'
429+
self.assertRaises(exception.ObjectActionError, compute.destroy)
419430

420431
@mock.patch.object(db, 'compute_node_get_all')
421432
def test_get_all(self, mock_get_all):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
5+
where a rebalance operation could result in a compute node being deleted
6+
from the database and not recreated. See `bug 1853009
7+
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.

0 commit comments

Comments
 (0)