Skip to content

Commit c434707

Browse files
committed
Clear rebalanced compute nodes from resource tracker
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009
1 parent e9e6479 commit c434707

File tree

5 files changed

+63
-10
lines changed

5 files changed

+63
-10
lines changed

nova/compute/manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8033,13 +8033,16 @@ def update_available_resource(self, context, startup=False):
80338033
compute_nodes_in_db = self._get_compute_nodes_in_db(context,
80348034
use_slave=True,
80358035
startup=startup)
8036+
8037+
rt = self._get_resource_tracker()
8038+
rt.clean_compute_node_cache(compute_nodes_in_db)
8039+
80368040
try:
80378041
nodenames = set(self.driver.get_available_nodes())
80388042
except exception.VirtDriverNotReady:
80398043
LOG.warning("Virt driver is not ready.")
80408044
return
80418045

8042-
rt = self._get_resource_tracker()
80438046
# Delete orphan compute node not reported by driver but still in db
80448047
for cn in compute_nodes_in_db:
80458048
if cn.hypervisor_hostname not in nodenames:

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,3 +1547,20 @@ def build_failed(self, nodename):
15471547
def build_succeeded(self, nodename):
15481548
"""Resets the failed_builds stats for the given node."""
15491549
self.stats[nodename].build_succeeded()
1550+
1551+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
1552+
def clean_compute_node_cache(self, compute_nodes_in_db):
1553+
"""Clean the compute node cache of any nodes that no longer exist.
1554+
1555+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1556+
"""
1557+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1558+
for cn in compute_nodes_in_db}
1559+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
1560+
1561+
for stale_cn in stale_cns:
1562+
# NOTE(mgoddard): we have found a node in the cache that has no
1563+
# compute node in the DB. This could be due to a node rebalance
1564+
# where another compute service took ownership of the node. Clean
1565+
# up the cache.
1566+
self.remove_node(stale_cn)

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,22 +141,35 @@ def test_node_rebalance_deleted_compute_node_race(self):
141141
self.assertEqual(0, len(rps), rps)
142142

143143
# host1[3]: Should recreate compute node and resource provider.
144-
# FIXME(mgoddard): Compute node not recreated here, because it is
145-
# already in RT.compute_nodes. See
146-
# https://bugs.launchpad.net/nova/+bug/1853009.
147144
host1.manager.update_available_resource(ctxt)
148145

149-
# Verify that the node was not recreated.
150-
hypervisors = self.api.api_get(
151-
'/os-hypervisors/detail').body['hypervisors']
152-
self.assertEqual(0, len(hypervisors), hypervisors)
146+
# Verify that the node was recreated.
147+
self._assert_hypervisor_api(nodename, 'host1')
153148

154149
rt = host1.manager._get_resource_tracker()
155150

156-
# But the compute node exists in the RT.
151+
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
152+
# node is not cached in the RT.
153+
self.assertNotIn(nodename, rt.compute_nodes)
154+
155+
# And for the same reason, the provider is not recreated.
156+
rps = self._get_all_providers()
157+
self.assertEqual(0, len(rps), rps)
158+
159+
# host1[1]: Should add compute node to RT cache and recreate resource
160+
# provider.
161+
# FIXME(mgoddard): Resource provider not recreated here, because it
162+
# exists in the provider tree. See
163+
# https://bugs.launchpad.net/nova/+bug/1841481.
164+
host1.manager.update_available_resource(ctxt)
165+
166+
# Verify that the node still exists.
167+
self._assert_hypervisor_api(nodename, 'host1')
168+
169+
# And it is now in the RT cache.
157170
self.assertIn(nodename, rt.compute_nodes)
158171

159-
# The RP exists in Rocky, due to the lack of a provider tree cache.
172+
# There is still no RP.
160173
rps = self._get_all_providers()
161174
self.assertEqual(1, len(rps), rps)
162175
self.assertEqual(nodename, rps[0]['name'])

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
309309
[mock.call(self.context, node) for node in avail_nodes_l])
310310

311311
# First node in set should have been removed from DB
312+
# Last node in set should have been added to DB.
312313
for db_node in db_nodes:
313314
if db_node.hypervisor_hostname == 'node1':
314315
db_node.destroy.assert_called_once_with()
@@ -318,6 +319,8 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
318319
'node1')
319320
else:
320321
self.assertFalse(db_node.destroy.called)
322+
(mock_get_rt.return_value.clean_compute_node_cache.
323+
assert_called_once_with(db_nodes))
321324

322325
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
323326
'delete_resource_provider')

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3310,3 +3310,20 @@ def test_ram_allocation_ratio_none_negative(self):
33103310
def test_disk_allocation_ratio_none_negative(self):
33113311
self.assertRaises(ValueError,
33123312
CONF.set_default, 'disk_allocation_ratio', -1.0)
3313+
3314+
3315+
class TestCleanComputeNodeCache(BaseTestCase):
3316+
3317+
def setUp(self):
3318+
super(TestCleanComputeNodeCache, self).setUp()
3319+
self._setup_rt()
3320+
self.context = context.RequestContext(mock.sentinel.user_id,
3321+
mock.sentinel.project_id)
3322+
3323+
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
3324+
def test_clean_compute_node_cache(self, mock_remove):
3325+
invalid_nodename = "invalid-node"
3326+
self.rt.compute_nodes[_NODENAME] = _COMPUTE_NODE_FIXTURES[0]
3327+
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
3328+
self.rt.clean_compute_node_cache([_COMPUTE_NODE_FIXTURES[0]])
3329+
mock_remove.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)