Skip to content

Commit 32676a9

Browse files
stephenfinlyarwood
authored andcommitted
Clear rebalanced compute nodes from resource tracker
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009
1 parent 59d9871 commit 32676a9

File tree

5 files changed

+66
-18
lines changed

5 files changed

+66
-18
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9996,6 +9996,8 @@ def update_available_resource(self, context, startup=False):
99969996
use_slave=True,
99979997
startup=startup)
99989998

9999+
self.rt.clean_compute_node_cache(compute_nodes_in_db)
10000+
999910001
# Delete orphan compute node not reported by driver but still in db
1000010002
for cn in compute_nodes_in_db:
1000110003
if cn.hypervisor_hostname not in nodenames:

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1945,3 +1945,20 @@ def finish_evacuation(self, instance, node, migration):
19451945
if migration:
19461946
migration.status = 'done'
19471947
migration.save()
1948+
1949+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
1950+
def clean_compute_node_cache(self, compute_nodes_in_db):
1951+
"""Clean the compute node cache of any nodes that no longer exist.
1952+
1953+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1954+
"""
1955+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1956+
for cn in compute_nodes_in_db}
1957+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
1958+
1959+
for stale_cn in stale_cns:
1960+
# NOTE(mgoddard): we have found a node in the cache that has no
1961+
# compute node in the DB. This could be due to a node rebalance
1962+
# where another compute service took ownership of the node. Clean
1963+
# up the cache.
1964+
self.remove_node(stale_cn)

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
8282
# Now run the update_available_resource periodic to register fake-node
8383
# and have it managed by host_b. This will also detect the "host_b"
8484
# node as orphaned and delete it along with its resource provider.
85-
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
86-
self.ctxt, 'host_b', 'host_b',
87-
)
8885

8986
# host_b[1]: Finds no compute record in RT. Tries to create one
9087
# (_init_compute_node).
@@ -93,10 +90,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
9390
# update for this node. See
9491
# https://bugs.launchpad.net/nova/+bug/1853159.
9592
host_b.manager.update_available_resource(self.ctxt)
96-
self.assertIn(
97-
'Deleting orphan compute node %s hypervisor host '
98-
'is host_b, nodes are' % cn_host_b_node.id,
99-
self.stdlog.logger.output)
10093
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
10194
# There should only be one resource provider (fake-node).
10295
original_rps = self._get_all_providers()
@@ -160,21 +153,17 @@ def test_node_rebalance_deleted_compute_node_race(self):
160153
self.assertEqual(0, len(rps), rps)
161154

162155
# host_b[3]: Should recreate compute node and resource provider.
163-
# FIXME(mgoddard): Compute node not recreated here, because it is
164-
# already in RT.compute_nodes. See
165-
# https://bugs.launchpad.net/nova/+bug/1853009.
166156
# FIXME(mgoddard): Resource provider not recreated here, because it
167157
# exists in the provider tree. See
168158
# https://bugs.launchpad.net/nova/+bug/1841481.
169159
host_b.manager.update_available_resource(self.ctxt)
170160

171-
# Verify that the node was not recreated.
172-
hypervisors = self.api.api_get(
173-
'/os-hypervisors/detail').body['hypervisors']
174-
self.assertEqual(0, len(hypervisors), hypervisors)
161+
# Verify that the node was recreated.
162+
self._assert_hypervisor_api(self.nodename, 'host_b')
175163

176-
# But the compute node exists in the RT.
177-
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
164+
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
165+
# node is not cached in the RT.
166+
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
178167

179168
# There is no RP.
180169
rps = self._get_all_providers()
@@ -184,6 +173,27 @@ def test_node_rebalance_deleted_compute_node_race(self):
184173
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
185174
self.nodename))
186175

176+
# host_b[1]: Should add compute node to RT cache and recreate resource
177+
# provider.
178+
# FIXME(mgoddard): Resource provider not recreated here, because it
179+
# exists in the provider tree. See
180+
# https://bugs.launchpad.net/nova/+bug/1841481.
181+
host_b.manager.update_available_resource(self.ctxt)
182+
183+
# Verify that the node still exists.
184+
self._assert_hypervisor_api(self.nodename, 'host_b')
185+
186+
# And it is now in the RT cache.
187+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188+
189+
# There is still no RP.
190+
rps = self._get_all_providers()
191+
self.assertEqual(0, len(rps), rps)
192+
193+
# But the RP it exists in the provider tree.
194+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195+
self.nodename))
196+
187197
# This fails due to the lack of a resource provider.
188198
self.assertIn(
189199
'Skipping removal of allocations for deleted instances',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,18 +373,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
373373
)
374374

375375
# First node in set should have been removed from DB
376+
# Last node in set should have been added to DB.
376377
for db_node in db_nodes:
377378
if db_node.hypervisor_hostname == 'node1':
378379
db_node.destroy.assert_called_once_with()
379380
rc_mock.delete_resource_provider.assert_called_once_with(
380381
self.context, db_node, cascade=True)
381-
mock_rt.remove_node.assert_called_once_with(
382-
'node1')
382+
mock_rt.remove_node.assert_called_once_with('node1')
383383
mock_log.error.assert_called_once_with(
384384
"Failed to delete compute node resource provider for "
385385
"compute node %s: %s", db_node.uuid, mock.ANY)
386386
else:
387387
self.assertFalse(db_node.destroy.called)
388+
self.assertEqual(1, mock_rt.remove_node.call_count)
389+
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
388390

389391
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
390392
'delete_resource_provider')

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4177,3 +4177,20 @@ def test__get_providers_to_update_not_in_tree(self, mock_log):
41774177
mock_log.warning.assert_called_once_with(*expected_log_call)
41784178
self.assertIn(uuids.unknown, self.rt.absent_providers)
41794179
self.assertEqual(result, [])
4180+
4181+
4182+
class TestCleanComputeNodeCache(BaseTestCase):
4183+
4184+
def setUp(self):
4185+
super(TestCleanComputeNodeCache, self).setUp()
4186+
self._setup_rt()
4187+
self.context = context.RequestContext(
4188+
mock.sentinel.user_id, mock.sentinel.project_id)
4189+
4190+
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
4191+
def test_clean_compute_node_cache(self, mock_remove):
4192+
invalid_nodename = "invalid-node"
4193+
self.rt.compute_nodes[_NODENAME] = self.compute
4194+
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4195+
self.rt.clean_compute_node_cache([self.compute])
4196+
mock_remove.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)