Skip to content

Commit f950ced

Browse files
stephenfinmelwitt
authored andcommitted
Clear rebalanced compute nodes from resource tracker
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009 (cherry picked from commit 32676a9)
1 parent c260e75 commit f950ced

File tree

5 files changed

+66
-18
lines changed

5 files changed

+66
-18
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9986,6 +9986,8 @@ def update_available_resource(self, context, startup=False):
99869986
use_slave=True,
99879987
startup=startup)
99889988

9989+
self.rt.clean_compute_node_cache(compute_nodes_in_db)
9990+
99899991
# Delete orphan compute node not reported by driver but still in db
99909992
for cn in compute_nodes_in_db:
99919993
if cn.hypervisor_hostname not in nodenames:

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1946,3 +1946,20 @@ def finish_evacuation(self, instance, node, migration):
19461946
if migration:
19471947
migration.status = 'done'
19481948
migration.save()
1949+
1950+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
1951+
def clean_compute_node_cache(self, compute_nodes_in_db):
1952+
"""Clean the compute node cache of any nodes that no longer exist.
1953+
1954+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1955+
"""
1956+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1957+
for cn in compute_nodes_in_db}
1958+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
1959+
1960+
for stale_cn in stale_cns:
1961+
# NOTE(mgoddard): we have found a node in the cache that has no
1962+
# compute node in the DB. This could be due to a node rebalance
1963+
# where another compute service took ownership of the node. Clean
1964+
# up the cache.
1965+
self.remove_node(stale_cn)

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
8282
# Now run the update_available_resource periodic to register fake-node
8383
# and have it managed by host_b. This will also detect the "host_b"
8484
# node as orphaned and delete it along with its resource provider.
85-
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
86-
self.ctxt, 'host_b', 'host_b',
87-
)
8885

8986
# host_b[1]: Finds no compute record in RT. Tries to create one
9087
# (_init_compute_node).
@@ -93,10 +90,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
9390
# update for this node. See
9491
# https://bugs.launchpad.net/nova/+bug/1853159.
9592
host_b.manager.update_available_resource(self.ctxt)
96-
self.assertIn(
97-
'Deleting orphan compute node %s hypervisor host '
98-
'is host_b, nodes are' % cn_host_b_node.id,
99-
self.stdlog.logger.output)
10093
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
10194
# There should only be one resource provider (fake-node).
10295
original_rps = self._get_all_providers()
@@ -160,21 +153,17 @@ def test_node_rebalance_deleted_compute_node_race(self):
160153
self.assertEqual(0, len(rps), rps)
161154

162155
# host_b[3]: Should recreate compute node and resource provider.
163-
# FIXME(mgoddard): Compute node not recreated here, because it is
164-
# already in RT.compute_nodes. See
165-
# https://bugs.launchpad.net/nova/+bug/1853009.
166156
# FIXME(mgoddard): Resource provider not recreated here, because it
167157
# exists in the provider tree. See
168158
# https://bugs.launchpad.net/nova/+bug/1841481.
169159
host_b.manager.update_available_resource(self.ctxt)
170160

171-
# Verify that the node was not recreated.
172-
hypervisors = self.api.api_get(
173-
'/os-hypervisors/detail').body['hypervisors']
174-
self.assertEqual(0, len(hypervisors), hypervisors)
161+
# Verify that the node was recreated.
162+
self._assert_hypervisor_api(self.nodename, 'host_b')
175163

176-
# But the compute node exists in the RT.
177-
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
164+
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
165+
# node is not cached in the RT.
166+
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
178167

179168
# There is no RP.
180169
rps = self._get_all_providers()
@@ -184,6 +173,27 @@ def test_node_rebalance_deleted_compute_node_race(self):
184173
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
185174
self.nodename))
186175

176+
# host_b[1]: Should add compute node to RT cache and recreate resource
177+
# provider.
178+
# FIXME(mgoddard): Resource provider not recreated here, because it
179+
# exists in the provider tree. See
180+
# https://bugs.launchpad.net/nova/+bug/1841481.
181+
host_b.manager.update_available_resource(self.ctxt)
182+
183+
# Verify that the node still exists.
184+
self._assert_hypervisor_api(self.nodename, 'host_b')
185+
186+
# And it is now in the RT cache.
187+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188+
189+
# There is still no RP.
190+
rps = self._get_all_providers()
191+
self.assertEqual(0, len(rps), rps)
192+
193+
# But the RP it exists in the provider tree.
194+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195+
self.nodename))
196+
187197
# This fails due to the lack of a resource provider.
188198
self.assertIn(
189199
'Skipping removal of allocations for deleted instances',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,18 +374,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
374374
)
375375

376376
# First node in set should have been removed from DB
377+
# Last node in set should have been added to DB.
377378
for db_node in db_nodes:
378379
if db_node.hypervisor_hostname == 'node1':
379380
db_node.destroy.assert_called_once_with()
380381
rc_mock.delete_resource_provider.assert_called_once_with(
381382
self.context, db_node, cascade=True)
382-
mock_rt.remove_node.assert_called_once_with(
383-
'node1')
383+
mock_rt.remove_node.assert_called_once_with('node1')
384384
mock_log.error.assert_called_once_with(
385385
"Failed to delete compute node resource provider for "
386386
"compute node %s: %s", db_node.uuid, mock.ANY)
387387
else:
388388
self.assertFalse(db_node.destroy.called)
389+
self.assertEqual(1, mock_rt.remove_node.call_count)
390+
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
389391

390392
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
391393
'delete_resource_provider')

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4178,3 +4178,20 @@ def test__get_providers_to_update_not_in_tree(self, mock_log):
41784178
mock_log.warning.assert_called_once_with(*expected_log_call)
41794179
self.assertIn(uuids.unknown, self.rt.absent_providers)
41804180
self.assertEqual(result, [])
4181+
4182+
4183+
class TestCleanComputeNodeCache(BaseTestCase):
4184+
4185+
def setUp(self):
4186+
super(TestCleanComputeNodeCache, self).setUp()
4187+
self._setup_rt()
4188+
self.context = context.RequestContext(
4189+
mock.sentinel.user_id, mock.sentinel.project_id)
4190+
4191+
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
4192+
def test_clean_compute_node_cache(self, mock_remove):
4193+
invalid_nodename = "invalid-node"
4194+
self.rt.compute_nodes[_NODENAME] = self.compute
4195+
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4196+
self.rt.clean_compute_node_cache([self.compute])
4197+
mock_remove.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)