Skip to content

Commit 0fc104e

Browse files
markgoddardmelwitt
authored andcommitted
Invalidate provider tree when compute node disappears
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the resource provider for that node might also be deleted. The compute host that owns the node might not recreate the resource provider if it exists in the provider tree cache. This change fixes the issue by clearing resource providers from the provider tree cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the resource providers are not found in the cache and get recreated in placement. Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e Related-Bug: #1853009 Related-Bug: #1841481 (cherry picked from commit 2bb4527)
1 parent f950ced commit 0fc104e

File tree

4 files changed

+25
-20
lines changed

4 files changed

+25
-20
lines changed

nova/compute/resource_tracker.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,3 +1963,4 @@ def clean_compute_node_cache(self, compute_nodes_in_db):
19631963
# where another compute service took ownership of the node. Clean
19641964
# up the cache.
19651965
self.remove_node(stale_cn)
1966+
self.reportclient.invalidate_resource_provider(stale_cn)

nova/scheduler/client/report.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -677,11 +677,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None):
677677
if resp:
678678
LOG.info("Deleted resource provider %s", rp_uuid)
679679
# clean the caches
680-
try:
681-
self._provider_tree.remove(rp_uuid)
682-
except ValueError:
683-
pass
684-
self._association_refresh_time.pop(rp_uuid, None)
680+
self.invalidate_resource_provider(rp_uuid)
685681
return
686682

687683
msg = ("[%(placement_req_id)s] Failed to delete resource provider "
@@ -2266,6 +2262,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
22662262
# left a no-op for backward compatibility.
22672263
pass
22682264

2265+
def invalidate_resource_provider(self, name_or_uuid):
2266+
"""Invalidate the cache for a resource provider.
2267+
2268+
:param name_or_uuid: Name or UUID of the resource provider to look up.
2269+
"""
2270+
try:
2271+
self._provider_tree.remove(name_or_uuid)
2272+
except ValueError:
2273+
pass
2274+
self._association_refresh_time.pop(name_or_uuid, None)
2275+
22692276
def get_provider_by_name(self, context, name):
22702277
"""Queries the placement API for resource provider information matching
22712278
a supplied name.

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,8 @@ def test_node_rebalance_deleted_compute_node_race(self):
153153
self.assertEqual(0, len(rps), rps)
154154

155155
# host_b[3]: Should recreate compute node and resource provider.
156-
# FIXME(mgoddard): Resource provider not recreated here, because it
157-
# exists in the provider tree. See
158-
# https://bugs.launchpad.net/nova/+bug/1841481.
156+
# FIXME(mgoddard): Resource provider not recreated here, due to
157+
# https://bugs.launchpad.net/nova/+bug/1853159.
159158
host_b.manager.update_available_resource(self.ctxt)
160159

161160
# Verify that the node was recreated.
@@ -170,14 +169,11 @@ def test_node_rebalance_deleted_compute_node_race(self):
170169
self.assertEqual(0, len(rps), rps)
171170

172171
# But the RP exists in the provider tree.
173-
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
172+
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
174173
self.nodename))
175174

176175
# host_b[1]: Should add compute node to RT cache and recreate resource
177176
# provider.
178-
# FIXME(mgoddard): Resource provider not recreated here, because it
179-
# exists in the provider tree. See
180-
# https://bugs.launchpad.net/nova/+bug/1841481.
181177
host_b.manager.update_available_resource(self.ctxt)
182178

183179
# Verify that the node still exists.
@@ -186,13 +182,10 @@ def test_node_rebalance_deleted_compute_node_race(self):
186182
# And it is now in the RT cache.
187183
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188184

189-
# There is still no RP.
185+
# The resource provider has now been created.
190186
rps = self._get_all_providers()
191-
self.assertEqual(0, len(rps), rps)
192-
193-
# But the RP it exists in the provider tree.
194-
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195-
self.nodename))
187+
self.assertEqual(1, len(rps), rps)
188+
self.assertEqual(self.nodename, rps[0]['name'])
196189

197190
# This fails due to the lack of a resource provider.
198191
self.assertIn(

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4193,5 +4193,9 @@ def test_clean_compute_node_cache(self, mock_remove):
41934193
invalid_nodename = "invalid-node"
41944194
self.rt.compute_nodes[_NODENAME] = self.compute
41954195
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4196-
self.rt.clean_compute_node_cache([self.compute])
4197-
mock_remove.assert_called_once_with(invalid_nodename)
4196+
with mock.patch.object(
4197+
self.rt.reportclient, "invalidate_resource_provider",
4198+
) as mock_invalidate:
4199+
self.rt.clean_compute_node_cache([self.compute])
4200+
mock_remove.assert_called_once_with(invalid_nodename)
4201+
mock_invalidate.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)