Clear rebalanced compute nodes from resource tracker

markgoddard · markgoddard · commit c43470771bdc · 2019-11-21T09:18:25.000Z
There is a race condition in nova-compute with the ironic virt driver as
nodes get rebalanced. It can lead to compute nodes being removed in the
DB and not repopulated. Ultimately this prevents these nodes from being
scheduled to.

The issue being addressed here is that if a compute node is deleted by a host
which thinks it is an orphan, then the compute host that actually owns the node
might not recreate it if the node is already in its resource tracker cache.

This change fixes the issue by clearing nodes from the resource tracker cache
for which a compute node entry does not exist. Then, when the available
resource for the node is updated, the compute node object is not found in the
cache and gets recreated.

Change-Id: I39241223b447fcc671161c370dbf16e1773b684a
Partial-Bug: #1853009
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -8033,13 +8033,16 @@ def update_available_resource(self, context, startup=False):
         compute_nodes_in_db = self._get_compute_nodes_in_db(context,
                                                             use_slave=True,
                                                             startup=startup)
+
+        rt = self._get_resource_tracker()
+        rt.clean_compute_node_cache(compute_nodes_in_db)
+
         try:
             nodenames = set(self.driver.get_available_nodes())
         except exception.VirtDriverNotReady:
             LOG.warning("Virt driver is not ready.")
             return
 
-        rt = self._get_resource_tracker()
         # Delete orphan compute node not reported by driver but still in db
         for cn in compute_nodes_in_db:
             if cn.hypervisor_hostname not in nodenames:
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
@@ -1547,3 +1547,20 @@ def build_failed(self, nodename):
     def build_succeeded(self, nodename):
         """Resets the failed_builds stats for the given node."""
         self.stats[nodename].build_succeeded()
+
+    @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
+    def clean_compute_node_cache(self, compute_nodes_in_db):
+        """Clean the compute node cache of any nodes that no longer exist.
+
+        :param compute_nodes_in_db: list of ComputeNode objects from the DB.
+        """
+        compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
+                                         for cn in compute_nodes_in_db}
+        stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
+
+        for stale_cn in stale_cns:
+            # NOTE(mgoddard): we have found a node in the cache that has no
+            # compute node in the DB. This could be due to a node rebalance
+            # where another compute service took ownership of the node. Clean
+            # up the cache.
+            self.remove_node(stale_cn)
diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py
@@ -141,22 +141,35 @@ def test_node_rebalance_deleted_compute_node_race(self):
         self.assertEqual(0, len(rps), rps)
 
         # host1[3]: Should recreate compute node and resource provider.
-        # FIXME(mgoddard): Compute node not recreated here, because it is
-        # already in RT.compute_nodes. See
-        # https://bugs.launchpad.net/nova/+bug/1853009.
         host1.manager.update_available_resource(ctxt)
 
-        # Verify that the node was not recreated.
-        hypervisors = self.api.api_get(
-            '/os-hypervisors/detail').body['hypervisors']
-        self.assertEqual(0, len(hypervisors), hypervisors)
+        # Verify that the node was recreated.
+        self._assert_hypervisor_api(nodename, 'host1')
 
         rt = host1.manager._get_resource_tracker()
 
-        # But the compute node exists in the RT.
+        # But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
+        # node is not cached in the RT.
+        self.assertNotIn(nodename, rt.compute_nodes)
+
+        # And for the same reason, the provider is not recreated.
+        rps = self._get_all_providers()
+        self.assertEqual(0, len(rps), rps)
+
+        # host1[1]: Should add compute node to RT cache and recreate resource
+        # provider.
+        # FIXME(mgoddard): Resource provider not recreated here, because it
+        # exists in the provider tree. See
+        # https://bugs.launchpad.net/nova/+bug/1841481.
+        host1.manager.update_available_resource(ctxt)
+
+        # Verify that the node still exists.
+        self._assert_hypervisor_api(nodename, 'host1')
+
+        # And it is now in the RT cache.
         self.assertIn(nodename, rt.compute_nodes)
 
-        # The RP exists in Rocky, due to the lack of a provider tree cache.
+        # There is still no RP.
         rps = self._get_all_providers()
         self.assertEqual(1, len(rps), rps)
         self.assertEqual(nodename, rps[0]['name'])
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
@@ -309,6 +309,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
             [mock.call(self.context, node) for node in avail_nodes_l])
 
         # First node in set should have been removed from DB
+        # Last node in set should have been added to DB.
         for db_node in db_nodes:
             if db_node.hypervisor_hostname == 'node1':
                 db_node.destroy.assert_called_once_with()
@@ -318,6 +319,8 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
                     'node1')
             else:
                 self.assertFalse(db_node.destroy.called)
+        (mock_get_rt.return_value.clean_compute_node_cache.
+         assert_called_once_with(db_nodes))
 
     @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                 'delete_resource_provider')
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
@@ -3310,3 +3310,20 @@ def test_ram_allocation_ratio_none_negative(self):
     def test_disk_allocation_ratio_none_negative(self):
         self.assertRaises(ValueError,
                           CONF.set_default, 'disk_allocation_ratio', -1.0)
+
+
+class TestCleanComputeNodeCache(BaseTestCase):
+
+    def setUp(self):
+        super(TestCleanComputeNodeCache, self).setUp()
+        self._setup_rt()
+        self.context = context.RequestContext(mock.sentinel.user_id,
+                                              mock.sentinel.project_id)
+
+    @mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
+    def test_clean_compute_node_cache(self, mock_remove):
+        invalid_nodename = "invalid-node"
+        self.rt.compute_nodes[_NODENAME] = _COMPUTE_NODE_FIXTURES[0]
+        self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
+        self.rt.clean_compute_node_cache([_COMPUTE_NODE_FIXTURES[0]])
+        mock_remove.assert_called_once_with(invalid_nodename)