Skip to content

Commit e9e6479

Browse files
committed
Add functional regression test for bug 1853009
Bug 1853009 describes a race condition involving multiple nova-compute services with ironic. As the compute services start up, the hash ring rebalances, and the compute services have an inconsistent view of which is responsible for a compute node. The sequence of actions here is adapted from a real world log [1], where multiple nova-compute services were started simultaneously. In some cases mocks are used to simulate race conditions. There are three main issues with the behaviour: * host2 deletes the orphan node compute node after host1 has taken ownership of it. * host1 assumes that another compute service will not delete its nodes. Once a node is in rt.compute_nodes, it is not removed again unless the node is orphaned. This prevents host1 from recreating the compute node. * host1 assumes that another compute service will not delete its resource providers. Once an RP is in the provider tree, it is not removed. This functional test documents the current behaviour, with the idea that it can be updated as this behaviour is fixed. [1] http://paste.openstack.org/show/786272/ Co-Authored-By: Matt Riedemann <[email protected]> Change-Id: Ice4071722de54e8d20bb8c3795be22f1995940cd Related-Bug: #1853009 Related-Bug: #1853159
1 parent 8c51a3a commit e9e6479

File tree

1 file changed

+162
-0
lines changed

1 file changed

+162
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
2+
# not use this file except in compliance with the License. You may obtain
3+
# a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10+
# License for the specific language governing permissions and limitations
11+
# under the License.
12+
13+
import mock
14+
15+
from nova import context
16+
from nova import objects
17+
from nova.tests.functional import integrated_helpers
18+
19+
20+
class NodeRebalanceDeletedComputeNodeRaceTestCase(
21+
integrated_helpers.ProviderUsageBaseTestCase):
22+
"""Regression test for bug 1853009 observed in Rocky & later.
23+
24+
When an ironic node re-balances from one host to another, there can be a
25+
race where the old host deletes the orphan compute node after the new host
26+
has taken ownership of it which results in the new host failing to create
27+
the compute node and resource provider because the ResourceTracker does not
28+
detect a change.
29+
"""
30+
# Make sure we're using the fake driver that has predictable uuids
31+
# for each node.
32+
compute_driver = 'fake.PredictableNodeUUIDDriver'
33+
34+
def _assert_hypervisor_api(self, nodename, expected_host):
35+
# We should have one compute node shown by the API.
36+
hypervisors = self.api.api_get(
37+
'/os-hypervisors/detail').body['hypervisors']
38+
self.assertEqual(1, len(hypervisors), hypervisors)
39+
hypervisor = hypervisors[0]
40+
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
41+
self.assertEqual(expected_host, hypervisor['service']['host'])
42+
43+
def _start_compute(self, host):
44+
host = self.start_service('compute', host)
45+
# Ironic compute driver has rebalances_nodes = True.
46+
host.manager.driver.rebalances_nodes = True
47+
return host
48+
49+
def setUp(self):
50+
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()
51+
nodename = 'fake-node'
52+
ctxt = context.get_admin_context()
53+
54+
# Simulate a service running and then stopping.
55+
# host2 runs, creates fake-node, then is stopped. The fake-node compute
56+
# node is destroyed. This leaves a soft-deleted node in the DB.
57+
host2 = self._start_compute('host2')
58+
host2.manager.driver._set_nodes([nodename])
59+
host2.manager.update_available_resource(ctxt)
60+
host2.stop()
61+
cn = objects.ComputeNode.get_by_host_and_nodename(
62+
ctxt, 'host2', nodename)
63+
cn.destroy()
64+
65+
def test_node_rebalance_deleted_compute_node_race(self):
66+
nodename = 'fake-node'
67+
ctxt = context.get_admin_context()
68+
69+
# First we create a compute service to manage our node.
70+
# When start_service runs, it will create a host1 ComputeNode. We want
71+
# to delete that and inject our fake node into the driver which will
72+
# be re-balanced to another host later.
73+
host1 = self._start_compute('host1')
74+
host1.manager.driver._set_nodes([nodename])
75+
76+
# Run the update_available_resource periodic to register fake-node and
77+
# have it managed by host1. This will also detect the "host1" node as
78+
# orphaned and delete it along with its resource provider.
79+
80+
# host1[1]: Finds no compute record in RT. Tries to create one
81+
# (_init_compute_node).
82+
# FIXME(mgoddard): This shows a traceback with SQL rollback due to
83+
# soft-deleted node. The create seems to succeed but breaks the RT
84+
# update for this node. See
85+
# https://bugs.launchpad.net/nova/+bug/1853159.
86+
host1.manager.update_available_resource(ctxt)
87+
self._assert_hypervisor_api(nodename, 'host1')
88+
# There should only be one resource provider (fake-node).
89+
original_rps = self._get_all_providers()
90+
self.assertEqual(1, len(original_rps), original_rps)
91+
self.assertEqual(nodename, original_rps[0]['name'])
92+
93+
# Simulate a re-balance by starting host2 and make it manage fake-node.
94+
# At this point both host1 and host2 think they own fake-node.
95+
host2 = self._start_compute('host2')
96+
host2.manager.driver._set_nodes([nodename])
97+
98+
# host2[1]: Finds no compute record in RT, 'moves' existing node from
99+
# host1
100+
host2.manager.update_available_resource(ctxt)
101+
# Assert that fake-node was re-balanced from host1 to host2.
102+
self.assertIn('ComputeNode fake-node moving from host1 to host2',
103+
self.stdlog.logger.output)
104+
self._assert_hypervisor_api(nodename, 'host2')
105+
106+
# host2[2]: Begins periodic update, queries compute nodes for this
107+
# host, finds the fake-node.
108+
cn = objects.ComputeNode.get_by_host_and_nodename(
109+
ctxt, 'host2', nodename)
110+
111+
# host1[2]: Finds no compute record in RT, 'moves' existing node from
112+
# host2
113+
host1.manager.update_available_resource(ctxt)
114+
# Assert that fake-node was re-balanced from host2 to host1.
115+
self.assertIn('ComputeNode fake-node moving from host2 to host1',
116+
self.stdlog.logger.output)
117+
self._assert_hypervisor_api(nodename, 'host1')
118+
119+
# Complete rebalance, as host2 realises it does not own fake-node.
120+
host2.manager.driver._set_nodes([])
121+
122+
# host2[2]: Deletes orphan compute node.
123+
# Mock out the compute node query to simulate a race condition where
124+
# the list includes an orphan compute node that is taken ownership of
125+
# by host1 by the time host2 deletes it.
126+
# FIXME(mgoddard): Ideally host2 would not delete a node that does not
127+
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
128+
with mock.patch('nova.compute.manager.ComputeManager.'
129+
'_get_compute_nodes_in_db') as mock_get:
130+
mock_get.return_value = [cn]
131+
host2.manager.update_available_resource(ctxt)
132+
133+
# Verify that the node was deleted.
134+
self.assertIn("Deleting orphan compute node %s hypervisor host "
135+
"is fake-node, nodes are" % cn.id,
136+
self.stdlog.logger.output)
137+
hypervisors = self.api.api_get(
138+
'/os-hypervisors/detail').body['hypervisors']
139+
self.assertEqual(0, len(hypervisors), hypervisors)
140+
rps = self._get_all_providers()
141+
self.assertEqual(0, len(rps), rps)
142+
143+
# host1[3]: Should recreate compute node and resource provider.
144+
# FIXME(mgoddard): Compute node not recreated here, because it is
145+
# already in RT.compute_nodes. See
146+
# https://bugs.launchpad.net/nova/+bug/1853009.
147+
host1.manager.update_available_resource(ctxt)
148+
149+
# Verify that the node was not recreated.
150+
hypervisors = self.api.api_get(
151+
'/os-hypervisors/detail').body['hypervisors']
152+
self.assertEqual(0, len(hypervisors), hypervisors)
153+
154+
rt = host1.manager._get_resource_tracker()
155+
156+
# But the compute node exists in the RT.
157+
self.assertIn(nodename, rt.compute_nodes)
158+
159+
# The RP exists in Rocky, due to the lack of a provider tree cache.
160+
rps = self._get_all_providers()
161+
self.assertEqual(1, len(rps), rps)
162+
self.assertEqual(nodename, rps[0]['name'])

0 commit comments

Comments
 (0)