Skip to content

Commit c260e75

Browse files
markgoddardmelwitt
authored andcommitted
Add functional regression test for bug 1853009
Bug 1853009 describes a race condition involving multiple nova-compute services with ironic. As the compute services start up, the hash ring rebalances, and the compute services have an inconsistent view of which is responsible for a compute node. The sequence of actions here is adapted from a real world log [1], where multiple nova-compute services were started simultaneously. In some cases mocks are used to simulate race conditions. There are three main issues with the behaviour: * host2 deletes the orphan node compute node after host1 has taken ownership of it. * host1 assumes that another compute service will not delete its nodes. Once a node is in rt.compute_nodes, it is not removed again unless the node is orphaned. This prevents host1 from recreating the compute node. * host1 assumes that another compute service will not delete its resource providers. Once an RP is in the provider tree, it is not removed. This functional test documents the current behaviour, with the idea that it can be updated as this behaviour is fixed. [1] http://paste.openstack.org/show/786272/ Co-Authored-By: Matt Riedemann <[email protected]> Change-Id: Ice4071722de54e8d20bb8c3795be22f1995940cd Related-Bug: #1853009 Related-Bug: #1853159 (cherry picked from commit 59d9871)
1 parent e39bbdc commit c260e75

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
2+
# not use this file except in compliance with the License. You may obtain
3+
# a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10+
# License for the specific language governing permissions and limitations
11+
# under the License.
12+
13+
import mock
14+
15+
from nova import context
16+
from nova import objects
17+
from nova.tests.functional import integrated_helpers
18+
19+
20+
class NodeRebalanceDeletedComputeNodeRaceTestCase(
21+
integrated_helpers.ProviderUsageBaseTestCase,
22+
):
23+
"""Regression test for bug 1853009 observed in Rocky & later.
24+
25+
When an ironic node re-balances from one host to another, there can be a
26+
race where the old host deletes the orphan compute node after the new host
27+
has taken ownership of it which results in the new host failing to create
28+
the compute node and resource provider because the ResourceTracker does not
29+
detect a change.
30+
"""
31+
# Make sure we're using the fake driver that has predictable uuids
32+
# for each node.
33+
compute_driver = 'fake.PredictableNodeUUIDDriver'
34+
35+
def _assert_hypervisor_api(self, nodename, expected_host):
36+
# We should have one compute node shown by the API.
37+
hypervisors = self.api.api_get(
38+
'/os-hypervisors/detail'
39+
).body['hypervisors']
40+
self.assertEqual(1, len(hypervisors), hypervisors)
41+
hypervisor = hypervisors[0]
42+
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
43+
self.assertEqual(expected_host, hypervisor['service']['host'])
44+
45+
def _start_compute(self, host):
46+
host = self.start_service('compute', host)
47+
# Ironic compute driver has rebalances_nodes = True.
48+
host.manager.driver.rebalances_nodes = True
49+
return host
50+
51+
def setUp(self):
52+
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()
53+
54+
self.nodename = 'fake-node'
55+
self.ctxt = context.get_admin_context()
56+
57+
def test_node_rebalance_deleted_compute_node_race(self):
58+
# Simulate a service running and then stopping. host_a runs, creates
59+
# fake-node, then is stopped. The fake-node compute node is destroyed.
60+
# This leaves a soft-deleted node in the DB.
61+
host_a = self._start_compute('host_a')
62+
host_a.manager.driver._set_nodes([self.nodename])
63+
host_a.manager.update_available_resource(self.ctxt)
64+
host_a.stop()
65+
cn = objects.ComputeNode.get_by_host_and_nodename(
66+
self.ctxt, 'host_a', self.nodename,
67+
)
68+
cn.destroy()
69+
70+
self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt)))
71+
72+
# Now we create a new compute service to manage our node.
73+
host_b = self._start_compute('host_b')
74+
host_b.manager.driver._set_nodes([self.nodename])
75+
76+
# When start_service runs, it will create a host_b ComputeNode. We want
77+
# to delete that and inject our fake node into the driver which will
78+
# be re-balanced to another host later. First assert this actually
79+
# exists.
80+
self._assert_hypervisor_api('host_b', expected_host='host_b')
81+
82+
# Now run the update_available_resource periodic to register fake-node
83+
# and have it managed by host_b. This will also detect the "host_b"
84+
# node as orphaned and delete it along with its resource provider.
85+
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
86+
self.ctxt, 'host_b', 'host_b',
87+
)
88+
89+
# host_b[1]: Finds no compute record in RT. Tries to create one
90+
# (_init_compute_node).
91+
# FIXME(mgoddard): This shows a traceback with SQL rollback due to
92+
# soft-deleted node. The create seems to succeed but breaks the RT
93+
# update for this node. See
94+
# https://bugs.launchpad.net/nova/+bug/1853159.
95+
host_b.manager.update_available_resource(self.ctxt)
96+
self.assertIn(
97+
'Deleting orphan compute node %s hypervisor host '
98+
'is host_b, nodes are' % cn_host_b_node.id,
99+
self.stdlog.logger.output)
100+
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
101+
# There should only be one resource provider (fake-node).
102+
original_rps = self._get_all_providers()
103+
self.assertEqual(1, len(original_rps), original_rps)
104+
self.assertEqual(self.nodename, original_rps[0]['name'])
105+
106+
# Simulate a re-balance by restarting host_a and make it manage
107+
# fake-node. At this point both host_b and host_a think they own
108+
# fake-node.
109+
host_a = self._start_compute('host_a')
110+
host_a.manager.driver._set_nodes([self.nodename])
111+
112+
# host_a[1]: Finds no compute record in RT, 'moves' existing node from
113+
# host_b
114+
host_a.manager.update_available_resource(self.ctxt)
115+
# Assert that fake-node was re-balanced from host_b to host_a.
116+
self.assertIn(
117+
'ComputeNode fake-node moving from host_b to host_a',
118+
self.stdlog.logger.output)
119+
self._assert_hypervisor_api(self.nodename, expected_host='host_a')
120+
121+
# host_a[2]: Begins periodic update, queries compute nodes for this
122+
# host, finds the fake-node.
123+
cn = objects.ComputeNode.get_by_host_and_nodename(
124+
self.ctxt, 'host_a', self.nodename,
125+
)
126+
127+
# host_b[2]: Finds no compute record in RT, 'moves' existing node from
128+
# host_a
129+
host_b.manager.update_available_resource(self.ctxt)
130+
# Assert that fake-node was re-balanced from host_a to host_b.
131+
self.assertIn(
132+
'ComputeNode fake-node moving from host_a to host_b',
133+
self.stdlog.logger.output)
134+
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
135+
136+
# Complete rebalance, as host_a realises it does not own fake-node.
137+
host_a.manager.driver._set_nodes([])
138+
139+
# host_a[2]: Deletes orphan compute node.
140+
# Mock out the compute node query to simulate a race condition where
141+
# the list includes an orphan compute node that is newly owned by
142+
# host_b by the time host_a attempts to delete it.
143+
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
144+
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
145+
with mock.patch(
146+
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
147+
) as mock_get:
148+
mock_get.return_value = [cn]
149+
host_a.manager.update_available_resource(self.ctxt)
150+
151+
# Verify that the node was deleted.
152+
self.assertIn(
153+
'Deleting orphan compute node %s hypervisor host '
154+
'is fake-node, nodes are' % cn.id,
155+
self.stdlog.logger.output)
156+
hypervisors = self.api.api_get(
157+
'/os-hypervisors/detail').body['hypervisors']
158+
self.assertEqual(0, len(hypervisors), hypervisors)
159+
rps = self._get_all_providers()
160+
self.assertEqual(0, len(rps), rps)
161+
162+
# host_b[3]: Should recreate compute node and resource provider.
163+
# FIXME(mgoddard): Compute node not recreated here, because it is
164+
# already in RT.compute_nodes. See
165+
# https://bugs.launchpad.net/nova/+bug/1853009.
166+
# FIXME(mgoddard): Resource provider not recreated here, because it
167+
# exists in the provider tree. See
168+
# https://bugs.launchpad.net/nova/+bug/1841481.
169+
host_b.manager.update_available_resource(self.ctxt)
170+
171+
# Verify that the node was not recreated.
172+
hypervisors = self.api.api_get(
173+
'/os-hypervisors/detail').body['hypervisors']
174+
self.assertEqual(0, len(hypervisors), hypervisors)
175+
176+
# But the compute node exists in the RT.
177+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
178+
179+
# There is no RP.
180+
rps = self._get_all_providers()
181+
self.assertEqual(0, len(rps), rps)
182+
183+
# But the RP exists in the provider tree.
184+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
185+
self.nodename))
186+
187+
# This fails due to the lack of a resource provider.
188+
self.assertIn(
189+
'Skipping removal of allocations for deleted instances',
190+
self.stdlog.logger.output)

0 commit comments

Comments
 (0)