Skip to content

Commit 22eb001

Browse files
authored
Merge pull request #2 from stackhpc/bug-1839560
Fix for bugs 1839560 and 1839674
2 parents 826e11d + 0fbfc26 commit 22eb001

File tree

6 files changed

+228
-36
lines changed

6 files changed

+228
-36
lines changed

nova/compute/resource_tracker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,8 +580,12 @@ def _init_compute_node(self, context, resources):
580580
cn = objects.ComputeNode(context)
581581
cn.host = self.host
582582
self._copy_resources(cn, resources)
583-
self.compute_nodes[nodename] = cn
584583
cn.create()
584+
# Only map the ComputeNode into compute_nodes if create() was OK
585+
# because if create() fails, on the next run through here nodename
586+
# would be in compute_nodes and we won't try to create again (because
587+
# of the logic above).
588+
self.compute_nodes[nodename] = cn
585589
LOG.info('Compute node record created for '
586590
'%(host)s:%(node)s with uuid: %(uuid)s',
587591
{'host': self.host, 'node': nodename, 'uuid': cn.uuid})

nova/db/sqlalchemy/api.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from oslo_db.sqlalchemy import update_match
3131
from oslo_db.sqlalchemy import utils as sqlalchemyutils
3232
from oslo_log import log as logging
33+
from oslo_utils import excutils
3334
from oslo_utils import importutils
3435
from oslo_utils import timeutils
3536
from oslo_utils import uuidutils
@@ -693,11 +694,54 @@ def compute_node_create(context, values):
693694

694695
compute_node_ref = models.ComputeNode()
695696
compute_node_ref.update(values)
696-
compute_node_ref.save(context.session)
697+
try:
698+
compute_node_ref.save(context.session)
699+
except db_exc.DBDuplicateEntry:
700+
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
701+
# Check to see if we have a (soft) deleted ComputeNode with the
702+
# same UUID and if so just update it and mark as no longer (soft)
703+
# deleted. See bug 1839560 for details.
704+
if 'uuid' in values:
705+
# Get a fresh context for a new DB session and allow it to
706+
# get a deleted record.
707+
ctxt = nova.context.get_admin_context(read_deleted='yes')
708+
compute_node_ref = _compute_node_get_and_update_deleted(
709+
ctxt, values)
710+
# If we didn't get anything back we failed to find the node
711+
# by uuid and update it so re-raise the DBDuplicateEntry.
712+
if compute_node_ref:
713+
err_ctx.reraise = False
697714

698715
return compute_node_ref
699716

700717

718+
@pick_context_manager_writer
719+
def _compute_node_get_and_update_deleted(context, values):
720+
"""Find a ComputeNode by uuid, update and un-delete it.
721+
722+
This is a special case from the ``compute_node_create`` method which
723+
needs to be separate to get a new Session.
724+
725+
This method will update the ComputeNode, if found, to have deleted=0 and
726+
deleted_at=None values.
727+
728+
:param context: request auth context which should be able to read deleted
729+
records
730+
:param values: values used to update the ComputeNode record - must include
731+
uuid
732+
:return: updated ComputeNode sqlalchemy model object if successfully found
733+
and updated, None otherwise
734+
"""
735+
cn = model_query(
736+
context, models.ComputeNode).filter_by(uuid=values['uuid']).first()
737+
if cn:
738+
# Update with the provided values but un-soft-delete.
739+
update_values = copy.deepcopy(values)
740+
update_values['deleted'] = 0
741+
update_values['deleted_at'] = None
742+
return compute_node_update(context, cn.id, update_values)
743+
744+
701745
@oslo_db_api.wrap_db_retry(max_retries=5, retry_on_deadlock=True)
702746
@pick_context_manager_writer
703747
def compute_node_update(context, compute_id, values):
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
2+
# not use this file except in compliance with the License. You may obtain
3+
# a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10+
# License for the specific language governing permissions and limitations
11+
# under the License.
12+
13+
from oslo_log import log as logging
14+
15+
from nova import context
16+
from nova.db import api as db_api
17+
from nova import objects
18+
from nova import test
19+
from nova.tests import fixtures as nova_fixtures
20+
from nova.tests.functional import integrated_helpers
21+
from nova import utils
22+
from nova.virt import fake as fake_virt
23+
24+
LOG = logging.getLogger(__name__)
25+
26+
27+
class PeriodicNodeRecreateTestCase(test.TestCase,
28+
integrated_helpers.InstanceHelperMixin):
29+
"""Regression test for bug 1839560 introduced in Rocky.
30+
31+
When an ironic node is undergoing maintenance the driver will not report
32+
it as an available node to the ComputeManager.update_available_resource
33+
periodic task. The ComputeManager will then (soft) delete a ComputeNode
34+
record for that no-longer-available node. If/when the ironic node is
35+
available again and the driver reports it, the ResourceTracker will attempt
36+
to create a ComputeNode record for the ironic node.
37+
38+
The regression with change Ia69fabce8e7fd7de101e291fe133c6f5f5f7056a is
39+
that the ironic node uuid is used as the ComputeNode.uuid and there is
40+
a unique constraint on the ComputeNode.uuid value in the database. So
41+
trying to create a ComputeNode with the same uuid (after the ironic node
42+
comes back from being unavailable) fails with a DuplicateEntry error since
43+
there is a (soft) deleted version of the ComputeNode with the same uuid
44+
in the database.
45+
"""
46+
def setUp(self):
47+
super(PeriodicNodeRecreateTestCase, self).setUp()
48+
# We need the PlacementFixture for the compute nodes to report in but
49+
# otherwise don't care about placement for this test.
50+
self.useFixture(nova_fixtures.PlacementFixture())
51+
# Start up the API so we can query the os-hypervisors API.
52+
self.api = self.useFixture(nova_fixtures.OSAPIFixture(
53+
api_version='v2.1')).admin_api
54+
# Make sure we're using the fake driver that has predictable uuids
55+
# for each node.
56+
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
57+
58+
def test_update_available_resource_node_recreate(self):
59+
# First we create a compute service to manage a couple of fake nodes.
60+
# When start_service runs, it will create the node1 and node2
61+
# ComputeNodes.
62+
fake_virt.set_nodes(['node1', 'node2'])
63+
self.addCleanup(fake_virt.restore_nodes)
64+
compute = self.start_service('compute', 'node1')
65+
# Now we should have two compute nodes, make sure the hypervisors API
66+
# shows them.
67+
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
68+
self.assertEqual(2, len(hypervisors), hypervisors)
69+
self.assertEqual({'node1', 'node2'},
70+
set([hyp['hypervisor_hostname']
71+
for hyp in hypervisors]))
72+
# Now stub the driver to only report node1. This is making it look like
73+
# node2 is no longer available when update_available_resource runs.
74+
compute.manager.driver._nodes = ['node1']
75+
ctxt = context.get_admin_context()
76+
compute.manager.update_available_resource(ctxt)
77+
# node2 should have been deleted, check the logs and API.
78+
log = self.stdlog.logger.output
79+
self.assertIn('Deleting orphan compute node', log)
80+
self.assertIn('hypervisor host is node2', log)
81+
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
82+
self.assertEqual(1, len(hypervisors), hypervisors)
83+
self.assertEqual('node1', hypervisors[0]['hypervisor_hostname'])
84+
# But the node2 ComputeNode is still in the database with deleted!=0.
85+
with utils.temporary_mutation(ctxt, read_deleted='yes'):
86+
cn = objects.ComputeNode.get_by_host_and_nodename(
87+
ctxt, 'node1', 'node2')
88+
self.assertTrue(cn.deleted)
89+
# Now stub the driver again to report node2 as being back and run
90+
# the periodic task.
91+
compute.manager.driver._nodes = ['node1', 'node2']
92+
LOG.info('Running update_available_resource which should bring back '
93+
'node2.')
94+
compute.manager.update_available_resource(ctxt)
95+
# The DBDuplicateEntry error should have been handled and resulted in
96+
# updating the (soft) deleted record to no longer be deleted.
97+
log = self.stdlog.logger.output
98+
self.assertNotIn('DBDuplicateEntry', log)
99+
# Should have two reported hypervisors again.
100+
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
101+
self.assertEqual(2, len(hypervisors), hypervisors)
102+
# Now that the node2 record was un-soft-deleted, archiving should not
103+
# remove any compute_nodes.
104+
LOG.info('Archiving the database.')
105+
archived = db_api.archive_deleted_rows(1000)[0]
106+
self.assertNotIn('compute_nodes', archived)
107+
cn2 = objects.ComputeNode.get_by_host_and_nodename(
108+
ctxt, 'node1', 'node2')
109+
self.assertFalse(cn2.deleted)
110+
self.assertIsNone(cn2.deleted_at)
111+
# The node2 id and uuid should not have changed in the DB.
112+
self.assertEqual(cn.id, cn2.id)
113+
self.assertEqual(cn.uuid, cn2.uuid)

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 39 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,56 +1074,45 @@ def fake_get_all(_ctx, nodename):
10741074
self.assertEqual(_HOSTNAME, self.rt.compute_nodes[_NODENAME].host)
10751075

10761076
@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
1077-
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
1078-
return_value=objects.PciDeviceList(objects=[]))
10791077
@mock.patch('nova.objects.ComputeNode.create')
10801078
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
10811079
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
10821080
'_update')
10831081
def test_compute_node_created_on_empty(self, update_mock, get_mock,
1084-
create_mock, pci_tracker_mock,
1082+
create_mock,
10851083
get_by_hypervisor_mock):
10861084
get_by_hypervisor_mock.return_value = []
10871085
self._test_compute_node_created(update_mock, get_mock, create_mock,
1088-
pci_tracker_mock,
10891086
get_by_hypervisor_mock)
10901087

10911088
@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
1092-
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
1093-
return_value=objects.PciDeviceList(objects=[]))
10941089
@mock.patch('nova.objects.ComputeNode.create')
10951090
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
10961091
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
10971092
'_update')
10981093
def test_compute_node_created_on_empty_rebalance(self, update_mock,
10991094
get_mock,
11001095
create_mock,
1101-
pci_tracker_mock,
11021096
get_by_hypervisor_mock):
11031097
get_by_hypervisor_mock.return_value = []
11041098
self._test_compute_node_created(update_mock, get_mock, create_mock,
1105-
pci_tracker_mock,
11061099
get_by_hypervisor_mock,
11071100
rebalances_nodes=True)
11081101

11091102
@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
1110-
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
1111-
return_value=objects.PciDeviceList(objects=[]))
11121103
@mock.patch('nova.objects.ComputeNode.create')
11131104
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
11141105
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
11151106
'_update')
11161107
def test_compute_node_created_too_many(self, update_mock, get_mock,
1117-
create_mock, pci_tracker_mock,
1108+
create_mock,
11181109
get_by_hypervisor_mock):
11191110
get_by_hypervisor_mock.return_value = ["fake_node_1", "fake_node_2"]
11201111
self._test_compute_node_created(update_mock, get_mock, create_mock,
1121-
pci_tracker_mock,
11221112
get_by_hypervisor_mock,
11231113
rebalances_nodes=True)
11241114

1125-
def _test_compute_node_created(self, update_mock, get_mock,
1126-
create_mock, pci_tracker_mock,
1115+
def _test_compute_node_created(self, update_mock, get_mock, create_mock,
11271116
get_by_hypervisor_mock,
11281117
rebalances_nodes=False):
11291118
self.flags(cpu_allocation_ratio=1.0, ram_allocation_ratio=1.0,
@@ -1152,12 +1141,12 @@ def _test_compute_node_created(self, update_mock, get_mock,
11521141
'current_workload': 0,
11531142
'vcpus': 4,
11541143
'running_vms': 0,
1155-
'pci_passthrough_devices': '[]'
1144+
'pci_passthrough_devices': '[]',
1145+
'uuid': uuids.compute_node_uuid
11561146
}
11571147
# The expected compute represents the initial values used
11581148
# when creating a compute node.
11591149
expected_compute = objects.ComputeNode(
1160-
id=42,
11611150
host_ip=resources['host_ip'],
11621151
vcpus=resources['vcpus'],
11631152
memory_mb=resources['memory_mb'],
@@ -1177,20 +1166,11 @@ def _test_compute_node_created(self, update_mock, get_mock,
11771166
cpu_allocation_ratio=1.0,
11781167
disk_allocation_ratio=1.0,
11791168
stats={'failed_builds': 0},
1180-
pci_device_pools=objects.PciDevicePoolList(objects=[]),
11811169
uuid=uuids.compute_node_uuid
11821170
)
11831171

1184-
def set_cn_id():
1185-
# The PCI tracker needs the compute node's ID when starting up, so
1186-
# make sure that we set the ID value so we don't get a Cannot load
1187-
# 'id' in base class error
1188-
cn = self.rt.compute_nodes[_NODENAME]
1189-
cn.id = 42 # Has to be a number, not a mock
1190-
cn.uuid = uuids.compute_node_uuid
1191-
1192-
create_mock.side_effect = set_cn_id
1193-
self.rt._init_compute_node(mock.sentinel.ctx, resources)
1172+
with mock.patch.object(self.rt, '_setup_pci_tracker') as setup_pci:
1173+
self.rt._init_compute_node(mock.sentinel.ctx, resources)
11941174

11951175
cn = self.rt.compute_nodes[_NODENAME]
11961176
get_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
@@ -1202,22 +1182,47 @@ def set_cn_id():
12021182
get_by_hypervisor_mock.assert_not_called()
12031183
create_mock.assert_called_once_with()
12041184
self.assertTrue(obj_base.obj_equal_prims(expected_compute, cn))
1205-
pci_tracker_mock.assert_called_once_with(mock.sentinel.ctx,
1206-
42)
1185+
setup_pci.assert_called_once_with(mock.sentinel.ctx, cn, resources)
12071186
self.assertFalse(update_mock.called)
12081187

1188+
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
1189+
'_setup_pci_tracker')
1190+
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
1191+
side_effect=exc.ComputeHostNotFound(host=_HOSTNAME))
1192+
@mock.patch('nova.objects.ComputeNode.create',
1193+
side_effect=(test.TestingException, None))
1194+
def test_compute_node_create_fail_retry_works(self, mock_create, mock_get,
1195+
mock_setup_pci):
1196+
"""Tests that _init_compute_node will not save the ComputeNode object
1197+
in the compute_nodes dict if create() fails.
1198+
"""
1199+
self._setup_rt()
1200+
self.assertEqual({}, self.rt.compute_nodes)
1201+
ctxt = context.get_context()
1202+
# The first ComputeNode.create fails so rt.compute_nodes should
1203+
# remain empty.
1204+
resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES)
1205+
resources['uuid'] = uuids.cn_uuid # for the LOG.info message
1206+
self.assertRaises(test.TestingException,
1207+
self.rt._init_compute_node, ctxt, resources)
1208+
self.assertEqual({}, self.rt.compute_nodes)
1209+
# Second create works so compute_nodes should have a mapping.
1210+
self.rt._init_compute_node(ctxt, resources)
1211+
self.assertIn(_NODENAME, self.rt.compute_nodes)
1212+
mock_get.assert_has_calls([mock.call(
1213+
ctxt, _HOSTNAME, _NODENAME)] * 2)
1214+
self.assertEqual(2, mock_create.call_count)
1215+
mock_setup_pci.assert_called_once_with(
1216+
ctxt, test.MatchType(objects.ComputeNode), resources)
1217+
12091218
@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
1210-
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
1211-
return_value=objects.PciDeviceList(objects=[]))
12121219
@mock.patch('nova.objects.ComputeNode.create')
12131220
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
12141221
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
12151222
'_update')
12161223
def test_node_removed(self, update_mock, get_mock,
1217-
create_mock, pci_tracker_mock,
1218-
get_by_hypervisor_mock):
1224+
create_mock, get_by_hypervisor_mock):
12191225
self._test_compute_node_created(update_mock, get_mock, create_mock,
1220-
pci_tracker_mock,
12211226
get_by_hypervisor_mock)
12221227
self.rt.old_resources[_NODENAME] = mock.sentinel.foo
12231228
self.assertIn(_NODENAME, self.rt.compute_nodes)

nova/tests/unit/db/test_db_api.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6882,6 +6882,18 @@ def test_compute_node_create(self):
68826882
new_stats = jsonutils.loads(self.item['stats'])
68836883
self.assertEqual(self.stats, new_stats)
68846884

6885+
def test_compute_node_create_duplicate_host_hypervisor_hostname(self):
6886+
"""Tests to make sure that DBDuplicateEntry is raised when trying to
6887+
create a duplicate ComputeNode with the same host and
6888+
hypervisor_hostname values but different uuid values. This makes
6889+
sure that when _compute_node_get_and_update_deleted returns None
6890+
the DBDuplicateEntry is re-raised.
6891+
"""
6892+
other_node = dict(self.compute_node_dict)
6893+
other_node['uuid'] = uuidutils.generate_uuid()
6894+
self.assertRaises(db_exc.DBDuplicateEntry,
6895+
db.compute_node_create, self.ctxt, other_node)
6896+
68856897
def test_compute_node_get_all(self):
68866898
nodes = db.compute_node_get_all(self.ctxt)
68876899
self.assertEqual(1, len(nodes))

nova/virt/fake.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import contextlib
2828
import copy
2929
import time
30+
import uuid
3031

3132
from oslo_log import log as logging
3233
from oslo_serialization import jsonutils
@@ -674,6 +675,19 @@ class MediumFakeDriver(FakeDriver):
674675
local_gb = 1028
675676

676677

678+
class PredictableNodeUUIDDriver(SmallFakeDriver):
679+
"""SmallFakeDriver variant that reports a predictable node uuid in
680+
get_available_resource, like IronicDriver.
681+
"""
682+
def get_available_resource(self, nodename):
683+
resources = super(
684+
PredictableNodeUUIDDriver, self).get_available_resource(nodename)
685+
# This is used in ComputeNode.update_from_virt_driver which is called
686+
# from the ResourceTracker when creating a ComputeNode.
687+
resources['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, nodename)
688+
return resources
689+
690+
677691
class FakeRescheduleDriver(FakeDriver):
678692
"""FakeDriver derivative that triggers a reschedule on the first spawn
679693
attempt. This is expected to only be used in tests that have more than

0 commit comments

Comments
 (0)