Skip to content

Commit 55a6149

Browse files
authored
Merge pull request #4 from stackhpc/fix-ironic-rebalance-race-rocky
Fix ironic rebalance race rocky
2 parents 32e73f5 + 6a9790b commit 55a6149

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+419
-230
lines changed

.travis.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
---
2+
language: python
3+
python: "2.7"
4+
5+
# Run jobs in VMs - sudo is required by ansible tests.
6+
sudo: required
7+
8+
# Install ansible
9+
addons:
10+
apt:
11+
packages:
12+
- gcc
13+
- python-apt
14+
- python-virtualenv
15+
- realpath
16+
17+
# Create a build matrix for the different test jobs.
18+
env:
19+
matrix:
20+
# Run python style checks.
21+
- TOX_ENV=pep8
22+
# Build documentation.
23+
- TOX_ENV=docs
24+
# Run python2.7 unit tests.
25+
- TOX_ENV=py27
26+
# Run functional tests.
27+
- TOX_ENV=functional
28+
29+
install:
30+
# Install tox in a virtualenv to ensure we have an up to date version.
31+
- pip install -U pip
32+
- pip install tox
33+
34+
script:
35+
# Run the tox environment.
36+
- tox -e ${TOX_ENV}

doc/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"hypervisors": [
33
{
4-
"hypervisor_hostname": "fake-mini",
4+
"hypervisor_hostname": "host2",
55
"id": "1bb62a04-c576-402c-8147-9e89757a09e3",
66
"state": "up",
77
"status": "enabled"

nova/compute/manager.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8033,13 +8033,16 @@ def update_available_resource(self, context, startup=False):
80338033
compute_nodes_in_db = self._get_compute_nodes_in_db(context,
80348034
use_slave=True,
80358035
startup=startup)
8036+
8037+
rt = self._get_resource_tracker()
8038+
rt.clean_compute_node_cache(compute_nodes_in_db)
8039+
80368040
try:
80378041
nodenames = set(self.driver.get_available_nodes())
80388042
except exception.VirtDriverNotReady:
80398043
LOG.warning("Virt driver is not ready.")
80408044
return
80418045

8042-
rt = self._get_resource_tracker()
80438046
# Delete orphan compute node not reported by driver but still in db
80448047
for cn in compute_nodes_in_db:
80458048
if cn.hypervisor_hostname not in nodenames:
@@ -8048,13 +8051,27 @@ def update_available_resource(self, context, startup=False):
80488051
"nodes are %(nodes)s",
80498052
{'id': cn.id, 'hh': cn.hypervisor_hostname,
80508053
'nodes': nodenames})
8051-
cn.destroy()
8052-
rt.remove_node(cn.hypervisor_hostname)
8053-
# Delete the corresponding resource provider in placement,
8054-
# along with any associated allocations and inventory.
8055-
# TODO(cdent): Move use of reportclient into resource tracker.
8056-
self.scheduler_client.reportclient.delete_resource_provider(
8057-
context, cn, cascade=True)
8054+
try:
8055+
cn.destroy()
8056+
except exception.ComputeHostNotFound:
8057+
# NOTE(mgoddard): it's possible that another compute
8058+
# service took ownership of this compute node since we
8059+
# queried it due to a rebalance, and this will cause the
8060+
# deletion to fail. Ignore the error in that case.
8061+
LOG.info("Ignoring failure to delete orphan compute node "
8062+
"%(id)s on hypervisor host %(hh)s due to "
8063+
"possible node rebalance",
8064+
{'id': cn.id, 'hh': cn.hypervisor_hostname})
8065+
rt.remove_node(cn.hypervisor_hostname)
8066+
else:
8067+
rt.remove_node(cn.hypervisor_hostname)
8068+
# Delete the corresponding resource provider in placement,
8069+
# along with any associated allocations.
8070+
# TODO(cdent): Move use of reportclient into resource
8071+
# tracker.
8072+
reportclient = self.scheduler_client.reportclient
8073+
reportclient.delete_resource_provider(
8074+
context, cn, cascade=True)
80588075

80598076
for nodename in nodenames:
80608077
self._update_available_resource_for_node(context, nodename)

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,3 +1547,20 @@ def build_failed(self, nodename):
15471547
def build_succeeded(self, nodename):
15481548
"""Resets the failed_builds stats for the given node."""
15491549
self.stats[nodename].build_succeeded()
1550+
1551+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
1552+
def clean_compute_node_cache(self, compute_nodes_in_db):
1553+
"""Clean the compute node cache of any nodes that no longer exist.
1554+
1555+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1556+
"""
1557+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1558+
for cn in compute_nodes_in_db}
1559+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
1560+
1561+
for stale_cn in stale_cns:
1562+
# NOTE(mgoddard): we have found a node in the cache that has no
1563+
# compute node in the DB. This could be due to a node rebalance
1564+
# where another compute service took ownership of the node. Clean
1565+
# up the cache.
1566+
self.remove_node(stale_cn)

nova/db/api.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,15 +330,16 @@ def compute_node_update(context, compute_id, values):
330330
return IMPL.compute_node_update(context, compute_id, values)
331331

332332

333-
def compute_node_delete(context, compute_id):
333+
def compute_node_delete(context, compute_id, compute_host):
334334
"""Delete a compute node from the database.
335335
336336
:param context: The security context
337337
:param compute_id: ID of the compute node
338+
:param compute_host: Hostname of the compute service
338339
339340
Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
340341
"""
341-
return IMPL.compute_node_delete(context, compute_id)
342+
return IMPL.compute_node_delete(context, compute_id, compute_host)
342343

343344

344345
def compute_node_statistics(context):

nova/db/sqlalchemy/api.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -685,16 +685,29 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):
685685

686686

687687
@pick_context_manager_writer
688-
def compute_node_create(context, values):
688+
def _compute_node_create(context, values):
689689
"""Creates a new ComputeNode and populates the capacity fields
690690
with the most recent data.
691691
"""
692692
convert_objects_related_datetimes(values)
693693

694694
compute_node_ref = models.ComputeNode()
695695
compute_node_ref.update(values)
696+
compute_node_ref.save(context.session)
697+
return compute_node_ref
698+
699+
700+
# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
701+
# so that we get a separate transaction in the exception handler. This avoids
702+
# an error message about inactive DB sessions during a transaction rollback.
703+
# See https://bugs.launchpad.net/nova/+bug/1853159.
704+
def compute_node_create(context, values):
705+
"""Creates a new ComputeNode and populates the capacity fields
706+
with the most recent data. Will restore a soft deleted compute node if a
707+
UUID has been explicitly requested.
708+
"""
696709
try:
697-
compute_node_ref.save(context.session)
710+
compute_node_ref = _compute_node_create(context, values)
698711
except db_exc.DBDuplicateEntry:
699712
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
700713
# Check to see if we have a (soft) deleted ComputeNode with the
@@ -758,10 +771,10 @@ def compute_node_update(context, compute_id, values):
758771

759772

760773
@pick_context_manager_writer
761-
def compute_node_delete(context, compute_id):
774+
def compute_node_delete(context, compute_id, compute_host):
762775
"""Delete a ComputeNode record."""
763776
result = model_query(context, models.ComputeNode).\
764-
filter_by(id=compute_id).\
777+
filter_by(id=compute_id, host=compute_host).\
765778
soft_delete(synchronize_session=False)
766779

767780
if not result:

nova/objects/compute_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ def save(self, prune_stats=False):
337337

338338
@base.remotable
339339
def destroy(self):
340-
db.compute_node_delete(self._context, self.id)
340+
db.compute_node_delete(self._context, self.id, self.host)
341341

342342
def update_from_virt_driver(self, resources):
343343
# NOTE(pmurray): the virt driver provides a dict of values that

nova/test.py

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -419,30 +419,42 @@ def flags(self, **kw):
419419

420420
def start_service(self, name, host=None, **kwargs):
421421
cell = None
422+
# if the host is None then the CONF.host remains defaulted to
423+
# 'fake-mini' (originally done in ConfFixture)
424+
if host is not None:
425+
# Make sure that CONF.host is relevant to the right hostname
426+
self.useFixture(nova_fixtures.ConfPatcher(host=host))
427+
422428
if name == 'compute' and self.USES_DB:
423429
# NOTE(danms): We need to create the HostMapping first, because
424430
# otherwise we'll fail to update the scheduler while running
425431
# the compute node startup routines below.
426432
ctxt = context.get_context()
427433
cell_name = kwargs.pop('cell', CELL1_NAME) or CELL1_NAME
428434
cell = self.cell_mappings[cell_name]
429-
hm = objects.HostMapping(context=ctxt,
430-
host=host or name,
431-
cell_mapping=cell)
432-
hm.create()
433-
self.host_mappings[hm.host] = hm
434-
if host is not None:
435-
# Make sure that CONF.host is relevant to the right hostname
436-
self.useFixture(nova_fixtures.ConfPatcher(host=host))
435+
if (host or name) not in self.host_mappings:
436+
# NOTE(gibi): If the HostMapping does not exists then this is
437+
# the first start of the service so we create the mapping.
438+
hm = objects.HostMapping(context=ctxt,
439+
host=host or name,
440+
cell_mapping=cell)
441+
hm.create()
442+
self.host_mappings[hm.host] = hm
437443
svc = self.useFixture(
438444
nova_fixtures.ServiceFixture(name, host, cell=cell, **kwargs))
439445

440446
return svc.service
441447

442-
def restart_compute_service(self, compute):
443-
"""Restart a compute service in a realistic way.
448+
def restart_compute_service(self, compute, keep_hypervisor_state=True):
449+
"""Stops the service and starts a new one to have realistic restart
444450
445451
:param:compute: the nova-compute service to be restarted
452+
:param:keep_hypervisor_state: If true then already defined instances
453+
will survive the compute service restart.
454+
If false then the new service will see
455+
an empty hypervisor
456+
:returns: a new compute service instance serving the same host and
457+
and node
446458
"""
447459

448460
# NOTE(gibi): The service interface cannot be used to simulate a real
@@ -452,12 +464,36 @@ def restart_compute_service(self, compute):
452464
# a stop start. The service.kill() call cannot help as it deletes
453465
# the service from the DB which is unrealistic and causes that some
454466
# operation that refers to the killed host (e.g. evacuate) fails.
455-
# So this helper method tries to simulate a better compute service
456-
# restart by cleaning up some of the internal state of the compute
457-
# manager.
467+
# So this helper method will stop the original service and then starts
468+
# a brand new compute service for the same host and node. This way
469+
# a new ComputeManager instance will be created and initialized during
470+
# the service startup.
458471
compute.stop()
459-
compute.manager._resource_tracker = None
460-
compute.start()
472+
473+
# this service was running previously so we have to make sure that
474+
# we restart it in the same cell
475+
cell_name = self.host_mappings[compute.host].cell_mapping.name
476+
477+
if keep_hypervisor_state:
478+
# NOTE(gibi): FakeDriver does not provide a meaningful way to
479+
# define some servers that exists already on the hypervisor when
480+
# the driver is (re)created during the service startup. This means
481+
# that we cannot simulate that the definition of a server
482+
# survives a nova-compute service restart on the hypervisor.
483+
# Instead here we save the FakeDriver instance that knows about
484+
# the defined servers and inject that driver into the new Manager
485+
# class during the startup of the compute service.
486+
old_driver = compute.manager.driver
487+
with mock.patch(
488+
'nova.virt.driver.load_compute_driver') as load_driver:
489+
load_driver.return_value = old_driver
490+
new_compute = self.start_service(
491+
'compute', host=compute.host, cell=cell_name)
492+
else:
493+
new_compute = self.start_service(
494+
'compute', host=compute.host, cell=cell_name)
495+
496+
return new_compute
461497

462498
def assertJsonEqual(self, expected, observed, message=''):
463499
"""Asserts that 2 complex data structures are json equivalent.

nova/tests/functional/api_sample_tests/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json.tpl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"hypervisors": [
33
{
4-
"hypervisor_hostname": "fake-mini",
4+
"hypervisor_hostname": "host2",
55
"id": "%(hypervisor_id)s",
66
"state": "up",
77
"status": "enabled"

nova/tests/functional/api_sample_tests/test_hypervisors.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from nova.cells import utils as cells_utils
1919
from nova import objects
2020
from nova.tests.functional.api_sample_tests import api_sample_base
21-
from nova.virt import fake
2221

2322

2423
class HypervisorsSampleJsonTests(api_sample_base.ApiSampleTestBaseV21):
@@ -157,8 +156,6 @@ def setUp(self):
157156
# Start a new compute service to fake a record with hypervisor id=2
158157
# for pagination test.
159158
host = 'host1'
160-
fake.set_nodes([host])
161-
self.addCleanup(fake.restore_nodes)
162159
self.start_service('compute', host=host)
163160

164161
def test_hypervisors_list(self):
@@ -205,8 +202,6 @@ def test_hypervisors_list(self):
205202
def test_hypervisors_detail(self):
206203
# Start another compute service to get a 2nd compute for paging tests.
207204
host = 'host2'
208-
fake.set_nodes([host])
209-
self.addCleanup(fake.restore_nodes)
210205
service_2 = self.start_service('compute', host=host).service_ref
211206
compute_node_2 = service_2.compute_node
212207
marker = self.compute_node_1.uuid

nova/tests/functional/compute/test_live_migration.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from nova.tests.functional import integrated_helpers
2121
from nova.tests.unit import fake_notifier
2222
from nova.tests import uuidsentinel as uuids
23-
from nova.virt import fake
2423

2524

2625
class FakeCinderError(object):
@@ -53,8 +52,6 @@ def setUp(self):
5352
# Start a second compte node (the first one was started for us by
5453
# _IntegratedTestBase. set_nodes() is needed to avoid duplicate
5554
# nodenames. See comments in test_bug_1702454.py.
56-
fake.set_nodes(['host2'])
57-
self.addCleanup(fake.restore_nodes)
5855
self.compute2 = self.start_service('compute', host='host2')
5956

6057
# To get the old Cinder flow we need to hack the service version, otherwise

nova/tests/functional/integrated_helpers.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import nova.tests.unit.image.fake
3636
from nova.tests.unit import policy_fixture
3737
from nova.tests import uuidsentinel as uuids
38-
from nova.virt import fake
3938

4039

4140
CONF = nova.conf.CONF
@@ -396,8 +395,6 @@ def _start_compute(self, host, cell_name=None):
396395
compute service (defaults to cell1)
397396
:return: the nova compute service object
398397
"""
399-
fake.set_nodes([host])
400-
self.addCleanup(fake.restore_nodes)
401398
compute = self.start_service('compute', host=host, cell=cell_name)
402399
self.computes[host] = compute
403400
return compute

nova/tests/functional/notification_sample_tests/test_instance.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from nova.tests.functional.notification_sample_tests \
2323
import notification_sample_base
2424
from nova.tests.unit import fake_notifier
25-
from nova.virt import fake
2625

2726
COMPUTE_VERSION_OLD_ATTACH_FLOW = \
2827
compute_api.CINDER_V3_ATTACH_MIN_COMPUTE_VERSION - 1
@@ -48,8 +47,6 @@ def test_multiple_compute_actions(self):
4847
self._wait_for_notification('instance.create.end')
4948
self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL)
5049
# server will boot on the 'compute' host
51-
fake.set_nodes(['host2'])
52-
self.addCleanup(fake.restore_nodes)
5350
self.compute2 = self.start_service('compute', host='host2')
5451

5552
actions = [

nova/tests/functional/regressions/test_bug_1669054.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from nova import context
1414
from nova import objects
1515
from nova.tests.functional import integrated_helpers
16-
from nova.virt import fake
1716

1817

1918
class ResizeEvacuateTestCase(integrated_helpers._IntegratedTestBase,
@@ -47,8 +46,6 @@ def test_resize_then_evacuate(self):
4746
self._wait_for_state_change(self.api, server, 'ACTIVE')
4847

4948
# Start up another compute service so we can resize.
50-
fake.set_nodes(['host2'])
51-
self.addCleanup(fake.restore_nodes)
5249
host2 = self.start_service('compute', host='host2')
5350

5451
# Now resize the server to move it to host2.

0 commit comments

Comments
 (0)