Skip to content

Commit e40b516

Browse files
authored
Merge pull request #57 from stackhpc/bugfix/yoga/ironic-race
Ironic nodes with instance reserved in placement
2 parents 3cedfd1 + c77acb4 commit e40b516

File tree

4 files changed

+89
-11
lines changed

4 files changed

+89
-11
lines changed

nova/conf/workarounds.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,21 @@
416416
help="""
417417
When this is enabled, it will skip version-checking of hypervisors
418418
during live migration.
419+
"""),
420+
cfg.BoolOpt(
421+
'skip_reserve_in_use_ironic_nodes',
422+
default=False,
423+
help="""
424+
This may be useful if you use the Ironic driver, but don't have
425+
automatic cleaning enabled in Ironic. Nova, by default, will mark
426+
Ironic nodes as reserved as soon as they are in use. When you free
427+
the Ironic node (by deleting the nova instance) it takes a while
428+
for Nova to un-reserve that Ironic node in placement. Usually this
429+
is a good idea, because it avoids placement providing an Ironic
430+
as a valid candidate when it is still being cleaned.
431+
Howerver, if you don't use automatic cleaning, it can cause an
432+
extra delay before and Ironic node is available for building a
433+
new Nova instance.
419434
"""),
420435
]
421436

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
931931

932932
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
933933

934+
expected = {
935+
'CUSTOM_IRON_NFV': {
936+
'total': 1,
937+
'reserved': 1,
938+
'min_unit': 1,
939+
'max_unit': 1,
940+
'step_size': 1,
941+
'allocation_ratio': 1.0,
942+
},
943+
}
944+
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
945+
mock_nr.assert_called_once_with(mock_nfc.return_value)
946+
mock_res_used.assert_called_once_with(mock_nfc.return_value)
947+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
948+
result = self.ptree.data(mock.sentinel.nodename).inventory
949+
self.assertEqual(expected, result)
950+
951+
@mock.patch.object(ironic_driver.IronicDriver,
952+
'_node_resources_used', return_value=True)
953+
@mock.patch.object(ironic_driver.IronicDriver,
954+
'_node_resources_unavailable', return_value=False)
955+
@mock.patch.object(ironic_driver.IronicDriver, '_node_resource')
956+
@mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache')
957+
def test_update_provider_tree_with_rc_occupied_workaround(self,
958+
mock_nfc, mock_nr, mock_res_unavail, mock_res_used):
959+
"""Ensure that when a node is used, we report the inventory matching
960+
the consumed resources.
961+
"""
962+
self.flags(skip_reserve_in_use_ironic_nodes=True,
963+
group="workarounds")
964+
mock_nr.return_value = {
965+
'vcpus': 24,
966+
'vcpus_used': 24,
967+
'memory_mb': 1024,
968+
'memory_mb_used': 1024,
969+
'local_gb': 100,
970+
'local_gb_used': 100,
971+
'resource_class': 'iron-nfv',
972+
}
973+
974+
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
975+
934976
expected = {
935977
'CUSTOM_IRON_NFV': {
936978
'total': 1,
@@ -944,7 +986,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
944986
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
945987
mock_nr.assert_called_once_with(mock_nfc.return_value)
946988
mock_res_used.assert_called_once_with(mock_nfc.return_value)
947-
self.assertFalse(mock_res_unavail.called)
989+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
948990
result = self.ptree.data(mock.sentinel.nodename).inventory
949991
self.assertEqual(expected, result)
950992

@@ -1015,7 +1057,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr,
10151057
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10161058
mock_nr.assert_called_once_with(mock_nfc.return_value)
10171059
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1018-
self.assertFalse(mock_res_unavail.called)
1060+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10191061
result = self.ptree.data(mock.sentinel.nodename).traits
10201062
self.assertEqual(set(), result)
10211063

@@ -1047,7 +1089,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr,
10471089
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10481090
mock_nr.assert_called_once_with(mock_nfc.return_value)
10491091
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1050-
self.assertFalse(mock_res_unavail.called)
1092+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10511093
result = self.ptree.data(mock.sentinel.nodename).traits
10521094
self.assertEqual(set(traits), result)
10531095

nova/virt/ironic/driver.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -886,15 +886,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
886886
"""
887887
# nodename is the ironic node's UUID.
888888
node = self._node_from_cache(nodename)
889+
889890
reserved = False
890-
if (not self._node_resources_used(node) and
891-
self._node_resources_unavailable(node)):
892-
LOG.debug('Node %(node)s is not ready for a deployment, '
893-
'reporting resources as reserved for it. Node\'s '
894-
'provision state is %(prov)s, power state is '
895-
'%(power)s and maintenance is %(maint)s.',
896-
{'node': node.uuid, 'prov': node.provision_state,
897-
'power': node.power_state, 'maint': node.maintenance})
891+
if self._node_resources_unavailable(node):
892+
# Operators might mark a node as in maintainance,
893+
# even when an instance is on the node,
894+
# either way lets mark this as reserved
895+
reserved = True
896+
897+
if (self._node_resources_used(node) and
898+
not CONF.workarounds.skip_reserve_in_use_ironic_nodes):
899+
# Make resources as reserved once we have
900+
# and instance here.
901+
# When the allocation is deleted, most likely
902+
# automatic clean will start, so we keep the node
903+
# reserved until it becomes available again.
904+
# In the case without automatic clean, once
905+
# the allocation is removed in placement it
906+
# also stays as reserved until we notice on
907+
# the next periodic its actually available.
898908
reserved = True
899909

900910
info = self._node_resource(node)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
fixes:
3+
- |
4+
Fixed when placement returns ironic nodes that have just started automatic
5+
cleaning as possible valid candidates. This is done by marking all ironic
6+
nodes with an instance on them as reserved, such that nova only makes them
7+
available once we have double checked Ironic reports the node as available.
8+
If you don't have automatic cleaning on, this might mean it takes longer
9+
than normal for Ironic nodes to become available for new instances.
10+
If you want the old behaviour use the following workaround config:
11+
`[workarounds]skip_reserve_in_use_ironic_nodes=true`

0 commit comments

Comments
 (0)