Skip to content

Commit c77acb4

Browse files
JohnGarbuttjovial
authored andcommitted
Ironic nodes with instance reserved in placement
Currently, when you delete an ironic instance, we trigger and undeploy in ironic and we release our allocation in placement. We do this well before the ironic node is actually available. We have attempted to fix this my marking unavailable nodes as reserved in placement. This works great until you try and re-image lots of nodes. It turns out, ironic nodes that are waiting for their automatic clean to finish, are returned as a valid allocation candidates for quite some time. Eventually we mark then as reserved. This patch takes a strange approach, if we mark all nodes as reserved as soon as the instance lands, we close the race. That is, when the allocation is removed the node is still unavailable until the next update of placement is done and notices that the node has become available. That may or may not have been after automatic cleaning. The trade off is that when you don't have automatic cleaning, we wait a bit longer to notice the node is available again. Note, this is also useful when a broken Ironic node is marked as in-maintainance while it is in-use by a nova instance. In a similar way, we mark the Nova as reserved immmeidately, rather than first waiting for the instance to be deleted before reserving the resources in Placement. Closes-Bug: #1974070 Change-Id: Iab92124b5776a799c7f90d07281d28fcf191c8fe (cherry picked from commit 3c022e9) (cherry picked from commit c9de185) (cherry picked from commit 9e486282eaf32443604d34a169272731d3ca9463)
1 parent 612225d commit c77acb4

File tree

4 files changed

+89
-11
lines changed

4 files changed

+89
-11
lines changed

nova/conf/workarounds.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,21 @@
416416
help="""
417417
When this is enabled, it will skip version-checking of hypervisors
418418
during live migration.
419+
"""),
420+
cfg.BoolOpt(
421+
'skip_reserve_in_use_ironic_nodes',
422+
default=False,
423+
help="""
424+
This may be useful if you use the Ironic driver, but don't have
425+
automatic cleaning enabled in Ironic. Nova, by default, will mark
426+
Ironic nodes as reserved as soon as they are in use. When you free
427+
the Ironic node (by deleting the nova instance) it takes a while
428+
for Nova to un-reserve that Ironic node in placement. Usually this
429+
is a good idea, because it avoids placement providing an Ironic
430+
as a valid candidate when it is still being cleaned.
431+
Howerver, if you don't use automatic cleaning, it can cause an
432+
extra delay before and Ironic node is available for building a
433+
new Nova instance.
419434
"""),
420435
]
421436

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
931931

932932
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
933933

934+
expected = {
935+
'CUSTOM_IRON_NFV': {
936+
'total': 1,
937+
'reserved': 1,
938+
'min_unit': 1,
939+
'max_unit': 1,
940+
'step_size': 1,
941+
'allocation_ratio': 1.0,
942+
},
943+
}
944+
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
945+
mock_nr.assert_called_once_with(mock_nfc.return_value)
946+
mock_res_used.assert_called_once_with(mock_nfc.return_value)
947+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
948+
result = self.ptree.data(mock.sentinel.nodename).inventory
949+
self.assertEqual(expected, result)
950+
951+
@mock.patch.object(ironic_driver.IronicDriver,
952+
'_node_resources_used', return_value=True)
953+
@mock.patch.object(ironic_driver.IronicDriver,
954+
'_node_resources_unavailable', return_value=False)
955+
@mock.patch.object(ironic_driver.IronicDriver, '_node_resource')
956+
@mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache')
957+
def test_update_provider_tree_with_rc_occupied_workaround(self,
958+
mock_nfc, mock_nr, mock_res_unavail, mock_res_used):
959+
"""Ensure that when a node is used, we report the inventory matching
960+
the consumed resources.
961+
"""
962+
self.flags(skip_reserve_in_use_ironic_nodes=True,
963+
group="workarounds")
964+
mock_nr.return_value = {
965+
'vcpus': 24,
966+
'vcpus_used': 24,
967+
'memory_mb': 1024,
968+
'memory_mb_used': 1024,
969+
'local_gb': 100,
970+
'local_gb_used': 100,
971+
'resource_class': 'iron-nfv',
972+
}
973+
974+
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
975+
934976
expected = {
935977
'CUSTOM_IRON_NFV': {
936978
'total': 1,
@@ -944,7 +986,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
944986
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
945987
mock_nr.assert_called_once_with(mock_nfc.return_value)
946988
mock_res_used.assert_called_once_with(mock_nfc.return_value)
947-
self.assertFalse(mock_res_unavail.called)
989+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
948990
result = self.ptree.data(mock.sentinel.nodename).inventory
949991
self.assertEqual(expected, result)
950992

@@ -1015,7 +1057,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr,
10151057
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10161058
mock_nr.assert_called_once_with(mock_nfc.return_value)
10171059
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1018-
self.assertFalse(mock_res_unavail.called)
1060+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10191061
result = self.ptree.data(mock.sentinel.nodename).traits
10201062
self.assertEqual(set(), result)
10211063

@@ -1047,7 +1089,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr,
10471089
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10481090
mock_nr.assert_called_once_with(mock_nfc.return_value)
10491091
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1050-
self.assertFalse(mock_res_unavail.called)
1092+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10511093
result = self.ptree.data(mock.sentinel.nodename).traits
10521094
self.assertEqual(set(traits), result)
10531095

nova/virt/ironic/driver.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -886,15 +886,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
886886
"""
887887
# nodename is the ironic node's UUID.
888888
node = self._node_from_cache(nodename)
889+
889890
reserved = False
890-
if (not self._node_resources_used(node) and
891-
self._node_resources_unavailable(node)):
892-
LOG.debug('Node %(node)s is not ready for a deployment, '
893-
'reporting resources as reserved for it. Node\'s '
894-
'provision state is %(prov)s, power state is '
895-
'%(power)s and maintenance is %(maint)s.',
896-
{'node': node.uuid, 'prov': node.provision_state,
897-
'power': node.power_state, 'maint': node.maintenance})
891+
if self._node_resources_unavailable(node):
892+
# Operators might mark a node as in maintainance,
893+
# even when an instance is on the node,
894+
# either way lets mark this as reserved
895+
reserved = True
896+
897+
if (self._node_resources_used(node) and
898+
not CONF.workarounds.skip_reserve_in_use_ironic_nodes):
899+
# Make resources as reserved once we have
900+
# and instance here.
901+
# When the allocation is deleted, most likely
902+
# automatic clean will start, so we keep the node
903+
# reserved until it becomes available again.
904+
# In the case without automatic clean, once
905+
# the allocation is removed in placement it
906+
# also stays as reserved until we notice on
907+
# the next periodic its actually available.
898908
reserved = True
899909

900910
info = self._node_resource(node)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
fixes:
3+
- |
4+
Fixed when placement returns ironic nodes that have just started automatic
5+
cleaning as possible valid candidates. This is done by marking all ironic
6+
nodes with an instance on them as reserved, such that nova only makes them
7+
available once we have double checked Ironic reports the node as available.
8+
If you don't have automatic cleaning on, this might mean it takes longer
9+
than normal for Ironic nodes to become available for new instances.
10+
If you want the old behaviour use the following workaround config:
11+
`[workarounds]skip_reserve_in_use_ironic_nodes=true`

0 commit comments

Comments
 (0)