Skip to content

Commit 3c022e9

Browse files
committed
Ironic nodes with instance reserved in placement
Currently, when you delete an ironic instance, we trigger and undeploy in ironic and we release our allocation in placement. We do this well before the ironic node is actually available. We have attempted to fix this my marking unavailable nodes as reserved in placement. This works great until you try and re-image lots of nodes. It turns out, ironic nodes that are waiting for their automatic clean to finish, are returned as a valid allocation candidates for quite some time. Eventually we mark then as reserved. This patch takes a strange approach, if we mark all nodes as reserved as soon as the instance lands, we close the race. That is, when the allocation is removed the node is still unavailable until the next update of placement is done and notices that the node has become available. That may or may not have been after automatic cleaning. The trade off is that when you don't have automatic cleaning, we wait a bit longer to notice the node is available again. Note, this is also useful when a broken Ironic node is marked as in-maintainance while it is in-use by a nova instance. In a similar way, we mark the Nova as reserved immmeidately, rather than first waiting for the instance to be deleted before reserving the resources in Placement. Closes-Bug: #1974070 Change-Id: Iab92124b5776a799c7f90d07281d28fcf191c8fe
1 parent 2eb358c commit 3c022e9

File tree

4 files changed

+89
-11
lines changed

4 files changed

+89
-11
lines changed

nova/conf/workarounds.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,21 @@
416416
help="""
417417
When this is enabled, it will skip version-checking of hypervisors
418418
during live migration.
419+
"""),
420+
cfg.BoolOpt(
421+
'skip_reserve_in_use_ironic_nodes',
422+
default=False,
423+
help="""
424+
This may be useful if you use the Ironic driver, but don't have
425+
automatic cleaning enabled in Ironic. Nova, by default, will mark
426+
Ironic nodes as reserved as soon as they are in use. When you free
427+
the Ironic node (by deleting the nova instance) it takes a while
428+
for Nova to un-reserve that Ironic node in placement. Usually this
429+
is a good idea, because it avoids placement providing an Ironic
430+
as a valid candidate when it is still being cleaned.
431+
Howerver, if you don't use automatic cleaning, it can cause an
432+
extra delay before and Ironic node is available for building a
433+
new Nova instance.
419434
"""),
420435
]
421436

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
932932

933933
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
934934

935+
expected = {
936+
'CUSTOM_IRON_NFV': {
937+
'total': 1,
938+
'reserved': 1,
939+
'min_unit': 1,
940+
'max_unit': 1,
941+
'step_size': 1,
942+
'allocation_ratio': 1.0,
943+
},
944+
}
945+
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
946+
mock_nr.assert_called_once_with(mock_nfc.return_value)
947+
mock_res_used.assert_called_once_with(mock_nfc.return_value)
948+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
949+
result = self.ptree.data(mock.sentinel.nodename).inventory
950+
self.assertEqual(expected, result)
951+
952+
@mock.patch.object(ironic_driver.IronicDriver,
953+
'_node_resources_used', return_value=True)
954+
@mock.patch.object(ironic_driver.IronicDriver,
955+
'_node_resources_unavailable', return_value=False)
956+
@mock.patch.object(ironic_driver.IronicDriver, '_node_resource')
957+
@mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache')
958+
def test_update_provider_tree_with_rc_occupied_workaround(self,
959+
mock_nfc, mock_nr, mock_res_unavail, mock_res_used):
960+
"""Ensure that when a node is used, we report the inventory matching
961+
the consumed resources.
962+
"""
963+
self.flags(skip_reserve_in_use_ironic_nodes=True,
964+
group="workarounds")
965+
mock_nr.return_value = {
966+
'vcpus': 24,
967+
'vcpus_used': 24,
968+
'memory_mb': 1024,
969+
'memory_mb_used': 1024,
970+
'local_gb': 100,
971+
'local_gb_used': 100,
972+
'resource_class': 'iron-nfv',
973+
}
974+
975+
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
976+
935977
expected = {
936978
'CUSTOM_IRON_NFV': {
937979
'total': 1,
@@ -945,7 +987,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
945987
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
946988
mock_nr.assert_called_once_with(mock_nfc.return_value)
947989
mock_res_used.assert_called_once_with(mock_nfc.return_value)
948-
self.assertFalse(mock_res_unavail.called)
990+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
949991
result = self.ptree.data(mock.sentinel.nodename).inventory
950992
self.assertEqual(expected, result)
951993

@@ -1016,7 +1058,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr,
10161058
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10171059
mock_nr.assert_called_once_with(mock_nfc.return_value)
10181060
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1019-
self.assertFalse(mock_res_unavail.called)
1061+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10201062
result = self.ptree.data(mock.sentinel.nodename).traits
10211063
self.assertEqual(set(), result)
10221064

@@ -1048,7 +1090,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr,
10481090
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
10491091
mock_nr.assert_called_once_with(mock_nfc.return_value)
10501092
mock_res_used.assert_called_once_with(mock_nfc.return_value)
1051-
self.assertFalse(mock_res_unavail.called)
1093+
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
10521094
result = self.ptree.data(mock.sentinel.nodename).traits
10531095
self.assertEqual(set(traits), result)
10541096

nova/virt/ironic/driver.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -874,15 +874,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
874874
"""
875875
# nodename is the ironic node's UUID.
876876
node = self._node_from_cache(nodename)
877+
877878
reserved = False
878-
if (not self._node_resources_used(node) and
879-
self._node_resources_unavailable(node)):
880-
LOG.debug('Node %(node)s is not ready for a deployment, '
881-
'reporting resources as reserved for it. Node\'s '
882-
'provision state is %(prov)s, power state is '
883-
'%(power)s and maintenance is %(maint)s.',
884-
{'node': node.uuid, 'prov': node.provision_state,
885-
'power': node.power_state, 'maint': node.maintenance})
879+
if self._node_resources_unavailable(node):
880+
# Operators might mark a node as in maintainance,
881+
# even when an instance is on the node,
882+
# either way lets mark this as reserved
883+
reserved = True
884+
885+
if (self._node_resources_used(node) and
886+
not CONF.workarounds.skip_reserve_in_use_ironic_nodes):
887+
# Make resources as reserved once we have
888+
# and instance here.
889+
# When the allocation is deleted, most likely
890+
# automatic clean will start, so we keep the node
891+
# reserved until it becomes available again.
892+
# In the case without automatic clean, once
893+
# the allocation is removed in placement it
894+
# also stays as reserved until we notice on
895+
# the next periodic its actually available.
886896
reserved = True
887897

888898
info = self._node_resource(node)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
fixes:
3+
- |
4+
Fixed when placement returns ironic nodes that have just started automatic
5+
cleaning as possible valid candidates. This is done by marking all ironic
6+
nodes with an instance on them as reserved, such that nova only makes them
7+
available once we have double checked Ironic reports the node as available.
8+
If you don't have automatic cleaning on, this might mean it takes longer
9+
than normal for Ironic nodes to become available for new instances.
10+
If you want the old behaviour use the following workaround config:
11+
`[workarounds]skip_reserve_in_use_ironic_nodes=true`

0 commit comments

Comments
 (0)