Skip to content

Commit 69b0d31

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Add a WA flag waiting for vif-plugged event during reboot"
2 parents 7aa3a0f + 68c970e commit 69b0d31

File tree

5 files changed

+148
-3
lines changed

5 files changed

+148
-3
lines changed

.zuul.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,12 @@
245245
# reduce the number of placement calls in steady state. Added in
246246
# Stein.
247247
resource_provider_association_refresh: 0
248+
workarounds:
249+
# This wa is an improvement on hard reboot that cannot be turned
250+
# on unconditionally. But we know that ml2/ovs sends plug time
251+
# events so we can enable this in this ovs job for vnic_type
252+
# normal
253+
wait_for_vif_plugged_event_during_hard_reboot: normal
248254
$NOVA_CONF:
249255
quota:
250256
# Added in Train.

nova/conf/workarounds.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,65 @@
299299
patched however in some cases this is not possible. This workaround allows the
300300
emulation of an apic to be disabled per host however it is not recommended to
301301
use outside of a CI or developer cloud.
302+
"""),
303+
cfg.ListOpt('wait_for_vif_plugged_event_during_hard_reboot',
304+
item_type=cfg.types.String(
305+
choices=[
306+
"normal",
307+
"direct",
308+
"macvtap",
309+
"baremetal",
310+
"direct-physical",
311+
"virtio-forwarder",
312+
"smart-nic",
313+
"vdpa",
314+
"accelerator-direct",
315+
"accelerator-direct-physical",
316+
]),
317+
default=[],
318+
help="""
319+
The libvirt virt driver implements power on and hard reboot by tearing down
320+
every vif of the instance being rebooted then plug them again. By default nova
321+
does not wait for network-vif-plugged event from neutron before it lets the
322+
instance run. This can cause the instance to requests the IP via DHCP before
323+
the neutron backend has a chance to set up the networking backend after the vif
324+
plug.
325+
326+
This flag defines which vifs nova expects network-vif-plugged events from
327+
during hard reboot. The possible values are neutron port vnic types:
328+
329+
* normal
330+
* direct
331+
* macvtap
332+
* baremetal
333+
* direct-physical
334+
* virtio-forwarder
335+
* smart-nic
336+
* vdpa
337+
* accelerator-direct
338+
* accelerator-direct-physical
339+
340+
Adding a ``vnic_type`` to this configuration makes Nova wait for a
341+
network-vif-plugged event for each of the instance's vifs having the specific
342+
``vnic_type`` before unpausing the instance, similarly to how new instance
343+
creation works.
344+
345+
Please note that not all neutron networking backends send plug time events, for
346+
certain ``vnic_type`` therefore this config is empty by default.
347+
348+
The ml2/ovs and the networking-odl backends are known to send plug time events
349+
for ports with ``normal`` ``vnic_type`` so it is safe to add ``normal`` to this
350+
config if you are using only those backends in the compute host.
351+
352+
The neutron in-tree SRIOV backend does not reliably send network-vif-plugged
353+
event during plug time for ports with ``direct`` vnic_type and never sends
354+
that event for port with ``direct-physical`` vnic_type during plug time. For
355+
other ``vnic_type`` and backend pairs, please consult the developers of the
356+
backend.
357+
358+
Related options:
359+
360+
* :oslo.config:option:`DEFAULT.vif_plugging_timeout`
302361
"""),
303362
]
304363

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16221,7 +16221,48 @@ def test_hard_reboot(self, mock_get_mdev, mock_destroy, mock_get_disk_info,
1622116221
accel_info=accel_info)
1622216222
mock_create_guest_with_network.assert_called_once_with(self.context,
1622316223
dummyxml, instance, network_info, block_device_info,
16224-
vifs_already_plugged=True)
16224+
vifs_already_plugged=True, external_events=[])
16225+
16226+
@mock.patch('oslo_utils.fileutils.ensure_tree', new=mock.Mock())
16227+
@mock.patch('nova.virt.libvirt.LibvirtDriver.get_info')
16228+
@mock.patch('nova.virt.libvirt.LibvirtDriver._create_guest_with_network')
16229+
@mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml')
16230+
@mock.patch('nova.virt.libvirt.LibvirtDriver.destroy', new=mock.Mock())
16231+
@mock.patch(
16232+
'nova.virt.libvirt.LibvirtDriver._get_all_assigned_mediated_devices',
16233+
new=mock.Mock(return_value={}))
16234+
def test_hard_reboot_wait_for_plug(
16235+
self, mock_get_guest_xml, mock_create_guest_with_network, mock_get_info
16236+
):
16237+
self.flags(
16238+
group="workarounds",
16239+
wait_for_vif_plugged_event_during_hard_reboot=["normal"])
16240+
self.context.auth_token = None
16241+
instance = objects.Instance(**self.test_instance)
16242+
network_info = _fake_network_info(self, num_networks=4)
16243+
network_info[0]["vnic_type"] = "normal"
16244+
network_info[1]["vnic_type"] = "direct"
16245+
network_info[2]["vnic_type"] = "normal"
16246+
network_info[3]["vnic_type"] = "direct-physical"
16247+
block_device_info = None
16248+
return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN),
16249+
hardware.InstanceInfo(state=power_state.RUNNING)]
16250+
mock_get_info.side_effect = return_values
16251+
mock_get_guest_xml.return_value = mock.sentinel.xml
16252+
16253+
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
16254+
drvr._hard_reboot(
16255+
self.context, instance, network_info, block_device_info)
16256+
16257+
mock_create_guest_with_network.assert_called_once_with(
16258+
self.context, mock.sentinel.xml, instance, network_info,
16259+
block_device_info,
16260+
vifs_already_plugged=False,
16261+
external_events=[
16262+
('network-vif-plugged', uuids.vif1),
16263+
('network-vif-plugged', uuids.vif3),
16264+
]
16265+
)
1622516266

1622616267
@mock.patch('oslo_utils.fileutils.ensure_tree')
1622716268
@mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')

nova/virt/libvirt/driver.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3818,11 +3818,32 @@ def _hard_reboot(self, context, instance, network_info,
38183818
# on which vif type we're using and we are working with a stale network
38193819
# info cache here, so won't rely on waiting for neutron plug events.
38203820
# vifs_already_plugged=True means "do not wait for neutron plug events"
3821+
external_events = []
3822+
vifs_already_plugged = True
3823+
event_expected_for_vnic_types = (
3824+
CONF.workarounds.wait_for_vif_plugged_event_during_hard_reboot)
3825+
if event_expected_for_vnic_types:
3826+
# NOTE(gibi): We unplugged every vif during destroy above and we
3827+
# will replug them with _create_guest_with_network. As the
3828+
# workaround config has some vnic_types configured we expect
3829+
# vif-plugged events for every vif with those vnic_types.
3830+
# TODO(gibi): only wait for events if we know that the networking
3831+
# backend sends plug time events. For that we need to finish
3832+
# https://bugs.launchpad.net/neutron/+bug/1821058 first in Neutron
3833+
# then create a driver -> plug-time event mapping in nova.
3834+
external_events = [
3835+
('network-vif-plugged', vif['id'])
3836+
for vif in network_info
3837+
if vif['vnic_type'] in event_expected_for_vnic_types
3838+
]
3839+
vifs_already_plugged = False
3840+
38213841
# NOTE(efried): The instance should already have a vtpm_secret_uuid
38223842
# registered if appropriate.
38233843
self._create_guest_with_network(
38243844
context, xml, instance, network_info, block_device_info,
3825-
vifs_already_plugged=True)
3845+
vifs_already_plugged=vifs_already_plugged,
3846+
external_events=external_events)
38263847

38273848
def _wait_for_reboot():
38283849
"""Called at an interval until the VM is running again."""
@@ -7216,7 +7237,7 @@ def _create_guest_with_network(
72167237
power_on: bool = True,
72177238
vifs_already_plugged: bool = False,
72187239
post_xml_callback: ty.Callable = None,
7219-
external_events: ty.Optional[ty.List[str]] = None,
7240+
external_events: ty.Optional[ty.List[ty.Tuple[str, str]]] = None,
72207241
cleanup_instance_dir: bool = False,
72217242
cleanup_instance_disks: bool = False,
72227243
) -> libvirt_guest.Guest:
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
issues:
3+
- |
4+
The libvirt virt driver in Nova implements power on and hard reboot by
5+
destroying the domain first and unpluging the vifs then recreating the
6+
domain and replugging the vifs. However nova does not wait for the
7+
network-vif-plugged event before unpause the domain. This can cause
8+
the domain to start running and requesting IP via DHCP before the
9+
networking backend has finished plugging the vifs. The config option
10+
[workarounds]wait_for_vif_plugged_event_during_hard_reboot has been added,
11+
defaulting to an empty list, that can be used to ensure that the libvirt
12+
driver waits for the network-vif-plugged event for vifs with specific
13+
``vnic_type`` before it unpauses the domain during hard reboot. This should
14+
only be used if the deployment uses a networking backend that sends such
15+
event for the given ``vif_type`` at vif plug time. The ml2/ovs and the
16+
networking-odl Neutron backend is known to send plug time events for ports
17+
with ``normal`` ``vnic_type``. For more information see
18+
https://bugs.launchpad.net/nova/+bug/1946729

0 commit comments

Comments
 (0)