Skip to content

Commit 31adec5

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "libvirt: Rework 'EBUSY' (SIGKILL) error handling code path" into stable/stein
2 parents 73011cf + 118cf0c commit 31adec5

File tree

2 files changed

+67
-13
lines changed

2 files changed

+67
-13
lines changed

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14823,7 +14823,7 @@ def test_private_destroy_processes_refused_to_die_still_raises(self):
1482314823
instance)
1482414824

1482514825
def test_private_destroy_ebusy_timeout(self):
14826-
# Tests that _destroy will retry 3 times to destroy the guest when an
14826+
# Tests that _destroy will retry 6 times to destroy the guest when an
1482714827
# EBUSY is raised, but eventually times out and raises the libvirtError
1482814828
ex = fakelibvirt.make_libvirtError(
1482914829
fakelibvirt.libvirtError,
@@ -14843,7 +14843,7 @@ def test_private_destroy_ebusy_timeout(self):
1484314843
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
1484414844
instance)
1484514845

14846-
self.assertEqual(3, mock_guest.poweroff.call_count)
14846+
self.assertEqual(6, mock_guest.poweroff.call_count)
1484714847

1484814848
def test_private_destroy_ebusy_multiple_attempt_ok(self):
1484914849
# Tests that the _destroy attempt loop is broken when EBUSY is no
@@ -14869,6 +14869,37 @@ def test_private_destroy_ebusy_multiple_attempt_ok(self):
1486914869

1487014870
self.assertEqual(2, mock_guest.poweroff.call_count)
1487114871

14872+
@mock.patch.object(libvirt_driver.LOG, 'warning')
14873+
@mock.patch.object(fakelibvirt.Connection, 'getLibVersion',
14874+
return_value=versionutils.convert_version_to_int(
14875+
libvirt_driver.MIN_LIBVIRT_BETTER_SIGKILL_HANDLING))
14876+
def test_min_libvirt_better_sigkill_handling_warning(self,
14877+
mock_warning,
14878+
mock_get_libversion):
14879+
ex = fakelibvirt.make_libvirtError(
14880+
fakelibvirt.libvirtError,
14881+
("Failed to terminate process 26425 with SIGKILL: "
14882+
"Device or resource busy"),
14883+
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
14884+
int1=errno.EBUSY)
14885+
14886+
mock_guest = mock.Mock(libvirt_guest.Guest, id=1)
14887+
mock_guest.poweroff = mock.Mock(side_effect=ex)
14888+
14889+
instance = objects.Instance(**self.test_instance)
14890+
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
14891+
14892+
with mock.patch.object(drvr._host, 'get_guest',
14893+
return_value=mock_guest):
14894+
raised = self.assertRaises(fakelibvirt.libvirtError,
14895+
drvr._destroy,
14896+
instance)
14897+
self.assertEqual(fakelibvirt.VIR_ERR_SYSTEM_ERROR,
14898+
raised.get_error_code())
14899+
14900+
mock_warning.assert_called_once()
14901+
mock_guest.poweroff.assert_called_once()
14902+
1487214903
@mock.patch.object(fakelibvirt.libvirtError, 'get_error_code')
1487314904
@mock.patch.object(host.Host, '_get_domain',
1487414905
side_effect=exception.InstanceNotFound(

nova/virt/libvirt/driver.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,11 @@ def repr_method(self):
284284
MIN_LIBVIRT_NATIVE_TLS_VERSION = (4, 4, 0)
285285
MIN_QEMU_NATIVE_TLS_VERSION = (2, 11, 0)
286286

287+
# If the host has this libvirt version, then we skip the retry loop of
288+
# instance destroy() call, as libvirt itself increased the wait time
289+
# before the SIGKILL signal takes effect.
290+
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING = (4, 7, 0)
291+
287292
VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
288293

289294

@@ -953,18 +958,36 @@ def _destroy(self, instance, attempt=1):
953958
# steal time from the cloud host. ie 15 wallclock
954959
# seconds may have passed, but the VM might have only
955960
# have a few seconds of scheduled run time.
956-
LOG.warning('Error from libvirt during destroy. '
957-
'Code=%(errcode)s Error=%(e)s; '
958-
'attempt %(attempt)d of 3',
959-
{'errcode': errcode, 'e': e,
960-
'attempt': attempt},
961-
instance=instance)
961+
#
962+
# TODO(kchamart): Once MIN_LIBVIRT_VERSION
963+
# reaches v4.7.0, (a) rewrite the above note,
964+
# and (b) remove the following code that retries
965+
# _destroy() API call (which gives SIGKILL 30
966+
# seconds to take effect) -- because from v4.7.0
967+
# onwards, libvirt _automatically_ increases the
968+
# timeout to 30 seconds. This was added in the
969+
# following libvirt commits:
970+
#
971+
# - 9a4e4b942 (process: wait longer 5->30s on
972+
# hard shutdown)
973+
#
974+
# - be2ca0444 (process: wait longer on kill
975+
# per assigned Hostdev)
962976
with excutils.save_and_reraise_exception() as ctxt:
963-
# Try up to 3 times before giving up.
964-
if attempt < 3:
965-
ctxt.reraise = False
966-
self._destroy(instance, attempt + 1)
967-
return
977+
if not self._host.has_min_version(
978+
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING):
979+
LOG.warning('Error from libvirt during '
980+
'destroy. Code=%(errcode)s '
981+
'Error=%(e)s; attempt '
982+
'%(attempt)d of 6 ',
983+
{'errcode': errcode, 'e': e,
984+
'attempt': attempt},
985+
instance=instance)
986+
# Try up to 6 times before giving up.
987+
if attempt < 6:
988+
ctxt.reraise = False
989+
self._destroy(instance, attempt + 1)
990+
return
968991

969992
if not is_okay:
970993
with excutils.save_and_reraise_exception():

0 commit comments

Comments
 (0)