Skip to content

Commit 118cf0c

Browse files
committed
libvirt: Rework 'EBUSY' (SIGKILL) error handling code path
Change ID I128bf6b939 (libvirt: handle code=38 + sigkill (ebusy) in _destroy()) handled the case where a QEMU process "refuses to die" within a given timeout period set by libvirt. Originally, libvirt sent SIGTERM (allowing the process to clean-up resources), then waited 10 seconds, if the guest didn't go away. Then it sent, the more lethal, SIGKILL and waited another 5 seconds for it to take effect. From libvirt v4.7.0 onwards, libvirt increased[1][2] the time it waits for a guest hard shutdown to complete. It now waits for 30 seconds for SIGKILL to work (instead of 5). Also, additional wait time is added if there are assigned PCI devices, as some of those tend to slow things down. In this change: - Increment the counter to retry the _destroy() call from 3 to 6, thus increasing the total time from 15 to 30 seconds, before SIGKILL takes effect. And it matches the (more graceful) behaviour of libvirt v4.7.0. This also gives breathing room for Nova instances running in environments with large compute nodes with high instance creation or delete churn, where the current timout may not be sufficient. - Retry the _destroy() API call _only_ if MIN_LIBVIRT_VERSION is lower than 4.7.0. [1] https://libvirt.org/git/?p=libvirt.git;a=commitdiff;h=9a4e4b9 (process: wait longer 5->30s on hard shutdown) [2] https://libvirt.org/git/?p=libvirt.git;a=commit;h=be2ca04 ("process: wait longer on kill per assigned Hostdev") Related-bug: #1353939 Change-Id: If2035cac931c42c440d61ba97ebc7e9e92141a28 Signed-off-by: Kashyap Chamarthy <[email protected]> (cherry picked from commit 10d50ca)
1 parent acd2daa commit 118cf0c

File tree

2 files changed

+67
-13
lines changed

2 files changed

+67
-13
lines changed

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14823,7 +14823,7 @@ def test_private_destroy_processes_refused_to_die_still_raises(self):
1482314823
instance)
1482414824

1482514825
def test_private_destroy_ebusy_timeout(self):
14826-
# Tests that _destroy will retry 3 times to destroy the guest when an
14826+
# Tests that _destroy will retry 6 times to destroy the guest when an
1482714827
# EBUSY is raised, but eventually times out and raises the libvirtError
1482814828
ex = fakelibvirt.make_libvirtError(
1482914829
fakelibvirt.libvirtError,
@@ -14843,7 +14843,7 @@ def test_private_destroy_ebusy_timeout(self):
1484314843
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
1484414844
instance)
1484514845

14846-
self.assertEqual(3, mock_guest.poweroff.call_count)
14846+
self.assertEqual(6, mock_guest.poweroff.call_count)
1484714847

1484814848
def test_private_destroy_ebusy_multiple_attempt_ok(self):
1484914849
# Tests that the _destroy attempt loop is broken when EBUSY is no
@@ -14869,6 +14869,37 @@ def test_private_destroy_ebusy_multiple_attempt_ok(self):
1486914869

1487014870
self.assertEqual(2, mock_guest.poweroff.call_count)
1487114871

14872+
@mock.patch.object(libvirt_driver.LOG, 'warning')
14873+
@mock.patch.object(fakelibvirt.Connection, 'getLibVersion',
14874+
return_value=versionutils.convert_version_to_int(
14875+
libvirt_driver.MIN_LIBVIRT_BETTER_SIGKILL_HANDLING))
14876+
def test_min_libvirt_better_sigkill_handling_warning(self,
14877+
mock_warning,
14878+
mock_get_libversion):
14879+
ex = fakelibvirt.make_libvirtError(
14880+
fakelibvirt.libvirtError,
14881+
("Failed to terminate process 26425 with SIGKILL: "
14882+
"Device or resource busy"),
14883+
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
14884+
int1=errno.EBUSY)
14885+
14886+
mock_guest = mock.Mock(libvirt_guest.Guest, id=1)
14887+
mock_guest.poweroff = mock.Mock(side_effect=ex)
14888+
14889+
instance = objects.Instance(**self.test_instance)
14890+
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
14891+
14892+
with mock.patch.object(drvr._host, 'get_guest',
14893+
return_value=mock_guest):
14894+
raised = self.assertRaises(fakelibvirt.libvirtError,
14895+
drvr._destroy,
14896+
instance)
14897+
self.assertEqual(fakelibvirt.VIR_ERR_SYSTEM_ERROR,
14898+
raised.get_error_code())
14899+
14900+
mock_warning.assert_called_once()
14901+
mock_guest.poweroff.assert_called_once()
14902+
1487214903
@mock.patch.object(fakelibvirt.libvirtError, 'get_error_code')
1487314904
@mock.patch.object(host.Host, '_get_domain',
1487414905
side_effect=exception.InstanceNotFound(

nova/virt/libvirt/driver.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,11 @@ def repr_method(self):
284284
MIN_LIBVIRT_NATIVE_TLS_VERSION = (4, 4, 0)
285285
MIN_QEMU_NATIVE_TLS_VERSION = (2, 11, 0)
286286

287+
# If the host has this libvirt version, then we skip the retry loop of
288+
# instance destroy() call, as libvirt itself increased the wait time
289+
# before the SIGKILL signal takes effect.
290+
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING = (4, 7, 0)
291+
287292
VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
288293

289294

@@ -953,18 +958,36 @@ def _destroy(self, instance, attempt=1):
953958
# steal time from the cloud host. ie 15 wallclock
954959
# seconds may have passed, but the VM might have only
955960
# have a few seconds of scheduled run time.
956-
LOG.warning('Error from libvirt during destroy. '
957-
'Code=%(errcode)s Error=%(e)s; '
958-
'attempt %(attempt)d of 3',
959-
{'errcode': errcode, 'e': e,
960-
'attempt': attempt},
961-
instance=instance)
961+
#
962+
# TODO(kchamart): Once MIN_LIBVIRT_VERSION
963+
# reaches v4.7.0, (a) rewrite the above note,
964+
# and (b) remove the following code that retries
965+
# _destroy() API call (which gives SIGKILL 30
966+
# seconds to take effect) -- because from v4.7.0
967+
# onwards, libvirt _automatically_ increases the
968+
# timeout to 30 seconds. This was added in the
969+
# following libvirt commits:
970+
#
971+
# - 9a4e4b942 (process: wait longer 5->30s on
972+
# hard shutdown)
973+
#
974+
# - be2ca0444 (process: wait longer on kill
975+
# per assigned Hostdev)
962976
with excutils.save_and_reraise_exception() as ctxt:
963-
# Try up to 3 times before giving up.
964-
if attempt < 3:
965-
ctxt.reraise = False
966-
self._destroy(instance, attempt + 1)
967-
return
977+
if not self._host.has_min_version(
978+
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING):
979+
LOG.warning('Error from libvirt during '
980+
'destroy. Code=%(errcode)s '
981+
'Error=%(e)s; attempt '
982+
'%(attempt)d of 6 ',
983+
{'errcode': errcode, 'e': e,
984+
'attempt': attempt},
985+
instance=instance)
986+
# Try up to 6 times before giving up.
987+
if attempt < 6:
988+
ctxt.reraise = False
989+
self._destroy(instance, attempt + 1)
990+
return
968991

969992
if not is_okay:
970993
with excutils.save_and_reraise_exception():

0 commit comments

Comments
 (0)