libvirt: Rework 'EBUSY' (SIGKILL) error handling code path

kashyapc · kashyapc · commit 118cf0c5920b · 2019-06-25T15:51:09.000Z
Change ID I128bf6b939 (libvirt: handle code=38 + sigkill (ebusy) in _destroy()) handled the case where a QEMU process "refuses to die" within a given timeout period set by libvirt. Originally, libvirt sent SIGTERM (allowing the process to clean-up resources), then waited 10 seconds, if the guest didn't go away. Then it sent, the more lethal, SIGKILL and waited another 5 seconds for it to take effect. From libvirt v4.7.0 onwards, libvirt increased[1][2] the time it waits for a guest hard shutdown to complete. It now waits for 30 seconds for SIGKILL to work (instead of 5). Also, additional wait time is added if there are assigned PCI devices, as some of those tend to slow things down. In this change: - Increment the counter to retry the _destroy() call from 3 to 6, thus increasing the total time from 15 to 30 seconds, before SIGKILL takes effect. And it matches the (more graceful) behaviour of libvirt v4.7.0. This also gives breathing room for Nova instances running in environments with large compute nodes with high instance creation or delete churn, where the current timout may not be sufficient. - Retry the _destroy() API call _only_ if MIN_LIBVIRT_VERSION is lower than 4.7.0. [1] https://libvirt.org/git/?p=libvirt.git;a=commitdiff;h=9a4e4b9 (process: wait longer 5->30s on hard shutdown) [2] https://libvirt.org/git/?p=libvirt.git;a=commit;h=be2ca04 ("process: wait longer on kill per assigned Hostdev") Related-bug: #1353939 Change-Id: If2035cac931c42c440d61ba97ebc7e9e92141a28 Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com> (cherry picked from commit 10d50ca)
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -14823,7 +14823,7 @@ def test_private_destroy_processes_refused_to_die_still_raises(self):
                               instance)
 
     def test_private_destroy_ebusy_timeout(self):
-        # Tests that _destroy will retry 3 times to destroy the guest when an
+        # Tests that _destroy will retry 6 times to destroy the guest when an
         # EBUSY is raised, but eventually times out and raises the libvirtError
         ex = fakelibvirt.make_libvirtError(
                 fakelibvirt.libvirtError,
@@ -14843,7 +14843,7 @@ def test_private_destroy_ebusy_timeout(self):
             self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
                               instance)
 
-        self.assertEqual(3, mock_guest.poweroff.call_count)
+        self.assertEqual(6, mock_guest.poweroff.call_count)
 
     def test_private_destroy_ebusy_multiple_attempt_ok(self):
         # Tests that the _destroy attempt loop is broken when EBUSY is no
@@ -14869,6 +14869,37 @@ def test_private_destroy_ebusy_multiple_attempt_ok(self):
 
         self.assertEqual(2, mock_guest.poweroff.call_count)
 
+    @mock.patch.object(libvirt_driver.LOG, 'warning')
+    @mock.patch.object(fakelibvirt.Connection, 'getLibVersion',
+                       return_value=versionutils.convert_version_to_int(
+                           libvirt_driver.MIN_LIBVIRT_BETTER_SIGKILL_HANDLING))
+    def test_min_libvirt_better_sigkill_handling_warning(self,
+                                                 mock_warning,
+                                                 mock_get_libversion):
+        ex = fakelibvirt.make_libvirtError(
+                fakelibvirt.libvirtError,
+                ("Failed to terminate process 26425 with SIGKILL: "
+                                     "Device or resource busy"),
+                error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
+                int1=errno.EBUSY)
+
+        mock_guest = mock.Mock(libvirt_guest.Guest, id=1)
+        mock_guest.poweroff = mock.Mock(side_effect=ex)
+
+        instance = objects.Instance(**self.test_instance)
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+
+        with mock.patch.object(drvr._host, 'get_guest',
+                               return_value=mock_guest):
+            raised = self.assertRaises(fakelibvirt.libvirtError,
+                                       drvr._destroy,
+                                       instance)
+            self.assertEqual(fakelibvirt.VIR_ERR_SYSTEM_ERROR,
+                             raised.get_error_code())
+
+        mock_warning.assert_called_once()
+        mock_guest.poweroff.assert_called_once()
+
     @mock.patch.object(fakelibvirt.libvirtError, 'get_error_code')
     @mock.patch.object(host.Host, '_get_domain',
                        side_effect=exception.InstanceNotFound(
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
@@ -284,6 +284,11 @@ def repr_method(self):
 MIN_LIBVIRT_NATIVE_TLS_VERSION = (4, 4, 0)
 MIN_QEMU_NATIVE_TLS_VERSION = (2, 11, 0)
 
+# If the host has this libvirt version, then we skip the retry loop of
+# instance destroy() call, as libvirt itself increased the wait time
+# before the SIGKILL signal takes effect.
+MIN_LIBVIRT_BETTER_SIGKILL_HANDLING = (4, 7, 0)
+
 VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
 
 
@@ -953,18 +958,36 @@ def _destroy(self, instance, attempt=1):
                         # steal time from the cloud host. ie 15 wallclock
                         # seconds may have passed, but the VM might have only
                         # have a few seconds of scheduled run time.
-                        LOG.warning('Error from libvirt during destroy. '
-                                    'Code=%(errcode)s Error=%(e)s; '
-                                    'attempt %(attempt)d of 3',
-                                    {'errcode': errcode, 'e': e,
-                                     'attempt': attempt},
-                                    instance=instance)
+                        #
+                        # TODO(kchamart): Once MIN_LIBVIRT_VERSION
+                        # reaches v4.7.0, (a) rewrite the above note,
+                        # and (b) remove the following code that retries
+                        # _destroy() API call (which gives SIGKILL 30
+                        # seconds to take effect) -- because from v4.7.0
+                        # onwards, libvirt _automatically_ increases the
+                        # timeout to 30 seconds.  This was added in the
+                        # following libvirt commits:
+                        #
+                        #   - 9a4e4b942 (process: wait longer 5->30s on
+                        #     hard shutdown)
+                        #
+                        #   - be2ca0444 (process: wait longer on kill
+                        #     per assigned Hostdev)
                         with excutils.save_and_reraise_exception() as ctxt:
-                            # Try up to 3 times before giving up.
-                            if attempt < 3:
-                                ctxt.reraise = False
-                                self._destroy(instance, attempt + 1)
-                                return
+                            if not self._host.has_min_version(
+                                    MIN_LIBVIRT_BETTER_SIGKILL_HANDLING):
+                                LOG.warning('Error from libvirt during '
+                                            'destroy. Code=%(errcode)s '
+                                            'Error=%(e)s; attempt '
+                                            '%(attempt)d of 6 ',
+                                            {'errcode': errcode, 'e': e,
+                                             'attempt': attempt},
+                                            instance=instance)
+                                # Try up to 6 times before giving up.
+                                if attempt < 6:
+                                    ctxt.reraise = False
+                                    self._destroy(instance, attempt + 1)
+                                    return
 
                 if not is_okay:
                     with excutils.save_and_reraise_exception():