Retry in CellDatabases fixture when global DB state changes

melwitt · gibizer · commit 875668827896 · 2022-05-06T09:14:30.000Z
There is a NOTE in the CellDatabases code about an unlikely but possible race that can occur between taking the writer lock to set the last DB context manager and taking the reader lock to call target_cell(). When the race is detected, a RuntimeError is raised. We can handle the race by retrying setting the last DB context manager when the race is detected, as described in the NOTE. Closes-Bug: #1959677 Change-Id: I5c0607ce5910dce581ab9360cc7fc69ba9673f35 (cherry picked from commit 1c8122a)
diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
@@ -22,6 +22,7 @@
 import functools
 import logging as std_logging
 import os
+import time
 import warnings
 
 import eventlet
@@ -451,6 +452,13 @@ def _wrap_target_cell(self, context, cell_mapping):
         #     yield to do the actual work. We can do schedulable things
         #     here and not exclude other threads from making progress.
         #     If an exception is raised, we capture that and save it.
+        #     Note that it is possible that another thread has changed the
+        #     global state (step #2) after we released the writer lock but
+        #     before we acquired the reader lock. If this happens, we will
+        #     detect the global state change and retry step #2 a limited number
+        #     of times. If we happen to race repeatedly with another thread and
+        #     exceed our retry limit, we will give up and raise a RuntimeError,
+        #     which will fail the test.
         #  4. If we changed state in #2, we need to change it back. So we grab
         #     a writer lock again and do that.
         #  5. Finally, if an exception was raised in #3 while state was
@@ -469,29 +477,47 @@ def _wrap_target_cell(self, context, cell_mapping):
 
         raised_exc = None
 
-        with self._cell_lock.write_lock():
-            if cell_mapping is not None:
-                # This assumes the next local DB access is the same cell that
-                # was targeted last time.
-                self._last_ctxt_mgr = desired
+        def set_last_ctxt_mgr():
+            with self._cell_lock.write_lock():
+                if cell_mapping is not None:
+                    # This assumes the next local DB access is the same cell
+                    # that was targeted last time.
+                    self._last_ctxt_mgr = desired
 
-        with self._cell_lock.read_lock():
-            if self._last_ctxt_mgr != desired:
-                # NOTE(danms): This is unlikely to happen, but it's possible
-                # another waiting writer changed the state between us letting
-                # it go and re-acquiring as a reader. If lockutils supported
-                # upgrading and downgrading locks, this wouldn't be a problem.
-                # Regardless, assert that it is still as we left it here
-                # so we don't hit the wrong cell. If this becomes a problem,
-                # we just need to retry the write section above until we land
-                # here with the cell we want.
-                raise RuntimeError('Global DB state changed underneath us')
+        # Set last context manager to the desired cell's context manager.
+        set_last_ctxt_mgr()
 
+        # Retry setting the last context manager if we detect that a writer
+        # changed global DB state before we take the read lock.
+        for retry_time in range(0, 3):
             try:
-                with self._real_target_cell(context, cell_mapping) as ccontext:
-                    yield ccontext
-            except Exception as exc:
-                raised_exc = exc
+                with self._cell_lock.read_lock():
+                    if self._last_ctxt_mgr != desired:
+                        # NOTE(danms): This is unlikely to happen, but it's
+                        # possible another waiting writer changed the state
+                        # between us letting it go and re-acquiring as a
+                        # reader. If lockutils supported upgrading and
+                        # downgrading locks, this wouldn't be a problem.
+                        # Regardless, assert that it is still as we left it
+                        # here so we don't hit the wrong cell. If this becomes
+                        # a problem, we just need to retry the write section
+                        # above until we land here with the cell we want.
+                        raise RuntimeError(
+                            'Global DB state changed underneath us')
+                    try:
+                        with self._real_target_cell(
+                            context, cell_mapping
+                        ) as ccontext:
+                            yield ccontext
+                    except Exception as exc:
+                        raised_exc = exc
+                    # Leave the retry loop after calling target_cell
+                    break
+            except RuntimeError:
+                # Give other threads a chance to make progress, increasing the
+                # wait time between attempts.
+                time.sleep(retry_time)
+                set_last_ctxt_mgr()
 
         with self._cell_lock.write_lock():
             # Once we have returned from the context, we need