fix(replays): Use sigmoid_delay for GCS retries (#78703)

cmanallen · web-flow · commit c80cbd786fc7 · 2024-10-07T14:59:24.000-05:00
Fixes intermittent deploy failures when our consumer blows up thanks to
Google.
diff --git a/src/sentry/filestore/gcs.py b/src/sentry/filestore/gcs.py
@@ -23,11 +23,11 @@
 
 from sentry.net.http import TimeoutAdapter
 from sentry.utils import metrics
-from sentry.utils.retries import ConditionalRetryPolicy, exponential_delay
+from sentry.utils.retries import ConditionalRetryPolicy, sigmoid_delay
 
 # how many times do we want to try if stuff goes wrong
 GCS_RETRIES = 5
-REPLAY_GCS_RETRIES = GCS_RETRIES + 2
+REPLAY_GCS_RETRIES = 125
 
 
 # Which errors are eligible for retry.
@@ -405,6 +405,7 @@ def should_retry(attempt: int, e: Exception) -> bool:
             """Retry gateway timeout exceptions up to the limit."""
             return attempt <= REPLAY_GCS_RETRIES and isinstance(e, GCS_RETRYABLE_ERRORS)
 
-        # Retry cadence: 0.025, 0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2 => ~6.5 seconds
-        policy = ConditionalRetryPolicy(should_retry, exponential_delay(0.05))
+        # Retry cadence: After a brief period of fast retries the function will retry once
+        # per second for two minutes.
+        policy = ConditionalRetryPolicy(should_retry, sigmoid_delay())
         policy(callable)
diff --git a/src/sentry/utils/retries.py b/src/sentry/utils/retries.py
@@ -1,6 +1,7 @@
 import functools
 import itertools
 import logging
+import math
 import random
 import time
 from abc import ABC, abstractmethod
@@ -66,6 +67,49 @@ def delay(attempt: int) -> float:
     return delay
 
 
+def sigmoid_delay(offset: int = -5, midpoint: int = 0, step: int = 1) -> Callable[[int], float]:
+    """
+    Returns an S-Curve function.
+
+    A sigmoid is the intersection of these two behaviors:
+        `while(true): retry() # immediate retry`
+    and
+        `while(true): sleep(1); retry() # static-wait then retry`
+
+    The intersection of these two worlds is an exponential function which
+    gradually ramps the program up to (or down to) a stable state (the s-curve).
+    The sharpness of the curse is controlled with step. A step of 0 flattens the
+    curve. A step of infinity turns the curve into a step change (a vertical
+    line).
+
+    The sigmoid is more difficult to intuit than a simple exponential delay but it
+    allows you to cap the maximum amount of time you're willing to wait between
+    retries. The cap is _always_ 1 second regardless of the value of the other
+    arguments. If you want to wait longer than one second multiply the result of
+    the function by something!
+
+    Consider this program:
+        [sigmoid_delay()(i) for i in range(-5, 5)]
+    is equivalent to:
+        [0.006, 0.017, 0.0474, 0.119, 0.268, 0.5, 0.731, 0.880, 0.952, 0.982]
+
+    You get the same results with:
+        [sigmoid_delay()(i) for i in range(10)]
+    except the window has changed:
+        [0.5, 0.731, 0.880, 0.952, 0.982, ...]
+
+    Now you see further along the curve. This explains the utility of the `offset`
+    parameter. The offset allows you to slide along the window. A smaller offset
+    gives you faster retries. A larger offset gives you slower retries. An offset
+    pushed too far past the midpoint reduces this function to a static wait.
+    """
+
+    def delay(attempt: int) -> float:
+        return 1 / (1 + math.exp(-step * ((attempt + offset) - midpoint)))
+
+    return delay
+
+
 class ConditionalRetryPolicy(RetryPolicy):
     """
     A basic policy that can be used to retry a callable based on the result
diff --git a/tests/sentry/replays/unit/test_retry.py b/tests/sentry/replays/unit/test_retry.py
@@ -0,0 +1,10 @@
+from sentry.utils.retries import sigmoid_delay
+
+
+def test_sigmoid_delay():
+    results = [sigmoid_delay()(i) for i in range(125)]
+    assert results[0] == 0.0066928509242848554
+    assert results[5] == 0.5
+    assert results[10] == 0.9933071490757153
+    assert results[124] == 1
+    assert sum(results) == 119.4960857616948  # Max two minute sleep.