Skip to content

Commit c80cbd7

Browse files
authored
fix(replays): Use sigmoid_delay for GCS retries (#78703)
Fixes intermittent deploy failures when our consumer blows up thanks to Google.
1 parent 020407e commit c80cbd7

File tree

3 files changed

+59
-4
lines changed

3 files changed

+59
-4
lines changed

src/sentry/filestore/gcs.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323

2424
from sentry.net.http import TimeoutAdapter
2525
from sentry.utils import metrics
26-
from sentry.utils.retries import ConditionalRetryPolicy, exponential_delay
26+
from sentry.utils.retries import ConditionalRetryPolicy, sigmoid_delay
2727

2828
# how many times do we want to try if stuff goes wrong
2929
GCS_RETRIES = 5
30-
REPLAY_GCS_RETRIES = GCS_RETRIES + 2
30+
REPLAY_GCS_RETRIES = 125
3131

3232

3333
# Which errors are eligible for retry.
@@ -405,6 +405,7 @@ def should_retry(attempt: int, e: Exception) -> bool:
405405
"""Retry gateway timeout exceptions up to the limit."""
406406
return attempt <= REPLAY_GCS_RETRIES and isinstance(e, GCS_RETRYABLE_ERRORS)
407407

408-
# Retry cadence: 0.025, 0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2 => ~6.5 seconds
409-
policy = ConditionalRetryPolicy(should_retry, exponential_delay(0.05))
408+
# Retry cadence: After a brief period of fast retries the function will retry once
409+
# per second for two minutes.
410+
policy = ConditionalRetryPolicy(should_retry, sigmoid_delay())
410411
policy(callable)

src/sentry/utils/retries.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import functools
22
import itertools
33
import logging
4+
import math
45
import random
56
import time
67
from abc import ABC, abstractmethod
@@ -66,6 +67,49 @@ def delay(attempt: int) -> float:
6667
return delay
6768

6869

70+
def sigmoid_delay(offset: int = -5, midpoint: int = 0, step: int = 1) -> Callable[[int], float]:
71+
"""
72+
Returns an S-Curve function.
73+
74+
A sigmoid is the intersection of these two behaviors:
75+
`while(true): retry() # immediate retry`
76+
and
77+
`while(true): sleep(1); retry() # static-wait then retry`
78+
79+
The intersection of these two worlds is an exponential function which
80+
gradually ramps the program up to (or down to) a stable state (the s-curve).
81+
The sharpness of the curse is controlled with step. A step of 0 flattens the
82+
curve. A step of infinity turns the curve into a step change (a vertical
83+
line).
84+
85+
The sigmoid is more difficult to intuit than a simple exponential delay but it
86+
allows you to cap the maximum amount of time you're willing to wait between
87+
retries. The cap is _always_ 1 second regardless of the value of the other
88+
arguments. If you want to wait longer than one second multiply the result of
89+
the function by something!
90+
91+
Consider this program:
92+
[sigmoid_delay()(i) for i in range(-5, 5)]
93+
is equivalent to:
94+
[0.006, 0.017, 0.0474, 0.119, 0.268, 0.5, 0.731, 0.880, 0.952, 0.982]
95+
96+
You get the same results with:
97+
[sigmoid_delay()(i) for i in range(10)]
98+
except the window has changed:
99+
[0.5, 0.731, 0.880, 0.952, 0.982, ...]
100+
101+
Now you see further along the curve. This explains the utility of the `offset`
102+
parameter. The offset allows you to slide along the window. A smaller offset
103+
gives you faster retries. A larger offset gives you slower retries. An offset
104+
pushed too far past the midpoint reduces this function to a static wait.
105+
"""
106+
107+
def delay(attempt: int) -> float:
108+
return 1 / (1 + math.exp(-step * ((attempt + offset) - midpoint)))
109+
110+
return delay
111+
112+
69113
class ConditionalRetryPolicy(RetryPolicy):
70114
"""
71115
A basic policy that can be used to retry a callable based on the result
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from sentry.utils.retries import sigmoid_delay
2+
3+
4+
def test_sigmoid_delay():
5+
results = [sigmoid_delay()(i) for i in range(125)]
6+
assert results[0] == 0.0066928509242848554
7+
assert results[5] == 0.5
8+
assert results[10] == 0.9933071490757153
9+
assert results[124] == 1
10+
assert sum(results) == 119.4960857616948 # Max two minute sleep.

0 commit comments

Comments
 (0)