Skip to content

Commit afd84b4

Browse files
Increase retry intervals
With retries in a 5 second interval and our current peak of ~424 worker threads, we end up with 85 requests/second even fully uniformly distributed, which is unsustainable. (Note that new work is being added at *roughly* ~8 requests per second from rustc jobs completing, so we may not be able to drain the queue at all since our processing takes ~150ms at p99). Spreading across eight minutes brings us to ~1 rps of retry load. In practice it'll be higher since any workers *not* seeing errors will return to the previous compute, but that brings us to roughly ~10 rps, which we should hopefully largely be able to handle, given that our p50 is only 60ms).
1 parent 489e8cb commit afd84b4

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

src/agent/api.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::time::Duration;
2+
13
use crate::agent::Capabilities;
24
use crate::crates::Crate;
35
use crate::experiments::Experiment;
@@ -62,8 +64,6 @@ impl ResponseExt for ::reqwest::blocking::Response {
6264
}
6365
}
6466

65-
const RETRY_AFTER: u64 = 5;
66-
6767
pub struct AgentApi {
6868
url: String,
6969
token: String,
@@ -88,6 +88,7 @@ impl AgentApi {
8888
}
8989

9090
fn retry<T, F: Fn(&Self) -> Fallible<T>>(&self, f: F) -> Fallible<T> {
91+
let mut retry_interval = 16u64;
9192
loop {
9293
match f(self) {
9394
Ok(res) => return Ok(res),
@@ -104,10 +105,19 @@ impl AgentApi {
104105
};
105106

106107
if retry {
107-
warn!("connection to the server failed. retrying in a few seconds...");
108-
::std::thread::sleep(::std::time::Duration::from_millis(
109-
rand::thread_rng().gen_range(0..(RETRY_AFTER * 1000)),
110-
));
108+
let sleep_for = Duration::from_millis(
109+
rand::thread_rng().gen_range(500..(retry_interval * 1000)),
110+
);
111+
warn!(
112+
"connection to the server failed. retrying in {:?}...",
113+
sleep_for
114+
);
115+
::std::thread::sleep(sleep_for);
116+
retry_interval *= 2;
117+
if retry_interval >= 8 * 60 {
118+
retry_interval = 8 * 60;
119+
}
120+
111121
continue;
112122
}
113123

@@ -141,7 +151,7 @@ impl AgentApi {
141151
// healthy.
142152
crate::agent::set_healthy();
143153

144-
::std::thread::sleep(::std::time::Duration::from_secs(120));
154+
::std::thread::sleep(Duration::from_secs(120));
145155
})
146156
}
147157

0 commit comments

Comments
 (0)