Skip to content

Commit c235e67

Browse files
committed
Auto merge of #666 - Mark-Simulacrum:improve-backoff, r=Mark-Simulacrum
Improve backoff & metrics See individual commits for details.
2 parents 489e8cb + 0c25039 commit c235e67

File tree

2 files changed

+23
-13
lines changed

2 files changed

+23
-13
lines changed

src/agent/api.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::time::Duration;
2+
13
use crate::agent::Capabilities;
24
use crate::crates::Crate;
35
use crate::experiments::Experiment;
@@ -62,8 +64,6 @@ impl ResponseExt for ::reqwest::blocking::Response {
6264
}
6365
}
6466

65-
const RETRY_AFTER: u64 = 5;
66-
6767
pub struct AgentApi {
6868
url: String,
6969
token: String,
@@ -88,6 +88,7 @@ impl AgentApi {
8888
}
8989

9090
fn retry<T, F: Fn(&Self) -> Fallible<T>>(&self, f: F) -> Fallible<T> {
91+
let mut retry_interval = 16u64;
9192
loop {
9293
match f(self) {
9394
Ok(res) => return Ok(res),
@@ -104,10 +105,19 @@ impl AgentApi {
104105
};
105106

106107
if retry {
107-
warn!("connection to the server failed. retrying in a few seconds...");
108-
::std::thread::sleep(::std::time::Duration::from_millis(
109-
rand::thread_rng().gen_range(0..(RETRY_AFTER * 1000)),
110-
));
108+
let sleep_for = Duration::from_millis(
109+
rand::thread_rng().gen_range(500..(retry_interval * 1000)),
110+
);
111+
warn!(
112+
"connection to the server failed. retrying in {:?}...",
113+
sleep_for
114+
);
115+
::std::thread::sleep(sleep_for);
116+
retry_interval *= 2;
117+
if retry_interval >= 8 * 60 {
118+
retry_interval = 8 * 60;
119+
}
120+
111121
continue;
112122
}
113123

@@ -141,7 +151,7 @@ impl AgentApi {
141151
// healthy.
142152
crate::agent::set_healthy();
143153

144-
::std::thread::sleep(::std::time::Duration::from_secs(120));
154+
::std::thread::sleep(Duration::from_secs(120));
145155
})
146156
}
147157

src/server/routes/agent.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,6 @@ impl RecordProgressThread {
195195
let start = std::time::Instant::now();
196196

197197
if let Some(ex) = Experiment::get(&db, &result.experiment_name).unwrap() {
198-
metrics.record_completed_jobs(
199-
&worker_name,
200-
&ex.name,
201-
result.data.results.len() as i64,
202-
);
203-
204198
let db = DatabaseDB::new(&db);
205199
if let Err(e) = db.store(&ex, &result.data, EncodingType::Plain) {
206200
// Failing to record a result is basically fine -- this
@@ -209,6 +203,12 @@ impl RecordProgressThread {
209203
crate::utils::report_failure(&e);
210204
}
211205

206+
metrics.record_completed_jobs(
207+
&worker_name,
208+
&ex.name,
209+
result.data.results.len() as i64,
210+
);
211+
212212
if let Err(e) = db.clear_stale_records() {
213213
// Not a hard failure. We can continue even if we failed
214214
// to clear records from already completed runs...

0 commit comments

Comments
 (0)