Skip to content

Commit 14c2bdf

Browse files
authored
v3: checkpoint and reliability improvements (#1198)
* only checkpoint retries with delays greater than threshold * rename checkpoint threshold env var * log task monitor ignores * crash runs with unbounded attempts * fix retry check in shared queue consumer * add missing stop for env var sync spinner * prod entry point refactor * missing awaits * more verbose prod flush and exit logs * reduce checkpoint support logs * heartbeat while checkpointing between retries * dynamic coordinator config * measure lazy attempt creation time in prod * simplify delay threshold * heartbeat clarifications * crash run if it doesn't reach checkpointable state * require dynamic config threshold * fix retry prep, await previous worker kill * unify wait mechanics * fix prod worker without tasks error * ensure worker is ready to be checkpointed for dependency waits * improve worker attempt creation logging * prevent crashes caused by failed socket schema parsing * fix dynamic imports in v3 catalog * clarify attempt retry mechanics * move backoff helper to core-apps * remove core-apps barrel file * add backoff execute with callback * deprecate non-lazy attempt messages * update socket.io-client to v4.7.5 * fix socket.io types for emits with timeout * retry all the things * remove todo * fix retry restores * improve index failure logs * retry incomplete dependency waits * fix checkpoint in-progress detection * prevent losing messages during reconnect * checkpoint when greater or equal to threshold * improve handling of duration wait edge cases * add ready for lazy attempt replay * retry attempt completion * allow failing runs with unfriendly run id * fix min max jitter * cancel checkpoints on run failure * improve attempt creation errors * prevent crashing run on failed cleanup * handle at-least-once execute lazy attempt delivery * log exit code on prepare for retry * fix timeout promise * mark some things * chaos monkey superpowers * refactor checkpointer * set chaos monkey defaults * less chaos * fix backoff * handle uncaught entry point exceptions * only replay rpcs on true reconnects * allow resume unless final run status * add changeset * small fixes
1 parent 7976d92 commit 14c2bdf

36 files changed

+2213
-1246
lines changed

.changeset/mighty-eggs-grab.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
---
2+
"@trigger.dev/core-apps": patch
3+
"trigger.dev": patch
4+
"@trigger.dev/core": patch
5+
---
6+
7+
Tasks should now be much more robust and resilient to reconnects during crucial operations and other failure scenarios.
8+
9+
Task runs now have to signal checkpointable state prior to ALL checkpoints. This ensures flushing always happens.
10+
11+
All important socket.io RPCs will now be retried with backoff. Actions relying on checkpoints will be replayed if we haven't been checkpointed and restored as expected, e.g. after reconnect.
12+
13+
Other changes:
14+
15+
- Fix retry check in shared queue
16+
- Fix env var sync spinner
17+
- Heartbeat between retries
18+
- Fix retry prep
19+
- Fix prod worker no tasks detection
20+
- Fail runs above `MAX_TASK_RUN_ATTEMPTS`
21+
- Additional debug logs in all places
22+
- Prevent crashes due to failed socket schema parsing
23+
- Remove core-apps barrel
24+
- Upgrade socket.io-client to fix an ACK memleak
25+
- Additional index failure logs
26+
- Prevent message loss during reconnect
27+
- Prevent burst of heartbeats on reconnect
28+
- Prevent crash on failed cleanup
29+
- Handle at-least-once lazy execute message delivery
30+
- Handle uncaught entry point exceptions

.env.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ COORDINATOR_SECRET=coordinator-secret # generate the actual secret with `openssl
7171
# OBJECT_STORE_BASE_URL="https://{bucket}.{accountId}.r2.cloudflarestorage.com"
7272
# OBJECT_STORE_ACCESS_KEY_ID=
7373
# OBJECT_STORE_SECRET_ACCESS_KEY=
74-
# RUNTIME_WAIT_THRESHOLD_IN_MS=10000
74+
# CHECKPOINT_THRESHOLD_IN_MS=10000
7575

7676
# These control the server-side internal telemetry
7777
# INTERNAL_OTEL_TRACE_EXPORTER_URL=<URL to send traces to>

apps/coordinator/package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121
"execa": "^8.0.1",
2222
"nanoid": "^5.0.6",
2323
"prom-client": "^15.1.0",
24-
"socket.io": "4.7.4",
25-
"socket.io-client": "4.7.4"
24+
"socket.io": "4.7.4"
2625
},
2726
"devDependencies": {
2827
"@types/node": "^18",

apps/coordinator/src/chaosMonkey.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import type { Execa$ } from "execa";
2+
import { setTimeout as timeout } from "node:timers/promises";
3+
4+
class ChaosMonkeyError extends Error {
5+
constructor(message: string) {
6+
super(message);
7+
this.name = "ChaosMonkeyError";
8+
}
9+
}
10+
11+
export class ChaosMonkey {
12+
private chaosEventRate = 0.2;
13+
private delayInSeconds = 45;
14+
15+
constructor(private enabled = false) {
16+
if (this.enabled) {
17+
console.log("🍌 Chaos monkey enabled");
18+
}
19+
}
20+
21+
static Error = ChaosMonkeyError;
22+
23+
enable() {
24+
this.enabled = true;
25+
console.log("🍌 Chaos monkey enabled");
26+
}
27+
28+
disable() {
29+
this.enabled = false;
30+
console.log("🍌 Chaos monkey disabled");
31+
}
32+
33+
async call({
34+
$,
35+
throwErrors = true,
36+
addDelays = true,
37+
}: {
38+
$?: Execa$<string>;
39+
throwErrors?: boolean;
40+
addDelays?: boolean;
41+
} = {}) {
42+
if (!this.enabled) {
43+
return;
44+
}
45+
46+
const random = Math.random();
47+
48+
if (random > this.chaosEventRate) {
49+
// Don't interfere with normal operation
50+
return;
51+
}
52+
53+
const chaosEvents: Array<() => Promise<any>> = [];
54+
55+
if (addDelays) {
56+
chaosEvents.push(async () => {
57+
console.log("🍌 Chaos monkey: Add delay");
58+
59+
if ($) {
60+
await $`sleep ${this.delayInSeconds}`;
61+
} else {
62+
await timeout(this.delayInSeconds * 1000);
63+
}
64+
});
65+
}
66+
67+
if (throwErrors) {
68+
chaosEvents.push(async () => {
69+
console.log("🍌 Chaos monkey: Throw error");
70+
71+
if ($) {
72+
await $`false`;
73+
} else {
74+
throw new ChaosMonkey.Error("🍌 Chaos monkey: Throw error");
75+
}
76+
});
77+
}
78+
79+
if (chaosEvents.length === 0) {
80+
console.error("🍌 Chaos monkey: No events selected");
81+
return;
82+
}
83+
84+
const randomIndex = Math.floor(Math.random() * chaosEvents.length);
85+
86+
const chaosEvent = chaosEvents[randomIndex];
87+
88+
if (!chaosEvent) {
89+
console.error("🍌 Chaos monkey: No event found");
90+
return;
91+
}
92+
93+
await chaosEvent();
94+
}
95+
}

0 commit comments

Comments
 (0)