Skip to content

Commit 68d3242

Browse files
authored
v3: checkpoint failover and misc fixes (#1157)
* configurable checkpoint registry namespace * add missing task create await * remove unused messages * changeset * update self-hosting docs * capture and display stderr for failed deploys * add missing lockfile changes * stderr changeset * fix cli stderr message * update error logs label
1 parent 36ac79a commit 68d3242

File tree

19 files changed

+217
-105
lines changed

19 files changed

+217
-105
lines changed

.changeset/rude-toys-compare.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@trigger.dev/core-apps": patch
3+
"trigger.dev": patch
4+
"@trigger.dev/core": patch
5+
---
6+
7+
Capture and display stderr on index failures

.changeset/slow-sloths-retire.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@trigger.dev/core-apps": patch
3+
"@trigger.dev/core": patch
4+
---
5+
6+
- Fix uncaught provider exception
7+
- Remove unused provider messages

apps/coordinator/src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ const SIMULATE_CHECKPOINT_FAILURE_SECONDS = parseInt(
4343
);
4444

4545
const REGISTRY_HOST = process.env.REGISTRY_HOST || "localhost:5000";
46+
const REGISTRY_NAMESPACE = process.env.REGISTRY_NAMESPACE || "trigger";
4647
const CHECKPOINT_PATH = process.env.CHECKPOINT_PATH || "/checkpoints";
4748
const REGISTRY_TLS_VERIFY = process.env.REGISTRY_TLS_VERIFY === "false" ? "false" : "true";
4849

@@ -179,7 +180,7 @@ class Checkpointer {
179180
}
180181

181182
#getImageRef(projectRef: string, deploymentVersion: string, shortCode: string) {
182-
return `${REGISTRY_HOST}/trigger/${projectRef}:${deploymentVersion}.prod-${shortCode}`;
183+
return `${REGISTRY_HOST}/${REGISTRY_NAMESPACE}/${projectRef}:${deploymentVersion}.prod-${shortCode}`;
183184
}
184185

185186
#getExportLocation(projectRef: string, deploymentVersion: string, shortCode: string) {

apps/docker-provider/src/index.ts

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -85,37 +85,23 @@ class DockerTaskOperations implements TaskOperations {
8585
port: COORDINATOR_PORT,
8686
});
8787

88-
try {
89-
logger.debug(
90-
await execa("docker", [
91-
"run",
92-
"--network=host",
93-
"--rm",
94-
`--env=INDEX_TASKS=true`,
95-
`--env=TRIGGER_SECRET_KEY=${opts.apiKey}`,
96-
`--env=TRIGGER_API_URL=${opts.apiUrl}`,
97-
`--env=TRIGGER_ENV_ID=${opts.envId}`,
98-
`--env=OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT}`,
99-
`--env=POD_NAME=${containerName}`,
100-
`--env=COORDINATOR_HOST=${COORDINATOR_HOST}`,
101-
`--env=COORDINATOR_PORT=${COORDINATOR_PORT}`,
102-
`--name=${containerName}`,
103-
`${opts.imageRef}`,
104-
])
105-
);
106-
} catch (error: any) {
107-
if (!isExecaChildProcess(error)) {
108-
throw error;
109-
}
110-
111-
logger.error("Index failed:", {
112-
opts,
113-
exitCode: error.exitCode,
114-
escapedCommand: error.escapedCommand,
115-
stdout: error.stdout,
116-
stderr: error.stderr,
117-
});
118-
}
88+
logger.debug(
89+
await execa("docker", [
90+
"run",
91+
"--network=host",
92+
"--rm",
93+
`--env=INDEX_TASKS=true`,
94+
`--env=TRIGGER_SECRET_KEY=${opts.apiKey}`,
95+
`--env=TRIGGER_API_URL=${opts.apiUrl}`,
96+
`--env=TRIGGER_ENV_ID=${opts.envId}`,
97+
`--env=OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT}`,
98+
`--env=POD_NAME=${containerName}`,
99+
`--env=COORDINATOR_HOST=${COORDINATOR_HOST}`,
100+
`--env=COORDINATOR_PORT=${COORDINATOR_PORT}`,
101+
`--name=${containerName}`,
102+
`${opts.imageRef}`,
103+
])
104+
);
119105
}
120106

121107
async create(opts: TaskOperationsCreateOptions) {

apps/webapp/app/components/runs/v3/DeploymentError.tsx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,17 @@ export function DeploymentError({ errorData }: DeploymentErrorProps) {
2020
maxLines={20}
2121
/>
2222
)}
23+
{errorData.stderr && (
24+
<>
25+
<DeploymentErrorHeader title="Error logs:" />
26+
<CodeBlock
27+
showCopyButton={false}
28+
showLineNumbers={false}
29+
code={errorData.stderr}
30+
maxLines={20}
31+
/>
32+
</>
33+
)}
2334
</div>
2435
);
2536
}

apps/webapp/app/presenters/v3/DeploymentPresenter.server.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export type ErrorData = {
1717
name: string;
1818
message: string;
1919
stack?: string;
20+
stderr?: string;
2021
};
2122

2223
export class DeploymentPresenter {
@@ -177,17 +178,20 @@ export class DeploymentPresenter {
177178
name: parsedErrorData.data.name,
178179
message: parsedErrorData.data.message,
179180
stack: createTaskMetadataFailedErrorStack(parsedError.data),
181+
stderr: parsedErrorData.data.stderr,
180182
};
181183
} else {
182184
return {
183185
name: parsedErrorData.data.name,
184186
message: parsedErrorData.data.message,
187+
stderr: parsedErrorData.data.stderr,
185188
};
186189
}
187190
} else {
188191
return {
189192
name: parsedErrorData.data.name,
190193
message: parsedErrorData.data.message,
194+
stderr: parsedErrorData.data.stderr,
191195
};
192196
}
193197
}
@@ -196,6 +200,7 @@ export class DeploymentPresenter {
196200
name: parsedErrorData.data.name,
197201
message: parsedErrorData.data.message,
198202
stack: parsedErrorData.data.stack,
203+
stderr: parsedErrorData.data.stderr,
199204
};
200205
}
201206
}
Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,63 @@
11
import { PerformDeploymentAlertsService } from "./alerts/performDeploymentAlerts.server";
22
import { BaseService } from "./baseService.server";
3+
import { logger } from "~/services/logger.server";
4+
import { WorkerDeploymentStatus } from "@trigger.dev/database";
5+
6+
const FINAL_DEPLOYMENT_STATUSES: WorkerDeploymentStatus[] = [
7+
"CANCELED",
8+
"DEPLOYED",
9+
"FAILED",
10+
"TIMED_OUT",
11+
];
312

413
export class DeploymentIndexFailed extends BaseService {
514
public async call(
615
maybeFriendlyId: string,
7-
error: { name: string; message: string; stack?: string }
16+
error: {
17+
name: string;
18+
message: string;
19+
stack?: string;
20+
stderr?: string;
21+
}
822
) {
923
const isFriendlyId = maybeFriendlyId.startsWith("deployment_");
1024

11-
const deployment = await this._prisma.workerDeployment.update({
25+
const deployment = await this._prisma.workerDeployment.findUnique({
1226
where: isFriendlyId
1327
? {
1428
friendlyId: maybeFriendlyId,
1529
}
1630
: {
1731
id: maybeFriendlyId,
1832
},
33+
});
34+
35+
if (!deployment) {
36+
logger.error("Worker deployment not found", { maybeFriendlyId });
37+
return;
38+
}
39+
40+
if (FINAL_DEPLOYMENT_STATUSES.includes(deployment.status)) {
41+
logger.error("Worker deployment already in final state", {
42+
id: deployment.id,
43+
status: deployment.status,
44+
});
45+
return;
46+
}
47+
48+
const failedDeployment = await this._prisma.workerDeployment.update({
49+
where: {
50+
id: deployment.id,
51+
},
1952
data: {
2053
status: "FAILED",
2154
failedAt: new Date(),
2255
errorData: error,
2356
},
2457
});
2558

26-
await PerformDeploymentAlertsService.enqueue(deployment.id, this._prisma);
59+
await PerformDeploymentAlertsService.enqueue(failedDeployment.id, this._prisma);
2760

28-
return deployment;
61+
return failedDeployment;
2962
}
3063
}

docs/v3/open-source-self-hosting.mdx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ scp -3 root@<webapp_machine>:docker/.env root@<worker_machine>:docker/.env
206206
Checkpointing allows you to save the state of a running container to disk and restore it later. This can be useful for
207207
long-running tasks that need to be paused and resumed without losing state. Think fan-out and fan-in, or long waits in email campaigns.
208208

209+
The checkpoints will be pushed to the same registry as the deployed images. Please see the [Registry setup](#registry-setup) section for more information.
210+
209211
### Requirements
210212

211213
- Debian, **NOT** a derivative like Ubuntu
@@ -225,7 +227,7 @@ sudo apt-get install criu
225227
2. Tweak the config so we can successfully checkpoint our workloads
226228

227229
```bash
228-
mkdir /etc/criu
230+
mkdir -p /etc/criu
229231

230232
cat << EOF >/etc/criu/runc.conf
231233
tcp-close

packages/cli-v3/src/commands/deploy.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,10 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) {
507507

508508
await preExitTasks();
509509

510+
if (finishedDeployment.errorData.stderr) {
511+
log.error(`stderr:\n${finishedDeployment.errorData.stderr}`);
512+
}
513+
510514
throw new SkipLoggingError(
511515
`Deployment encountered an error: ${finishedDeployment.errorData.name}`
512516
);

packages/cli-v3/src/workers/prod/backgroundWorker.ts

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ export class ProdBackgroundWorker {
7878
private _onClose: Evt<void> = new Evt();
7979

8080
public tasks: Array<TaskMetadataWithFilePath> = [];
81+
public stderr: Array<string> = [];
8182

8283
_taskRunProcess: TaskRunProcess | undefined;
8384
private _taskRunProcessesBeingKilled: Map<number, TaskRunProcess> = new Map();
@@ -161,6 +162,23 @@ export class ProdBackgroundWorker {
161162
reject(new Error("Worker timed out"));
162163
}, 10_000);
163164

165+
child.stdout?.on("data", (data) => {
166+
console.log(data.toString());
167+
});
168+
169+
child.stderr?.on("data", (data) => {
170+
console.error(data.toString());
171+
this.stderr.push(data.toString());
172+
});
173+
174+
child.on("exit", (code) => {
175+
if (!resolved) {
176+
clearTimeout(timeout);
177+
resolved = true;
178+
reject(new Error(`Worker exited with code ${code}`));
179+
}
180+
});
181+
164182
new ZodIpcConnection({
165183
listenSchema: ProdChildToWorkerMessages,
166184
emitSchema: ProdWorkerToChildMessages,
@@ -192,22 +210,6 @@ export class ProdBackgroundWorker {
192210
},
193211
},
194212
});
195-
196-
child.stdout?.on("data", (data) => {
197-
console.log(data.toString());
198-
});
199-
200-
child.stderr?.on("data", (data) => {
201-
console.error(data.toString());
202-
});
203-
204-
child.on("exit", (code) => {
205-
if (!resolved) {
206-
clearTimeout(timeout);
207-
resolved = true;
208-
reject(new Error(`Worker exited with code ${code}`));
209-
}
210-
});
211213
});
212214

213215
this._initialized = true;

packages/cli-v3/src/workers/prod/entry-point.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,8 @@ class ProdWorker {
634634
process.exit(1);
635635
}
636636
} catch (e) {
637+
const stderr = this.#backgroundWorker.stderr.join("\n");
638+
637639
if (e instanceof TaskMetadataParseError) {
638640
logger.error("tasks metadata parse error", {
639641
zodIssues: e.zodIssues,
@@ -647,13 +649,15 @@ class ProdWorker {
647649
name: "TaskMetadataParseError",
648650
message: "There was an error parsing the task metadata",
649651
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
652+
stderr,
650653
},
651654
});
652655
} else if (e instanceof UncaughtExceptionError) {
653656
const error = {
654657
name: e.originalError.name,
655658
message: e.originalError.message,
656659
stack: e.originalError.stack,
660+
stderr,
657661
};
658662

659663
logger.error("uncaught exception", { originalError: error });
@@ -668,6 +672,7 @@ class ProdWorker {
668672
name: e.name,
669673
message: e.message,
670674
stack: e.stack,
675+
stderr,
671676
};
672677

673678
logger.error("error", { error });
@@ -686,6 +691,7 @@ class ProdWorker {
686691
error: {
687692
name: "Error",
688693
message: e,
694+
stderr,
689695
},
690696
});
691697
} else {
@@ -697,6 +703,7 @@ class ProdWorker {
697703
error: {
698704
name: "Error",
699705
message: "Unknown error",
706+
stderr,
700707
},
701708
});
702709
}

0 commit comments

Comments
 (0)