Skip to content

Commit 90593ad

Browse files
authored
Fix several restore and resume bugs (#1418)
* try to correct resume messages with missing checkpoint * prevent creating checkpoints for outdated task waits * prevent creating checkpoints for outdated batch waits * use heartbeats to check for and clean up any leftover containers * lint * improve exec logging * improve resume attempt logs * fix for resuming parents of canceled child runs * separate SIGTERM from maybe OOM errors * pretty errors can have magic dashboard links * prevent uncancellable checkpoints * simplify task run error code enum export * grab the last, not the first child run * Revert "prevent creating checkpoints for outdated batch waits" This reverts commit f2b5c2a. * Revert "grab the last, not the first child run" This reverts commit 89ec5c8. * Revert "prevent creating checkpoints for outdated task waits" This reverts commit 11066b4. * more logs for resume message handling * add magic error link comment * add changeset
1 parent 235ab90 commit 90593ad

File tree

12 files changed

+416
-119
lines changed

12 files changed

+416
-119
lines changed

.changeset/many-plants-destroy.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@trigger.dev/core": patch
3+
---
4+
5+
SIGTERM detection and prettier errors

apps/coordinator/src/checkpointer.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,10 @@ export class Checkpointer {
436436
this.#logger.error("Error during cleanup", { ...metadata, error });
437437
}
438438

439-
this.#abortControllers.delete(runId);
439+
// Ensure only the current controller is removed
440+
if (this.#abortControllers.get(runId) === controller) {
441+
this.#abortControllers.delete(runId);
442+
}
440443
controller.signal.removeEventListener("abort", onAbort);
441444
};
442445

apps/coordinator/src/exec.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,18 @@ export class Exec {
6464
command,
6565
argsRaw: args,
6666
argsTrimmed,
67-
...output,
67+
globalOpts: {
68+
trimArgs: this.trimArgs,
69+
neverThrow: this.neverThrow,
70+
hasAbortSignal: !!this.abortSignal,
71+
},
72+
localOpts: opts,
73+
stdout: output.stdout,
74+
stderr: output.stderr,
75+
pid: result.pid,
76+
exitCode: result.exitCode,
77+
aborted: result.aborted,
78+
killed: result.killed,
6879
};
6980

7081
if (this.logOutput) {

apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
import { CheckIcon, ClockIcon, CloudArrowDownIcon, QueueListIcon } from "@heroicons/react/20/solid";
1+
import {
2+
CheckIcon,
3+
ClockIcon,
4+
CloudArrowDownIcon,
5+
EnvelopeIcon,
6+
QueueListIcon,
7+
} from "@heroicons/react/20/solid";
28
import { Link } from "@remix-run/react";
39
import { LoaderFunctionArgs } from "@remix-run/server-runtime";
410
import {
@@ -13,6 +19,7 @@ import { typedjson, useTypedFetcher } from "remix-typedjson";
1319
import { ExitIcon } from "~/assets/icons/ExitIcon";
1420
import { CodeBlock } from "~/components/code/CodeBlock";
1521
import { EnvironmentLabel } from "~/components/environments/EnvironmentLabel";
22+
import { Feedback } from "~/components/Feedback";
1623
import { Button, LinkButton } from "~/components/primitives/Buttons";
1724
import { Callout } from "~/components/primitives/Callout";
1825
import { DateTime, DateTimeAccurate } from "~/components/primitives/DateTime";
@@ -963,11 +970,26 @@ function RunError({ error }: { error: TaskRunError }) {
963970
<div className="flex flex-col gap-2 rounded-sm border border-rose-500/50 px-3 pb-3 pt-2">
964971
<Header3 className="text-rose-500">{name}</Header3>
965972
{enhancedError.message && <Callout variant="error">{enhancedError.message}</Callout>}
966-
{enhancedError.link && (
967-
<Callout variant="docs" to={enhancedError.link.href}>
968-
{enhancedError.link.name}
969-
</Callout>
970-
)}
973+
{enhancedError.link &&
974+
(enhancedError.link.magic === "CONTACT_FORM" ? (
975+
<Feedback
976+
button={
977+
<Button
978+
variant="tertiary/medium"
979+
LeadingIcon={EnvelopeIcon}
980+
leadingIconClassName="text-blue-400"
981+
fullWidth
982+
textAlignLeft
983+
>
984+
{enhancedError.link.name}
985+
</Button>
986+
}
987+
/>
988+
) : (
989+
<Callout variant="docs" to={enhancedError.link.href}>
990+
{enhancedError.link.name}
991+
</Callout>
992+
))}
971993
{enhancedError.stackTrace && (
972994
<CodeBlock
973995
showCopyButton={false}

apps/webapp/app/v3/environmentVariables/environmentVariablesRepository.server.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ export class EnvironmentVariablesRepository implements Repository {
136136

137137
try {
138138
for (const variable of values) {
139-
const result = await $transaction(this.prismaClient, async (tx) => {
139+
const result = await $transaction(this.prismaClient, async (tx) => {
140140
const environmentVariable = await tx.environmentVariable.upsert({
141141
where: {
142142
projectId_key: {

apps/webapp/app/v3/handleSocketIo.server.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,10 @@ function createCoordinatorNamespace(io: Server) {
195195
const service = new CreateTaskRunAttemptService();
196196
const { attempt } = await service.call(message.runId, environment, false);
197197

198-
const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true);
198+
const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt({
199+
id: attempt.id,
200+
setToExecuting: true,
201+
});
199202

200203
if (!payload) {
201204
logger.error("Failed to retrieve payload after attempt creation", message);

0 commit comments

Comments
 (0)