Skip to content

Commit 11df7b8

Browse files
committed
better errors for crashes
1 parent c4af185 commit 11df7b8

File tree

4 files changed

+24
-4
lines changed

4 files changed

+24
-4
lines changed

apps/kubernetes-provider/src/taskMonitor.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ import { SimpleLogger } from "@trigger.dev/core/v3/apps";
33
import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core/v3/apps";
44
import { setTimeout } from "timers/promises";
55
import PQueue from "p-queue";
6-
import type { Prettify } from "@trigger.dev/core/v3";
6+
import { TaskRunErrorCodes, type Prettify, type TaskRunInternalError } from "@trigger.dev/core/v3";
77

88
type FailureDetails = Prettify<{
99
exitCode: number;
1010
reason: string;
1111
logs: string;
1212
overrideCompletion: boolean;
13+
errorCode: TaskRunInternalError["code"];
1314
}>;
1415

1516
type IndexFailureHandler = (deploymentId: string, details: FailureDetails) => Promise<any>;
@@ -160,18 +161,23 @@ export class TaskMonitor {
160161
let reason = rawReason || "Unknown error";
161162
let logs = rawLogs || "";
162163
let overrideCompletion = false;
164+
let errorCode: TaskRunInternalError["code"] = TaskRunErrorCodes.POD_UNKNOWN_ERROR;
163165

164166
switch (rawReason) {
165167
case "Error":
166168
reason = "Unknown error.";
169+
errorCode = TaskRunErrorCodes.POD_UNKNOWN_ERROR;
167170
break;
168171
case "Evicted":
169172
if (message.startsWith("Pod ephemeral local storage usage")) {
170173
reason = "Storage limit exceeded.";
174+
errorCode = TaskRunErrorCodes.DISK_SPACE_EXCEEDED;
171175
} else if (message) {
172176
reason = `Evicted: ${message}`;
177+
errorCode = TaskRunErrorCodes.POD_EVICTED;
173178
} else {
174179
reason = "Evicted for unknown reason.";
180+
errorCode = TaskRunErrorCodes.POD_EVICTED;
175181
}
176182

177183
if (logs.startsWith("failed to try resolving symlinks")) {
@@ -183,6 +189,7 @@ export class TaskMonitor {
183189
reason = `${
184190
exitCode === EXIT_CODE_CHILD_NONZERO ? "Child process" : "Parent process"
185191
} ran out of memory! Try choosing a machine preset with more memory for this task.`;
192+
errorCode = TaskRunErrorCodes.TASK_PROCESS_OOM_KILLED;
186193
break;
187194
default:
188195
break;
@@ -193,6 +200,7 @@ export class TaskMonitor {
193200
reason,
194201
logs,
195202
overrideCompletion,
203+
errorCode,
196204
} satisfies FailureDetails;
197205

198206
const app = pod.metadata?.labels?.app;

apps/webapp/app/v3/services/crashTaskRun.server.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { BaseService } from "./baseService.server";
55
import { logger } from "~/services/logger.server";
66
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
77
import { CRASHABLE_ATTEMPT_STATUSES, isCrashableRunStatus } from "../taskStatus";
8-
import { sanitizeError } from "@trigger.dev/core/v3";
8+
import { sanitizeError, TaskRunInternalError } from "@trigger.dev/core/v3";
99
import { FinalizeTaskRunService } from "./finalizeTaskRun.server";
1010

1111
export type CrashTaskRunServiceOptions = {
@@ -15,6 +15,7 @@ export type CrashTaskRunServiceOptions = {
1515
crashAttempts?: boolean;
1616
crashedAt?: Date;
1717
overrideCompletion?: boolean;
18+
errorCode?: TaskRunInternalError["code"];
1819
};
1920

2021
export class CrashTaskRunService extends BaseService {
@@ -26,6 +27,8 @@ export class CrashTaskRunService extends BaseService {
2627
...options,
2728
};
2829

30+
logger.debug("CrashTaskRunService.call", { runId, opts });
31+
2932
const taskRun = await this._prisma.taskRun.findFirst({
3033
where: {
3134
id: runId,
@@ -71,7 +74,7 @@ export class CrashTaskRunService extends BaseService {
7174
attemptStatus: "FAILED",
7275
error: {
7376
type: "INTERNAL_ERROR",
74-
code: "TASK_RUN_CRASHED",
77+
code: opts.errorCode ?? "TASK_RUN_CRASHED",
7578
message: opts.reason,
7679
stackTrace: opts.logs,
7780
},
@@ -129,6 +132,7 @@ export class CrashTaskRunService extends BaseService {
129132
error: {
130133
reason: string;
131134
logs?: string;
135+
code?: TaskRunInternalError["code"];
132136
}
133137
) {
134138
return await this.traceWithEnv("failAttempt()", environment, async (span) => {
@@ -146,7 +150,7 @@ export class CrashTaskRunService extends BaseService {
146150
completedAt: failedAt,
147151
error: sanitizeError({
148152
type: "INTERNAL_ERROR",
149-
code: "TASK_RUN_CRASHED",
153+
code: error.code ?? "TASK_RUN_CRASHED",
150154
message: error.reason,
151155
stackTrace: error.logs,
152156
}),

packages/core/src/v3/schemas/common.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ export const TaskRunErrorCodes = {
9595
GRACEFUL_EXIT_TIMEOUT: "GRACEFUL_EXIT_TIMEOUT",
9696
TASK_RUN_CRASHED: "TASK_RUN_CRASHED",
9797
MAX_DURATION_EXCEEDED: "MAX_DURATION_EXCEEDED",
98+
DISK_SPACE_EXCEEDED: "DISK_SPACE_EXCEEDED",
99+
POD_EVICTED: "POD_EVICTED",
100+
POD_UNKNOWN_ERROR: "POD_UNKNOWN_ERROR",
98101
} as const;
99102

100103
export const TaskRunInternalError = z.object({
@@ -118,6 +121,9 @@ export const TaskRunInternalError = z.object({
118121
"TASK_RUN_HEARTBEAT_TIMEOUT",
119122
"TASK_RUN_CRASHED",
120123
"MAX_DURATION_EXCEEDED",
124+
"DISK_SPACE_EXCEEDED",
125+
"POD_EVICTED",
126+
"POD_UNKNOWN_ERROR",
121127
]),
122128
message: z.string().optional(),
123129
stackTrace: z.string().optional(),

packages/core/src/v3/schemas/messages.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
TaskRunExecution,
66
TaskRunExecutionResult,
77
TaskRunFailedExecutionResult,
8+
TaskRunInternalError,
89
} from "./common.js";
910
import { TaskResource } from "./resources.js";
1011
import {
@@ -252,6 +253,7 @@ export const ProviderToPlatformMessages = {
252253
message: z.string().optional(),
253254
logs: z.string().optional(),
254255
overrideCompletion: z.boolean().optional(),
256+
errorCode: TaskRunInternalError.shape.code.optional(),
255257
}),
256258
},
257259
INDEXING_FAILED: {

0 commit comments

Comments
 (0)