Skip to content

Commit 0f6e24d

Browse files
committed
Fix OOM detection, again
1 parent 9d20ef7 commit 0f6e24d

File tree

8 files changed

+67
-40
lines changed

8 files changed

+67
-40
lines changed

apps/kubernetes-provider/src/index.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -529,27 +529,27 @@ provider.listen();
529529

530530
const taskMonitor = new TaskMonitor({
531531
runtimeEnv: RUNTIME_ENV,
532-
onIndexFailure: async (deploymentId, failureInfo) => {
533-
logger.log("Indexing failed", { deploymentId, failureInfo });
532+
onIndexFailure: async (deploymentId, details) => {
533+
logger.log("Indexing failed", { deploymentId, details });
534534

535535
try {
536536
provider.platformSocket.send("INDEXING_FAILED", {
537537
deploymentId,
538538
error: {
539-
name: `Crashed with exit code ${failureInfo.exitCode}`,
540-
message: failureInfo.reason,
541-
stack: failureInfo.logs,
539+
name: `Crashed with exit code ${details.exitCode}`,
540+
message: details.reason,
541+
stack: details.logs,
542542
},
543543
});
544544
} catch (error) {
545545
logger.error(error);
546546
}
547547
},
548-
onRunFailure: async (runId, failureInfo) => {
549-
logger.log("Run failed:", { runId, failureInfo });
548+
onRunFailure: async (runId, details) => {
549+
logger.log("Run failed:", { runId, details });
550550

551551
try {
552-
provider.platformSocket.send("WORKER_CRASHED", { runId, ...failureInfo });
552+
provider.platformSocket.send("WORKER_CRASHED", { runId, ...details });
553553
} catch (error) {
554554
logger.error(error);
555555
}

apps/kubernetes-provider/src/taskMonitor.ts

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,20 @@
11
import * as k8s from "@kubernetes/client-node";
22
import { SimpleLogger } from "@trigger.dev/core-apps";
3+
import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core-apps/process";
34
import { setTimeout } from "timers/promises";
45
import PQueue from "p-queue";
6+
import type { Prettify } from "@trigger.dev/core/v3";
57

6-
type IndexFailureHandler = (
7-
deploymentId: string,
8-
failureInfo: {
9-
exitCode: number;
10-
reason: string;
11-
logs: string;
12-
}
13-
) => Promise<any>;
14-
15-
type RunFailureHandler = (
16-
runId: string,
17-
failureInfo: {
18-
exitCode: number;
19-
reason: string;
20-
logs: string;
21-
}
22-
) => Promise<any>;
8+
type FailureDetails = Prettify<{
9+
exitCode: number;
10+
reason: string;
11+
logs: string;
12+
overrideCompletion: boolean;
13+
}>;
14+
15+
type IndexFailureHandler = (deploymentId: string, details: FailureDetails) => Promise<any>;
16+
17+
type RunFailureHandler = (runId: string, details: FailureDetails) => Promise<any>;
2318

2419
type TaskMonitorOptions = {
2520
runtimeEnv: "local" | "kubernetes";
@@ -144,8 +139,7 @@ export class TaskMonitor {
144139
const containerState = this.#getContainerStateSummary(containerStatus.state);
145140
const exitCode = containerState.exitCode ?? -1;
146141

147-
// We use this special exit code to signal any errors were already handled elsewhere
148-
if (exitCode === 111) {
142+
if (exitCode === EXIT_CODE_ALREADY_HANDLED) {
149143
return;
150144
}
151145

@@ -162,6 +156,7 @@ export class TaskMonitor {
162156

163157
let reason = rawReason || "Unknown error";
164158
let logs = rawLogs || "";
159+
let overrideCompletion = false;
165160

166161
switch (rawReason) {
167162
case "Error":
@@ -181,8 +176,10 @@ export class TaskMonitor {
181176
}
182177
break;
183178
case "OOMKilled":
184-
reason =
185-
"Process ran out of memory! Try choosing a machine preset with more memory for this task.";
179+
overrideCompletion = true;
180+
reason = `${
181+
exitCode === EXIT_CODE_CHILD_NONZERO ? "Child process" : "Parent process"
182+
} ran out of memory! Try choosing a machine preset with more memory for this task.`;
186183
break;
187184
default:
188185
break;
@@ -192,7 +189,8 @@ export class TaskMonitor {
192189
exitCode,
193190
reason,
194191
logs,
195-
};
192+
overrideCompletion,
193+
} satisfies FailureDetails;
196194

197195
const app = pod.metadata?.labels?.app;
198196

apps/webapp/app/v3/services/crashTaskRun.server.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export type CrashTaskRunServiceOptions = {
1313
logs?: string;
1414
crashAttempts?: boolean;
1515
crashedAt?: Date;
16+
overrideCompletion?: boolean;
1617
};
1718

1819
export class CrashTaskRunService extends BaseService {
@@ -36,7 +37,7 @@ export class CrashTaskRunService extends BaseService {
3637
}
3738

3839
// Make sure the task run is in a crashable state
39-
if (!isCrashableRunStatus(taskRun.status)) {
40+
if (!opts.overrideCompletion && !isCrashableRunStatus(taskRun.status)) {
4041
logger.error("Task run is not in a crashable state", { runId, status: taskRun.status });
4142
return;
4243
}

apps/webapp/app/v3/services/deploymentIndexFailed.server.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ export class DeploymentIndexFailed extends BaseService {
1818
message: string;
1919
stack?: string;
2020
stderr?: string;
21-
}
21+
},
22+
overrideCompletion = false
2223
) {
2324
const isFriendlyId = maybeFriendlyId.startsWith("deployment_");
2425

@@ -38,6 +39,15 @@ export class DeploymentIndexFailed extends BaseService {
3839
}
3940

4041
if (FINAL_DEPLOYMENT_STATUSES.includes(deployment.status)) {
42+
if (overrideCompletion) {
43+
logger.error("No support for overriding final deployment statuses just yet", {
44+
id: deployment.id,
45+
status: deployment.status,
46+
previousError: deployment.errorData,
47+
incomingError: error,
48+
});
49+
}
50+
4151
logger.error("Worker deployment already in final state", {
4252
id: deployment.id,
4353
status: deployment.status,

packages/cli-v3/src/workers/prod/entry-point.ts

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ import {
55
PreStopCauses,
66
ProdWorkerToCoordinatorMessages,
77
TaskResource,
8+
TaskRunErrorCodes,
89
TaskRunFailedExecutionResult,
910
WaitReason,
1011
} from "@trigger.dev/core/v3";
1112
import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
1213
import { HttpReply, getRandomPortNumber } from "@trigger.dev/core-apps/http";
1314
import { SimpleLogger } from "@trigger.dev/core-apps/logger";
15+
import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core-apps/process";
1416
import { readFile } from "node:fs/promises";
1517
import { createServer } from "node:http";
1618
import { ProdBackgroundWorker } from "./backgroundWorker";
@@ -97,12 +99,12 @@ class ProdWorker {
9799
logger.log("Unhandled signal", { signal });
98100
}
99101

100-
async #exitGracefully(gracefulExitTimeoutElapsed = false) {
102+
async #exitGracefully(gracefulExitTimeoutElapsed = false, exitCode = 0) {
101103
await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
102104

103105
if (!gracefulExitTimeoutElapsed) {
104106
// TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies
105-
process.exit(0);
107+
process.exit(exitCode);
106108
}
107109
}
108110

@@ -315,7 +317,11 @@ class ProdWorker {
315317
}
316318
}
317319

318-
async #prepareForRetry(willCheckpointAndRestore: boolean, shouldExit: boolean) {
320+
async #prepareForRetry(
321+
willCheckpointAndRestore: boolean,
322+
shouldExit: boolean,
323+
exitCode?: number
324+
) {
319325
logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit });
320326

321327
// Graceful shutdown on final attempt
@@ -324,7 +330,7 @@ class ProdWorker {
324330
logger.log("WARNING: Will checkpoint but also requested exit. This won't end well.");
325331
}
326332

327-
await this.#exitGracefully();
333+
await this.#exitGracefully(false, exitCode);
328334
return;
329335
}
330336

@@ -516,7 +522,14 @@ class ProdWorker {
516522

517523
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
518524

519-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
525+
const exitCode =
526+
!completion.ok &&
527+
completion.error.type === "INTERNAL_ERROR" &&
528+
completion.error.code === TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE
529+
? EXIT_CODE_CHILD_NONZERO
530+
: 0;
531+
532+
this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
520533
} catch (error) {
521534
const completion: TaskRunFailedExecutionResult = {
522535
ok: false,
@@ -709,8 +722,8 @@ class ProdWorker {
709722
}
710723

711724
await setTimeout(200);
712-
// Use exit code 111 so we can ignore those failures in the task monitor
713-
process.exit(111);
725+
726+
process.exit(EXIT_CODE_ALREADY_HANDLED);
714727
}
715728
}
716729

packages/core-apps/src/process.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export const EXIT_CODE_ALREADY_HANDLED = 111;
2+
export const EXIT_CODE_CHILD_NONZERO = 112;

packages/core-apps/src/provider.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { getRandomPortNumber, HttpReply, getTextBody } from "./http";
1414
import { SimpleLogger } from "./logger";
1515
import { isExecaChildProcess } from "./checkpoints";
1616
import { setTimeout } from "node:timers/promises";
17+
import { EXIT_CODE_ALREADY_HANDLED } from "./process";
1718

1819
const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
1920
const MACHINE_NAME = process.env.MACHINE_NAME || "local";
@@ -198,7 +199,7 @@ export class ProviderShell implements Provider {
198199
stderr: error.stderr,
199200
});
200201

201-
if (error.exitCode === 111) {
202+
if (error.exitCode === EXIT_CODE_ALREADY_HANDLED) {
202203
logger.error("Index failure already reported by the worker", {
203204
socketMessage: message,
204205
});

packages/core/src/v3/schemas/messages.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ export const ProviderToPlatformMessages = {
334334
exitCode: z.number().optional(),
335335
message: z.string().optional(),
336336
logs: z.string().optional(),
337+
overrideCompletion: z.boolean().optional(),
337338
}),
338339
},
339340
INDEXING_FAILED: {
@@ -346,6 +347,7 @@ export const ProviderToPlatformMessages = {
346347
stack: z.string().optional(),
347348
stderr: z.string().optional(),
348349
}),
350+
overrideCompletion: z.string().optional(),
349351
}),
350352
},
351353
};

0 commit comments

Comments
 (0)