Skip to content

Commit c8dec74

Browse files
committed
Fix for OOM retrying
1 parent c46d560 commit c8dec74

File tree

2 files changed

+26
-23
lines changed

2 files changed

+26
-23
lines changed

internal-packages/run-engine/src/engine/retrying.ts

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
TaskRunExecutionRetry,
77
taskRunErrorEnhancer,
88
sanitizeError,
9+
calculateNextRetryDelay,
910
} from "@trigger.dev/core/v3";
1011
import { PrismaClientOrTransaction, TaskRunStatus } from "@trigger.dev/database";
1112
import { MAX_TASK_RUN_ATTEMPTS } from "./consts";
@@ -47,26 +48,33 @@ export async function retryOutcomeFromCompletion(
4748

4849
const sanitizedError = sanitizeError(error);
4950

50-
// No retry settings
51-
if (!retrySettings) {
52-
return { outcome: "fail_run", sanitizedError };
53-
}
54-
5551
// OOM error (retry on a larger machine or fail)
5652
if (isOOMRunError(error)) {
57-
const newMachine = await retryOOMOnMachine(prisma, runId);
58-
if (!newMachine) {
53+
const oomResult = await retryOOMOnMachine(prisma, runId);
54+
if (!oomResult) {
55+
return { outcome: "fail_run", sanitizedError, wasOOMError: true };
56+
}
57+
58+
const delay = calculateNextRetryDelay(oomResult.retrySettings, attemptNumber ?? 1);
59+
60+
if (!delay) {
61+
//no more retries left
5962
return { outcome: "fail_run", sanitizedError, wasOOMError: true };
6063
}
6164

6265
return {
6366
outcome: "retry",
6467
method: "queue",
65-
settings: retrySettings,
66-
machine: newMachine,
68+
machine: oomResult.machine,
69+
settings: { timestamp: Date.now() + delay, delay },
6770
};
6871
}
6972

73+
// No retry settings
74+
if (!retrySettings) {
75+
return { outcome: "fail_run", sanitizedError };
76+
}
77+
7078
// Not a retriable error: fail
7179
const retriableError = shouldRetryError(taskRunErrorEnhancer(error));
7280
if (!retriableError) {
@@ -112,7 +120,7 @@ export async function retryOutcomeFromCompletion(
112120
async function retryOOMOnMachine(
113121
prisma: PrismaClientOrTransaction,
114122
runId: string
115-
): Promise<string | undefined> {
123+
): Promise<{ machine: string; retrySettings: RetryOptions } | undefined> {
116124
try {
117125
const run = await prisma.taskRun.findFirst({
118126
where: {
@@ -153,7 +161,7 @@ async function retryOOMOnMachine(
153161
return;
154162
}
155163

156-
return retryMachine;
164+
return { machine: retryMachine, retrySettings: parsedRetryConfig.data };
157165
} catch (error) {
158166
console.error("[FailedTaskRunRetryHelper] Failed to get execution retry", {
159167
runId,

internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { trace } from "@opentelemetry/api";
88
import { expect } from "vitest";
99
import { EventBusEventArgs } from "../eventBus.js";
1010
import { RunEngine } from "../index.js";
11+
import { setTimeout } from "node:timers/promises";
1112

1213
describe("RunEngine attempt failures", () => {
1314
containerTest(
@@ -479,10 +480,6 @@ describe("RunEngine attempt failures", () => {
479480
ok: false,
480481
id: dequeued[0].run.id,
481482
error,
482-
retry: {
483-
timestamp: Date.now(),
484-
delay: 0,
485-
},
486483
},
487484
});
488485

@@ -603,10 +600,6 @@ describe("RunEngine attempt failures", () => {
603600
ok: false,
604601
id: dequeued[0].run.id,
605602
error,
606-
retry: {
607-
timestamp: Date.now(),
608-
delay: 0,
609-
},
610603
},
611604
});
612605

@@ -714,6 +707,8 @@ describe("RunEngine attempt failures", () => {
714707

715708
//create background worker
716709
await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, {
710+
maxTimeoutInMs: 10,
711+
maxAttempts: 10,
717712
outOfMemory: {
718713
machine: "small-2x",
719714
},
@@ -768,10 +763,6 @@ describe("RunEngine attempt failures", () => {
768763
ok: false,
769764
id: dequeued[0].run.id,
770765
error,
771-
retry: {
772-
timestamp: Date.now(),
773-
delay: 0,
774-
},
775766
},
776767
});
777768

@@ -787,12 +778,16 @@ describe("RunEngine attempt failures", () => {
787778
expect(executionData.run.attemptNumber).toBe(1);
788779
expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE");
789780

781+
//wait for 1s
782+
await setTimeout(1_000);
783+
790784
//dequeue again
791785
const dequeued2 = await engine.dequeueFromMasterQueue({
792786
consumerId: "test_12345",
793787
masterQueue: run.masterQueue,
794788
maxRunCount: 10,
795789
});
790+
expect(dequeued2.length).toBe(1);
796791

797792
//create second attempt
798793
const attemptResult2 = await engine.startRunAttempt({

0 commit comments

Comments
 (0)