Skip to content

Commit 79f2c8b

Browse files
committed
only replay rpcs on true reconnects
1 parent 7d2e5a4 commit 79f2c8b

File tree

2 files changed

+140
-116
lines changed

2 files changed

+140
-116
lines changed

apps/coordinator/src/chaosMonkey.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class ChaosMonkeyError extends Error {
1010

1111
export class ChaosMonkey {
1212
private chaosEventRate = 0.2;
13+
private delayInSeconds = 45;
1314

1415
constructor(private enabled = false) {
1516
if (this.enabled) {
@@ -56,9 +57,9 @@ export class ChaosMonkey {
5657
console.log("🍌 Chaos monkey: Add delay");
5758

5859
if ($) {
59-
await $`sleep 300`;
60+
await $`sleep ${this.delayInSeconds}`;
6061
} else {
61-
await timeout(300_000);
62+
await timeout(this.delayInSeconds * 1000);
6263
}
6364
});
6465
}

packages/cli-v3/src/workers/prod/entry-point.ts

Lines changed: 137 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class ProdWorker {
6161

6262
private nextResumeAfter?: WaitReason;
6363
private waitForPostStart = false;
64+
private connectionCount = 0;
6465

6566
private waitForTaskReplay:
6667
| {
@@ -160,6 +161,7 @@ class ProdWorker {
160161
this.waitForPostStart = false;
161162

162163
this.#coordinatorSocket.close();
164+
this.connectionCount = 0;
163165

164166
let coordinatorHost = COORDINATOR_HOST;
165167

@@ -187,6 +189,7 @@ class ProdWorker {
187189
}
188190
}
189191

192+
// MARK: TASK WAIT
190193
async #waitForTaskHandler(message: OnWaitForTaskMessage, replayIdempotencyKey?: string) {
191194
const waitForTask = await defaultBackoff.execute(async ({ retry }) => {
192195
logger.log("Wait for task with backoff", { retry });
@@ -223,6 +226,7 @@ class ProdWorker {
223226
await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
224227

225228
if (willCheckpointAndRestore) {
229+
// TODO: where's the timeout?
226230
// We need to replay this on next connection if we don't receive RESUME_AFTER_DEPENDENCY within a reasonable time
227231
if (!this.waitForTaskReplay) {
228232
this.waitForTaskReplay = {
@@ -246,6 +250,7 @@ class ProdWorker {
246250
}
247251
}
248252

253+
// MARK: BATCH WAIT
249254
async #waitForBatchHandler(message: OnWaitForBatchMessage, replayIdempotencyKey?: string) {
250255
const waitForBatch = await defaultBackoff.execute(async ({ retry }) => {
251256
logger.log("Wait for batch with backoff", { retry });
@@ -306,6 +311,7 @@ class ProdWorker {
306311
}
307312
}
308313

314+
// MARK: WORKER CREATION
309315
#createBackgroundWorker() {
310316
const backgroundWorker = new ProdBackgroundWorker("worker.js", {
311317
projectConfig: __PROJECT_CONFIG__,
@@ -520,6 +526,7 @@ class ProdWorker {
520526
await this.#prepareForCheckpoint();
521527
}
522528

529+
// MARK: RETRY PREP
523530
async #prepareForRetry(
524531
willCheckpointAndRestore: boolean,
525532
shouldExit: boolean,
@@ -553,6 +560,7 @@ class ProdWorker {
553560
await this.#prepareForCheckpoint(false);
554561
}
555562

563+
// MARK: CHECKPOINT PREP
556564
async #prepareForCheckpoint(flush = true) {
557565
if (flush) {
558566
// Flush before checkpointing so we don't flush the same spans again after restore
@@ -664,6 +672,7 @@ class ProdWorker {
664672
});
665673
}
666674

675+
// MARK: ATTEMPT COMPLETION
667676
async #submitAttemptCompletion(
668677
execution: ProdTaskRunExecution,
669678
completion: TaskRunExecutionResult,
@@ -742,6 +751,7 @@ class ProdWorker {
742751
return headers;
743752
}
744753

754+
// MARK: COORDINATOR SOCKET
745755
#createCoordinatorSocket(host: string) {
746756
const extraHeaders = this.#returnValidatedExtraHeaders({
747757
"x-machine-name": MACHINE_NAME,
@@ -943,8 +953,12 @@ class ProdWorker {
943953
await this.#readyForLazyAttempt();
944954
},
945955
},
956+
// MARK: ON CONNECTION
946957
onConnection: async (socket, handler, sender, logger) => {
947-
logger.log("connected to coordinator", { status: this.#status });
958+
logger.log("connected to coordinator", {
959+
status: this.#status,
960+
connectionCount: ++this.connectionCount,
961+
});
948962

949963
// We need to send our current state to the coordinator
950964
socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId });
@@ -1101,153 +1115,161 @@ class ProdWorker {
11011115
} catch (error) {
11021116
logger.error("connection handler error", { error });
11031117
} finally {
1104-
const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
1105-
const cancellationDelay = 20_000;
1118+
if (this.connectionCount === 1) {
1119+
// Skip replays if this is the first connection, including post start
1120+
return;
1121+
}
11061122

1107-
if (this.waitForTaskReplay) {
1108-
logger.log("replaying wait for task", { ...this.waitForTaskReplay });
1123+
// This is a reconnect, so handle replays
1124+
this.#handleReplays();
1125+
}
1126+
},
1127+
onError: async (socket, err, logger) => {
1128+
logger.error("onError", {
1129+
error: {
1130+
name: err.name,
1131+
message: err.message,
1132+
},
1133+
});
1134+
},
1135+
});
11091136

1110-
const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
1137+
return coordinatorConnection;
1138+
}
11111139

1112-
// Give the platform some time to send RESUME_AFTER_DEPENDENCY
1113-
await timeout(cancellationDelay);
1140+
// MARK: REPLAYS
1141+
async #handleReplays() {
1142+
const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
1143+
const replayCancellationDelay = 20_000;
11141144

1115-
if (!this.waitForTaskReplay) {
1116-
logger.error("wait for task replay cancelled, discarding", {
1117-
originalMessage: { idempotencyKey, message, attempt },
1118-
});
1145+
if (this.waitForTaskReplay) {
1146+
logger.log("replaying wait for task", { ...this.waitForTaskReplay });
11191147

1120-
return;
1121-
}
1148+
const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
11221149

1123-
if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
1124-
logger.error("wait for task replay idempotency key mismatch, discarding", {
1125-
originalMessage: { idempotencyKey, message, attempt },
1126-
newMessage: this.waitForTaskReplay,
1127-
});
1150+
// Give the platform some time to send RESUME_AFTER_DEPENDENCY
1151+
await timeout(replayCancellationDelay);
11281152

1129-
return;
1130-
}
1153+
if (!this.waitForTaskReplay) {
1154+
logger.error("wait for task replay cancelled, discarding", {
1155+
originalMessage: { idempotencyKey, message, attempt },
1156+
});
11311157

1132-
try {
1133-
await backoff.wait(attempt + 1);
1158+
return;
1159+
}
11341160

1135-
await this.#waitForTaskHandler(message);
1136-
} catch (error) {
1137-
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1138-
logger.error("wait for task replay retry limit exceeded", { error });
1139-
} else {
1140-
logger.error("wait for task replay error", { error });
1141-
}
1142-
}
1161+
if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
1162+
logger.error("wait for task replay idempotency key mismatch, discarding", {
1163+
originalMessage: { idempotencyKey, message, attempt },
1164+
newMessage: this.waitForTaskReplay,
1165+
});
11431166

1144-
return;
1145-
}
1167+
return;
1168+
}
11461169

1147-
if (this.waitForBatchReplay) {
1148-
logger.log("replaying wait for batch", {
1149-
...this.waitForBatchReplay,
1150-
cancellationDelay,
1151-
});
1170+
try {
1171+
await backoff.wait(attempt + 1);
11521172

1153-
const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
1173+
await this.#waitForTaskHandler(message);
1174+
} catch (error) {
1175+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1176+
logger.error("wait for task replay retry limit exceeded", { error });
1177+
} else {
1178+
logger.error("wait for task replay error", { error });
1179+
}
1180+
}
11541181

1155-
// Give the platform some time to send RESUME_AFTER_DEPENDENCY
1156-
await timeout(cancellationDelay);
1182+
return;
1183+
}
11571184

1158-
if (!this.waitForBatchReplay) {
1159-
logger.error("wait for batch replay cancelled, discarding", {
1160-
originalMessage: { idempotencyKey, message, attempt },
1161-
});
1185+
if (this.waitForBatchReplay) {
1186+
logger.log("replaying wait for batch", {
1187+
...this.waitForBatchReplay,
1188+
cancellationDelay: replayCancellationDelay,
1189+
});
11621190

1163-
return;
1164-
}
1191+
const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
11651192

1166-
if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
1167-
logger.error("wait for batch replay idempotency key mismatch, discarding", {
1168-
originalMessage: { idempotencyKey, message, attempt },
1169-
newMessage: this.waitForBatchReplay,
1170-
});
1193+
// Give the platform some time to send RESUME_AFTER_DEPENDENCY
1194+
await timeout(replayCancellationDelay);
11711195

1172-
return;
1173-
}
1196+
if (!this.waitForBatchReplay) {
1197+
logger.error("wait for batch replay cancelled, discarding", {
1198+
originalMessage: { idempotencyKey, message, attempt },
1199+
});
11741200

1175-
try {
1176-
await backoff.wait(attempt + 1);
1201+
return;
1202+
}
11771203

1178-
await this.#waitForBatchHandler(message);
1179-
} catch (error) {
1180-
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1181-
logger.error("wait for batch replay retry limit exceeded", { error });
1182-
} else {
1183-
logger.error("wait for batch replay error", { error });
1184-
}
1185-
}
1204+
if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
1205+
logger.error("wait for batch replay idempotency key mismatch, discarding", {
1206+
originalMessage: { idempotencyKey, message, attempt },
1207+
newMessage: this.waitForBatchReplay,
1208+
});
11861209

1187-
return;
1188-
}
1210+
return;
1211+
}
11891212

1190-
if (this.submitAttemptCompletionReplay) {
1191-
logger.log("replaying attempt completion", {
1192-
...this.submitAttemptCompletionReplay,
1193-
cancellationDelay,
1194-
});
1213+
try {
1214+
await backoff.wait(attempt + 1);
11951215

1196-
const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
1216+
await this.#waitForBatchHandler(message);
1217+
} catch (error) {
1218+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1219+
logger.error("wait for batch replay retry limit exceeded", { error });
1220+
} else {
1221+
logger.error("wait for batch replay error", { error });
1222+
}
1223+
}
11971224

1198-
// Give the platform some time to send READY_FOR_RETRY
1199-
await timeout(cancellationDelay);
1225+
return;
1226+
}
12001227

1201-
if (!this.submitAttemptCompletionReplay) {
1202-
logger.error("attempt completion replay cancelled, discarding", {
1203-
originalMessage: { idempotencyKey, message, attempt },
1204-
});
1228+
if (this.submitAttemptCompletionReplay) {
1229+
logger.log("replaying attempt completion", {
1230+
...this.submitAttemptCompletionReplay,
1231+
cancellationDelay: replayCancellationDelay,
1232+
});
12051233

1206-
return;
1207-
}
1234+
const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
12081235

1209-
if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
1210-
logger.error("attempt completion replay idempotency key mismatch, discarding", {
1211-
originalMessage: { idempotencyKey, message, attempt },
1212-
newMessage: this.submitAttemptCompletionReplay,
1213-
});
1236+
// Give the platform some time to send READY_FOR_RETRY
1237+
await timeout(replayCancellationDelay);
12141238

1215-
return;
1216-
}
1239+
if (!this.submitAttemptCompletionReplay) {
1240+
logger.error("attempt completion replay cancelled, discarding", {
1241+
originalMessage: { idempotencyKey, message, attempt },
1242+
});
12171243

1218-
try {
1219-
await backoff.wait(attempt + 1);
1244+
return;
1245+
}
12201246

1221-
await this.#submitAttemptCompletion(
1222-
message.execution,
1223-
message.completion,
1224-
idempotencyKey
1225-
);
1226-
} catch (error) {
1227-
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1228-
logger.error("attempt completion replay retry limit exceeded", { error });
1229-
} else {
1230-
logger.error("attempt completion replay error", { error });
1231-
}
1232-
}
1247+
if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
1248+
logger.error("attempt completion replay idempotency key mismatch, discarding", {
1249+
originalMessage: { idempotencyKey, message, attempt },
1250+
newMessage: this.submitAttemptCompletionReplay,
1251+
});
12331252

1234-
return;
1235-
}
1253+
return;
1254+
}
1255+
1256+
try {
1257+
await backoff.wait(attempt + 1);
1258+
1259+
await this.#submitAttemptCompletion(message.execution, message.completion, idempotencyKey);
1260+
} catch (error) {
1261+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1262+
logger.error("attempt completion replay retry limit exceeded", { error });
1263+
} else {
1264+
logger.error("attempt completion replay error", { error });
12361265
}
1237-
},
1238-
onError: async (socket, err, logger) => {
1239-
logger.error("onError", {
1240-
error: {
1241-
name: err.name,
1242-
message: err.message,
1243-
},
1244-
});
1245-
},
1246-
});
1266+
}
12471267

1248-
return coordinatorConnection;
1268+
return;
1269+
}
12491270
}
12501271

1272+
// MARK: HTTP SERVER
12511273
#createHttpServer() {
12521274
const httpServer = createServer(async (req, res) => {
12531275
logger.log(`[${req.method}]`, req.url);
@@ -1273,6 +1295,7 @@ class ProdWorker {
12731295

12741296
case "/close": {
12751297
this.#coordinatorSocket.close();
1298+
this.connectionCount = 0;
12761299

12771300
return reply.text("Disconnected from coordinator");
12781301
}

0 commit comments

Comments
 (0)