Skip to content

Commit 58567bf

Browse files
authored
Add v4 timeline metrics to prod runs (#1884)
* set correct worker name on create * dev: kill run process on failed cancel attempt * add missing prod timeline events * add more debug logs after api call failures * import events are always visible * set dequeued at on new runners * skip fork event test * fix timeline test for import event
1 parent e7bca6f commit 58567bf

File tree

8 files changed

+126
-22
lines changed

8 files changed

+126
-22
lines changed

apps/supervisor/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ class ManagedSupervisor {
224224

225225
try {
226226
await this.workloadManager.create({
227+
dequeuedAt: message.dequeuedAt,
227228
envId: message.environment.id,
228229
envType: message.environment.type,
229230
image: message.image,

apps/supervisor/src/workloadManager/docker.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ export class DockerWorkloadManager implements WorkloadManager {
2828
"run",
2929
"--detach",
3030
`--network=${env.DOCKER_NETWORK}`,
31+
`--env=TRIGGER_DEQUEUED_AT_MS=${opts.dequeuedAt.getTime()}`,
32+
`--env=TRIGGER_POD_SCHEDULED_AT_MS=${Date.now()}`,
3133
`--env=TRIGGER_ENV_ID=${opts.envId}`,
3234
`--env=TRIGGER_RUN_ID=${opts.runFriendlyId}`,
3335
`--env=TRIGGER_SNAPSHOT_ID=${opts.snapshotFriendlyId}`,

apps/supervisor/src/workloadManager/kubernetes.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ export class KubernetesWorkloadManager implements WorkloadManager {
6161
],
6262
resources: this.#getResourcesForMachine(opts.machine),
6363
env: [
64+
{
65+
name: "TRIGGER_DEQUEUED_AT_MS",
66+
value: opts.dequeuedAt.getTime().toString(),
67+
},
68+
{
69+
name: "TRIGGER_POD_SCHEDULED_AT_MS",
70+
value: Date.now().toString(),
71+
},
6472
{
6573
name: "TRIGGER_RUN_ID",
6674
value: opts.runFriendlyId,
@@ -97,7 +105,11 @@ export class KubernetesWorkloadManager implements WorkloadManager {
97105
},
98106
{
99107
name: "TRIGGER_WORKER_INSTANCE_NAME",
100-
value: env.TRIGGER_WORKER_INSTANCE_NAME,
108+
valueFrom: {
109+
fieldRef: {
110+
fieldPath: "spec.nodeName",
111+
},
112+
},
101113
},
102114
{
103115
name: "OTEL_EXPORTER_OTLP_ENDPOINT",

apps/supervisor/src/workloadManager/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ export interface WorkloadManagerCreateOptions {
2121
machine: MachinePreset;
2222
version: string;
2323
nextAttemptNumber?: number;
24+
dequeuedAt: Date;
2425
// identifiers
2526
envId: string;
2627
envType: EnvironmentType;

apps/webapp/app/utils/timelineSpanEvents.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ function getAdminOnlyForEvent(event: string): boolean {
171171
return true;
172172
}
173173
case "import": {
174-
return true;
174+
return false;
175175
}
176176
case "lazy_payload": {
177177
return true;

apps/webapp/test/timelineSpanEvents.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ describe("createTimelineSpanEventsFromSpanEvents", () => {
6262
const result = createTimelineSpanEventsFromSpanEvents(sampleSpanEvents, false);
6363

6464
// Only dequeue and fork events should be visible for non-admins
65-
expect(result.length).toBe(2);
65+
expect(result.length).toBe(3);
6666
expect(result.some((event) => event.name === "Dequeued")).toBe(true);
6767
expect(result.some((event) => event.name === "Launched")).toBe(true);
6868
expect(result.some((event) => event.name === "Attempt created")).toBe(false);
69-
expect(result.some((event) => event.name.includes("Importing"))).toBe(false);
69+
expect(result.some((event) => event.name.includes("Importing"))).toBe(true);
7070
});
7171

7272
test("should include all events when isAdmin is true", () => {
@@ -220,7 +220,7 @@ describe("createTimelineSpanEventsFromSpanEvents", () => {
220220
expect(result.some((event) => event.name === "Attempt created")).toBe(false);
221221
});
222222

223-
test("should filter import events for non-admin when fork event exists", () => {
223+
test.skip("should filter import events for non-admin when fork event exists", () => {
224224
const result = createTimelineSpanEventsFromSpanEvents(sampleSpanEvents, false);
225225

226226
// With fork event, import should be hidden for non-admins

packages/cli-v3/src/entryPoints/dev-run-controller.ts

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -369,11 +369,15 @@ export class DevRunController {
369369
try {
370370
await this.cancelAttempt();
371371
} catch (error) {
372-
logger.debug("Failed to cancel attempt, shutting down", {
372+
logger.debug("Failed to cancel attempt, killing task run process", {
373373
error,
374374
});
375375

376-
//todo kill the process?
376+
try {
377+
await this.taskRunProcess?.kill("SIGKILL");
378+
} catch (error) {
379+
logger.debug("Failed to cancel attempt, failed to kill task run process", { error });
380+
}
377381

378382
return;
379383
}
@@ -512,8 +516,6 @@ export class DevRunController {
512516
snapshot: snapshot.friendlyId,
513517
});
514518

515-
// TODO: We may already be executing this run, this may be a new attempt
516-
// This is the only case where incrementing the attempt number is allowed
517519
this.enterRunPhase(run, snapshot);
518520

519521
const metrics = [
@@ -539,9 +541,6 @@ export class DevRunController {
539541
try {
540542
return await this.executeRun({ run, snapshot, execution, envVars, metrics });
541543
} catch (error) {
542-
// TODO: Handle the case where we're in the warm start phase or executing a new run
543-
// This can happen if we kill the run while it's still executing, e.g. after receiving an attempt number mismatch
544-
545544
logger.debug("Error while executing attempt", {
546545
error,
547546
});
@@ -570,8 +569,6 @@ export class DevRunController {
570569
error: completionResult.error,
571570
});
572571

573-
// TODO: Maybe we should keep retrying for a while longer
574-
575572
this.runFinished();
576573
return;
577574
}

packages/cli-v3/src/entryPoints/managed-run-controller.ts

Lines changed: 99 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
type CompleteRunAttemptResult,
99
HeartbeatService,
1010
type RunExecutionData,
11+
type TaskRunExecutionMetrics,
1112
type TaskRunExecutionResult,
1213
type TaskRunFailedExecutionResult,
1314
WorkerManifest,
@@ -25,6 +26,11 @@ import { assertExhaustive } from "../utilities/assertExhaustive.js";
2526
import { setTimeout as sleep } from "timers/promises";
2627
import { io, type Socket } from "socket.io-client";
2728

29+
const DateEnv = z
30+
.string()
31+
.transform((val) => new Date(parseInt(val, 10)))
32+
.pipe(z.date());
33+
2834
// All IDs are friendly IDs
2935
const Env = z.object({
3036
// Set at build time
@@ -50,6 +56,10 @@ const Env = z.object({
5056
TRIGGER_RUNNER_ID: z.string(),
5157
TRIGGER_METADATA_URL: z.string().optional(),
5258

59+
// Timeline metrics
60+
TRIGGER_POD_SCHEDULED_AT_MS: DateEnv,
61+
TRIGGER_DEQUEUED_AT_MS: DateEnv,
62+
5363
// May be overridden
5464
TRIGGER_SUPERVISOR_API_PROTOCOL: z.enum(["http", "https"]),
5565
TRIGGER_SUPERVISOR_API_DOMAIN: z.string(),
@@ -238,6 +248,14 @@ class ManagedRunController {
238248

239249
if (!response.success) {
240250
console.error("[ManagedRunController] Heartbeat failed", { error: response.error });
251+
252+
this.sendDebugLog({
253+
runId: this.runFriendlyId,
254+
message: "heartbeat: failed",
255+
properties: {
256+
error: response.error,
257+
},
258+
});
241259
}
242260
},
243261
intervalMs: this.heartbeatIntervalSeconds * 1000,
@@ -620,6 +638,14 @@ class ManagedRunController {
620638
if (!continuationResult.success) {
621639
console.error("Failed to continue execution", { error: continuationResult.error });
622640

641+
this.sendDebugLog({
642+
runId: run.friendlyId,
643+
message: "failed to continue execution",
644+
properties: {
645+
error: continuationResult.error,
646+
},
647+
});
648+
623649
this.waitForNextRun();
624650
return;
625651
}
@@ -734,10 +760,14 @@ class ManagedRunController {
734760
private async startAndExecuteRunAttempt({
735761
runFriendlyId,
736762
snapshotFriendlyId,
763+
dequeuedAt,
764+
podScheduledAt,
737765
isWarmStart = false,
738766
}: {
739767
runFriendlyId: string;
740768
snapshotFriendlyId: string;
769+
dequeuedAt?: Date;
770+
podScheduledAt?: Date;
741771
isWarmStart?: boolean;
742772
}) {
743773
if (!this.socket) {
@@ -749,39 +779,79 @@ class ManagedRunController {
749779
snapshot: { friendlyId: snapshotFriendlyId },
750780
});
751781

782+
const attemptStartedAt = Date.now();
783+
752784
const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, {
753785
isWarmStart,
754786
});
755787

756788
if (!start.success) {
757789
console.error("[ManagedRunController] Failed to start run", { error: start.error });
758790

791+
this.sendDebugLog({
792+
runId: runFriendlyId,
793+
message: "failed to start run attempt",
794+
properties: {
795+
error: start.error,
796+
},
797+
});
798+
759799
this.waitForNextRun();
760800
return;
761801
}
762802

803+
const attemptDuration = Date.now() - attemptStartedAt;
804+
763805
const { run, snapshot, execution, envVars } = start.data;
764806

765807
logger.debug("[ManagedRunController] Started run", {
766808
runId: run.friendlyId,
767809
snapshot: snapshot.friendlyId,
768810
});
769811

770-
// TODO: We may already be executing this run, this may be a new attempt
771-
// This is the only case where incrementing the attempt number is allowed
772812
this.enterRunPhase(run, snapshot);
773813

814+
const metrics = [
815+
{
816+
name: "start",
817+
event: "create_attempt",
818+
timestamp: attemptStartedAt,
819+
duration: attemptDuration,
820+
},
821+
]
822+
.concat(
823+
dequeuedAt
824+
? [
825+
{
826+
name: "start",
827+
event: "dequeue",
828+
timestamp: dequeuedAt.getTime(),
829+
duration: 0,
830+
},
831+
]
832+
: []
833+
)
834+
.concat(
835+
podScheduledAt
836+
? [
837+
{
838+
name: "start",
839+
event: "pod_scheduled",
840+
timestamp: podScheduledAt.getTime(),
841+
duration: 0,
842+
},
843+
]
844+
: []
845+
) satisfies TaskRunExecutionMetrics;
846+
774847
const taskRunEnv = {
775848
...gatherProcessEnv(),
776849
...envVars,
777850
};
778851

779852
try {
780-
return await this.executeRun({ run, snapshot, envVars: taskRunEnv, execution });
853+
return await this.executeRun({ run, snapshot, envVars: taskRunEnv, execution, metrics });
781854
} catch (error) {
782-
// TODO: Handle the case where we're in the warm start phase or executing a new run
783-
// This can happen if we kill the run while it's still executing, e.g. after receiving an attempt number mismatch
784-
785855
console.error("Error while executing attempt", {
786856
error,
787857
});
@@ -810,7 +880,13 @@ class ManagedRunController {
810880
error: completionResult.error,
811881
});
812882

813-
// TODO: Maybe we should keep retrying for a while longer
883+
this.sendDebugLog({
884+
runId: run.friendlyId,
885+
message: "completion: failed to submit after error",
886+
properties: {
887+
error: completionResult.error,
888+
},
889+
});
814890

815891
this.waitForNextRun();
816892
return;
@@ -923,6 +999,7 @@ class ManagedRunController {
923999
this.startAndExecuteRunAttempt({
9241000
runFriendlyId: nextRun.run.friendlyId,
9251001
snapshotFriendlyId: nextRun.snapshot.friendlyId,
1002+
dequeuedAt: nextRun.dequeuedAt,
9261003
isWarmStart: true,
9271004
}).finally(() => {});
9281005
return;
@@ -1032,7 +1109,10 @@ class ManagedRunController {
10321109
snapshot,
10331110
envVars,
10341111
execution,
1035-
}: WorkloadRunAttemptStartResponseBody) {
1112+
metrics,
1113+
}: WorkloadRunAttemptStartResponseBody & {
1114+
metrics?: TaskRunExecutionMetrics;
1115+
}) {
10361116
this.snapshotPoller.start();
10371117

10381118
if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) {
@@ -1058,6 +1138,7 @@ class ManagedRunController {
10581138
payload: {
10591139
execution,
10601140
traceContext: execution.run.traceContext ?? {},
1141+
metrics,
10611142
},
10621143
messageId: run.friendlyId,
10631144
env: envVars,
@@ -1096,6 +1177,14 @@ class ManagedRunController {
10961177
error: completionResult.error,
10971178
});
10981179

1180+
this.sendDebugLog({
1181+
runId: run.friendlyId,
1182+
message: "completion: failed to submit",
1183+
properties: {
1184+
error: completionResult.error,
1185+
},
1186+
});
1187+
10991188
this.waitForNextRun();
11001189
return;
11011190
}
@@ -1212,6 +1301,8 @@ class ManagedRunController {
12121301
this.startAndExecuteRunAttempt({
12131302
runFriendlyId: env.TRIGGER_RUN_ID,
12141303
snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID,
1304+
dequeuedAt: env.TRIGGER_DEQUEUED_AT_MS,
1305+
podScheduledAt: env.TRIGGER_POD_SCHEDULED_AT_MS,
12151306
}).finally(() => {});
12161307
return;
12171308
}

0 commit comments

Comments
 (0)