Skip to content

Commit 5fa2acc

Browse files
committed
[server] Make sure to fail a WorkspaceInstance on any error
1 parent 0d936f1 commit 5fa2acc

File tree

2 files changed

+32
-32
lines changed

2 files changed

+32
-32
lines changed

components/server/src/prometheus-metrics.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ export type FailedInstanceStartReason =
200200
| "clusterSelectionFailed"
201201
| "startOnClusterFailed"
202202
| "imageBuildFailed"
203+
| "imageBuildFailedUser"
203204
| "resourceExhausted"
204205
| "workspaceClusterMaintenance"
205206
| "other";

components/server/src/workspace/workspace-starter.ts

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,7 @@ export class WorkspaceStarter {
557557
additionalAuth,
558558
forceRebuild,
559559
forceRebuild,
560+
abortSignal,
560561
region,
561562
);
562563

@@ -577,23 +578,23 @@ export class WorkspaceStarter {
577578
startRequest.setSpec(spec);
578579
startRequest.setServicePrefix(workspace.id);
579580

580-
if (instance.status.phase === "pending") {
581-
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
582-
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
583-
if (workspaceAlreadyExists) {
584-
log.debug(
585-
{ instanceId: instance.id, workspaceId: instance.workspaceId },
586-
"workspace already exists, not starting again",
587-
{ phase: instance.status.phase },
588-
);
589-
return;
590-
}
591-
}
592-
593581
// choose a cluster and start the instance
594582
let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
595583
let retries = 0;
596584
try {
585+
if (instance.status.phase === "pending") {
586+
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
587+
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
588+
if (workspaceAlreadyExists) {
589+
log.debug(
590+
{ instanceId: instance.id, workspaceId: instance.workspaceId },
591+
"workspace already exists, not starting again",
592+
{ phase: instance.status.phase },
593+
);
594+
return;
595+
}
596+
}
597+
597598
for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
598599
if (abortSignal.aborted) {
599600
return;
@@ -657,6 +658,12 @@ export class WorkspaceStarter {
657658
});
658659
}
659660
} catch (err) {
661+
if (!(err instanceof StartInstanceError)) {
662+
// fallback in case we did not already handle this error
663+
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);
664+
err = new StartInstanceError("other", err); // don't throw because there's nobody catching it. We just want to log/trace it.
665+
}
666+
660667
this.logAndTraceStartWorkspaceError({ span }, logCtx, err);
661668
} finally {
662669
if (abortSignal.aborted) {
@@ -809,8 +816,9 @@ export class WorkspaceStarter {
809816
// We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
810817
// We have to set that status ourselves.
811818
instance.status.phase = "stopped";
812-
instance.stoppingTime = new Date().toISOString();
813-
instance.stoppedTime = new Date().toISOString();
819+
const now = new Date().toISOString();
820+
instance.stoppingTime = now;
821+
instance.stoppedTime = now;
814822

815823
instance.status.conditions.failed = err.toString();
816824
instance.status.message = `Workspace cannot be started: ${err}`;
@@ -1199,6 +1207,7 @@ export class WorkspaceStarter {
11991207
additionalAuth: Map<string, string>,
12001208
ignoreBaseImageresolvedAndRebuildBase: boolean = false,
12011209
forceRebuild: boolean = false,
1210+
abortSignal: RedlockAbortSignal,
12021211
region?: WorkspaceRegion,
12031212
): Promise<WorkspaceInstance> {
12041213
const span = TraceContext.startSpan("buildWorkspaceImage", ctx);
@@ -1300,6 +1309,7 @@ export class WorkspaceStarter {
13001309
additionalAuth,
13011310
true,
13021311
forceRebuild,
1312+
abortSignal,
13031313
region,
13041314
);
13051315
} else {
@@ -1336,24 +1346,8 @@ export class WorkspaceStarter {
13361346
}
13371347

13381348
// This instance's image build "failed" as well, so mark it as such.
1339-
const now = new Date().toISOString();
1340-
instance = await this.workspaceDb.trace({ span }).updateInstancePartial(instance.id, {
1341-
status: { ...instance.status, phase: "stopped", conditions: { failed: message }, message },
1342-
stoppedTime: now,
1343-
stoppingTime: now,
1344-
});
1345-
1346-
// Mark the PrebuildWorkspace as failed
1347-
await this.failPrebuildWorkspace({ span }, err, workspace);
1349+
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);
13481350

1349-
// Publish updated workspace instance
1350-
await this.publisher.publishInstanceUpdate({
1351-
workspaceID: workspace.ownerId,
1352-
instanceID: instance.id,
1353-
ownerID: workspace.ownerId,
1354-
});
1355-
1356-
TraceContext.setError({ span }, err);
13571351
const looksLikeUserError = (msg: string): boolean => {
13581352
return msg.startsWith("build failed:") || msg.includes("headless task failed:");
13591353
};
@@ -1363,6 +1357,8 @@ export class WorkspaceStarter {
13631357
`workspace image build failed: ${message}`,
13641358
{ looksLikeUserError: true },
13651359
);
1360+
err = new StartInstanceError("imageBuildFailedUser", err);
1361+
// Don't report this as "failed" to our metrics as it would trigger an alert
13661362
} else {
13671363
log.error(
13681364
{ instanceId: instance.id, userId: user.id, workspaceId: workspace.id },
@@ -1956,6 +1952,9 @@ export class WorkspaceStarter {
19561952
await client.describeWorkspace(ctx, req);
19571953
return true;
19581954
} catch (err) {
1955+
if (isClusterMaintenanceError(err)) {
1956+
throw err;
1957+
}
19591958
return false;
19601959
}
19611960
}

0 commit comments

Comments
 (0)