@@ -557,6 +557,7 @@ export class WorkspaceStarter {
557
557
additionalAuth ,
558
558
forceRebuild ,
559
559
forceRebuild ,
560
+ abortSignal ,
560
561
region ,
561
562
) ;
562
563
@@ -577,23 +578,23 @@ export class WorkspaceStarter {
577
578
startRequest . setSpec ( spec ) ;
578
579
startRequest . setServicePrefix ( workspace . id ) ;
579
580
580
- if ( instance . status . phase === "pending" ) {
581
- // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
582
- const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
583
- if ( workspaceAlreadyExists ) {
584
- log . debug (
585
- { instanceId : instance . id , workspaceId : instance . workspaceId } ,
586
- "workspace already exists, not starting again" ,
587
- { phase : instance . status . phase } ,
588
- ) ;
589
- return ;
590
- }
591
- }
592
-
593
581
// choose a cluster and start the instance
594
582
let resp : StartWorkspaceResponse . AsObject | undefined = undefined ;
595
583
let retries = 0 ;
596
584
try {
585
+ if ( instance . status . phase === "pending" ) {
586
+ // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
587
+ const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
588
+ if ( workspaceAlreadyExists ) {
589
+ log . debug (
590
+ { instanceId : instance . id , workspaceId : instance . workspaceId } ,
591
+ "workspace already exists, not starting again" ,
592
+ { phase : instance . status . phase } ,
593
+ ) ;
594
+ return ;
595
+ }
596
+ }
597
+
597
598
for ( ; retries < MAX_INSTANCE_START_RETRIES ; retries ++ ) {
598
599
if ( abortSignal . aborted ) {
599
600
return ;
@@ -657,6 +658,12 @@ export class WorkspaceStarter {
657
658
} ) ;
658
659
}
659
660
} catch ( err ) {
661
+ if ( ! ( err instanceof StartInstanceError ) ) {
662
+ // fallback in case we did not already handle this error
663
+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
664
+ err = new StartInstanceError ( "other" , err ) ; // don't throw because there's nobody catching it. We just want to log/trace it.
665
+ }
666
+
660
667
this . logAndTraceStartWorkspaceError ( { span } , logCtx , err ) ;
661
668
} finally {
662
669
if ( abortSignal . aborted ) {
@@ -809,8 +816,9 @@ export class WorkspaceStarter {
809
816
// We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
810
817
// We have to set that status ourselves.
811
818
instance . status . phase = "stopped" ;
812
- instance . stoppingTime = new Date ( ) . toISOString ( ) ;
813
- instance . stoppedTime = new Date ( ) . toISOString ( ) ;
819
+ const now = new Date ( ) . toISOString ( ) ;
820
+ instance . stoppingTime = now ;
821
+ instance . stoppedTime = now ;
814
822
815
823
instance . status . conditions . failed = err . toString ( ) ;
816
824
instance . status . message = `Workspace cannot be started: ${ err } ` ;
@@ -1199,6 +1207,7 @@ export class WorkspaceStarter {
1199
1207
additionalAuth : Map < string , string > ,
1200
1208
ignoreBaseImageresolvedAndRebuildBase : boolean = false ,
1201
1209
forceRebuild : boolean = false ,
1210
+ abortSignal : RedlockAbortSignal ,
1202
1211
region ?: WorkspaceRegion ,
1203
1212
) : Promise < WorkspaceInstance > {
1204
1213
const span = TraceContext . startSpan ( "buildWorkspaceImage" , ctx ) ;
@@ -1300,6 +1309,7 @@ export class WorkspaceStarter {
1300
1309
additionalAuth ,
1301
1310
true ,
1302
1311
forceRebuild ,
1312
+ abortSignal ,
1303
1313
region ,
1304
1314
) ;
1305
1315
} else {
@@ -1336,24 +1346,8 @@ export class WorkspaceStarter {
1336
1346
}
1337
1347
1338
1348
// This instance's image build "failed" as well, so mark it as such.
1339
- const now = new Date ( ) . toISOString ( ) ;
1340
- instance = await this . workspaceDb . trace ( { span } ) . updateInstancePartial ( instance . id , {
1341
- status : { ...instance . status , phase : "stopped" , conditions : { failed : message } , message } ,
1342
- stoppedTime : now ,
1343
- stoppingTime : now ,
1344
- } ) ;
1345
-
1346
- // Mark the PrebuildWorkspace as failed
1347
- await this . failPrebuildWorkspace ( { span } , err , workspace ) ;
1349
+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
1348
1350
1349
- // Publish updated workspace instance
1350
- await this . publisher . publishInstanceUpdate ( {
1351
- workspaceID : workspace . ownerId ,
1352
- instanceID : instance . id ,
1353
- ownerID : workspace . ownerId ,
1354
- } ) ;
1355
-
1356
- TraceContext . setError ( { span } , err ) ;
1357
1351
const looksLikeUserError = ( msg : string ) : boolean => {
1358
1352
return msg . startsWith ( "build failed:" ) || msg . includes ( "headless task failed:" ) ;
1359
1353
} ;
@@ -1363,6 +1357,8 @@ export class WorkspaceStarter {
1363
1357
`workspace image build failed: ${ message } ` ,
1364
1358
{ looksLikeUserError : true } ,
1365
1359
) ;
1360
+ err = new StartInstanceError ( "imageBuildFailedUser" , err ) ;
1361
+ // Don't report this as "failed" to our metrics as it would trigger an alert
1366
1362
} else {
1367
1363
log . error (
1368
1364
{ instanceId : instance . id , userId : user . id , workspaceId : workspace . id } ,
@@ -1956,6 +1952,9 @@ export class WorkspaceStarter {
1956
1952
await client . describeWorkspace ( ctx , req ) ;
1957
1953
return true ;
1958
1954
} catch ( err ) {
1955
+ if ( isClusterMaintenanceError ( err ) ) {
1956
+ throw err ;
1957
+ }
1959
1958
return false ;
1960
1959
}
1961
1960
}
0 commit comments