8
8
type CompleteRunAttemptResult ,
9
9
HeartbeatService ,
10
10
type RunExecutionData ,
11
+ type TaskRunExecutionMetrics ,
11
12
type TaskRunExecutionResult ,
12
13
type TaskRunFailedExecutionResult ,
13
14
WorkerManifest ,
@@ -50,6 +51,9 @@ const Env = z.object({
50
51
TRIGGER_RUNNER_ID : z . string ( ) ,
51
52
TRIGGER_METADATA_URL : z . string ( ) . optional ( ) ,
52
53
54
+ // Timeline metrics
55
+ TRIGGER_POD_SCHEDULED_AT_MS : z . coerce . date ( ) ,
56
+
53
57
// May be overridden
54
58
TRIGGER_SUPERVISOR_API_PROTOCOL : z . enum ( [ "http" , "https" ] ) ,
55
59
TRIGGER_SUPERVISOR_API_DOMAIN : z . string ( ) ,
@@ -734,10 +738,14 @@ class ManagedRunController {
734
738
private async startAndExecuteRunAttempt ( {
735
739
runFriendlyId,
736
740
snapshotFriendlyId,
741
+ dequeuedAt,
742
+ podScheduledAt,
737
743
isWarmStart = false ,
738
744
} : {
739
745
runFriendlyId : string ;
740
746
snapshotFriendlyId : string ;
747
+ dequeuedAt ?: Date ;
748
+ podScheduledAt ?: Date ;
741
749
isWarmStart ?: boolean ;
742
750
} ) {
743
751
if ( ! this . socket ) {
@@ -749,6 +757,8 @@ class ManagedRunController {
749
757
snapshot : { friendlyId : snapshotFriendlyId } ,
750
758
} ) ;
751
759
760
+ const attemptStartedAt = Date . now ( ) ;
761
+
752
762
const start = await this . httpClient . startRunAttempt ( runFriendlyId , snapshotFriendlyId , {
753
763
isWarmStart,
754
764
} ) ;
@@ -760,28 +770,58 @@ class ManagedRunController {
760
770
return ;
761
771
}
762
772
773
+ const attemptDuration = Date . now ( ) - attemptStartedAt ;
774
+
763
775
const { run, snapshot, execution, envVars } = start . data ;
764
776
765
777
logger . debug ( "[ManagedRunController] Started run" , {
766
778
runId : run . friendlyId ,
767
779
snapshot : snapshot . friendlyId ,
768
780
} ) ;
769
781
770
- // TODO: We may already be executing this run, this may be a new attempt
771
- // This is the only case where incrementing the attempt number is allowed
772
782
this . enterRunPhase ( run , snapshot ) ;
773
783
784
+ const metrics = [
785
+ {
786
+ name : "start" ,
787
+ event : "create_attempt" ,
788
+ timestamp : attemptStartedAt ,
789
+ duration : attemptDuration ,
790
+ } ,
791
+ ]
792
+ . concat (
793
+ dequeuedAt
794
+ ? [
795
+ {
796
+ name : "start" ,
797
+ event : "dequeue" ,
798
+ timestamp : dequeuedAt . getTime ( ) ,
799
+ duration : 0 ,
800
+ } ,
801
+ ]
802
+ : [ ]
803
+ )
804
+ . concat (
805
+ podScheduledAt
806
+ ? [
807
+ {
808
+ name : "start" ,
809
+ event : "pod_scheduled" ,
810
+ timestamp : podScheduledAt . getTime ( ) ,
811
+ duration : 0 ,
812
+ } ,
813
+ ]
814
+ : [ ]
815
+ ) satisfies TaskRunExecutionMetrics ;
816
+
774
817
const taskRunEnv = {
775
818
...gatherProcessEnv ( ) ,
776
819
...envVars ,
777
820
} ;
778
821
779
822
try {
780
- return await this . executeRun ( { run, snapshot, envVars : taskRunEnv , execution } ) ;
823
+ return await this . executeRun ( { run, snapshot, envVars : taskRunEnv , execution, metrics } ) ;
781
824
} catch ( error ) {
782
- // TODO: Handle the case where we're in the warm start phase or executing a new run
783
- // This can happen if we kill the run while it's still executing, e.g. after receiving an attempt number mismatch
784
-
785
825
console . error ( "Error while executing attempt" , {
786
826
error,
787
827
} ) ;
@@ -810,8 +850,6 @@ class ManagedRunController {
810
850
error : completionResult . error ,
811
851
} ) ;
812
852
813
- // TODO: Maybe we should keep retrying for a while longer
814
-
815
853
this . waitForNextRun ( ) ;
816
854
return ;
817
855
}
@@ -923,6 +961,7 @@ class ManagedRunController {
923
961
this . startAndExecuteRunAttempt ( {
924
962
runFriendlyId : nextRun . run . friendlyId ,
925
963
snapshotFriendlyId : nextRun . snapshot . friendlyId ,
964
+ dequeuedAt : nextRun . dequeuedAt ,
926
965
isWarmStart : true ,
927
966
} ) . finally ( ( ) => { } ) ;
928
967
return ;
@@ -1032,7 +1071,10 @@ class ManagedRunController {
1032
1071
snapshot,
1033
1072
envVars,
1034
1073
execution,
1035
- } : WorkloadRunAttemptStartResponseBody ) {
1074
+ metrics,
1075
+ } : WorkloadRunAttemptStartResponseBody & {
1076
+ metrics ?: TaskRunExecutionMetrics ;
1077
+ } ) {
1036
1078
this . snapshotPoller . start ( ) ;
1037
1079
1038
1080
if ( ! this . taskRunProcess || ! this . taskRunProcess . isPreparedForNextRun ) {
@@ -1058,6 +1100,7 @@ class ManagedRunController {
1058
1100
payload : {
1059
1101
execution,
1060
1102
traceContext : execution . run . traceContext ?? { } ,
1103
+ metrics,
1061
1104
} ,
1062
1105
messageId : run . friendlyId ,
1063
1106
env : envVars ,
@@ -1212,6 +1255,8 @@ class ManagedRunController {
1212
1255
this . startAndExecuteRunAttempt ( {
1213
1256
runFriendlyId : env . TRIGGER_RUN_ID ,
1214
1257
snapshotFriendlyId : env . TRIGGER_SNAPSHOT_ID ,
1258
+ dequeuedAt : new Date ( ) ,
1259
+ podScheduledAt : env . TRIGGER_POD_SCHEDULED_AT_MS ,
1215
1260
} ) . finally ( ( ) => { } ) ;
1216
1261
return ;
1217
1262
}
0 commit comments