1
- import type { ClickHouse , TaskRunV1 , RawTaskRunPayloadV1 } from "@internal/clickhouse" ;
1
+ import type { ClickHouse , RawTaskRunPayloadV1 , TaskRunV1 } from "@internal/clickhouse" ;
2
2
import { RedisOptions } from "@internal/redis" ;
3
3
import { LogicalReplicationClient , Transaction , type PgoutputMessage } from "@internal/replication" ;
4
4
import { Logger } from "@trigger.dev/core/logger" ;
5
5
import { tryCatch } from "@trigger.dev/core/utils" ;
6
- import { TaskRunError } from "@trigger.dev/core/v3/schemas" ;
7
6
import { parsePacket } from "@trigger.dev/core/v3/utils/ioSerialization" ;
8
7
import { TaskRun } from "@trigger.dev/database" ;
9
8
import { nanoid } from "nanoid" ;
@@ -46,6 +45,8 @@ export class RunsReplicationService {
46
45
private _lastReplicationLagMs : number | null = null ;
47
46
private _transactionCounter ?: Counter ;
48
47
private _insertStrategy : "streaming" | "batching" ;
48
+ private _isShuttingDown = false ;
49
+ private _isShutDownComplete = false ;
49
50
50
51
constructor ( private readonly options : RunsReplicationServiceOptions ) {
51
52
this . logger = new Logger ( "RunsReplicationService" , "debug" ) ;
@@ -62,7 +63,7 @@ export class RunsReplicationService {
62
63
table : "TaskRun" ,
63
64
redisOptions : options . redisOptions ,
64
65
autoAcknowledge : false ,
65
- publicationActions : [ "insert" , "update" ] ,
66
+ publicationActions : [ "insert" , "update" , "delete" ] ,
66
67
logger : new Logger ( "RunsReplicationService" , "debug" ) ,
67
68
leaderLockTimeoutMs : options . leaderLockTimeoutMs ?? 30_000 ,
68
69
leaderLockExtendIntervalMs : options . leaderLockExtendIntervalMs ?? 10_000 ,
@@ -84,6 +85,9 @@ export class RunsReplicationService {
84
85
} ) ;
85
86
86
87
this . _replicationClient . events . on ( "heartbeat" , async ( { lsn, shouldRespond } ) => {
88
+ if ( this . _isShuttingDown ) return ;
89
+ if ( this . _isShutDownComplete ) return ;
90
+
87
91
if ( shouldRespond ) {
88
92
await this . _replicationClient . acknowledge ( lsn ) ;
89
93
}
@@ -130,6 +134,11 @@ export class RunsReplicationService {
130
134
}
131
135
}
132
136
137
+ public shutdown ( ) {
138
+ this . logger . info ( "Initiating shutdown of runs replication service" ) ;
139
+ this . _isShuttingDown = true ;
140
+ }
141
+
133
142
async start ( insertStrategy ?: "streaming" | "batching" ) {
134
143
this . _insertStrategy = insertStrategy ?? this . _insertStrategy ;
135
144
@@ -201,11 +210,27 @@ export class RunsReplicationService {
201
210
}
202
211
203
212
async #handleTransaction( transaction : Transaction < TaskRun > ) {
213
+ if ( this . _isShutDownComplete ) return ;
214
+
215
+ let alreadyAcknowledged = false ;
216
+
217
+ if ( this . _isShuttingDown ) {
218
+ // We need to immediately acknowledge the transaction
219
+ // And then try and handle this transaction
220
+ if ( transaction . commitEndLsn ) {
221
+ await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
222
+ alreadyAcknowledged = true ;
223
+ }
224
+
225
+ await this . _replicationClient . stop ( ) ;
226
+ this . _isShutDownComplete = true ;
227
+ }
228
+
204
229
this . _lastReplicationLagMs = transaction . replicationLagMs ;
205
230
206
231
// If there are no events, do nothing
207
232
if ( transaction . events . length === 0 ) {
208
- if ( transaction . commitEndLsn ) {
233
+ if ( transaction . commitEndLsn && ! alreadyAcknowledged ) {
209
234
await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
210
235
}
211
236
@@ -222,6 +247,7 @@ export class RunsReplicationService {
222
247
223
248
this . logger . debug ( "Handling transaction" , {
224
249
transaction,
250
+ alreadyAcknowledged,
225
251
} ) ;
226
252
227
253
// If there are events, we need to handle them
@@ -230,13 +256,19 @@ export class RunsReplicationService {
230
256
this . _transactionCounter ?. inc ( ) ;
231
257
232
258
if ( this . _insertStrategy === "streaming" ) {
233
- await this . _concurrentFlushScheduler . addToBatch (
234
- transaction . events . map ( ( event ) => ( {
235
- _version,
236
- run : event . data ,
237
- event : event . tag ,
238
- } ) )
239
- ) ;
259
+ this . _concurrentFlushScheduler
260
+ . addToBatch (
261
+ transaction . events . map ( ( event ) => ( {
262
+ _version,
263
+ run : event . data ,
264
+ event : event . tag ,
265
+ } ) )
266
+ )
267
+ . catch ( ( error ) => {
268
+ this . logger . error ( "Error adding to batch" , {
269
+ error,
270
+ } ) ;
271
+ } ) ;
240
272
} else {
241
273
const [ flushError ] = await tryCatch (
242
274
this . #flushBatch(
@@ -256,7 +288,9 @@ export class RunsReplicationService {
256
288
}
257
289
}
258
290
259
- await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
291
+ if ( ! alreadyAcknowledged ) {
292
+ await this . _replicationClient . acknowledge ( transaction . commitEndLsn ) ;
293
+ }
260
294
}
261
295
262
296
async #flushBatch( flushId : string , batch : Array < TaskRunInsert > ) {
@@ -497,7 +531,6 @@ export class ConcurrentFlushScheduler<T> {
497
531
private readonly MAX_CONCURRENCY : number ;
498
532
private readonly concurrencyLimiter : ReturnType < typeof pLimit > ;
499
533
private flushTimer : NodeJS . Timeout | null ;
500
- private isShuttingDown ;
501
534
private failedBatchCount ;
502
535
private metricsRegister ?: MetricsRegister ;
503
536
private logger : Logger ;
@@ -510,7 +543,6 @@ export class ConcurrentFlushScheduler<T> {
510
543
this . MAX_CONCURRENCY = config . maxConcurrency || 1 ;
511
544
this . concurrencyLimiter = pLimit ( this . MAX_CONCURRENCY ) ;
512
545
this . flushTimer = null ;
513
- this . isShuttingDown = false ;
514
546
this . failedBatchCount = 0 ;
515
547
516
548
this . logger . info ( "Initializing ConcurrentFlushScheduler" , {
@@ -520,7 +552,6 @@ export class ConcurrentFlushScheduler<T> {
520
552
} ) ;
521
553
522
554
this . startFlushTimer ( ) ;
523
- this . setupShutdownHandlers ( ) ;
524
555
525
556
if ( ! process . env . VITEST && config . metricsRegister ) {
526
557
this . metricsRegister = config . metricsRegister ;
@@ -592,27 +623,6 @@ export class ConcurrentFlushScheduler<T> {
592
623
this . logger . debug ( "Started flush timer" , { interval : this . FLUSH_INTERVAL } ) ;
593
624
}
594
625
595
- private setupShutdownHandlers ( ) {
596
- process . on ( "SIGTERM" , this . shutdown . bind ( this ) ) ;
597
- process . on ( "SIGINT" , this . shutdown . bind ( this ) ) ;
598
- this . logger . debug ( "Shutdown handlers configured" ) ;
599
- }
600
-
601
- private async shutdown ( ) : Promise < void > {
602
- if ( this . isShuttingDown ) return ;
603
- this . isShuttingDown = true ;
604
- this . logger . info ( "Initiating shutdown of dynamic flush scheduler" , {
605
- remainingItems : this . currentBatch . length ,
606
- } ) ;
607
-
608
- await this . checkAndFlush ( ) ;
609
- this . clearTimer ( ) ;
610
-
611
- this . logger . info ( "Dynamic flush scheduler shutdown complete" , {
612
- totalFailedBatches : this . failedBatchCount ,
613
- } ) ;
614
- }
615
-
616
626
private clearTimer ( ) : void {
617
627
if ( this . flushTimer ) {
618
628
clearInterval ( this . flushTimer ) ;
0 commit comments