Merge remote-tracking branch 'origin/main' into fix/dev-engine-url

nicktrn · nicktrn · commit b43bab01e891 · 2025-04-18T10:54:19.000+01:00
diff --git a/apps/webapp/app/runEngine/services/batchTrigger.server.ts b/apps/webapp/app/runEngine/services/batchTrigger.server.ts
@@ -571,17 +571,6 @@ export class RunEngineBatchTriggerService extends WithRunEngine {
 
     //triggered all the runs
     if (updatedBatch.runIds.length === updatedBatch.runCount) {
-      //unblock the parent run from the batch
-      //this prevents the parent continuing before all the runs are created
-      if (parentRunId && resumeParentOnCompletion) {
-        await this._engine.unblockRunForCreatedBatch({
-          runId: RunId.fromFriendlyId(parentRunId),
-          batchId: batch.id,
-          environmentId: environment.id,
-          projectId: environment.projectId,
-        });
-      }
-
       //if all the runs were idempotent, it's possible the batch is already completed
       await this._engine.tryCompleteBatch({ batchId: batch.id });
     }
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
@@ -290,6 +290,7 @@ export class RunEngine {
 
     this.batchSystem = new BatchSystem({
       resources,
+      waitpointSystem: this.waitpointSystem,
     });
 
     this.runAttemptSystem = new RunAttemptSystem({
@@ -905,43 +906,6 @@ export class RunEngine {
     }
   }
 
-  /**
-   * This is called when all the runs for a batch have been created.
-   * This does NOT mean that all the runs for the batch are completed.
-   */
-  async unblockRunForCreatedBatch({
-    runId,
-    batchId,
-    tx,
-  }: {
-    runId: string;
-    batchId: string;
-    environmentId: string;
-    projectId: string;
-    tx?: PrismaClientOrTransaction;
-  }): Promise<void> {
-    const prisma = tx ?? this.prisma;
-
-    const waitpoint = await prisma.waitpoint.findFirst({
-      where: {
-        completedByBatchId: batchId,
-      },
-    });
-
-    if (!waitpoint) {
-      this.logger.error("RunEngine.unblockRunForBatch(): Waitpoint not found", {
-        runId,
-        batchId,
-      });
-      throw new ServiceValidationError("Waitpoint not found for batch", 404);
-    }
-
-    await this.completeWaitpoint({
-      id: waitpoint.id,
-      output: { value: "Batch waitpoint completed", isError: false },
-    });
-  }
-
   async tryCompleteBatch({ batchId }: { batchId: string }): Promise<void> {
     return this.batchSystem.scheduleCompleteBatch({ batchId });
   }
diff --git a/internal-packages/run-engine/src/engine/systems/batchSystem.ts b/internal-packages/run-engine/src/engine/systems/batchSystem.ts
@@ -1,16 +1,20 @@
 import { startSpan } from "@internal/tracing";
 import { isFinalRunStatus } from "../statuses.js";
 import { SystemResources } from "./systems.js";
+import { WaitpointSystem } from "./waitpointSystem.js";
 
 export type BatchSystemOptions = {
   resources: SystemResources;
+  waitpointSystem: WaitpointSystem;
 };
 
 export class BatchSystem {
   private readonly $: SystemResources;
+  private readonly waitpointSystem: WaitpointSystem;
 
   constructor(private readonly options: BatchSystemOptions) {
     this.$ = options.resources;
+    this.waitpointSystem = options.waitpointSystem;
   }
 
   public async scheduleCompleteBatch({ batchId }: { batchId: string }): Promise<void> {
@@ -19,8 +23,8 @@ export class BatchSystem {
       id: `tryCompleteBatch:${batchId}`,
       job: "tryCompleteBatch",
       payload: { batchId: batchId },
-      //2s in the future
-      availableAt: new Date(Date.now() + 2_000),
+      //200ms in the future
+      availableAt: new Date(Date.now() + 200),
     });
   }
 
@@ -75,6 +79,28 @@ export class BatchSystem {
             status: "COMPLETED",
           },
         });
+
+        //get waitpoint (if there is one)
+        const waitpoint = await this.$.prisma.waitpoint.findFirst({
+          where: {
+            completedByBatchId: batchId,
+          },
+        });
+
+        if (!waitpoint) {
+          this.$.logger.debug(
+            "RunEngine.unblockRunForBatch(): Waitpoint not found. This is ok, because only batchTriggerAndWait has waitpoints",
+            {
+              batchId,
+            }
+          );
+          return;
+        }
+
+        await this.waitpointSystem.completeWaitpoint({
+          id: waitpoint.id,
+          output: { value: "Batch waitpoint completed", isError: false },
+        });
       } else {
         this.$.logger.debug("#tryCompleteBatch: Not all runs are completed", { batchId });
       }
diff --git a/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts
@@ -191,13 +191,6 @@ describe("RunEngine batchTriggerAndWait", () => {
       expect(batchWaitpoint?.waitpoint.type).toBe("BATCH");
       expect(batchWaitpoint?.waitpoint.completedByBatchId).toBe(batch.id);
 
-      await engine.unblockRunForCreatedBatch({
-        runId: parentRun.id,
-        batchId: batch.id,
-        environmentId: authenticatedEnvironment.id,
-        projectId: authenticatedEnvironment.projectId,
-      });
-
       //dequeue and start the 1st child
       const dequeuedChild = await engine.dequeueFromMasterQueue({
         consumerId: "test_12345",
@@ -303,7 +296,7 @@ describe("RunEngine batchTriggerAndWait", () => {
       expect(child2WaitpointAfter?.status).toBe("COMPLETED");
       expect(child2WaitpointAfter?.output).toBe('{"baz":"qux"}');
 
-      await setTimeout(500);
+      await setTimeout(1_000);
 
       const runWaitpointsAfterSecondChild = await prisma.taskRunWaitpoint.findMany({
         where: {
@@ -497,13 +490,6 @@ describe("RunEngine batchTriggerAndWait", () => {
         expect(parentAfterBatchChild.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
         expect(parentAfterBatchChild.batch?.id).toBe(batch.id);
 
-        await engine.unblockRunForCreatedBatch({
-          runId: parentRun.id,
-          batchId: batch.id,
-          environmentId: authenticatedEnvironment.id,
-          projectId: authenticatedEnvironment.projectId,
-        });
-
         //dequeue and start the batch child
         const dequeuedBatchChild = await engine.dequeueFromMasterQueue({
           consumerId: "test_12345",
diff --git a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts
@@ -1166,13 +1166,6 @@ describe("RunEngine checkpoints", () => {
       expect(batchWaitpoint?.waitpoint.type).toBe("BATCH");
       expect(batchWaitpoint?.waitpoint.completedByBatchId).toBe(batch.id);
 
-      await engine.unblockRunForCreatedBatch({
-        runId: parentRun.id,
-        batchId: batch.id,
-        environmentId: authenticatedEnvironment.id,
-        projectId: authenticatedEnvironment.projectId,
-      });
-
       // Create a checkpoint
       const checkpointResult = await engine.createCheckpoint({
         runId: parentRun.id,
diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts
@@ -49,7 +49,7 @@ export type RunQueueOptions = {
   keys: RunQueueKeyProducer;
   queueSelectionStrategy: RunQueueSelectionStrategy;
   verbose?: boolean;
-  logger: Logger;
+  logger?: Logger;
   retryOptions?: RetryOptions;
 };
 
@@ -88,7 +88,7 @@ export class RunQueue {
         });
       },
     });
-    this.logger = options.logger;
+    this.logger = options.logger ?? new Logger("RunQueue", "warn");
 
     this.keys = options.keys;
     this.queueSelectionStrategy = options.queueSelectionStrategy;
@@ -404,11 +404,17 @@ export class RunQueue {
           tenantQueues[env.envId] = [...env.queues]; // Create a copy of the queues array
         }
 
+        // Track if we successfully dequeued any message in a complete cycle
+        let successfulDequeueInCycle = false;
+
         // Continue until we've hit max count or all tenants have empty queue lists
         while (
           messages.length < maxCount &&
           Object.values(tenantQueues).some((queues) => queues.length > 0)
         ) {
+          // Reset the success flag at the start of each cycle
+          successfulDequeueInCycle = false;
+
           for (const env of envQueues) {
             attemptedEnvs++;
 
@@ -428,6 +434,7 @@ export class RunQueue {
 
             if (message) {
               messages.push(message);
+              successfulDequeueInCycle = true;
               // Re-add this queue at the end, since it might have more messages
               tenantQueues[env.envId].push(queue);
             }
@@ -438,6 +445,14 @@ export class RunQueue {
               break;
             }
           }
+
+          // If we completed a full cycle through all tenants with no successful dequeues,
+          // exit early as we're likely hitting concurrency limits or have no ready messages
+          if (!successfulDequeueInCycle) {
+            // IMPORTANT: Keep this log message as it's used in tests
+            this.logger.log("No successful dequeues in a full cycle, exiting...");
+            break;
+          }
         }
 
         span.setAttributes({
diff --git a/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts b/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts
@@ -1,6 +1,5 @@
 import { redisTest } from "@internal/testcontainers";
 import { trace } from "@internal/tracing";
-import { Logger } from "@trigger.dev/core/logger";
 import { describe } from "node:test";
 import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js";
 import { RunQueue } from "../index.js";
@@ -12,7 +11,6 @@ const testOptions = {
   tracer: trace.getTracer("rq"),
   workers: 1,
   defaultEnvConcurrency: 25,
-  logger: new Logger("RunQueue", "warn"),
   retryOptions: {
     maxAttempts: 5,
     factor: 1.1,
@@ -264,4 +262,95 @@ describe("RunQueue.dequeueMessageFromMasterQueue", () => {
       }
     }
   );
+
+  redisTest(
+    "should exit early when no messages can be dequeued in a full cycle",
+    async ({ redisContainer }) => {
+      const mockLogger = {
+        log: vi.fn(),
+        error: vi.fn(),
+        warn: vi.fn(),
+        debug: vi.fn(),
+        name: "test-logger",
+        level: "debug",
+        filteredKeys: [],
+        additionalFields: {},
+        setLevel: vi.fn(),
+        setFilteredKeys: vi.fn(),
+        setAdditionalFields: vi.fn(),
+        child: vi.fn(),
+      };
+
+      const queue = new RunQueue({
+        ...testOptions,
+        queueSelectionStrategy: new FairQueueSelectionStrategy({
+          redis: {
+            keyPrefix: "runqueue:test:",
+            host: redisContainer.getHost(),
+            port: redisContainer.getPort(),
+          },
+          keys: testOptions.keys,
+        }),
+        redis: {
+          keyPrefix: "runqueue:test:",
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+        },
+        // @ts-expect-error
+        logger: mockLogger,
+      });
+
+      try {
+        const envMasterQueue = `env:${authenticatedEnvDev.id}`;
+        const queueCount = 10; // Reduced for simplicity
+
+        // First, create all queues and enqueue initial messages
+        for (let i = 0; i < queueCount; i++) {
+          const queueName = `${messageDev.queue}_${i}`;
+          // Set each queue's concurrency limit to 0 (this guarantees dequeue will fail)
+          await queue.updateQueueConcurrencyLimits(authenticatedEnvDev, queueName, 0);
+
+          // Enqueue a message to each queue
+          await queue.enqueueMessage({
+            env: authenticatedEnvDev,
+            message: { ...messageDev, runId: `r${4321 + i}`, queue: queueName },
+            masterQueues: ["main", envMasterQueue],
+          });
+        }
+
+        // Try to dequeue messages - this should exit early due to concurrency limits
+        const startTime = Date.now();
+        const dequeued = await queue.dequeueMessageFromMasterQueue(
+          "test_12345",
+          envMasterQueue,
+          queueCount
+        );
+        const endTime = Date.now();
+
+        // Verify no messages were dequeued
+        expect(dequeued.length).toBe(0);
+
+        // Verify the operation completed quickly (under 1000ms)
+        const duration = endTime - startTime;
+        expect(duration).toBeLessThan(1000);
+
+        // Verify we only logged one early exit message
+        expect(mockLogger.log).toHaveBeenCalledWith(
+          expect.stringContaining("No successful dequeues in a full cycle, exiting")
+        );
+        expect(mockLogger.log.mock.calls.length).toBeLessThanOrEqual(2);
+
+        // Verify all messages are still in queues
+        let totalRemaining = 0;
+        for (let i = 0; i < queueCount; i++) {
+          const queueName = `${messageDev.queue}_${i}`;
+          const length = await queue.lengthOfQueue(authenticatedEnvDev, queueName);
+          totalRemaining += length;
+        }
+        expect(totalRemaining).toBe(queueCount);
+      } finally {
+        await queue.quit();
+      }
+    }
+  );
 });