Free up concurrency when using triggerAndWait. Improved errors on runs (#1272)

matt-aitken · web-flow · commit f7d32b83b115 · 2024-08-25T20:15:45.000+01:00
* Add an error to the final attempt if there isn’t one

* Improved the checkpointResumer test task

* Using triggerAndWait or batchTriggerAndWait frees up concurrency

Normally it frees up just env and org concurrency. If it’s a recursive task then it will free up the run concurrency too (e.g. a task calling itself).

* Removed the filepath and export name from attempt spans

* Run inspector: only show an error if the run is in a finished state
diff --git a/.changeset/fast-ladybugs-eat.md b/.changeset/fast-ladybugs-eat.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/core": patch
+---
+
+Removed the folder/filepath from Attempt spans
diff --git a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts
@@ -135,20 +135,24 @@ export class SpanPresenter extends BasePresenter {
       return;
     }
 
-    const finishedAttempt = await this._replica.taskRunAttempt.findFirst({
-      select: {
-        output: true,
-        outputType: true,
-        error: true,
-      },
-      where: {
-        status: { in: FINAL_ATTEMPT_STATUSES },
-        taskRunId: run.id,
-      },
-      orderBy: {
-        createdAt: "desc",
-      },
-    });
+    const isFinished = isFinalRunStatus(run.status);
+
+    const finishedAttempt = isFinished
+      ? await this._replica.taskRunAttempt.findFirst({
+          select: {
+            output: true,
+            outputType: true,
+            error: true,
+          },
+          where: {
+            status: { in: FINAL_ATTEMPT_STATUSES },
+            taskRunId: run.id,
+          },
+          orderBy: {
+            createdAt: "desc",
+          },
+        })
+      : null;
 
     const output =
       finishedAttempt === null
@@ -258,7 +262,7 @@ export class SpanPresenter extends BasePresenter {
       costInCents: run.costInCents,
       totalCostInCents: run.costInCents + run.baseCostInCents,
       usageDurationMs: run.usageDurationMs,
-      isFinished: isFinalRunStatus(run.status),
+      isFinished,
       isRunning: RUNNING_STATUSES.includes(run.status),
       payload,
       payloadType: run.payloadType,
diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts
@@ -1,4 +1,4 @@
-import { TaskRunFailedExecutionResult } from "@trigger.dev/core/v3";
+import { sanitizeError, TaskRunFailedExecutionResult } from "@trigger.dev/core/v3";
 import { logger } from "~/services/logger.server";
 import { createExceptionPropertiesFromError, eventRepository } from "./eventRepository.server";
 import { BaseService } from "./services/baseService.server";
@@ -44,6 +44,24 @@ export class FailedTaskRunService extends BaseService {
       completedAt: new Date(),
     });
 
+    // Get the final attempt and add the error to it, if it's not already set
+    const finalAttempt = await this._prisma.taskRunAttempt.findFirst({
+      where: {
+        taskRunId: taskRun.id,
+      },
+      orderBy: { id: "desc" },
+    });
+
+    if (finalAttempt && !finalAttempt.error) {
+      // Haven't set the status because the attempt might still be running
+      await this._prisma.taskRunAttempt.update({
+        where: { id: finalAttempt.id },
+        data: {
+          error: sanitizeError(completion.error),
+        },
+      });
+    }
+
     // Now we need to "complete" the task run event/span
     await eventRepository.completeEvent(taskRun.spanId, {
       endTime: new Date(),
diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts
@@ -542,6 +542,60 @@ export class MarQS {
     );
   }
 
+  public async releaseConcurrency(messageId: string, releaseForRun: boolean = false) {
+    return this.#trace(
+      "releaseConcurrency",
+      async (span) => {
+        span.setAttributes({
+          [SemanticAttributes.MESSAGE_ID]: messageId,
+        });
+
+        const message = await this.readMessage(messageId);
+
+        if (!message) {
+          return;
+        }
+
+        span.setAttributes({
+          [SemanticAttributes.QUEUE]: message.queue,
+          [SemanticAttributes.MESSAGE_ID]: message.messageId,
+          [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey,
+          [SemanticAttributes.PARENT_QUEUE]: message.parentQueue,
+        });
+
+        const concurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue);
+        const envConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue);
+        const orgConcurrencyKey = this.keys.orgCurrentConcurrencyKeyFromQueue(message.queue);
+
+        logger.debug("Calling releaseConcurrency", {
+          messageId,
+          queue: message.queue,
+          concurrencyKey,
+          envConcurrencyKey,
+          orgConcurrencyKey,
+          service: this.name,
+          releaseForRun,
+        });
+
+        return this.redis.releaseConcurrency(
+          //don't release the for the run, it breaks concurrencyLimits
+          releaseForRun ? concurrencyKey : "",
+          envConcurrencyKey,
+          orgConcurrencyKey,
+          message.messageId
+        );
+      },
+      {
+        kind: SpanKind.CONSUMER,
+        attributes: {
+          [SEMATTRS_MESSAGING_OPERATION]: "releaseConcurrency",
+          [SEMATTRS_MESSAGE_ID]: messageId,
+          [SEMATTRS_MESSAGING_SYSTEM]: "marqs",
+        },
+      }
+    );
+  }
+
   async #trace<T>(
     name: string,
     fn: (span: Span) => Promise<T>,
@@ -1488,6 +1542,24 @@ end
 `,
     });
 
+    this.redis.defineCommand("releaseConcurrency", {
+      numberOfKeys: 3,
+      lua: `
+local concurrencyKey = KEYS[1]
+local envCurrentConcurrencyKey = KEYS[2]
+local orgCurrentConcurrencyKey = KEYS[3]
+
+local messageId = ARGV[1]
+
+-- Update the concurrency keys
+if concurrencyKey ~= "" then
+  redis.call('SREM', concurrencyKey, messageId)
+end
+redis.call('SREM', envCurrentConcurrencyKey, messageId)
+redis.call('SREM', orgCurrentConcurrencyKey, messageId)
+`,
+    });
+
     this.redis.defineCommand("heartbeatMessage", {
       numberOfKeys: 1,
       lua: `
@@ -1699,6 +1771,14 @@ declare module "ioredis" {
       callback?: Callback<void>
     ): Result<void, Context>;
 
+    releaseConcurrency(
+      concurrencyKey: string,
+      envConcurrencyKey: string,
+      orgConcurrencyKey: string,
+      messageId: string,
+      callback?: Callback<void>
+    ): Result<void, Context>;
+
     heartbeatMessage(
       visibilityQueue: string,
       messageId: string,
diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts
@@ -107,6 +107,7 @@ export class TriggerTaskService extends BaseService {
                 select: {
                   id: true,
                   status: true,
+                  taskIdentifier: true,
                 },
               },
             },
@@ -143,6 +144,7 @@ export class TriggerTaskService extends BaseService {
                     select: {
                       id: true,
                       status: true,
+                      taskIdentifier: true,
                     },
                   },
                 },
@@ -391,6 +393,20 @@ export class TriggerTaskService extends BaseService {
             this._prisma
           );
 
+          //release the concurrency for the env and org, if part of a (batch)triggerAndWait
+          if (dependentAttempt) {
+            const isSameTask = dependentAttempt.taskRun.taskIdentifier === taskId;
+            await marqs?.releaseConcurrency(dependentAttempt.taskRun.id, isSameTask);
+          }
+          if (dependentBatchRun?.dependentTaskAttempt) {
+            const isSameTask =
+              dependentBatchRun.dependentTaskAttempt.taskRun.taskIdentifier === taskId;
+            await marqs?.releaseConcurrency(
+              dependentBatchRun.dependentTaskAttempt.taskRun.id,
+              isSameTask
+            );
+          }
+
           if (!run) {
             return;
           }
diff --git a/packages/core/src/v3/workers/taskExecutor.ts b/packages/core/src/v3/workers/taskExecutor.ts
@@ -202,17 +202,6 @@ export class TaskExecutor {
         kind: SpanKind.CONSUMER,
         attributes: {
           [SemanticInternalAttributes.STYLE_ICON]: "attempt",
-          ...accessoryAttributes({
-            items: [
-              {
-                text: ctx.task.filePath,
-              },
-              {
-                text: `${ctx.task.exportName}.run()`,
-              },
-            ],
-            style: "codepath",
-          }),
         },
       },
       this._tracer.extractContext(traceContext)
diff --git a/references/v3-catalog/src/trigger/checkpoints.ts b/references/v3-catalog/src/trigger/checkpoints.ts
@@ -18,13 +18,16 @@ export const checkpointBatchResumer = task({
 /** Test that checkpoints and resuming works if the checkpoint isn't created before the resume */
 export const checkpointResumer = task({
   id: "checkpoint-resume",
+  queue: {
+    concurrencyLimit: 1,
+  },
   run: async ({ count = 1 }: Payload) => {
-    await noop.triggerAndWait();
-    logger.info(`Successfully 1/3 resumed`);
-    await noop.triggerAndWait();
-    logger.info(`Successfully 2/3 resumed`);
-    await noop.triggerAndWait();
-    logger.info(`Successfully 3/3 resumed`);
+    logger.info(`Starting ${count} runs`);
+
+    for (let i = 0; i < count; i++) {
+      await noop.triggerAndWait();
+      logger.info(`Successfully ${i + 1}/${count} resumed`);
+    }
   },
 });
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@trigger.dev/core": patch
 +---
++
 +Removed the folder/filepath from Attempt spans