Skip to content

Commit 35dbaed

Browse files
authored
v3: self-hosting (#1147)
* add amin email regex env var * fix displayed init command for self-hosted setups * shared env var to disable telemetry in cli and webapp * pin sdk version during init * if specified, add api url to dev command shown after init * improve checkpoint support detection * control forced checkpoint simulation via env var * add public init to providers * better checkpoint support check for coordinator * add docker to coordinator image * update docker provider containerfile * bump remaining containers to node 20 * add infra image build to default publish workflow * lockfile * remove concurrency group from infra workflow * add docker provider to build matrix * fix var subst * checkpoint test is docker specific * enable v3 projects by default on self-hosted instances * fix v3 setup command again * add default posthog key * self-hosting docs * add latest tags to versioned infra and webapp builds * some checkpoint errors should skip retrying * add changeset * shorten paragraph * some docs updates * update tunnelling section * add registry setup section * use correct cli push flag * add checkout to v3 branch * update the worker machine setup steps * fix infra build * small docs update * remove unused feature function * Revert "remove unused feature function" This reverts commit cfe0788. * fix self-hosted v3 feature gate * add note about missing arm support * simplify helper script syntax
1 parent c11a77f commit 35dbaed

File tree

28 files changed

+644
-146
lines changed

28 files changed

+644
-146
lines changed

.changeset/spicy-terms-bow.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
"@trigger.dev/core-apps": patch
3+
"trigger.dev": patch
4+
---
5+
6+
- Fix init command SDK pinning
7+
- Show --api-url / -a flag where needed
8+
- CLI now also respects `TRIGGER_TELEMETRY_DISABLED`
9+
- Dedicated docker checkpoint test function

.env.example

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ DEV_OTEL_BATCH_PROCESSING_ENABLED="0"
2525
# OPTIONAL VARIABLES
2626
# This is used for validating emails that are allowed to log in. Every email that do not match this regex will be rejected.
2727
# WHITELISTED_EMAILS="authorized@yahoo\.com|authorized@gmail\.com"
28+
# Accounts with these emails will get global admin rights. This grants access to the admin UI.
29+
# ADMIN_EMAILS="admin@example\.com|another-admin@example\.com"
2830
# This is used for logging in via GitHub. You can leave these commented out if you don't want to use GitHub for authentication.
2931
# AUTH_GITHUB_CLIENT_ID=
3032
# AUTH_GITHUB_CLIENT_SECRET=

.github/workflows/publish-docker.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,25 @@ jobs:
3939
exit 1
4040
fi
4141
echo "::set-output name=version::${IMAGE_TAG}"
42+
4243
- name: 🔢 Get the commit hash
4344
id: get_commit
4445
run: |
4546
echo ::set-output name=sha_short::$(echo ${{ github.sha }} | cut -c1-7)
4647
48+
- name: 📛 Set the tags
49+
id: set_tags
50+
run: |
51+
ref_without_tag=ghcr.io/triggerdotdev/trigger.dev
52+
image_tags=$ref_without_tag:${{ steps.get_version.outputs.version }}
53+
54+
# if it's a versioned tag, also tag it as latest
55+
if [[ "${{ github.ref_name }}" == v.docker.* ]]; then
56+
image_tags=$image_tags,$ref_without_tag:latest
57+
fi
58+
59+
echo "IMAGE_TAGS=${image_tags}" >> "$GITHUB_OUTPUT"
60+
4761
- name: 🐙 Login to GitHub Container Registry
4862
uses: docker/login-action@v2
4963
with:
@@ -56,6 +70,5 @@ jobs:
5670
with:
5771
file: ./docker/Dockerfile
5872
platforms: linux/amd64,linux/arm64
59-
tags: |
60-
ghcr.io/triggerdotdev/trigger.dev:${{ steps.get_version.outputs.version }}
73+
tags: ${{ steps.set_tags.outputs.IMAGE_TAGS }}
6174
push: true

.github/workflows/publish-infra.yml

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: "🚢 Publish Infra Images"
22

33
on:
4+
workflow_call:
45
push:
56
tags:
67
- "infra-dev-*"
@@ -29,17 +30,14 @@ permissions:
2930
packages: write
3031
contents: read
3132

32-
concurrency:
33-
group: ${{ github.workflow }}-${{ github.ref }}
34-
3533
env:
3634
AWS_REGION: us-east-1
3735

3836
jobs:
3937
build:
4038
strategy:
4139
matrix:
42-
package: [coordinator, kubernetes-provider]
40+
package: [coordinator, docker-provider, kubernetes-provider]
4341
runs-on: buildjet-16vcpu-ubuntu-2204
4442
env:
4543
DOCKER_BUILDKIT: "1"
@@ -48,20 +46,40 @@ jobs:
4846

4947
- name: Generate image reference
5048
id: prep
51-
# WARNING: This step expects the workflow to have been triggered by a specific tag format of: infra-${env}-*
5249
run: |
53-
env=$(echo ${{ github.ref_name }} | cut -d- -f2)
54-
sha=${GITHUB_SHA::7}
55-
ts=$(date +%s)
50+
# set image repo
5651
if [[ "${{ matrix.package }}" == *-provider ]]; then
57-
provider_type=$(echo ${{ matrix.package }} | cut -d- -f1)
52+
provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1)
5853
repository=provider/${provider_type}
5954
else
60-
repository=${{ matrix.package }}
55+
repository="${{ matrix.package }}"
6156
fi
62-
echo "IMAGE_TAG=${env}-${sha}-${ts}" >> "$GITHUB_OUTPUT"
6357
echo "REPOSITORY=${repository}" >> "$GITHUB_OUTPUT"
6458
59+
# set image tag
60+
if [[ "${{ github.ref_type }}" == "tag" ]]; then
61+
if [[ "${{ github.ref_name }}" == infra-*-* ]]; then
62+
env=$(echo ${{ github.ref_name }} | cut -d- -f2)
63+
sha=$(echo ${{ github.sha }} | head -c7)
64+
ts=$(date +%s)
65+
image_tag=${env}-${sha}-${ts}
66+
elif [[ "${{ github.ref_name }}" == v.docker.* ]]; then
67+
version="${GITHUB_REF_NAME#v.docker.}"
68+
image_tag="v${version}"
69+
elif [[ "${{ github.ref_name }}" == build-* ]]; then
70+
image_tag="${GITHUB_REF_NAME#build-}"
71+
else
72+
echo "Invalid tag: ${{ github.ref_name }}"
73+
exit 1
74+
fi
75+
elif [[ "${{ github.ref_name }}" == "main" ]]; then
76+
image_tag="main"
77+
else
78+
echo "Invalid reference: ${{ github.ref }}"
79+
exit 1
80+
fi
81+
echo "IMAGE_TAG=${image_tag}" >> "$GITHUB_OUTPUT"
82+
6583
- name: Set up Docker Buildx
6684
uses: docker/setup-buildx-action@v3
6785

@@ -92,3 +110,12 @@ jobs:
92110
REGISTRY: ghcr.io/triggerdotdev
93111
REPOSITORY: ${{ steps.prep.outputs.REPOSITORY }}
94112
IMAGE_TAG: ${{ steps.prep.outputs.IMAGE_TAG }}
113+
114+
- name: 🐙 Push 'latest' to GitHub Container Registry
115+
if: startsWith(github.ref_name, 'v.docker.')
116+
run: |
117+
docker tag infra_image $REGISTRY/$REPOSITORY:latest
118+
docker push $REGISTRY/$REPOSITORY:latest
119+
env:
120+
REGISTRY: ghcr.io/triggerdotdev
121+
REPOSITORY: ${{ steps.prep.outputs.REPOSITORY }}

.github/workflows/publish.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,8 @@ jobs:
5757
needs: [typecheck, units]
5858
uses: ./.github/workflows/publish-docker.yml
5959
secrets: inherit
60+
61+
publish-infra:
62+
needs: [typecheck, units]
63+
uses: ./.github/workflows/publish-infra.yml
64+
secrets: inherit

apps/coordinator/Containerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
# syntax=docker/dockerfile:labs
22

3-
FROM node:18-bullseye-slim@sha256:a4edd54dcfdcacc8a4100fee71498e8671d99556a1acf5614539214a70092426 AS node-18
3+
FROM node:20-bookworm-slim@sha256:72f2f046a5f8468db28730b990b37de63ce93fd1a72a40f531d6aa82afdf0d46 AS node-20
44

55
WORKDIR /app
66

7-
FROM node-18 AS pruner
7+
FROM node-20 AS pruner
88

99
COPY --chown=node:node . .
1010
RUN npx -q [email protected] prune --scope=coordinator --docker
1111
RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' +
1212

13-
FROM node-18 AS base
13+
FROM node-20 AS base
1414

1515
RUN apt-get update \
16-
&& apt-get install -y buildah ca-certificates dumb-init \
16+
&& apt-get install -y buildah ca-certificates dumb-init docker.io \
1717
&& rm -rf /var/lib/apt/lists/*
1818

1919
COPY --chown=node:node .gitignore .gitignore

apps/coordinator/src/index.ts

Lines changed: 45 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import {
1212
} from "@trigger.dev/core/v3";
1313
import { ZodNamespace } from "@trigger.dev/core/v3/zodNamespace";
1414
import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
15-
import { HttpReply, getTextBody, SimpleLogger } from "@trigger.dev/core-apps";
15+
import { HttpReply, getTextBody, SimpleLogger, testDockerCheckpoint } from "@trigger.dev/core-apps";
1616
import { ExponentialBackoff } from "./backoff";
1717

1818
import { collectDefaultMetrics, register, Gauge } from "prom-client";
@@ -72,7 +72,10 @@ type CheckpointAndPushOptions = {
7272

7373
type CheckpointAndPushResult =
7474
| { success: true; checkpoint: CheckpointData }
75-
| { success: false; reason?: "CANCELED" | "DISABLED" | "ERROR" | "IN_PROGRESS" | "NO_SUPPORT" };
75+
| {
76+
success: false;
77+
reason?: "CANCELED" | "DISABLED" | "ERROR" | "IN_PROGRESS" | "NO_SUPPORT" | "SKIP_RETRYING";
78+
};
7679

7780
type CheckpointData = {
7881
location: string;
@@ -125,65 +128,53 @@ class Checkpointer {
125128

126129
constructor(private opts = { forceSimulate: false }) {}
127130

128-
async initialize(): Promise<CheckpointerInitializeReturn> {
131+
async init(): Promise<CheckpointerInitializeReturn> {
129132
if (this.#initialized) {
130-
return this.#getInitializeReturn();
133+
return this.#getInitReturn(this.#canCheckpoint);
131134
}
132135

133136
this.#logger.log(`${this.#dockerMode ? "Docker" : "Kubernetes"} mode`);
134137

135138
if (this.#dockerMode) {
136-
try {
137-
await $`criu --version`;
138-
} catch (error) {
139-
this.#logger.error("No checkpoint support: Missing CRIU binary");
140-
this.#logger.error("Will simulate instead");
141-
this.#canCheckpoint = false;
142-
this.#initialized = true;
139+
const testCheckpoint = await testDockerCheckpoint();
143140

144-
return this.#getInitializeReturn();
141+
if (testCheckpoint.ok) {
142+
return this.#getInitReturn(true);
145143
}
146144

147-
try {
148-
await $`docker checkpoint`;
149-
} catch (error) {
150-
this.#logger.error(
151-
"No checkpoint support: Docker needs to have experimental features enabled"
152-
);
153-
this.#logger.error("Will simulate instead");
154-
this.#canCheckpoint = false;
155-
this.#initialized = true;
156-
157-
return this.#getInitializeReturn();
158-
}
145+
this.#logger.error(testCheckpoint.message, testCheckpoint.error ?? "");
146+
return this.#getInitReturn(false);
159147
} else {
160148
try {
161149
await $`buildah login --get-login ${REGISTRY_HOST}`;
162150
} catch (error) {
163151
this.#logger.error(`No checkpoint support: Not logged in to registry ${REGISTRY_HOST}`);
164-
this.#canCheckpoint = false;
165-
this.#initialized = true;
166-
167-
return this.#getInitializeReturn();
152+
return this.#getInitReturn(false);
168153
}
169154
}
170155

171-
this.#logger.log(
172-
`Full checkpoint support${
173-
this.#dockerMode && this.opts.forceSimulate ? " with forced simulation enabled." : "!"
174-
}`
175-
);
156+
return this.#getInitReturn(true);
157+
}
176158

159+
#getInitReturn(canCheckpoint: boolean): CheckpointerInitializeReturn {
177160
this.#initialized = true;
178-
this.#canCheckpoint = true;
161+
this.#canCheckpoint = canCheckpoint;
179162

180-
return this.#getInitializeReturn();
181-
}
163+
if (canCheckpoint) {
164+
this.#logger.log("Full checkpoint support!");
165+
}
166+
167+
const willSimulate = this.#dockerMode && (!this.#canCheckpoint || this.opts.forceSimulate);
168+
169+
if (willSimulate) {
170+
this.#logger.log("Simulation mode enabled. Containers will be paused, not checkpointed.", {
171+
forceSimulate: this.opts.forceSimulate,
172+
});
173+
}
182174

183-
#getInitializeReturn(): CheckpointerInitializeReturn {
184175
return {
185-
canCheckpoint: this.#canCheckpoint,
186-
willSimulate: this.#dockerMode && (!this.#canCheckpoint || this.opts.forceSimulate),
176+
canCheckpoint,
177+
willSimulate,
187178
};
188179
}
189180

@@ -327,6 +318,11 @@ class Checkpointer {
327318
return result;
328319
}
329320

321+
if (result.reason === "SKIP_RETRYING") {
322+
this.#logger.log("Skipping retrying", { runId });
323+
return result;
324+
}
325+
330326
continue;
331327
} catch (error) {
332328
this.#logger.error("Checkpoint error", {
@@ -355,7 +351,7 @@ class Checkpointer {
355351
projectRef,
356352
deploymentVersion,
357353
}: CheckpointAndPushOptions): Promise<CheckpointAndPushResult> {
358-
await this.initialize();
354+
await this.init();
359355

360356
const options = {
361357
runId,
@@ -473,7 +469,8 @@ class Checkpointer {
473469

474470
// Create checkpoint (CRI)
475471
if (!this.#canCheckpoint) {
476-
throw new Error("No checkpoint support in kubernetes mode.");
472+
this.#logger.error("No checkpoint support in kubernetes mode.");
473+
return { success: false, reason: "SKIP_RETRYING" };
477474
}
478475

479476
const containerId = this.#logger.debug(
@@ -484,7 +481,8 @@ class Checkpointer {
484481
);
485482

486483
if (!containerId.stdout) {
487-
throw new Error("could not find container id");
484+
this.#logger.error("could not find container id", { options, containterName });
485+
return { success: false, reason: "SKIP_RETRYING" };
488486
}
489487

490488
const start = performance.now();
@@ -617,7 +615,7 @@ class TaskCoordinator {
617615
private host = "0.0.0.0"
618616
) {
619617
this.#httpServer = this.#createHttpServer();
620-
this.#checkpointer.initialize();
618+
this.#checkpointer.init();
621619
this.#delayThresholdInMs = this.#getDelayThreshold();
622620

623621
if (process.env.DELAY_THRESHOLD_IN_MS) {
@@ -1034,7 +1032,7 @@ class TaskCoordinator {
10341032
return;
10351033
}
10361034

1037-
const { canCheckpoint, willSimulate } = await this.#checkpointer.initialize();
1035+
const { canCheckpoint, willSimulate } = await this.#checkpointer.init();
10381036

10391037
const willCheckpointAndRestore = canCheckpoint || willSimulate;
10401038

@@ -1131,7 +1129,7 @@ class TaskCoordinator {
11311129
return;
11321130
}
11331131

1134-
const { canCheckpoint, willSimulate } = await this.#checkpointer.initialize();
1132+
const { canCheckpoint, willSimulate } = await this.#checkpointer.init();
11351133

11361134
const willCheckpointAndRestore = canCheckpoint || willSimulate;
11371135

@@ -1185,7 +1183,7 @@ class TaskCoordinator {
11851183
socket.on("WAIT_FOR_TASK", async (message, callback) => {
11861184
logger.log("[WAIT_FOR_TASK]", message);
11871185

1188-
const { canCheckpoint, willSimulate } = await this.#checkpointer.initialize();
1186+
const { canCheckpoint, willSimulate } = await this.#checkpointer.init();
11891187

11901188
const willCheckpointAndRestore = canCheckpoint || willSimulate;
11911189

@@ -1227,7 +1225,7 @@ class TaskCoordinator {
12271225
socket.on("WAIT_FOR_BATCH", async (message, callback) => {
12281226
logger.log("[WAIT_FOR_BATCH]", message);
12291227

1230-
const { canCheckpoint, willSimulate } = await this.#checkpointer.initialize();
1228+
const { canCheckpoint, willSimulate } = await this.#checkpointer.init();
12311229

12321230
const willCheckpointAndRestore = canCheckpoint || willSimulate;
12331231

0 commit comments

Comments
 (0)