Skip to content

Commit 394f1de

Browse files
authored
Misc v4 fixes and improved logging (#1831)
* logs for optional services * print env vars on startup in debug mode * routes need to explicitly ask to keep connection alive * log indicators for now * make workload api listen host configurable * expose supervisor metrics and make more configurable * configurable pull secrets, no defaults * remove restore route * run controller to handle queued executing * fix v3 deploys in v4 project * update admin worker route * only start pod cleaner et al in k8s mode * set new worker group as default if none yet
1 parent c963dcd commit 394f1de

15 files changed

+151
-122
lines changed

apps/supervisor/README.md

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,15 @@
88
api_url=http://localhost:3030
99
wg_name=my-worker
1010

11-
# edit these
11+
# edit this
1212
admin_pat=tr_pat_...
13-
project_id=clsw6q8wz...
1413

1514
curl -sS \
1615
-X POST \
1716
"$api_url/admin/api/v1/workers" \
1817
-H "Authorization: Bearer $admin_pat" \
1918
-H "Content-Type: application/json" \
20-
-d "{
21-
\"name\": \"$wg_name\",
22-
\"makeDefault\": true,
23-
\"projectId\": \"$project_id\"
24-
}"
19+
-d "{\"name\": \"$wg_name\"}"
2520
```
2621

2722
2. Create `.env` and set the worker token
@@ -47,3 +42,26 @@ pnpm exec trigger deploy --self-hosted
4742
# The additional network flag is required on linux
4843
pnpm exec trigger deploy --self-hosted --network host
4944
```
45+
46+
## Additional worker groups
47+
48+
When adding more worker groups you might also want to make them the default for a specific project. This will allow you to test it without having to change the global default:
49+
50+
```sh
51+
api_url=http://localhost:3030
52+
wg_name=my-worker
53+
54+
# edit these
55+
admin_pat=tr_pat_...
56+
project_id=clsw6q8wz...
57+
58+
curl -sS \
59+
-X POST \
60+
"$api_url/admin/api/v1/workers" \
61+
-H "Authorization: Bearer $admin_pat" \
62+
-H "Content-Type: application/json" \
63+
-d "{
64+
\"name\": \"$wg_name\",
65+
\"makeDefaultForProjectId\": \"$project_id\"
66+
}"
67+
```

apps/supervisor/src/env.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const Env = z.object({
2626
.transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase()))
2727
.default("http"),
2828
TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default
29+
TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"),
2930
TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on
3031
TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller
3132

@@ -41,6 +42,7 @@ const Env = z.object({
4142
DOCKER_NETWORK: z.string().default("host"),
4243
OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(),
4344
ENFORCE_MACHINE_PRESETS: z.coerce.boolean().default(false),
45+
KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv
4446

4547
// Used by the resource monitor
4648
OVERRIDE_CPU_TOTAL: z.coerce.number().optional(),
@@ -53,7 +55,10 @@ const Env = z.object({
5355
EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"),
5456

5557
// Metrics
58+
METRICS_ENABLED: BoolEnv.default(true),
5659
METRICS_COLLECT_DEFAULTS: BoolEnv.default(true),
60+
METRICS_HOST: z.string().default("127.0.0.1"),
61+
METRICS_PORT: z.coerce.number().int().default(9090),
5762

5863
// Pod cleaner
5964
POD_CLEANER_ENABLED: BoolEnv.default(true),
@@ -63,6 +68,9 @@ const Env = z.object({
6368
// Failed pod handler
6469
FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true),
6570
FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000),
71+
72+
// Debug
73+
DEBUG: BoolEnv.default(false),
6674
});
6775

6876
export const env = Env.parse(stdEnv);

apps/supervisor/src/index.ts

Lines changed: 58 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ if (env.METRICS_COLLECT_DEFAULTS) {
3030

3131
class ManagedSupervisor {
3232
private readonly workerSession: SupervisorSession;
33-
private readonly httpServer: HttpServer;
33+
private readonly metricsServer?: HttpServer;
3434
private readonly workloadServer: WorkloadServer;
3535
private readonly workloadManager: WorkloadManager;
3636
private readonly logger = new SimpleStructuredLogger("managed-worker");
@@ -44,24 +44,15 @@ class ManagedSupervisor {
4444
private readonly warmStartUrl = env.TRIGGER_WARM_START_URL;
4545

4646
constructor() {
47-
const workloadApiProtocol = env.TRIGGER_WORKLOAD_API_PROTOCOL;
48-
const workloadApiDomain = env.TRIGGER_WORKLOAD_API_DOMAIN;
49-
const workloadApiPortExternal = env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL;
47+
const { TRIGGER_WORKER_TOKEN, MANAGED_WORKER_SECRET, ...envWithoutSecrets } = env;
5048

51-
if (env.POD_CLEANER_ENABLED) {
52-
this.podCleaner = new PodCleaner({
53-
namespace: env.KUBERNETES_NAMESPACE,
54-
batchSize: env.POD_CLEANER_BATCH_SIZE,
55-
intervalMs: env.POD_CLEANER_INTERVAL_MS,
56-
});
49+
if (env.DEBUG) {
50+
console.debug("[ManagedSupervisor] Starting up", { envWithoutSecrets });
5751
}
5852

59-
if (env.FAILED_POD_HANDLER_ENABLED) {
60-
this.failedPodHandler = new FailedPodHandler({
61-
namespace: env.KUBERNETES_NAMESPACE,
62-
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
63-
});
64-
}
53+
const workloadApiProtocol = env.TRIGGER_WORKLOAD_API_PROTOCOL;
54+
const workloadApiDomain = env.TRIGGER_WORKLOAD_API_DOMAIN;
55+
const workloadApiPortExternal = env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL;
6556

6657
if (this.warmStartUrl) {
6758
this.logger.log("[ManagedWorker] 🔥 Warm starts enabled", {
@@ -70,12 +61,43 @@ class ManagedSupervisor {
7061
}
7162

7263
if (this.isKubernetes) {
64+
if (env.POD_CLEANER_ENABLED) {
65+
this.logger.log("[ManagedWorker] 🧹 Pod cleaner enabled", {
66+
namespace: env.KUBERNETES_NAMESPACE,
67+
batchSize: env.POD_CLEANER_BATCH_SIZE,
68+
intervalMs: env.POD_CLEANER_INTERVAL_MS,
69+
});
70+
this.podCleaner = new PodCleaner({
71+
register,
72+
namespace: env.KUBERNETES_NAMESPACE,
73+
batchSize: env.POD_CLEANER_BATCH_SIZE,
74+
intervalMs: env.POD_CLEANER_INTERVAL_MS,
75+
});
76+
} else {
77+
this.logger.warn("[ManagedWorker] Pod cleaner disabled");
78+
}
79+
80+
if (env.FAILED_POD_HANDLER_ENABLED) {
81+
this.logger.log("[ManagedWorker] 🔁 Failed pod handler enabled", {
82+
namespace: env.KUBERNETES_NAMESPACE,
83+
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
84+
});
85+
this.failedPodHandler = new FailedPodHandler({
86+
register,
87+
namespace: env.KUBERNETES_NAMESPACE,
88+
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
89+
});
90+
} else {
91+
this.logger.warn("[ManagedWorker] Failed pod handler disabled");
92+
}
93+
7394
this.resourceMonitor = new KubernetesResourceMonitor(createK8sApi(), "");
7495
this.workloadManager = new KubernetesWorkloadManager({
7596
workloadApiProtocol,
7697
workloadApiDomain,
7798
workloadApiPort: workloadApiPortExternal,
7899
warmStartUrl: this.warmStartUrl,
100+
imagePullSecrets: env.KUBERNETES_IMAGE_PULL_SECRETS?.split(","),
79101
});
80102
} else {
81103
this.resourceMonitor = new DockerResourceMonitor(new Docker());
@@ -224,16 +246,21 @@ class ManagedSupervisor {
224246
}
225247
});
226248

227-
// Used for health checks and metrics
228-
this.httpServer = new HttpServer({ port: 8080, host: "0.0.0.0" }).route("/health", "GET", {
229-
handler: async ({ reply }) => {
230-
reply.text("OK");
231-
},
232-
});
249+
if (env.METRICS_ENABLED) {
250+
this.metricsServer = new HttpServer({
251+
port: env.METRICS_PORT,
252+
host: env.METRICS_HOST,
253+
metrics: {
254+
register,
255+
expose: true,
256+
},
257+
});
258+
}
233259

234260
// Responds to workload requests only
235261
this.workloadServer = new WorkloadServer({
236262
port: env.TRIGGER_WORKLOAD_API_PORT_INTERNAL,
263+
host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL,
237264
workerClient: this.workerSession.httpClient,
238265
checkpointClient: this.checkpointClient,
239266
});
@@ -299,13 +326,10 @@ class ManagedSupervisor {
299326
async start() {
300327
this.logger.log("[ManagedWorker] Starting up");
301328

302-
if (this.podCleaner) {
303-
await this.podCleaner.start();
304-
}
305-
306-
if (this.failedPodHandler) {
307-
await this.failedPodHandler.start();
308-
}
329+
// Optional services
330+
await this.podCleaner?.start();
331+
await this.failedPodHandler?.start();
332+
await this.metricsServer?.start();
309333

310334
if (env.TRIGGER_WORKLOAD_API_ENABLED) {
311335
this.logger.log("[ManagedWorker] Workload API enabled", {
@@ -319,21 +343,16 @@ class ManagedSupervisor {
319343
}
320344

321345
await this.workerSession.start();
322-
323-
await this.httpServer.start();
324346
}
325347

326348
async stop() {
327349
this.logger.log("[ManagedWorker] Shutting down");
328-
await this.httpServer.stop();
350+
await this.workerSession.stop();
329351

330-
if (this.podCleaner) {
331-
await this.podCleaner.stop();
332-
}
333-
334-
if (this.failedPodHandler) {
335-
await this.failedPodHandler.stop();
336-
}
352+
// Optional services
353+
await this.podCleaner?.stop();
354+
await this.failedPodHandler?.stop();
355+
await this.metricsServer?.stop();
337356
}
338357
}
339358

apps/supervisor/src/workloadManager/kubernetes.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -182,18 +182,15 @@ export class KubernetesWorkloadManager implements WorkloadManager {
182182
}
183183
}
184184

185+
private getImagePullSecrets(): k8s.V1LocalObjectReference[] | undefined {
186+
return this.opts.imagePullSecrets?.map((name) => ({ name }));
187+
}
188+
185189
get #defaultPodSpec(): Omit<k8s.V1PodSpec, "containers"> {
186190
return {
187191
restartPolicy: "Never",
188192
automountServiceAccountToken: false,
189-
imagePullSecrets: [
190-
{
191-
name: "registry-trigger",
192-
},
193-
{
194-
name: "registry-trigger-failover",
195-
},
196-
],
193+
imagePullSecrets: this.getImagePullSecrets(),
197194
nodeSelector: {
198195
nodetype: "worker-re2",
199196
},

apps/supervisor/src/workloadManager/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export interface WorkloadManagerOptions {
55
workloadApiDomain?: string; // If unset, will use orchestrator-specific default
66
workloadApiPort: number;
77
warmStartUrl?: string;
8+
imagePullSecrets?: string[];
89
}
910

1011
export interface WorkloadManager {

apps/supervisor/src/workloadServer/index.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
} from "@trigger.dev/core/v3/workers";
2323
import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOnly";
2424
import { type IncomingMessage } from "node:http";
25+
import { register } from "../metrics.js";
2526

2627
// Use the official export when upgrading to [email protected]
2728
interface DefaultEventsMap {
@@ -121,7 +122,19 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
121122
}
122123

123124
private createHttpServer({ host, port }: { host: string; port: number }) {
124-
return new HttpServer({ port, host })
125+
return new HttpServer({
126+
port,
127+
host,
128+
metrics: {
129+
register,
130+
expose: false,
131+
},
132+
})
133+
.route("/health", "GET", {
134+
handler: async ({ reply }) => {
135+
reply.text("OK");
136+
},
137+
})
125138
.route(
126139
"/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/start",
127140
"POST",

apps/webapp/app/routes/admin.api.v1.workers.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ import { WorkerGroupService } from "~/v3/services/worker/workerGroupService.serv
77
const RequestBodySchema = z.object({
88
name: z.string().optional(),
99
description: z.string().optional(),
10-
projectId: z.string().optional(),
11-
makeDefault: z.boolean().optional(),
10+
makeDefaultForProjectId: z.string().optional(),
1211
});
1312

1413
export async function action({ request }: ActionFunctionArgs) {
@@ -35,22 +34,21 @@ export async function action({ request }: ActionFunctionArgs) {
3534

3635
try {
3736
const rawBody = await request.json();
38-
const { name, description, projectId, makeDefault } = RequestBodySchema.parse(rawBody ?? {});
37+
const { name, description, makeDefaultForProjectId } = RequestBodySchema.parse(rawBody ?? {});
3938

4039
const service = new WorkerGroupService();
4140
const { workerGroup, token } = await service.createWorkerGroup({
4241
name,
4342
description,
4443
});
4544

46-
if (makeDefault && projectId) {
45+
if (makeDefaultForProjectId) {
4746
await prisma.project.update({
4847
where: {
49-
id: projectId,
48+
id: makeDefaultForProjectId,
5049
},
5150
data: {
5251
defaultWorkerGroupId: workerGroup.id,
53-
engine: "V2",
5452
},
5553
});
5654
}

apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.restore.ts

Lines changed: 0 additions & 51 deletions
This file was deleted.

0 commit comments

Comments
 (0)