Skip to content

[spicedb] Fix intermittent "Error: 4 DEADLINE_EXCEEDED...Waiting for LB pick" #20637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/content-service-api/typescript/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"lib"
],
"dependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1",
"inversify": "^6.0.1",
"opentracing": "^0.14.4"
Expand Down
2 changes: 1 addition & 1 deletion components/gitpod-protocol/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"src"
],
"devDependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"@testdeck/mocha": "^0.3.3",
"@types/analytics-node": "^3.1.9",
"@types/chai-subset": "^1.3.3",
Expand Down
87 changes: 87 additions & 0 deletions components/gitpod-protocol/src/util/grpc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import * as grpc from "@grpc/grpc-js";
import { Status } from "@grpc/grpc-js/build/src/constants";
import { log } from "./logging";
import { TrustedValue } from "./scrubbing";

export const defaultGRPCOptions = {
"grpc.keepalive_timeout_ms": 10000,
Expand Down Expand Up @@ -108,6 +110,91 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
};
}

export function createDebugLogInterceptor(additionalContextF: (() => object) | undefined): grpc.Interceptor {
const FAILURE_STATUS_CODES = new Map([
[Status.ABORTED, true],
[Status.CANCELLED, true],
[Status.DATA_LOSS, true],
[Status.DEADLINE_EXCEEDED, true],
[Status.FAILED_PRECONDITION, true],
[Status.INTERNAL, true],
[Status.PERMISSION_DENIED, true],
[Status.RESOURCE_EXHAUSTED, true],
[Status.UNAUTHENTICATED, true],
[Status.UNAVAILABLE, true],
[Status.UNIMPLEMENTED, true],
[Status.UNKNOWN, true],
]);

return (options, nextCall): grpc.InterceptingCall => {
const methodDef = options.method_definition;
const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1);
const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1);
const labels = {
service,
method,
type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream),
};
const requester = new grpc.RequesterBuilder()
.withStart((metadata, listener, next) => {
const newListener = new grpc.ListenerBuilder()
.withOnReceiveStatus((status, next) => {
// If given, call the additionalContext function and log the result
let additionalContext = {};
try {
if (additionalContextF) {
additionalContext = additionalContextF();
}
} catch (e) {}

try {
const info = {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
code: Status[status.code],
details: status.details,
additionalContext: new TrustedValue(additionalContext),
};
if (FAILURE_STATUS_CODES.has(status.code)) {
log.warn(`grpc call failed`, info);
} else {
log.debug(`grpc call status`, info);
}
} finally {
next(status);
}
})
.build();
try {
log.debug(`grpc call started`, {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
});
} finally {
next(metadata, newListener);
}
})
.withCancel((next) => {
try {
log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) });
} finally {
next();
}
})
.build();
return new grpc.InterceptingCall(nextCall(options), requester);
};
}

export function isGrpcError(err: any): err is grpc.StatusObject {
return err.code && err.details;
}

export function isConnectionAlive(client: grpc.Client) {
const cs = client.getChannel().getConnectivityState(false);
return (
cs == grpc.connectivityState.CONNECTING ||
cs == grpc.connectivityState.IDLE ||
cs == grpc.connectivityState.READY
);
}
2 changes: 1 addition & 1 deletion components/ide-metrics-api/typescript-grpc/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"lib"
],
"dependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1"
},
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion components/image-builder-api/typescript/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"dependencies": {
"@gitpod/content-service": "0.1.5",
"@gitpod/gitpod-protocol": "0.1.5",
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1",
"inversify": "^6.0.1",
"opentracing": "^0.14.4"
Expand Down
9 changes: 2 additions & 7 deletions components/image-builder-api/typescript/src/sugar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { ImageBuilderClient } from "./imgbuilder_grpc_pb";
import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing";
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
import { createClientCallMetricsInterceptor, IClientCallMetrics } from "@gitpod/gitpod-protocol/lib/util/grpc";
import { createClientCallMetricsInterceptor, IClientCallMetrics, isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc";
import * as opentracing from "opentracing";
import { Metadata } from "@grpc/grpc-js";
import {
Expand Down Expand Up @@ -130,12 +130,7 @@ export class PromisifiedImageBuilderClient {
) {}

public isConnectionAlive() {
const cs = this.client.getChannel().getConnectivityState(false);
return (
cs == grpc.connectivityState.CONNECTING ||
cs == grpc.connectivityState.IDLE ||
cs == grpc.connectivityState.READY
);
return isConnectionAlive(this.client);
}

public resolveBaseImage(ctx: TraceContext, request: ResolveBaseImageRequest): Promise<ResolveBaseImageResponse> {
Expand Down
2 changes: 1 addition & 1 deletion components/server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"/src"
],
"dependencies": {
"@authzed/authzed-node": "^0.15.0",
"@authzed/authzed-node": "^1.2.2",
"@connectrpc/connect": "1.1.2",
"@connectrpc/connect-express": "1.1.2",
"@gitbeaker/rest": "^39.12.0",
Expand Down
118 changes: 62 additions & 56 deletions components/server/src/authorization/spicedb-authorizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error";
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";

async function tryThree<T>(errMessage: string, code: (attempt: number) => Promise<T>): Promise<T> {
let attempt = 0;
// we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
// last time we checked it was 15 times per day (check logs)
while (attempt++ < 3) {
try {
return await code(attempt);
} catch (err) {
if (
(err.code === grpc.status.INTERNAL ||
err.code === grpc.status.DEADLINE_EXCEEDED ||
err.code === grpc.status.UNAVAILABLE) &&
attempt < 3
) {
let delay = 500 * attempt;
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
// we already waited for timeout, so let's try again immediately
delay = 0;
}

log.warn(errMessage, err, {
attempt,
delay,
code: err.code,
});
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}

log.error(errMessage, err, {
attempt,
code: err.code,
});
// we don't try again on other errors
throw err;
}
}
throw new Error("unreachable");
}

export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer {
return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache());
}
Expand All @@ -71,13 +31,11 @@ interface DeletionResult {
deletedAt?: string;
}

const GRPC_DEADLINE = 10_000;

export class SpiceDBAuthorizer {
constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {}

private get client(): v1.ZedPromiseClientInterface {
return this.clientProvider.getClient();
}

public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise<boolean> {
req.consistency = await this.tokenCache.consistency(req.resource);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
Expand All @@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to perform authorization check.", () =>
this.client.checkPermission(req, this.callOptions),
const response = await this.call("[spicedb] Failed to perform authorization check.", (client) =>
client.checkPermission(req, this.callOptions),
);
const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION;
return { permitted, checkedAt: response.checkedAt?.token };
Expand Down Expand Up @@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to write relationships.", () =>
this.client.writeRelationships(
const response = await this.call("[spicedb] Failed to write relationships.", (client) =>
client.writeRelationships(
v1.WriteRelationshipsRequest.create({
updates,
}),
Expand Down Expand Up @@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
let error: Error | undefined;
try {
let deletedAt: string | undefined = undefined;
const existing = await tryThree("readRelationships before deleteRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const existing = await this.call("readRelationships before deleteRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (existing.length > 0) {
const response = await tryThree("deleteRelationships failed.", () =>
this.client.deleteRelationships(req, this.callOptions),
const response = await this.call("deleteRelationships failed.", (client) =>
client.deleteRelationships(req, this.callOptions),
);
deletedAt = response.deletedAt?.token;
const after = await tryThree("readRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const after = await this.call("readRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (after.length > 0) {
log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req });
Expand Down Expand Up @@ -213,7 +171,55 @@ export class SpiceDBAuthorizer {
async readRelationships(req: v1.ReadRelationshipsRequest): Promise<v1.ReadRelationshipsResponse[]> {
req.consistency = await this.tokenCache.consistency(undefined);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions));
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
}

/**
* call retrieves a Spicedb client and executes the given code block.
* In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
* This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
* @param description
* @param code
* @returns
*/
private async call<T>(description: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
const MAX_ATTEMPTS = 3;
let attempt = 0;
while (attempt++ < 3) {
try {
const checkClient = attempt > 1; // the last client error'd out, so check if we should get a new one
const client = this.clientProvider.getClient(checkClient);
return code(client);
} catch (err) {
// Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
if (
(err.code === grpc.status.DEADLINE_EXCEEDED || err.code === grpc.status.UNAVAILABLE) &&
attempt < MAX_ATTEMPTS
) {
let delay = 500 * attempt;
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
// we already waited for timeout, so let's try again immediately
delay = 0;
}

log.warn(description, err, {
attempt,
delay,
code: err.code,
});
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}

// Some other error: log and rethrow
log.error(description, err, {
attempt,
code: err.code,
});
throw err;
}
}
throw new Error("unreachable");
}

/**
Expand All @@ -223,7 +229,7 @@ export class SpiceDBAuthorizer {
*/
private get callOptions(): grpc.Metadata {
return (<grpc.CallOptions>{
deadline: Date.now() + 8000,
deadline: Date.now() + GRPC_DEADLINE,
}) as any as grpc.Metadata;
}
}
Expand Down
Loading
Loading