Skip to content

Commit 560e4dc

Browse files
committed
[server] Streamline spicedb gRPC client usage and creation options
- instead of doing retries on two levels, rely on the gRPC-level retries - to mitigate the loss of insights, introduce createDebugLogInterceptor - client options: use sane defaults derived from the documentation instead of the excessive ones we had in place before - use "waitForReady" option: it should a) make our calls for responsive on re-connects, while b) - because we keep re-trying on DEADLINE_EXCEEDED - should be as reliable as before Tool: gitpod/catfood.gitpod.cloud
1 parent 2a69b53 commit 560e4dc

File tree

4 files changed

+113
-65
lines changed

4 files changed

+113
-65
lines changed

components/gitpod-protocol/src/util/grpc.ts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import * as grpc from "@grpc/grpc-js";
88
import { Status } from "@grpc/grpc-js/build/src/constants";
9+
import { log } from "./logging";
10+
import { TrustedValue } from "./scrubbing";
911

1012
export const defaultGRPCOptions = {
1113
"grpc.keepalive_timeout_ms": 10000,
@@ -108,6 +110,72 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
108110
};
109111
}
110112

113+
export function createDebugLogInterceptor(): grpc.Interceptor {
114+
const FAILURE_STATUS_CODES = new Map([
115+
[Status.ABORTED, true],
116+
[Status.CANCELLED, true],
117+
[Status.DATA_LOSS, true],
118+
[Status.DEADLINE_EXCEEDED, true],
119+
[Status.FAILED_PRECONDITION, true],
120+
[Status.INTERNAL, true],
121+
[Status.PERMISSION_DENIED, true],
122+
[Status.RESOURCE_EXHAUSTED, true],
123+
[Status.UNAUTHENTICATED, true],
124+
[Status.UNAVAILABLE, true],
125+
[Status.UNIMPLEMENTED, true],
126+
[Status.UNKNOWN, true],
127+
]);
128+
129+
return (options, nextCall): grpc.InterceptingCall => {
130+
const methodDef = options.method_definition;
131+
const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1);
132+
const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1);
133+
const labels = {
134+
service,
135+
method,
136+
type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream),
137+
};
138+
const requester = new grpc.RequesterBuilder()
139+
.withStart((metadata, listener, next) => {
140+
const newListener = new grpc.ListenerBuilder()
141+
.withOnReceiveStatus((status, next) => {
142+
try {
143+
const info = {
144+
labels: new TrustedValue(labels),
145+
metadata: new TrustedValue(metadata.toJSON()),
146+
code: Status[status.code],
147+
};
148+
if (FAILURE_STATUS_CODES.has(status.code)) {
149+
log.warn(`grpc call failed`, info);
150+
} else {
151+
log.debug(`grpc call status`, info);
152+
}
153+
} finally {
154+
next(status);
155+
}
156+
})
157+
.build();
158+
try {
159+
log.debug(`grpc call started`, {
160+
labels: new TrustedValue(labels),
161+
metadata: new TrustedValue(metadata.toJSON()),
162+
});
163+
} finally {
164+
next(metadata, newListener);
165+
}
166+
})
167+
.withCancel((next) => {
168+
try {
169+
log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) });
170+
} finally {
171+
next();
172+
}
173+
})
174+
.build();
175+
return new grpc.InterceptingCall(nextCall(options), requester);
176+
};
177+
}
178+
111179
export function isGrpcError(err: any): err is grpc.StatusObject {
112180
return err.code && err.details;
113181
}

components/server/src/authorization/spicedb-authorizer.ts

Lines changed: 26 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
1717
import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error";
1818
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";
1919

20-
async function tryThree<T>(errMessage: string, code: (attempt: number) => Promise<T>): Promise<T> {
21-
let attempt = 0;
22-
// we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
23-
// last time we checked it was 15 times per day (check logs)
24-
while (attempt++ < 3) {
25-
try {
26-
return await code(attempt);
27-
} catch (err) {
28-
if (
29-
(err.code === grpc.status.INTERNAL ||
30-
err.code === grpc.status.DEADLINE_EXCEEDED ||
31-
err.code === grpc.status.UNAVAILABLE) &&
32-
attempt < 3
33-
) {
34-
let delay = 500 * attempt;
35-
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
36-
// we already waited for timeout, so let's try again immediately
37-
delay = 0;
38-
}
39-
40-
log.warn(errMessage, err, {
41-
attempt,
42-
delay,
43-
code: err.code,
44-
});
45-
await new Promise((resolve) => setTimeout(resolve, delay));
46-
continue;
47-
}
48-
49-
log.error(errMessage, err, {
50-
attempt,
51-
code: err.code,
52-
});
53-
// we don't try again on other errors
54-
throw err;
55-
}
56-
}
57-
throw new Error("unreachable");
58-
}
59-
6020
export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer {
6121
return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache());
6222
}
@@ -71,13 +31,11 @@ interface DeletionResult {
7131
deletedAt?: string;
7232
}
7333

34+
const GRPC_DEADLINE = 10_000;
35+
7436
export class SpiceDBAuthorizer {
7537
constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {}
7638

77-
private get client(): v1.ZedPromiseClientInterface {
78-
return this.clientProvider.getClient();
79-
}
80-
8139
public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise<boolean> {
8240
req.consistency = await this.tokenCache.consistency(req.resource);
8341
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
9957
const timer = spicedbClientLatency.startTimer();
10058
let error: Error | undefined;
10159
try {
102-
const response = await tryThree("[spicedb] Failed to perform authorization check.", () =>
103-
this.client.checkPermission(req, this.callOptions),
60+
const response = await this.call("[spicedb] Failed to perform authorization check.", (client) =>
61+
client.checkPermission(req, this.callOptions),
10462
);
10563
const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION;
10664
return { permitted, checkedAt: response.checkedAt?.token };
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
13997
const timer = spicedbClientLatency.startTimer();
14098
let error: Error | undefined;
14199
try {
142-
const response = await tryThree("[spicedb] Failed to write relationships.", () =>
143-
this.client.writeRelationships(
100+
const response = await this.call("[spicedb] Failed to write relationships.", (client) =>
101+
client.writeRelationships(
144102
v1.WriteRelationshipsRequest.create({
145103
updates,
146104
}),
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
175133
let error: Error | undefined;
176134
try {
177135
let deletedAt: string | undefined = undefined;
178-
const existing = await tryThree("readRelationships before deleteRelationships failed.", () =>
179-
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
136+
const existing = await this.call("readRelationships before deleteRelationships failed.", (client) =>
137+
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
180138
);
181139
if (existing.length > 0) {
182-
const response = await tryThree("deleteRelationships failed.", () =>
183-
this.client.deleteRelationships(req, this.callOptions),
140+
const response = await this.call("deleteRelationships failed.", (client) =>
141+
client.deleteRelationships(req, this.callOptions),
184142
);
185143
deletedAt = response.deletedAt?.token;
186-
const after = await tryThree("readRelationships failed.", () =>
187-
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
144+
const after = await this.call("readRelationships failed.", (client) =>
145+
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
188146
);
189147
if (after.length > 0) {
190148
log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req });
@@ -213,7 +171,19 @@ export class SpiceDBAuthorizer {
213171
async readRelationships(req: v1.ReadRelationshipsRequest): Promise<v1.ReadRelationshipsResponse[]> {
214172
req.consistency = await this.tokenCache.consistency(undefined);
215173
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
216-
return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions));
174+
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
175+
}
176+
177+
private async call<T>(errMessage: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
178+
try {
179+
const client = this.clientProvider.getClient();
180+
return code(client);
181+
} catch (err) {
182+
log.error(errMessage, err, {
183+
code: err.code,
184+
});
185+
throw err;
186+
}
217187
}
218188

219189
/**
@@ -223,7 +193,7 @@ export class SpiceDBAuthorizer {
223193
*/
224194
private get callOptions(): grpc.Metadata {
225195
return (<grpc.CallOptions>{
226-
deadline: Date.now() + 8000,
196+
deadline: Date.now() + GRPC_DEADLINE,
227197
}) as any as grpc.Metadata;
228198
}
229199
}

components/server/src/authorization/spicedb.ts

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,33 @@ export interface SpiceDBClientConfig {
1717

1818
export type SpiceDBClient = v1.ZedPromiseClientInterface;
1919
type Client = v1.ZedClientInterface & grpc.Client;
20+
2021
const DEFAULT_FEATURE_FLAG_VALUE = "undefined";
2122
const DefaultClientOptions: grpc.ClientOptions = {
2223
// we ping frequently to check if the connection is still alive
23-
"grpc.keepalive_time_ms": 1000,
24-
"grpc.keepalive_timeout_ms": 1000,
24+
"grpc.keepalive_time_ms": 30_000,
25+
"grpc.keepalive_timeout_ms": 4_000,
26+
27+
"grpc.max_reconnect_backoff_ms": 5_000,
28+
"grpc.initial_reconnect_backoff_ms": 1_000,
2529

26-
"grpc.max_reconnect_backoff_ms": 5000,
27-
"grpc.initial_reconnect_backoff_ms": 500,
30+
// docs on client-side retry support: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/examples/retry/README.md?plain=1#L3
31+
"grpc.service_config_disable_resolution": 1, // don't resolve from external, but guarantee to take this config
2832
"grpc.service_config": JSON.stringify({
2933
methodConfig: [
3034
{
35+
// here is the code that shows how an empty shape matches every method: https://github.com/grpc/grpc-node/blob/bfd87a9bf62ebc438bcf98a7af223d5353f4c8b2/packages/grpc-js/src/resolving-load-balancer.ts#L62-L147
3136
name: [{}],
37+
// docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L88C29-L88C43
38+
waitForReady: true,
39+
// docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L136
3240
retryPolicy: {
33-
maxAttempts: 10,
34-
initialBackoff: "0.1s",
41+
maxAttempts: 5,
42+
initialBackoff: "1s",
3543
maxBackoff: "5s",
3644
backoffMultiplier: 2.0,
37-
retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED"],
45+
// validation code: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/packages/grpc-js/src/service-config.ts#L182C1-L197C4
46+
retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED", "INTERNAL"],
3847
},
3948
},
4049
],
@@ -43,7 +52,7 @@ const DefaultClientOptions: grpc.ClientOptions = {
4352

4453
// Governs how log DNS resolution results are cached (at minimum!)
4554
// default is 30s, which is too long for us during rollouts (where service DNS entries are updated)
46-
"grpc.dns_min_time_between_resolutions_ms": 2000,
55+
"grpc.dns_min_time_between_resolutions_ms": 2_000,
4756
};
4857

4958
export function spiceDBConfigFromEnv(): SpiceDBClientConfig | undefined {

components/server/src/container-module.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import { DebugApp } from "@gitpod/gitpod-protocol/lib/util/debug-app";
1515
import {
1616
IClientCallMetrics,
1717
createClientCallMetricsInterceptor,
18+
createDebugLogInterceptor,
1819
defaultGRPCOptions,
1920
} from "@gitpod/gitpod-protocol/lib/util/grpc";
2021
import { prometheusClientMiddleware } from "@gitpod/gitpod-protocol/lib/util/nice-grpc";
@@ -341,7 +342,7 @@ export const productionContainerModule = new ContainerModule(
341342
const clientCallMetrics = ctx.container.get<IClientCallMetrics>(IClientCallMetrics);
342343
return new SpiceDBClientProvider(
343344
config, //
344-
[createClientCallMetricsInterceptor(clientCallMetrics)],
345+
[createClientCallMetricsInterceptor(clientCallMetrics), createDebugLogInterceptor()],
345346
);
346347
})
347348
.inSingletonScope();

0 commit comments

Comments
 (0)