Skip to content

Commit c875f6d

Browse files
committed
[server] SpiceDB client: retry with new client on "Waiting for LB pick" error
Tool: gitpod/catfood.gitpod.cloud
1 parent 99193d9 commit c875f6d

File tree

4 files changed

+101
-26
lines changed

4 files changed

+101
-26
lines changed

components/gitpod-protocol/src/util/grpc.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
110110
};
111111
}
112112

113-
export function createDebugLogInterceptor(): grpc.Interceptor {
113+
export function createDebugLogInterceptor(additionalContextF: (() => object) | undefined): grpc.Interceptor {
114114
const FAILURE_STATUS_CODES = new Map([
115115
[Status.ABORTED, true],
116116
[Status.CANCELLED, true],
@@ -139,11 +139,21 @@ export function createDebugLogInterceptor(): grpc.Interceptor {
139139
.withStart((metadata, listener, next) => {
140140
const newListener = new grpc.ListenerBuilder()
141141
.withOnReceiveStatus((status, next) => {
142+
// If given, call the additionalContext function and log the result
143+
let additionalContext = {};
144+
try {
145+
if (additionalContextF) {
146+
additionalContext = additionalContextF();
147+
}
148+
} catch (e) {}
149+
142150
try {
143151
const info = {
144152
labels: new TrustedValue(labels),
145153
metadata: new TrustedValue(metadata.toJSON()),
146154
code: Status[status.code],
155+
details: status.details,
156+
additionalContext: new TrustedValue(additionalContext),
147157
};
148158
if (FAILURE_STATUS_CODES.has(status.code)) {
149159
log.warn(`grpc call failed`, info);

components/server/src/authorization/spicedb-authorizer.ts

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,16 +174,52 @@ export class SpiceDBAuthorizer {
174174
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
175175
}
176176

177-
private async call<T>(errMessage: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
178-
try {
179-
const client = this.clientProvider.getClient();
180-
return code(client);
181-
} catch (err) {
182-
log.error(errMessage, err, {
183-
code: err.code,
184-
});
185-
throw err;
177+
/**
178+
* call retrieves a Spicedb client and executes the given code block.
179+
* In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
180+
* This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
181+
* @param description
182+
* @param code
183+
* @returns
184+
*/
185+
private async call<T>(description: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
186+
const MAX_ATTEMPTS = 3;
187+
let attempt = 0;
188+
while (attempt++ < 3) {
189+
try {
190+
const checkClient = attempt > 1; // the last client error'd out, so check if we should get a new one
191+
const client = this.clientProvider.getClient(checkClient);
192+
return code(client);
193+
} catch (err) {
194+
// Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
195+
if (
196+
(err.code === grpc.status.DEADLINE_EXCEEDED || err.code === grpc.status.UNAVAILABLE) &&
197+
attempt < MAX_ATTEMPTS
198+
) {
199+
let delay = 500 * attempt;
200+
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
201+
// we already waited for timeout, so let's try again immediately
202+
delay = 0;
203+
}
204+
205+
log.warn(description, err, {
206+
attempt,
207+
delay,
208+
code: err.code,
209+
});
210+
await new Promise((resolve) => setTimeout(resolve, delay));
211+
continue;
212+
}
213+
214+
// Some other error: log and rethrow
215+
log.error(description, err, {
216+
attempt,
217+
code: err.code,
218+
});
219+
throw err;
220+
}
186221
}
222+
throw new Error("unreachable");
187223
}
188224

189225
/**

components/server/src/authorization/spicedb.ts

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
99
import * as grpc from "@grpc/grpc-js";
1010
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server";
1111
import { TrustedValue } from "@gitpod/gitpod-protocol/lib/util/scrubbing";
12+
import { createDebugLogInterceptor } from "@gitpod/gitpod-protocol/lib/util/grpc";
13+
import { isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc";
1214

1315
export interface SpiceDBClientConfig {
1416
address: string;
@@ -54,6 +56,7 @@ const DefaultClientOptions: grpc.ClientOptions = {
5456
// default is 30s, which is too long for us during rollouts (where service DNS entries are updated)
5557
"grpc.dns_min_time_between_resolutions_ms": 2_000,
5658
};
59+
const CLIENT_CLOSE_DELAY = 60_000; // 60 [s]
5760

5861
export function spiceDBConfigFromEnv(): SpiceDBClientConfig | undefined {
5962
const token = process.env["SPICEDB_PRESHARED_KEY"];
@@ -110,9 +113,7 @@ export class SpiceDBClientProvider {
110113
});
111114

112115
// close client after 2 minutes to make sure most pending requests on the previous client are finished.
113-
setTimeout(() => {
114-
this.closeClient(oldClient);
115-
}, 2 * 60 * 1000);
116+
this.closeClientAfter(oldClient, CLIENT_CLOSE_DELAY);
116117
}
117118
this.clientOptions = clientOptions;
118119
// `createClient` will use the `DefaultClientOptions` to create client if the value on Feature Flag is not able to create a client
@@ -132,48 +133,77 @@ export class SpiceDBClientProvider {
132133
})();
133134
}
134135

135-
private closeClient(client: Client) {
136-
try {
137-
client.close();
138-
} catch (error) {
139-
log.error("[spicedb] Error closing client", error);
140-
}
136+
private closeClientAfter(client: Client, timeout: number): void {
137+
setTimeout(() => {
138+
try {
139+
client.close();
140+
} catch (error) {
141+
log.error("[spicedb] Error closing client", error);
142+
}
143+
}, timeout);
141144
}
142145

143146
private createClient(clientOptions: grpc.ClientOptions): Client {
144147
log.debug("[spicedb] Creating client", {
145148
clientOptions: new TrustedValue(clientOptions),
146149
});
150+
151+
// Inject debugLogInterceptor to log details especially on failed grpc calls
152+
let client: Client;
153+
const debugInfo = (): object => {
154+
if (!client) {
155+
return {};
156+
}
157+
const ch = client.getChannel();
158+
return {
159+
target: ch.getTarget(),
160+
state: ch.getConnectivityState(false),
161+
};
162+
};
163+
const interceptors = [...(this.interceptors || []), createDebugLogInterceptor(debugInfo)];
164+
165+
// Create the actual client
147166
try {
148-
return v1.NewClient(
167+
client = v1.NewClient(
149168
this.clientConfig.token,
150169
this.clientConfig.address,
151170
v1.ClientSecurity.INSECURE_PLAINTEXT_CREDENTIALS,
152171
undefined,
153172
{
154173
...clientOptions,
155-
interceptors: this.interceptors,
174+
interceptors,
156175
},
157176
) as Client;
158177
} catch (error) {
159178
log.error("[spicedb] Error create client, fallback to default options", error);
160-
return v1.NewClient(
179+
client = v1.NewClient(
161180
this.clientConfig.token,
162181
this.clientConfig.address,
163182
v1.ClientSecurity.INSECURE_PLAINTEXT_CREDENTIALS,
164183
undefined,
165184
{
166185
...DefaultClientOptions,
167-
interceptors: this.interceptors,
186+
interceptors,
168187
},
169188
) as Client;
170189
}
190+
return client;
171191
}
172192

173-
getClient(): SpiceDBClient {
193+
getClient(checkClient: boolean = false): SpiceDBClient {
174194
if (!this.client) {
175195
this.client = this.createClient(this.clientOptions);
176196
}
197+
198+
if (checkClient) {
199+
const oldClient = this.client;
200+
if (!isConnectionAlive(oldClient)) {
201+
const connectivityState = oldClient.getChannel().getConnectivityState(false);
202+
log.warn("[spicedb] Client is not alive, creating a new one", { connectivityState });
203+
this.closeClientAfter(oldClient, CLIENT_CLOSE_DELAY);
204+
this.client = this.createClient(this.clientOptions);
205+
}
206+
}
177207
return this.client.promises;
178208
}
179209
}

components/server/src/container-module.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import { DebugApp } from "@gitpod/gitpod-protocol/lib/util/debug-app";
1515
import {
1616
IClientCallMetrics,
1717
createClientCallMetricsInterceptor,
18-
createDebugLogInterceptor,
1918
defaultGRPCOptions,
2019
} from "@gitpod/gitpod-protocol/lib/util/grpc";
2120
import { prometheusClientMiddleware } from "@gitpod/gitpod-protocol/lib/util/nice-grpc";
@@ -342,7 +341,7 @@ export const productionContainerModule = new ContainerModule(
342341
const clientCallMetrics = ctx.container.get<IClientCallMetrics>(IClientCallMetrics);
343342
return new SpiceDBClientProvider(
344343
config, //
345-
[createClientCallMetricsInterceptor(clientCallMetrics), createDebugLogInterceptor()],
344+
[createClientCallMetricsInterceptor(clientCallMetrics)],
346345
);
347346
})
348347
.inSingletonScope();

0 commit comments

Comments
 (0)