Skip to content

Commit 9b6132d

Browse files
committed
Retry certain errors between server and gateway
1 parent 19c716b commit 9b6132d

File tree

2 files changed

+83
-23
lines changed

2 files changed

+83
-23
lines changed

jupyter_server/gateway/gateway_client.py

Lines changed: 81 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
# Copyright (c) Jupyter Development Team.
22
# Distributed under the terms of the Modified BSD License.
3+
import asyncio
34
import json
5+
import logging
46
import os
7+
import typing as ty
58
from socket import gaierror
69

710
from tornado import web
8-
from tornado.httpclient import AsyncHTTPClient, HTTPError
11+
from tornado.httpclient import AsyncHTTPClient, HTTPClientError, HTTPResponse
912
from traitlets import Bool, Float, Int, TraitError, Unicode, default, validate
1013
from traitlets.config import SingletonConfigurable
1114

@@ -417,40 +420,96 @@ def load_connection_args(self, **kwargs):
417420
return kwargs
418421

419422

420-
async def gateway_request(endpoint, **kwargs):
423+
class RetryableHTTPClient:
424+
"""
425+
Inspired by urllib.util.Retry (https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html),
426+
this class is initialized with desired retry characteristics, uses a recursive method `fetch()` against an instance
427+
of `AsyncHTTPClient` which tracks the current retry count across applicable request retries.
428+
"""
429+
430+
MAX_RETRIES_DEFAULT = 2
431+
MAX_RETRIES_CAP = 10 # The upper limit to max_retries value.
432+
max_retries: int = int(os.getenv("JUPYTER_GATEWAY_MAX_REQUEST_RETRIES", MAX_RETRIES_DEFAULT))
433+
if max_retries < 0:
434+
max_retries = 0
435+
elif max_retries > MAX_RETRIES_CAP:
436+
max_retries = MAX_RETRIES_CAP
437+
retried_methods: ty.Set[str] = {"GET", "DELETE"}
438+
retried_errors: ty.Set[int] = {502, 503, 504, 599}
439+
retried_exceptions: ty.Set[type] = {ConnectionError}
440+
backoff_factor: float = 0.1
441+
442+
def __init__(self):
443+
self.retry_count: int = 0
444+
self.client: AsyncHTTPClient = AsyncHTTPClient()
445+
446+
async def fetch(self, endpoint: str, **kwargs: ty.Any) -> HTTPResponse:
447+
"""
448+
Retryable AsyncHTTPClient.fetch method. When the request fails, this method will
449+
recurse up to max_retries times if the condition deserves a retry.
450+
"""
451+
try:
452+
response: HTTPResponse = await self.client.fetch(endpoint, **kwargs)
453+
except Exception as e:
454+
is_retryable: bool = await self._is_retryable(kwargs["method"], e)
455+
if not is_retryable:
456+
raise e
457+
logging.getLogger("ServerApp").info(
458+
f"Attempting retry ({self.retry_count}) against "
459+
f"endpoint '{endpoint}'. Retried error: '{repr(e)}'"
460+
)
461+
response = await self.fetch(endpoint, **kwargs)
462+
return response
463+
464+
async def _is_retryable(self, method: str, exception: Exception) -> bool:
465+
"""Determines if the given exception is retryable based on object's configuration."""
466+
467+
if method not in self.retried_methods:
468+
return False
469+
if self.retry_count == self.max_retries:
470+
return False
471+
472+
# Determine if error is retryable...
473+
if isinstance(exception, HTTPClientError):
474+
hce: HTTPClientError = exception
475+
if hce.code not in self.retried_errors:
476+
return False
477+
elif not any(isinstance(exception, error) for error in self.retried_exceptions):
478+
return False
479+
480+
# Is retryable, wait for backoff, then increment count
481+
await asyncio.sleep(self.backoff_factor * (2**self.retry_count))
482+
self.retry_count += 1
483+
return True
484+
485+
486+
async def gateway_request(endpoint: str, **kwargs: ty.Any) -> HTTPResponse:
421487
"""Make an async request to kernel gateway endpoint, returns a response"""
422-
client = AsyncHTTPClient()
423488
kwargs = GatewayClient.instance().load_connection_args(**kwargs)
489+
rhc = RetryableHTTPClient()
424490
try:
425-
response = await client.fetch(endpoint, **kwargs)
491+
response = await rhc.fetch(endpoint, **kwargs)
426492
# Trap a set of common exceptions so that we can inform the user that their Gateway url is incorrect
427493
# or the server is not running.
428-
# NOTE: We do this here since this handler is called during the Notebook's startup and subsequent refreshes
494+
# NOTE: We do this here since this handler is called during the server's startup and subsequent refreshes
429495
# of the tree view.
430-
except ConnectionRefusedError as e:
496+
except HTTPClientError as e:
431497
raise web.HTTPError(
432-
503,
433-
"Connection refused from Gateway server url '{}'. "
434-
"Check to be sure the Gateway instance is running.".format(
435-
GatewayClient.instance().url
436-
),
498+
e.code,
499+
f"Error attempting to connect to Gateway server url '{GatewayClient.instance().url}'. "
500+
"Ensure gateway url is valid and the Gateway instance is running.",
437501
) from e
438-
except HTTPError as e:
439-
# This can occur if the host is valid (e.g., foo.com) but there's nothing there.
502+
except ConnectionError as e:
440503
raise web.HTTPError(
441-
e.code,
442-
"Error attempting to connect to Gateway server url '{}'. "
443-
"Ensure gateway url is valid and the Gateway instance is running.".format(
444-
GatewayClient.instance().url
445-
),
504+
503,
505+
f"ConnectionError was received from Gateway server url '{GatewayClient.instance().url}'. "
506+
"Check to be sure the Gateway instance is running.",
446507
) from e
447508
except gaierror as e:
448509
raise web.HTTPError(
449510
404,
450-
"The Gateway server specified in the gateway_url '{}' doesn't appear to be valid. "
451-
"Ensure gateway url is valid and the Gateway instance is running.".format(
452-
GatewayClient.instance().url
453-
),
511+
f"The Gateway server specified in the gateway_url '{GatewayClient.instance().url}' doesn't "
512+
f"appear to be valid. Ensure gateway url is valid and the Gateway instance is running.",
454513
) from e
455514

456515
return response

jupyter_server/gateway/managers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,8 @@ def __init__(self, **kwargs):
326326
self.kernels_url = url_path_join(
327327
GatewayClient.instance().url, GatewayClient.instance().kernels_endpoint
328328
)
329-
self.kernel_url = self.kernel = self.kernel_id = None
329+
self.kernel_url: str
330+
self.kernel = self.kernel_id = None
330331
# simulate busy/activity markers:
331332
self.execution_state = self.last_activity = None
332333

0 commit comments

Comments
 (0)