Skip to content

Commit 9c5fe16

Browse files
committed
PYTHON-2462 Avoid connection storms: implement pool PAUSED state
Mark server unknown and clear the pool when background connections fail Eagerly evict threads from the wait queue when pool is paused. Evicted threads will raise the following error: AutoReconnect('localhost:27017: connection pool paused') CMAP spec test changes: - CMAP unit tests should not use real monitors - Assert that CMAP threads complete all scheduled operations
1 parent 61232b7 commit 9c5fe16

38 files changed

+686
-245
lines changed

doc/changelog.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ Changelog
44
Changes in Version 4.0
55
----------------------
66

7+
Breaking Changes in 4.0
8+
```````````````````````
9+
10+
- Removed :mod:`~pymongo.thread_util`.
11+
712
Issues Resolved
813
...............
914

pymongo/event_loggers.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ class ConnectionPoolLogger(monitoring.ConnectionPoolListener):
171171
def pool_created(self, event):
172172
logging.info("[pool {0.address}] pool created".format(event))
173173

174+
def pool_ready(self, event):
175+
logging.info("[pool {0.address}] pool ready".format(event))
176+
174177
def pool_cleared(self, event):
175178
logging.info("[pool {0.address}] pool cleared".format(event))
176179

pymongo/mongo_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,7 @@ def target():
737737

738738
executor = periodic_executor.PeriodicExecutor(
739739
interval=common.KILL_CURSOR_FREQUENCY,
740-
min_interval=0.5,
740+
min_interval=common.MIN_HEARTBEAT_INTERVAL,
741741
target=target,
742742
name="pymongo_kill_cursors_thread")
743743

pymongo/monitoring.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,18 @@ def pool_created(self, event):
255255
"""
256256
raise NotImplementedError
257257

258+
def pool_ready(self, event):
259+
"""Abstract method to handle a :class:`PoolReadyEvent`.
260+
261+
Emitted when a Connection Pool is marked ready.
262+
263+
:Parameters:
264+
- `event`: An instance of :class:`PoolReadyEvent`.
265+
266+
.. versionadded:: 4.0
267+
"""
268+
raise NotImplementedError
269+
258270
def pool_cleared(self, event):
259271
"""Abstract method to handle a `PoolClearedEvent`.
260272
@@ -692,6 +704,18 @@ def __repr__(self):
692704
self.__class__.__name__, self.address, self.__options)
693705

694706

707+
class PoolReadyEvent(_PoolEvent):
708+
"""Published when a Connection Pool is marked ready.
709+
710+
:Parameters:
711+
- `address`: The address (host, port) pair of the server this Pool is
712+
attempting to connect to.
713+
714+
.. versionadded:: 4.0
715+
"""
716+
__slots__ = ()
717+
718+
695719
class PoolClearedEvent(_PoolEvent):
696720
"""Published when a Connection Pool is cleared.
697721
@@ -1475,6 +1499,16 @@ def publish_pool_created(self, address, options):
14751499
except Exception:
14761500
_handle_exception()
14771501

1502+
def publish_pool_ready(self, address):
1503+
"""Publish a :class:`PoolReadyEvent` to all pool listeners.
1504+
"""
1505+
event = PoolReadyEvent(address)
1506+
for subscriber in self.__cmap_listeners:
1507+
try:
1508+
subscriber.pool_ready(event)
1509+
except Exception:
1510+
_handle_exception()
1511+
14781512
def publish_pool_cleared(self, address):
14791513
"""Publish a :class:`PoolClearedEvent` to all pool listeners.
14801514
"""

pymongo/pool.py

Lines changed: 101 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from bson import DEFAULT_CODEC_OPTIONS
3131
from bson.py3compat import imap, itervalues, _unicode, PY3
3232
from bson.son import SON
33-
from pymongo import auth, helpers, thread_util, __version__
33+
from pymongo import auth, helpers, __version__
3434
from pymongo.client_session import _validate_session_write_concern
3535
from pymongo.common import (MAX_BSON_SIZE,
3636
MAX_CONNECTING,
@@ -46,6 +46,7 @@
4646
CertificateError,
4747
ConnectionFailure,
4848
ConfigurationError,
49+
ExceededMaxWaiters,
4950
InvalidOperation,
5051
DocumentTooLarge,
5152
NetworkTimeout,
@@ -309,7 +310,8 @@ class PoolOptions(object):
309310
'__wait_queue_timeout', '__wait_queue_multiple',
310311
'__ssl_context', '__ssl_match_hostname', '__socket_keepalive',
311312
'__event_listeners', '__appname', '__driver', '__metadata',
312-
'__compression_settings', '__max_connecting')
313+
'__compression_settings', '__max_connecting',
314+
'__pause_enabled')
313315

314316
def __init__(self, max_pool_size=MAX_POOL_SIZE,
315317
min_pool_size=MIN_POOL_SIZE,
@@ -318,7 +320,8 @@ def __init__(self, max_pool_size=MAX_POOL_SIZE,
318320
wait_queue_multiple=None, ssl_context=None,
319321
ssl_match_hostname=True, socket_keepalive=True,
320322
event_listeners=None, appname=None, driver=None,
321-
compression_settings=None, max_connecting=MAX_CONNECTING):
323+
compression_settings=None, max_connecting=MAX_CONNECTING,
324+
pause_enabled=True):
322325

323326
self.__max_pool_size = max_pool_size
324327
self.__min_pool_size = min_pool_size
@@ -335,6 +338,7 @@ def __init__(self, max_pool_size=MAX_POOL_SIZE,
335338
self.__driver = driver
336339
self.__compression_settings = compression_settings
337340
self.__max_connecting = max_connecting
341+
self.__pause_enabled = pause_enabled
338342
self.__metadata = copy.deepcopy(_METADATA)
339343
if appname:
340344
self.__metadata['application'] = {'name': appname}
@@ -406,6 +410,10 @@ def max_connecting(self):
406410
"""
407411
return self.__max_connecting
408412

413+
@property
414+
def pause_enabled(self):
415+
return self.__pause_enabled
416+
409417
@property
410418
def max_idle_time_seconds(self):
411419
"""The maximum number of seconds that a connection can remain
@@ -1058,6 +1066,8 @@ class _PoolClosedError(PyMongoError):
10581066
pass
10591067

10601068

1069+
PAUSED, READY, CLOSED = range(3)
1070+
10611071
# Do *not* explicitly inherit from object or Jython won't call __del__
10621072
# http://bugs.jython.org/issue1057
10631073
class Pool:
@@ -1068,6 +1078,9 @@ def __init__(self, address, options, handshake=True):
10681078
- `options`: a PoolOptions instance
10691079
- `handshake`: whether to call ismaster for each new SocketInfo
10701080
"""
1081+
self.state = READY
1082+
if options.pause_enabled:
1083+
self.state = PAUSED
10711084
# Check a socket's health with socket_closed() every once in a while.
10721085
# Can override for testing: 0 to always check, None to never check.
10731086
self._check_interval_seconds = 1
@@ -1079,7 +1092,6 @@ def __init__(self, address, options, handshake=True):
10791092
self.active_sockets = 0
10801093
# Monotonically increasing connection ID required for CMAP Events.
10811094
self.next_connection_id = 1
1082-
self.closed = False
10831095
# Track whether the sockets in this pool are writeable or not.
10841096
self.is_writable = None
10851097

@@ -1098,13 +1110,23 @@ def __init__(self, address, options, handshake=True):
10981110

10991111
if (self.opts.wait_queue_multiple is None or
11001112
self.opts.max_pool_size is None):
1101-
max_waiters = None
1113+
max_waiters = float('inf')
11021114
else:
11031115
max_waiters = (
11041116
self.opts.max_pool_size * self.opts.wait_queue_multiple)
1105-
1106-
self._socket_semaphore = thread_util.create_semaphore(
1107-
self.opts.max_pool_size, max_waiters)
1117+
# The first portion of the wait queue.
1118+
# Enforces: maxPoolSize and waitQueueMultiple
1119+
# Also used for: clearing the wait queue
1120+
self.size_cond = threading.Condition(self.lock)
1121+
self.requests = 0
1122+
self.max_pool_size = self.opts.max_pool_size
1123+
if self.max_pool_size is None:
1124+
self.max_pool_size = float('inf')
1125+
self.waiters = 0
1126+
self.max_waiters = max_waiters
1127+
# The second portion of the wait queue.
1128+
# Enforces: maxConnecting
1129+
# Also used for: clearing the wait queue
11081130
self._max_connecting_cond = threading.Condition(self.lock)
11091131
self._max_connecting = self.opts.max_connecting
11101132
self._pending = 0
@@ -1114,10 +1136,22 @@ def __init__(self, address, options, handshake=True):
11141136
# Similar to active_sockets but includes threads in the wait queue.
11151137
self.operation_count = 0
11161138

1139+
def ready(self):
1140+
old_state, self.state = self.state, READY
1141+
if old_state != READY:
1142+
if self.enabled_for_cmap:
1143+
self.opts.event_listeners.publish_pool_ready(self.address)
1144+
1145+
@property
1146+
def closed(self):
1147+
return self.state == CLOSED
1148+
11171149
def _reset(self, close):
1118-
with self.lock:
1150+
with self.size_cond:
11191151
if self.closed:
11201152
return
1153+
if self.opts.pause_enabled:
1154+
self.state = PAUSED
11211155
self.generation += 1
11221156
newpid = os.getpid()
11231157
if self.pid != newpid:
@@ -1126,7 +1160,10 @@ def _reset(self, close):
11261160
self.operation_count = 0
11271161
sockets, self.sockets = self.sockets, collections.deque()
11281162
if close:
1129-
self.closed = True
1163+
self.state = CLOSED
1164+
# Clear the wait queue
1165+
self._max_connecting_cond.notify_all()
1166+
self.size_cond.notify_all()
11301167

11311168
listeners = self.opts.event_listeners
11321169
# CMAP spec says that close() MUST close sockets before publishing the
@@ -1164,6 +1201,9 @@ def remove_stale_sockets(self, reference_generation, all_credentials):
11641201
`generation` at the point in time this operation was requested on the
11651202
pool.
11661203
"""
1204+
if self.state != READY:
1205+
return
1206+
11671207
if self.opts.max_idle_time_seconds is not None:
11681208
with self.lock:
11691209
while (self.sockets and
@@ -1172,15 +1212,14 @@ def remove_stale_sockets(self, reference_generation, all_credentials):
11721212
sock_info.close_socket(ConnectionClosedReason.IDLE)
11731213

11741214
while True:
1175-
with self.lock:
1215+
with self.size_cond:
1216+
# There are enough sockets in the pool.
11761217
if (len(self.sockets) + self.active_sockets >=
11771218
self.opts.min_pool_size):
1178-
# There are enough sockets in the pool.
11791219
return
1180-
1181-
# We must acquire the semaphore to respect max_pool_size.
1182-
if not self._socket_semaphore.acquire(False):
1183-
return
1220+
if self.requests >= self.opts.min_pool_size:
1221+
return
1222+
self.requests += 1
11841223
incremented = False
11851224
try:
11861225
with self._max_connecting_cond:
@@ -1204,7 +1243,10 @@ def remove_stale_sockets(self, reference_generation, all_credentials):
12041243
with self._max_connecting_cond:
12051244
self._pending -= 1
12061245
self._max_connecting_cond.notify()
1207-
self._socket_semaphore.release()
1246+
1247+
with self.size_cond:
1248+
self.requests -= 1
1249+
self.size_cond.notify()
12081250

12091251
def connect(self, all_credentials=None):
12101252
"""Connect to Mongo and return a new SocketInfo.
@@ -1289,6 +1331,15 @@ def get_socket(self, all_credentials, checkout=False):
12891331
if not checkout:
12901332
self.return_socket(sock_info)
12911333

1334+
def _raise_if_not_ready(self):
1335+
if self.opts.pause_enabled and self.state == PAUSED:
1336+
if self.enabled_for_cmap:
1337+
self.opts.event_listeners.publish_connection_check_out_failed(
1338+
self.address, ConnectionCheckOutFailedReason.CONN_ERROR)
1339+
# TODO: ensure this error is retryable
1340+
_raise_connection_failure(
1341+
self.address, AutoReconnect('connection pool paused'))
1342+
12921343
def _get_socket(self, all_credentials):
12931344
"""Get or create a SocketInfo. Can raise ConnectionFailure."""
12941345
# We use the pid here to avoid issues with fork / multiprocessing.
@@ -1313,9 +1364,26 @@ def _get_socket(self, all_credentials):
13131364
deadline = _time() + self.opts.wait_queue_timeout
13141365
else:
13151366
deadline = None
1316-
if not self._socket_semaphore.acquire(
1317-
True, self.opts.wait_queue_timeout):
1318-
self._raise_wait_queue_timeout()
1367+
1368+
with self.size_cond:
1369+
self._raise_if_not_ready()
1370+
if self.waiters >= self.max_waiters:
1371+
raise ExceededMaxWaiters(
1372+
'exceeded max waiters: %s threads already waiting' % (
1373+
self.waiters))
1374+
self.waiters += 1
1375+
try:
1376+
while not (self.requests < self.max_pool_size):
1377+
if not _cond_wait(self.size_cond, deadline):
1378+
# Timed out, notify the next thread to ensure a
1379+
# timeout doesn't consume the condition.
1380+
if self.requests < self.max_pool_size:
1381+
self.size_cond.notify()
1382+
self._raise_wait_queue_timeout()
1383+
self._raise_if_not_ready()
1384+
finally:
1385+
self.waiters -= 1
1386+
self.requests += 1
13191387

13201388
# We've now acquired the semaphore and must release it on error.
13211389
sock_info = None
@@ -1330,6 +1398,9 @@ def _get_socket(self, all_credentials):
13301398
# CMAP: we MUST wait for either maxConnecting OR for a socket
13311399
# to be checked back into the pool.
13321400
with self._max_connecting_cond:
1401+
emitted_event = True
1402+
self._raise_if_not_ready()
1403+
emitted_event = False
13331404
while not (self.sockets or
13341405
self._pending < self._max_connecting):
13351406
if not _cond_wait(self._max_connecting_cond, deadline):
@@ -1340,6 +1411,9 @@ def _get_socket(self, all_credentials):
13401411
self._max_connecting_cond.notify()
13411412
emitted_event = True
13421413
self._raise_wait_queue_timeout()
1414+
emitted_event = True
1415+
self._raise_if_not_ready()
1416+
emitted_event = False
13431417

13441418
try:
13451419
sock_info = self.sockets.popleft()
@@ -1361,11 +1435,11 @@ def _get_socket(self, all_credentials):
13611435
if sock_info:
13621436
# We checked out a socket but authentication failed.
13631437
sock_info.close_socket(ConnectionClosedReason.ERROR)
1364-
self._socket_semaphore.release()
1365-
1366-
if incremented:
1367-
with self.lock:
1438+
with self.size_cond:
1439+
self.requests -= 1
1440+
if incremented:
13681441
self.active_sockets -= 1
1442+
self.size_cond.notify()
13691443

13701444
if self.enabled_for_cmap and not emitted_event:
13711445
self.opts.event_listeners.publish_connection_check_out_failed(
@@ -1401,10 +1475,11 @@ def return_socket(self, sock_info):
14011475
# Notify any threads waiting to create a connection.
14021476
self._max_connecting_cond.notify()
14031477

1404-
self._socket_semaphore.release()
1405-
with self.lock:
1478+
with self.size_cond:
1479+
self.requests -= 1
14061480
self.active_sockets -= 1
14071481
self.operation_count -= 1
1482+
self.size_cond.notify()
14081483

14091484
def _perished(self, sock_info):
14101485
"""Return True and close the connection if it is "perished".

0 commit comments

Comments
 (0)