Skip to content

Commit 78377c4

Browse files
committed
Back to single thread for connection recovery
This does not solve the problem of recovery when there are too many connections for the number of available processors. The sleep operations between connection retry attempts block threads, so an approached based on scheduling should be more appropriate.
1 parent 764cf0d commit 78377c4

File tree

9 files changed

+197
-162
lines changed

9 files changed

+197
-162
lines changed

src/main/java/com/rabbitmq/client/amqp/impl/AmqpConnection.java

Lines changed: 109 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -90,52 +90,27 @@ final class AmqpConnection extends ResourceBase implements Connection {
9090

9191
if (recoveryConfiguration.activated()) {
9292
this.recoveryRequestQueue = new ArrayBlockingQueue<>(10);
93-
Duration recoveryCheckPeriod = Duration.ofMillis(100L);
9493
this.recoveryTask =
95-
this.scheduledExecutorService()
96-
.scheduleAtFixedRate(
94+
this.executorService()
95+
.submit(
9796
() -> {
98-
Runnable recoveryTask = this.recoveryRequestQueue.poll();
99-
if (recoveryTask != null) {
100-
this.executorService()
101-
.submit(
102-
() -> {
103-
try {
104-
LOGGER.debug(
105-
"Running recovery task for connection '{}'...", this.name());
106-
recoveryTask.run();
107-
LOGGER.debug(
108-
"Recovery task for connection '{}' has run.", this.name());
109-
} catch (Exception e) {
110-
LOGGER.warn(
111-
"Error during connection recovery for '{}'", this.name(), e);
112-
}
113-
});
97+
LOGGER.debug("Starting recovery loop for '{}'", this.name());
98+
boolean keepGoing = true;
99+
while (keepGoing) {
100+
try {
101+
Runnable recoveryTask = this.recoveryRequestQueue.take();
102+
LOGGER.debug("Running recovery task for connection '{}'...", this.name());
103+
recoveryTask.run();
104+
LOGGER.debug("Recovery task for connection '{}' has run.", this.name());
105+
} catch (InterruptedException e) {
106+
LOGGER.info("Recovery loop for '{}' has been interrupted.", this.name());
107+
keepGoing = false;
108+
} catch (Exception e) {
109+
LOGGER.warn("Error during connection recovery for '{}'", this.name(), e);
110+
}
114111
}
115-
},
116-
recoveryCheckPeriod.toMillis(),
117-
recoveryCheckPeriod.toMillis(),
118-
TimeUnit.MILLISECONDS);
119-
// this.recoveryLoop =
120-
// this.executorService()
121-
// .submit(
122-
// () -> {
123-
// while (!Thread.currentThread().isInterrupted()) {
124-
// try {
125-
// Runnable recoveryTask = this.recoveryRequestQueue.take();
126-
// LOGGER.debug("Running recovery task for connection '{}'...",
127-
// this.name());
128-
// recoveryTask.run();
129-
// LOGGER.debug("Recovery task for connection '{}' has run.",
130-
// this.name());
131-
// } catch (InterruptedException e) {
132-
// return;
133-
// } catch (Exception e) {
134-
// LOGGER.warn("Error during connection recovery for '{}'",
135-
// this.name(), e);
136-
// }
137-
// }
138-
// });
112+
LOGGER.info("Closing recovery loop for '{}'.", this.name());
113+
});
139114
disconnectHandler = recoveryDisconnectHandler(recoveryConfiguration, builder.name());
140115
} else {
141116
disconnectHandler =
@@ -171,7 +146,8 @@ final class AmqpConnection extends ResourceBase implements Connection {
171146
this.affinity,
172147
this.environment.affinityCache(),
173148
this.affinityStrategy,
174-
ConnectionUtils.NO_RETRY_STRATEGY);
149+
ConnectionUtils.NO_RETRY_STRATEGY,
150+
this.name());
175151
this.sync(ncw);
176152
LOGGER.debug("Opened connection '{}' on node '{}'.", this.name(), this.connectionNodename());
177153
this.state(OPEN);
@@ -259,17 +235,18 @@ private NativeConnectionWrapper connect(
259235
Address address = connectionSettings.selectAddress(addresses);
260236
StopWatch stopWatch = new StopWatch();
261237
try {
262-
LOGGER.trace("Connecting...");
238+
LOGGER.trace("Connecting '{}' to {}...", this.name(), address);
263239
org.apache.qpid.protonj2.client.Connection connection =
264240
this.environment.client().connect(address.host(), address.port(), connectionOptions);
241+
LOGGER.debug("Created native connection instance for '{}'", this.name());
265242
ExceptionUtils.wrapGet(connection.openFuture());
266-
LOGGER.trace("Connection attempt succeeded");
243+
LOGGER.debug("Connection attempt '{}' succeeded", this.name());
267244
checkBrokerVersion(connection);
268245
return new NativeConnectionWrapper(connection, extractNode(connection), address);
269246
} catch (ClientException e) {
270247
throw ExceptionUtils.convert(e);
271248
} finally {
272-
LOGGER.trace("Connection attempt took {}", stopWatch.stop());
249+
LOGGER.debug("Connection attempt for '{}' took {}", this.name(), stopWatch.stop());
273250
}
274251
}
275252

@@ -302,7 +279,9 @@ private static String extractNode(org.apache.qpid.protonj2.client.Connection con
302279
TopologyListener createTopologyListener(AmqpConnectionBuilder builder) {
303280
TopologyListener topologyListener;
304281
if (builder.recoveryConfiguration().topology()) {
305-
RecordingTopologyListener rtl = new RecordingTopologyListener(this.executorService());
282+
RecordingTopologyListener rtl =
283+
new RecordingTopologyListener(
284+
"topology-listener-connection-" + this.name(), this.executorService());
306285
this.entityRecovery = new EntityRecovery(this, rtl);
307286
topologyListener = rtl;
308287
} else {
@@ -346,9 +325,12 @@ TopologyListener createTopologyListener(AmqpConnectionBuilder builder) {
346325
this.name(),
347326
exception.getMessage());
348327
this.recoveryRequestQueue.add(
349-
() ->
328+
() -> {
329+
if (!this.recoveringConnection.get()) {
350330
recoverAfterConnectionFailure(
351-
recoveryConfiguration, name, exception, resultReference));
331+
recoveryConfiguration, name, exception, resultReference);
332+
}
333+
});
352334
} else {
353335
LOGGER.debug(
354336
"Not recovering connection '{}' for error {}",
@@ -364,7 +346,7 @@ TopologyListener createTopologyListener(AmqpConnectionBuilder builder) {
364346
private void recoverAfterConnectionFailure(
365347
AmqpConnectionBuilder.AmqpRecoveryConfiguration recoveryConfiguration,
366348
String connectionName,
367-
AmqpException failureCause,
349+
Exception failureCause,
368350
AtomicReference<BiConsumer<org.apache.qpid.protonj2.client.Connection, DisconnectionEvent>>
369351
disconnectedHandlerReference) {
370352
LOGGER.info(
@@ -380,41 +362,59 @@ private void recoverAfterConnectionFailure(
380362
this.connectionAddress = null;
381363
LOGGER.debug("Releasing management resource of connection '{}'.", this.name());
382364
this.releaseManagementResources();
383-
try {
384-
this.recoveringConnection.set(true);
385-
LOGGER.debug("Connection attempt for '{}'.", this.name());
386-
NativeConnectionWrapper ncw =
387-
recoverNativeConnection(
388-
recoveryConfiguration, connectionName, disconnectedHandlerReference);
389-
this.sync(ncw);
390-
LOGGER.debug("Reconnected '{}' to {}", this.name(), this.currentConnectionLabel());
391-
} catch (Exception ex) {
392-
if (ex instanceof InterruptedException) {
393-
Thread.currentThread().interrupt();
365+
if (this.recoveringConnection.compareAndSet(false, true)) {
366+
try {
367+
this.recoveringConnection.set(true);
368+
LOGGER.debug("Connection attempt for '{}'.", this.name());
369+
NativeConnectionWrapper ncw =
370+
recoverNativeConnection(
371+
recoveryConfiguration, connectionName, disconnectedHandlerReference);
372+
this.sync(ncw);
373+
LOGGER.debug("Reconnected '{}' to {}", this.name(), this.currentConnectionLabel());
374+
} catch (Exception ex) {
375+
if (ex instanceof InterruptedException) {
376+
Thread.currentThread().interrupt();
377+
}
378+
this.close(ex);
379+
return;
380+
} finally {
381+
this.recoveringConnection.set(false);
394382
}
395-
this.closed.set(true);
396-
this.changeStateOfPublishers(CLOSED, ex);
397-
this.changeStateOfConsumers(CLOSED, ex);
398-
this.state(CLOSED, ex);
383+
} else {
384+
LOGGER.debug("Connection '{}' already recovering, returning.", this.name());
399385
return;
400-
} finally {
401-
this.recoveringConnection.set(false);
402386
}
403387

404388
try {
405389
if (recoveryConfiguration.topology()) {
390+
this.management.init();
406391
LOGGER.debug("Recovering topology of connection '{}'...", this.name());
407392
this.recoverTopology();
408393
this.recoverConsumers();
409394
this.recoverPublishers();
410395
LOGGER.debug("Recovered topology of connection '{}'.", this.name());
411396
}
412-
413-
LOGGER.info("Recovered connection to {}", this.currentConnectionLabel());
397+
LOGGER.info("Recovered connection '{}' to {}", this.name(), this.currentConnectionLabel());
414398
this.state(OPEN);
415399
} catch (Exception ex) {
416400
// likely InterruptedException or IO exception
417-
LOGGER.info("Error while trying to recover connection", ex);
401+
LOGGER.info(
402+
"Error while trying to recover topology for connection '{}': {}",
403+
this.name(),
404+
ex.getMessage());
405+
if (RECOVERY_PREDICATE.test(ex)) {
406+
LOGGER.debug(
407+
"Error during topology recoverable, queueing recovery task for '{}', error is {}",
408+
this.name(),
409+
ex.getMessage());
410+
this.recoveryRequestQueue.add(
411+
() -> {
412+
if (!this.recoveringConnection.get()) {
413+
recoverAfterConnectionFailure(
414+
recoveryConfiguration, name, ex, disconnectedHandlerReference);
415+
}
416+
});
417+
}
418418
}
419419
}
420420

@@ -428,10 +428,14 @@ private NativeConnectionWrapper recoverNativeConnection(
428428
while (true) {
429429
Duration delay = recoveryConfiguration.backOffDelayPolicy().delay(attempt);
430430
if (BackOffDelayPolicy.TIMEOUT.equals(delay)) {
431+
LOGGER.debug("Reached timeout to recover '{}'", this.name());
431432
throw new AmqpException("Recovery retry timed out");
432433
} else {
433434
try {
435+
LOGGER.debug(
436+
"Waiting for {} before trying to recover connection '{}'", delay, this.name());
434437
Thread.sleep(delay.toMillis());
438+
LOGGER.debug("Done waiting for '{}' recovery.", this.name());
435439
} catch (InterruptedException ex) {
436440
Thread.currentThread().interrupt();
437441
LOGGER.info("Thread interrupted while waiting during connection recovery");
@@ -440,38 +444,40 @@ private NativeConnectionWrapper recoverNativeConnection(
440444
}
441445

442446
try {
443-
NativeConnectionWrapper result =
444-
ConnectionUtils.enforceAffinity(
445-
addrs -> {
446-
NativeConnectionWrapper wrapper =
447-
connect(
448-
this.connectionSettings,
449-
connectionName,
450-
disconnectedHandlerReference.get(),
451-
addrs);
452-
this.nativeConnection = wrapper.connection();
453-
return wrapper;
454-
},
455-
this.management,
456-
this.affinity,
457-
this.environment.affinityCache(),
458-
this.affinityStrategy,
459-
new ConnectionUtils.RetryStrategy() {
460-
@Override
461-
public <T> T maybeRetry(Supplier<T> task) {
462-
return RetryUtils.callAndMaybeRetry(
463-
task::get,
464-
e -> true,
465-
recoveryConfiguration.backOffDelayPolicy(),
466-
"Connection affinity operation");
467-
}
468-
});
469-
return result;
447+
LOGGER.debug("Trying to create native connection for '{}'.", this.name());
448+
return ConnectionUtils.enforceAffinity(
449+
addrs -> {
450+
NativeConnectionWrapper wrapper =
451+
connect(
452+
this.connectionSettings,
453+
connectionName,
454+
disconnectedHandlerReference.get(),
455+
addrs);
456+
this.nativeConnection = wrapper.connection();
457+
return wrapper;
458+
},
459+
this.management,
460+
this.affinity,
461+
this.environment.affinityCache(),
462+
this.affinityStrategy,
463+
new ConnectionUtils.RetryStrategy() {
464+
@Override
465+
public <T> T maybeRetry(Supplier<T> task) {
466+
return RetryUtils.callAndMaybeRetry(
467+
task::get,
468+
e -> true,
469+
recoveryConfiguration.backOffDelayPolicy(),
470+
"Connection affinity operation");
471+
}
472+
},
473+
this.name());
470474
} catch (Exception ex) {
471475
LOGGER.info("Error while trying to recover connection", ex);
472476
if (!RECOVERY_PREDICATE.test(ex)) {
473477
LOGGER.info(
474-
"Stopping connection recovery, exception is not recoverable: {}", ex.getMessage());
478+
"Stopping connection '{}' recovery, exception is not recoverable: {}",
479+
this.name(),
480+
ex.getMessage());
475481
throw new AmqpException("Could not recover connection after fatal exception", ex);
476482
}
477483
}
@@ -721,7 +727,10 @@ private void close(Throwable cause) {
721727
consumer.close();
722728
}
723729
try {
724-
this.nativeConnection.close();
730+
org.apache.qpid.protonj2.client.Connection nc = this.nativeConnection;
731+
if (nc != null) {
732+
nc.close();
733+
}
725734
} catch (Exception e) {
726735
LOGGER.warn("Error while closing native connection", e);
727736
}

src/main/java/com/rabbitmq/client/amqp/impl/AmqpEnvironment.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import java.util.concurrent.atomic.AtomicBoolean;
2727
import java.util.concurrent.atomic.AtomicLong;
2828
import org.apache.qpid.protonj2.client.*;
29-
import org.apache.qpid.protonj2.client.futures.ClientFutureFactory;
3029

3130
class AmqpEnvironment implements Environment {
3231

@@ -56,7 +55,7 @@ class AmqpEnvironment implements Environment {
5655
this.id = ID_SEQUENCE.getAndIncrement();
5756
connectionSettings.copyTo(this.connectionSettings);
5857
this.connectionSettings.consolidate();
59-
ClientOptions clientOptions = new ClientOptions().futureType(ClientFutureFactory.PROGRESSIVE);
58+
ClientOptions clientOptions = new ClientOptions();
6059
this.client = Client.create(clientOptions);
6160

6261
if (executorService == null) {

0 commit comments

Comments
 (0)