Merge pull request #1269 from TheBlueMatt/022-01-no-disconnect-on-slow-persist

valentinewallace · web-flow · commit b1bdba52653c · 2022-01-21T11:52:05.000-05:00
Avoid disconnecting all peers if user code is slow
diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
@@ -61,7 +61,7 @@ const FRESHNESS_TIMER: u64 = 60;
 const FRESHNESS_TIMER: u64 = 1;
 
 #[cfg(all(not(test), not(debug_assertions)))]
-const PING_TIMER: u64 = 5;
+const PING_TIMER: u64 = 10;
 /// Signature operations take a lot longer without compiler optimisations.
 /// Increasing the ping timer allows for this but slower devices will be disconnected if the
 /// timeout is reached.
@@ -219,11 +219,17 @@ impl BackgroundProcessor {
 			let mut have_pruned = false;
 
 			loop {
-				peer_manager.process_events();
+				peer_manager.process_events(); // Note that this may block on ChannelManager's locking
 				channel_manager.process_pending_events(&event_handler);
 				chain_monitor.process_pending_events(&event_handler);
+
+				// We wait up to 100ms, but track how long it takes to detect being put to sleep,
+				// see `await_start`'s use below.
+				let await_start = Instant::now();
 				let updates_available =
 					channel_manager.await_persistable_update_timeout(Duration::from_millis(100));
+				let await_time = await_start.elapsed();
+
 				if updates_available {
 					log_trace!(logger, "Persisting ChannelManager...");
 					persister.persist_manager(&*channel_manager)?;
@@ -239,15 +245,20 @@ impl BackgroundProcessor {
 					channel_manager.timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 {
+				if await_time > Duration::from_secs(1) {
 					// On various platforms, we may be starved of CPU cycles for several reasons.
 					// E.g. on iOS, if we've been in the background, we will be entirely paused.
 					// Similarly, if we're on a desktop platform and the device has been asleep, we
 					// may not get any cycles.
-					// In any case, if we've been entirely paused for more than double our ping
-					// timer, we should have disconnected all sockets by now (and they're probably
-					// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
-					log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
+					// We detect this by checking if our max-100ms-sleep, above, ran longer than a
+					// full second, at which point we assume sockets may have been killed (they
+					// appear to be at least on some platforms, even if it has only been a second).
+					// Note that we have to take care to not get here just because user event
+					// processing was slow at the top of the loop. For example, the sample client
+					// may call Bitcoin Core RPCs during event handling, which very often takes
+					// more than a handful of seconds to complete, and shouldn't disconnect all our
+					// peers.
+					log_trace!(logger, "100ms sleep took more than a second, disconnecting peers.");
 					peer_manager.disconnect_all_peers();
 					last_ping_call = Instant::now();
 				} else if last_ping_call.elapsed().as_secs() > PING_TIMER {