Avoid disconnecting all peers if user code is slow

TheBlueMatt · TheBlueMatt · commit 87bd47067ddb · 2022-01-20T18:23:51.000Z
In the sample client (and likely other downstream users), event
processing may block on slow operations (e.g. Bitcoin Core RPCs)
and ChannelManager persistence may take some time. This should be
fine, except that we consider this a case of possible backgrounding
and disconnect all of our peers when it happens.

Instead, we here avoid considering event processing time in the
time between PeerManager events.
diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
@@ -218,16 +218,24 @@ impl BackgroundProcessor {
 			let mut last_prune_call = Instant::now();
 			let mut have_pruned = false;
 
+			// When considering how long its taken since the last timer tick, we don't want to
+			// count any time spent in user code, especially since event processing can block on
+			// disk writes. Thus, we track how long we spent in event handling here.
+			let mut ev_handle_time_since_last_ping = Duration::from_millis(0);
 			loop {
-				peer_manager.process_events();
+				let ev_handle_start = Instant::now();
+				peer_manager.process_events(); // Note that this may block on ChannelManager's locking
 				channel_manager.process_pending_events(&event_handler);
 				chain_monitor.process_pending_events(&event_handler);
+				ev_handle_time_since_last_ping += ev_handle_start.elapsed();
 				let updates_available =
 					channel_manager.await_persistable_update_timeout(Duration::from_millis(100));
 				if updates_available {
+					let persist_start = Instant::now();
 					log_trace!(logger, "Persisting ChannelManager...");
 					persister.persist_manager(&*channel_manager)?;
 					log_trace!(logger, "Done persisting ChannelManager.");
+					ev_handle_time_since_last_ping += Instant::now() - persist_start;
 				}
 				// Exit the loop if the background processor was requested to stop.
 				if stop_thread.load(Ordering::Acquire) == true {
@@ -239,21 +247,28 @@ impl BackgroundProcessor {
 					channel_manager.timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 {
+				if (last_ping_call.elapsed() - ev_handle_time_since_last_ping).as_secs() > PING_TIMER * 2 {
 					// On various platforms, we may be starved of CPU cycles for several reasons.
 					// E.g. on iOS, if we've been in the background, we will be entirely paused.
 					// Similarly, if we're on a desktop platform and the device has been asleep, we
 					// may not get any cycles.
 					// In any case, if we've been entirely paused for more than double our ping
 					// timer, we should have disconnected all sockets by now (and they're probably
 					// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
+					// Note that we have to take care to not get here just because user event
+					// processing was slow at the top of the loop. For example, the sample client
+					// may call Bitcoin Core RPCs during event handling, which very often takes
+					// more than a handful of seconds to complete, and shouldn't disconnect all our
+					// peers.
 					log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
 					peer_manager.disconnect_all_peers();
 					last_ping_call = Instant::now();
+					ev_handle_time_since_last_ping = Duration::from_millis(0);
 				} else if last_ping_call.elapsed().as_secs() > PING_TIMER {
 					log_trace!(logger, "Calling PeerManager's timer_tick_occurred");
 					peer_manager.timer_tick_occurred();
 					last_ping_call = Instant::now();
+					ev_handle_time_since_last_ping = Duration::from_millis(0);
 				}
 
 				// Note that we want to run a graph prune once not long after startup before