Persist ChannelMonitors after new blocks are connected

TheBlueMatt · TheBlueMatt · commit 5c2ff2cb30ef · 2021-10-20T00:06:17.000Z
This resolves several user complaints (and issues in the sample
node) where startup is substantially delayed as we're always
waiting for the chain data to sync.

Further, in an upcoming PR, we'll be reloading pending payments
from ChannelMonitors on restart, at which point we'll need the
change here which avoids handling events until after the user
has confirmed the `ChannelMonitor` has been persisted to disk.
It will avoid a race where we
 * send a payment/HTLC (persisting the monitor to disk with the
   HTLC pending),
 * force-close the channel, removing the channel entry from the
   ChannelManager entirely,
 * persist the ChannelManager,
 * connect a block which contains a fulfill of the HTLC, generating
   a claim event,
 * handle the claim event while the `ChannelMonitor` is being
   persisted,
 * persist the ChannelManager (before the CHannelMonitor is
   persisted fully),
 * restart, reloading the HTLC as a pending payment in the
   ChannelManager, which now has no references to it except from
   the ChannelMonitor which still has the pending HTLC,
 * replay the block connection, generating a duplicate PaymentSent
   event.
diff --git a/fuzz/src/utils/test_persister.rs b/fuzz/src/utils/test_persister.rs
@@ -14,7 +14,7 @@ impl chainmonitor::Persist<EnforcingSigner> for TestPersister {
 		self.update_ret.lock().unwrap().clone()
 	}
 
-	fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &channelmonitor::ChannelMonitorUpdate, _data: &channelmonitor::ChannelMonitor<EnforcingSigner>, _update_id: MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {
+	fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &Option<channelmonitor::ChannelMonitorUpdate>, _data: &channelmonitor::ChannelMonitor<EnforcingSigner>, _update_id: MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {
 		self.update_ret.lock().unwrap().clone()
 	}
 }
diff --git a/lightning-persister/src/lib.rs b/lightning-persister/src/lib.rs
@@ -159,13 +159,18 @@ impl FilesystemPersister {
 }
 
 impl<ChannelSigner: Sign> chainmonitor::Persist<ChannelSigner> for FilesystemPersister {
+	// TODO: We really need a way for the persister to inform the user that its time to crash/shut
+	// down once these start returning failure.
+	// A PermanentFailure implies we need to shut down since we're force-closing channels without
+	// even broadcasting!
+
 	fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor<ChannelSigner>, _update_id: chainmonitor::MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {
 		let filename = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index);
 		util::write_to_file(self.path_to_monitor_data(), filename, monitor)
 			.map_err(|_| chain::ChannelMonitorUpdateErr::PermanentFailure)
 	}
 
-	fn update_persisted_channel(&self, funding_txo: OutPoint, _update: &ChannelMonitorUpdate, monitor: &ChannelMonitor<ChannelSigner>, _update_id: chainmonitor::MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {
+	fn update_persisted_channel(&self, funding_txo: OutPoint, _update: &Option<ChannelMonitorUpdate>, monitor: &ChannelMonitor<ChannelSigner>, _update_id: chainmonitor::MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {
 		let filename = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index);
 		util::write_to_file(self.path_to_monitor_data(), filename, monitor)
 			.map_err(|_| chain::ChannelMonitorUpdateErr::PermanentFailure)
diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs
@@ -32,6 +32,7 @@ use chain::chaininterface::{BroadcasterInterface, FeeEstimator};
 use chain::channelmonitor::{ChannelMonitor, ChannelMonitorUpdate, Balance, MonitorEvent, TransactionOutputs};
 use chain::transaction::{OutPoint, TransactionData};
 use chain::keysinterface::Sign;
+use util::atomic_counter::AtomicCounter;
 use util::logger::Logger;
 use util::errors::APIError;
 use util::events;
@@ -41,10 +42,19 @@ use ln::channelmanager::ChannelDetails;
 use prelude::*;
 use sync::{RwLock, RwLockReadGuard, Mutex, MutexGuard};
 use core::ops::Deref;
+use core::sync::atomic::{AtomicBool, Ordering};
 
 #[derive(Clone, Copy, Hash, PartialEq, Eq)]
+/// A specific update's ID stored in a `MonitorUpdateId`, separated out to make the contents
+/// entirely opaque.
 enum UpdateOrigin {
+	/// An update that was generated by the `ChannelManager` (via our `chain::Watch`
+	/// implementation). This corresponds to an actual [`ChannelMonitorUpdate::update_id`] field
+	/// and [`ChannelMonitor::get_latest_update_id`].
 	OffChain(u64),
+	/// An update that was generated during blockchain processing. The ID here is specific to the
+	/// generating [`ChainMonitor`] and does *not* correspond to any on-disk IDs.
+	ChainSync(u64),
 }
 
 /// An opaque identifier describing a specific [`Persist`] method call.
@@ -103,6 +113,12 @@ pub trait Persist<ChannelSigner: Sign> {
 	/// updated monitor itself to disk/backups. See the [`Persist`] trait documentation for more
 	/// details.
 	///
+	/// During blockchain synchronization operations, this may be called with no
+	/// [`ChannelMonitorUpdate`], in which case the full [`ChannelMonitor`] needs to be persisted.
+	/// Note that after the full [`ChannelMonitor`] is persisted any previous
+	/// [`ChannelMonitorUpdate`]s which were persisted should be discarded - they can no longer be
+	/// applied to the persisted [`ChannelMonitor`] as they were already applied.
+	///
 	/// If an implementer chooses to persist the updates only, they need to make
 	/// sure that all the updates are applied to the `ChannelMonitors` *before*
 	/// the set of channel monitors is given to the `ChannelManager`
@@ -123,7 +139,7 @@ pub trait Persist<ChannelSigner: Sign> {
 	/// [`ChannelMonitorUpdateErr`] for requirements when returning errors.
 	///
 	/// [`Writeable::write`]: crate::util::ser::Writeable::write
-	fn update_persisted_channel(&self, channel_id: OutPoint, update: &ChannelMonitorUpdate, data: &ChannelMonitor<ChannelSigner>, update_id: MonitorUpdateId) -> Result<(), ChannelMonitorUpdateErr>;
+	fn update_persisted_channel(&self, channel_id: OutPoint, update: &Option<ChannelMonitorUpdate>, data: &ChannelMonitor<ChannelSigner>, update_id: MonitorUpdateId) -> Result<(), ChannelMonitorUpdateErr>;
 }
 
 struct MonitorHolder<ChannelSigner: Sign> {
@@ -134,14 +150,35 @@ struct MonitorHolder<ChannelSigner: Sign> {
 	/// update_persisted_channel, the user returns a TemporaryFailure, and then calls
 	/// channel_monitor_updated immediately, racing our insertion of the pending update into the
 	/// contained Vec.
+	///
+	/// Beyond the synchronization of updates themselves, we cannot handle user events until after
+	/// any chain updates have been stored on disk. Thus, we scan this list when returning updates
+	/// to the ChannelManager, refusing to return any updates for a ChannelMonitor which is still
+	/// being persisted fully to disk after a chain update.
+	///
+	/// This avoids the possibility of handling, e.g. an on-chain claim, generating a claim monitor
+	/// event, resulting in the relevant ChannelManager generating a PaymentSent event and dropping
+	/// the pending payment entry, and then reloading before the monitor is persisted, resulting in
+	/// the ChannelManager re-adding the same payment entry, before the same block is replayed,
+	/// resulting in a duplicate PaymentSent event.
 	pending_monitor_updates: Mutex<Vec<MonitorUpdateId>>,
+	/// When the user returns a PermanentFailure error from an update_persisted_channel call during
+	/// block processing, we inform the ChannelManager that the channel should be closed
+	/// asynchronously. In order to ensure no further changes happen before the ChannelManager has
+	/// processed the closure event, we set this to true and return PermanentFailure for any other
+	/// chain::Watch events.
+	channel_perm_failed: AtomicBool,
 }
 
 impl<ChannelSigner: Sign> MonitorHolder<ChannelSigner> {
 	fn has_pending_offchain_updates(&self, pending_monitor_updates_lock: &MutexGuard<Vec<MonitorUpdateId>>) -> bool {
 		pending_monitor_updates_lock.iter().any(|update_id|
 			if let UpdateOrigin::OffChain(_) = update_id.contents { true } else { false })
 	}
+	fn has_pending_chainsync_updates(&self, pending_monitor_updates_lock: &MutexGuard<Vec<MonitorUpdateId>>) -> bool {
+		pending_monitor_updates_lock.iter().any(|update_id|
+			if let UpdateOrigin::ChainSync(_) = update_id.contents { true } else { false })
+	}
 }
 
 /// A read-only reference to a current ChannelMonitor.
@@ -177,11 +214,17 @@ pub struct ChainMonitor<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: De
         P::Target: Persist<ChannelSigner>,
 {
 	monitors: RwLock<HashMap<OutPoint, MonitorHolder<ChannelSigner>>>,
+	/// When we generate a [`MonitorUpdateId`] for a chain-event monitor persistence, we need a
+	/// unique ID, which we calculate by simply getting the next value from this counter. Note that
+	/// the ID is never persisted so it's ok that they reset on restart.
+	sync_persistence_id: AtomicCounter,
 	chain_source: Option<C>,
 	broadcaster: T,
 	logger: L,
 	fee_estimator: F,
 	persister: P,
+	/// "User-provided" (ie persistence-completion/-failed) [`MonitorEvent`]s. These came directly
+	/// from the user and not from a [`ChannelMonitor`].
 	pending_monitor_events: Mutex<Vec<MonitorEvent>>,
 }
 
@@ -206,26 +249,50 @@ where C::Target: chain::Filter,
 		FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs>
 	{
 		let mut dependent_txdata = Vec::new();
-		let monitor_states = self.monitors.read().unwrap();
-		for monitor_state in monitor_states.values() {
-			let mut txn_outputs = process(&monitor_state.monitor, txdata);
+		{
+			let monitor_states = self.monitors.write().unwrap();
+			for (funding_outpoint, monitor_state) in monitor_states.iter() {
+				let monitor = &monitor_state.monitor;
+				let mut txn_outputs;
+				{
+					txn_outputs = process(monitor, txdata);
+					let update_id = MonitorUpdateId {
+						contents: UpdateOrigin::ChainSync(self.sync_persistence_id.get_increment()),
+					};
+					let mut pending_monitor_updates = monitor_state.pending_monitor_updates.lock().unwrap();
+
+					log_trace!(self.logger, "Syncing Channel Monitor for channel {}", log_funding_info!(monitor));
+					match self.persister.update_persisted_channel(*funding_outpoint, &None, monitor, update_id) {
+						Ok(()) =>
+							log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)),
+						Err(ChannelMonitorUpdateErr::PermanentFailure) => {
+							monitor_state.channel_perm_failed.store(true, Ordering::Release);
+							self.pending_monitor_events.lock().unwrap().push(MonitorEvent::UpdateFailed(*funding_outpoint));
+						},
+						Err(ChannelMonitorUpdateErr::TemporaryFailure) => {
+							log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor));
+							pending_monitor_updates.push(update_id);
+						},
+					}
+				}
 
-			// Register any new outputs with the chain source for filtering, storing any dependent
-			// transactions from within the block that previously had not been included in txdata.
-			if let Some(ref chain_source) = self.chain_source {
-				let block_hash = header.block_hash();
-				for (txid, mut outputs) in txn_outputs.drain(..) {
-					for (idx, output) in outputs.drain(..) {
-						// Register any new outputs with the chain source for filtering and recurse
-						// if it indicates that there are dependent transactions within the block
-						// that had not been previously included in txdata.
-						let output = WatchedOutput {
-							block_hash: Some(block_hash),
-							outpoint: OutPoint { txid, index: idx as u16 },
-							script_pubkey: output.script_pubkey,
-						};
-						if let Some(tx) = chain_source.register_output(output) {
-							dependent_txdata.push(tx);
+				// Register any new outputs with the chain source for filtering, storing any dependent
+				// transactions from within the block that previously had not been included in txdata.
+				if let Some(ref chain_source) = self.chain_source {
+					let block_hash = header.block_hash();
+					for (txid, mut outputs) in txn_outputs.drain(..) {
+						for (idx, output) in outputs.drain(..) {
+							// Register any new outputs with the chain source for filtering and recurse
+							// if it indicates that there are dependent transactions within the block
+							// that had not been previously included in txdata.
+							let output = WatchedOutput {
+								block_hash: Some(block_hash),
+								outpoint: OutPoint { txid, index: idx as u16 },
+								script_pubkey: output.script_pubkey,
+							};
+							if let Some(tx) = chain_source.register_output(output) {
+								dependent_txdata.push(tx);
+							}
 						}
 					}
 				}
@@ -251,6 +318,7 @@ where C::Target: chain::Filter,
 	pub fn new(chain_source: Option<C>, broadcaster: T, logger: L, feeest: F, persister: P) -> Self {
 		Self {
 			monitors: RwLock::new(HashMap::new()),
+			sync_persistence_id: AtomicCounter::new(),
 			chain_source,
 			broadcaster,
 			logger,
@@ -337,7 +405,7 @@ where C::Target: chain::Filter,
 		pending_monitor_updates.retain(|update_id| *update_id != completed_update_id);
 
 		match completed_update_id {
-			MonitorUpdateId { .. } => {
+			MonitorUpdateId { contents: UpdateOrigin::OffChain(_) } => {
 				// Note that we only check for `UpdateOrigin::OffChain` failures here - if
 				// we're being told that a `UpdateOrigin::OffChain` monitor update completed,
 				// we only care about ensuring we don't tell the `ChannelManager` to restore
@@ -348,16 +416,22 @@ where C::Target: chain::Filter,
 				// `MonitorEvent`s from the monitor back to the `ChannelManager` until they
 				// complete.
 				let monitor_is_pending_updates = monitor_data.has_pending_offchain_updates(&pending_monitor_updates);
-				if monitor_is_pending_updates {
-					// If there are still monitor updates pending, we cannot yet construct an
+				if monitor_is_pending_updates || monitor_data.channel_perm_failed.load(Ordering::Acquire) {
+					// If there are still monitor updates pending (or an old monitor update
+					// finished after a later one perm-failed), we cannot yet construct an
 					// UpdateCompleted event.
 					return Ok(());
 				}
 				self.pending_monitor_events.lock().unwrap().push(MonitorEvent::UpdateCompleted {
 					funding_txo,
 					monitor_update_id: monitor_data.monitor.get_latest_update_id(),
 				});
-			}
+			},
+			MonitorUpdateId { contents: UpdateOrigin::ChainSync(_) } => {
+				// We've already done everything we need to, the next time
+				// release_pending_monitor_events is called, any events for this ChannelMonitor
+				// will be returned if there's no more SyncPersistId events left.
+			},
 		}
 		Ok(())
 	}
@@ -502,7 +576,11 @@ where C::Target: chain::Filter,
 				monitor.load_outputs_to_watch(chain_source);
 			}
 		}
-		entry.insert(MonitorHolder { monitor, pending_monitor_updates: Mutex::new(pending_monitor_updates) });
+		entry.insert(MonitorHolder {
+			monitor,
+			pending_monitor_updates: Mutex::new(pending_monitor_updates),
+			channel_perm_failed: AtomicBool::new(false),
+		});
 		persist_res
 	}
 
@@ -534,15 +612,19 @@ where C::Target: chain::Filter,
 				// still be changed. So, persist the updated monitor despite the error.
 				let update_id = MonitorUpdateId::from_monitor_update(&update);
 				let mut pending_monitor_updates = monitor_state.pending_monitor_updates.lock().unwrap();
-				let persist_res = self.persister.update_persisted_channel(funding_txo, &update, monitor, update_id);
+				let persist_res = self.persister.update_persisted_channel(funding_txo, &Some(update), monitor, update_id);
 				if let Err(e) = persist_res {
 					if e == ChannelMonitorUpdateErr::TemporaryFailure {
 						pending_monitor_updates.push(update_id);
+					} else {
+						monitor_state.channel_perm_failed.store(true, Ordering::Release);
 					}
 					log_error!(self.logger, "Failed to persist channel monitor update: {:?}", e);
 				}
 				if update_res.is_err() {
 					Err(ChannelMonitorUpdateErr::PermanentFailure)
+				} else if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
+					Err(ChannelMonitorUpdateErr::PermanentFailure)
 				} else {
 					persist_res
 				}
@@ -553,7 +635,23 @@ where C::Target: chain::Filter,
 	fn release_pending_monitor_events(&self) -> Vec<MonitorEvent> {
 		let mut pending_monitor_events = self.pending_monitor_events.lock().unwrap().split_off(0);
 		for monitor_state in self.monitors.read().unwrap().values() {
-			pending_monitor_events.append(&mut monitor_state.monitor.get_and_clear_pending_monitor_events());
+			let is_pending_monitor_update = monitor_state.has_pending_chainsync_updates(&monitor_state.pending_monitor_updates.lock().unwrap());
+			if is_pending_monitor_update {
+				log_info!(self.logger, "A Channel Monitor sync is still in progress, refusing to provide monitor events!");
+			} else {
+				if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
+					// If a `UpdateOrigin::ChainSync` persistence failed with `PermanantFailure`,
+					// we don't really know if the latest `ChannelMonitor` state is on disk or not.
+					// We're supposed to hold monitor updates until the latest state is on disk to
+					// avoid duplicate events, but the user told us persistence is screw-y and may
+					// not complete. We can't hold events forever because we may learn some payment
+					// preimage, so instead we just log and hope the user complied with the
+					// `PermanentFailure` requirements of having at least the local-disk copy
+					// updated.
+					log_info!(self.logger, "A Channel Monitor sync returned PermanentFailure. Returning monitor events but duplicate events may appear after reload!");
+				}
+				pending_monitor_events.append(&mut monitor_state.monitor.get_and_clear_pending_monitor_events());
+			}
 		}
 		pending_monitor_events
 	}
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -146,16 +146,24 @@ pub enum MonitorEvent {
 		/// same [`ChannelMonitor`] have been applied and persisted.
 		monitor_update_id: u64,
 	},
+
+	/// Indicates a [`ChannelMonitor`] update has failed. See
+	/// [`ChannelMonitorUpdateErr::PermanentFailure`] for more information on how this is used.
+	///
+	/// [`ChannelMonitorUpdateErr::PermanentFailure`]: super::ChannelMonitorUpdateErr::PermanentFailure
+	UpdateFailed(OutPoint),
 }
 impl_writeable_tlv_based_enum_upgradable!(MonitorEvent,
-	// Note that UpdateCompleted is currently never serialized to disk as it is generated only in ChainMonitor
+	// Note that UpdateCompleted and UpdateFailed are currently never serialized to disk as they are
+	// generated only in ChainMonitor
 	(0, UpdateCompleted) => {
 		(0, funding_txo, required),
 		(2, monitor_update_id, required),
 	},
 ;
 	(2, HTLCEvent),
 	(4, CommitmentTxConfirmed),
+	(6, UpdateFailed),
 );
 
 /// Simple structure sent back by `chain::Watch` when an HTLC from a forward channel is detected on
@@ -649,7 +657,17 @@ pub(crate) struct ChannelMonitorImpl<Signer: Sign> {
 
 	payment_preimages: HashMap<PaymentHash, PaymentPreimage>,
 
+	// Note that `MonitorEvent`s MUST NOT be generated during update processing, only generated
+	// during chain data processing. This prevents a race in `ChainMonitor::update_channel` (and
+	// presumably user implementations thereof as well) where we update the in-memory channel
+	// object, then before the persistence finishes (as it's all under a read-lock), we return
+	// pending events to the user or to the relevant `ChannelManager`. Then, on reload, we'll have
+	// the pre-event state here, but have processed the event in the `ChannelManager`.
+	// Note that because the `event_lock` in `ChainMonitor` is only taken in
+	// block/transaction-connected events and *not* during block/transaction-disconnected events,
+	// we further MUST NOT generate events during block/transaction-disconnection.
 	pending_monitor_events: Vec<MonitorEvent>,
+
 	pending_events: Vec<Event>,
 
 	// Used to track on-chain events (i.e., transactions part of channels confirmed on chain) on
diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs
@@ -285,6 +285,10 @@ pub trait Watch<ChannelSigner: Sign> {
 	/// Returns any monitor events since the last call. Subsequent calls must only return new
 	/// events.
 	///
+	/// Note that after any block- or transaction-connection calls to a [`ChannelMonitor`], no
+	/// further events may be returned here until the [`ChannelMonitor`] has been fully persisted
+	/// to disk.
+	///
 	/// For details on asynchronous [`ChannelMonitor`] updating and returning
 	/// [`MonitorEvent::UpdateCompleted`] here, see [`ChannelMonitorUpdateErr::TemporaryFailure`].
 	fn release_pending_monitor_events(&self) -> Vec<MonitorEvent>;
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
diff --git a/lightning/src/util/test_utils.rs b/lightning/src/util/test_utils.rs

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ impl chainmonitor::Persist<EnforcingSigner> for TestPersister {`
`14`	`14`	`self.update_ret.lock().unwrap().clone()`
`15`	`15`	`}`
`16`	`16`
`17`		`- fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &channelmonitor::ChannelMonitorUpdate, _data: &channelmonitor::ChannelMonitor<EnforcingSigner>, _update_id: MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {`
	`17`	`+ fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &Option<channelmonitor::ChannelMonitorUpdate>, _data: &channelmonitor::ChannelMonitor<EnforcingSigner>, _update_id: MonitorUpdateId) -> Result<(), chain::ChannelMonitorUpdateErr> {`
`18`	`18`	`self.update_ret.lock().unwrap().clone()`
`19`	`19`	`}`
`20`	`20`	`}`