Skip to content

Commit b48230f

Browse files
committed
Stop failing back HTLCs on temporary monitor udpate failures
Previously, if we get a temporary monitor update failure while there were HTLCs pending forwarding in the holding cell, we'd clear them and fail them all backwards. This makes sense if temporary failures are rare, but in an async environment, temporary monitor update failures may be the normal case. In such a world, this results in potentially a lot of spurious HTLC forwarding failures (which is the topic of #661).
1 parent f106e1c commit b48230f

File tree

2 files changed

+17
-62
lines changed

2 files changed

+17
-62
lines changed

lightning/src/ln/channel.rs

Lines changed: 12 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2517,13 +2517,11 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
25172517
/// implicitly dropping) and the payment_hashes of HTLCs we tried to add but are dropping.
25182518
/// No further message handling calls may be made until a channel_reestablish dance has
25192519
/// completed.
2520-
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) -> Vec<(HTLCSource, PaymentHash)> where L::Target: Logger {
2521-
let mut outbound_drops = Vec::new();
2522-
2520+
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) where L::Target: Logger {
25232521
assert_eq!(self.channel_state & ChannelState::ShutdownComplete as u32, 0);
25242522
if self.channel_state < ChannelState::FundingSent as u32 {
25252523
self.channel_state = ChannelState::ShutdownComplete as u32;
2526-
return outbound_drops;
2524+
return;
25272525
}
25282526
// Upon reconnect we have to start the closing_signed dance over, but shutdown messages
25292527
// will be retransmitted.
@@ -2566,23 +2564,8 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
25662564
}
25672565
}
25682566

2569-
self.holding_cell_htlc_updates.retain(|htlc_update| {
2570-
match htlc_update {
2571-
// Note that currently on channel reestablish we assert that there are
2572-
// no holding cell HTLC update_adds, so if in the future we stop
2573-
// dropping added HTLCs here and failing them backwards, then there will
2574-
// need to be corresponding changes made in the Channel's re-establish
2575-
// logic.
2576-
&HTLCUpdateAwaitingACK::AddHTLC { ref payment_hash, ref source, .. } => {
2577-
outbound_drops.push((source.clone(), payment_hash.clone()));
2578-
false
2579-
},
2580-
&HTLCUpdateAwaitingACK::ClaimHTLC {..} | &HTLCUpdateAwaitingACK::FailHTLC {..} => true,
2581-
}
2582-
});
25832567
self.channel_state |= ChannelState::PeerDisconnected as u32;
2584-
log_debug!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops and {} waiting-to-locally-announced HTLC drops on channel {}", outbound_drops.len(), inbound_drop_count, log_bytes!(self.channel_id()));
2585-
outbound_drops
2568+
log_debug!(logger, "Peer disconnection resulted in {} waiting-to-locally-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
25862569
}
25872570

25882571
/// Indicates that a ChannelMonitor update failed to be stored by the client and further
@@ -2757,7 +2740,7 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
27572740

27582741
/// May panic if some calls other than message-handling calls (which will all Err immediately)
27592742
/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
2760-
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
2743+
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Vec<(HTLCSource, PaymentHash)>, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
27612744
if self.channel_state & (ChannelState::PeerDisconnected as u32) == 0 {
27622745
// While BOLT 2 doesn't indicate explicitly we should error this channel here, it
27632746
// almost certainly indicates we are going to end up out-of-sync in some way, so we
@@ -2808,15 +2791,15 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
28082791
return Err(ChannelError::Close("Peer claimed they saw a revoke_and_ack but we haven't sent funding_locked yet".to_owned()));
28092792
}
28102793
// Short circuit the whole handler as there is nothing we can resend them
2811-
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2794+
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
28122795
}
28132796

28142797
// We have OurFundingLocked set!
28152798
let next_per_commitment_point = self.holder_keys.get_per_commitment_point(self.cur_holder_commitment_transaction_number, &self.secp_ctx);
28162799
return Ok((Some(msgs::FundingLocked {
28172800
channel_id: self.channel_id(),
28182801
next_per_commitment_point,
2819-
}), None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2802+
}), None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
28202803
}
28212804

28222805
let required_revoke = if msg.next_remote_commitment_number + 1 == INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number {
@@ -2857,14 +2840,6 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
28572840
}
28582841

28592842
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 {
2860-
// Note that if in the future we no longer drop holding cell update_adds on peer
2861-
// disconnect, this logic will need to be updated.
2862-
for htlc_update in self.holding_cell_htlc_updates.iter() {
2863-
if let &HTLCUpdateAwaitingACK::AddHTLC { .. } = htlc_update {
2864-
debug_assert!(false, "There shouldn't be any add-HTLCs in the holding cell now because they should have been dropped on peer disconnect. Panic here because said HTLCs won't be handled correctly.");
2865-
}
2866-
}
2867-
28682843
// We're up-to-date and not waiting on a remote revoke (if we are our
28692844
// channel_reestablish should result in them sending a revoke_and_ack), but we may
28702845
// have received some updates while we were disconnected. Free the holding cell
@@ -2873,20 +2848,14 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
28732848
Err(ChannelError::Close(msg)) => return Err(ChannelError::Close(msg)),
28742849
Err(ChannelError::Ignore(_)) | Err(ChannelError::CloseDelayBroadcast(_)) => panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
28752850
Ok((Some((commitment_update, monitor_update)), htlcs_to_fail)) => {
2876-
// If in the future we no longer drop holding cell update_adds on peer
2877-
// disconnect, we may be handed some HTLCs to fail backwards here.
2878-
assert!(htlcs_to_fail.is_empty());
2879-
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), shutdown_msg));
2851+
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
28802852
},
28812853
Ok((None, htlcs_to_fail)) => {
2882-
// If in the future we no longer drop holding cell update_adds on peer
2883-
// disconnect, we may be handed some HTLCs to fail backwards here.
2884-
assert!(htlcs_to_fail.is_empty());
2885-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
2854+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
28862855
},
28872856
}
28882857
} else {
2889-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
2858+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
28902859
}
28912860
} else if msg.next_local_commitment_number == next_counterparty_commitment_number - 1 {
28922861
if required_revoke.is_some() {
@@ -2897,10 +2866,10 @@ impl<ChanSigner: ChannelKeys> Channel<ChanSigner> {
28972866

28982867
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
28992868
self.monitor_pending_commitment_signed = true;
2900-
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), shutdown_msg));
2869+
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
29012870
}
29022871

2903-
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), shutdown_msg));
2872+
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), Vec::new(), shutdown_msg));
29042873
} else {
29052874
return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction".to_owned()));
29062875
}
@@ -4094,7 +4063,7 @@ impl Readable for InboundHTLCRemovalReason {
40944063
impl<ChanSigner: ChannelKeys + Writeable> Writeable for Channel<ChanSigner> {
40954064
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ::std::io::Error> {
40964065
// Note that we write out as if remove_uncommitted_htlcs_and_mark_paused had just been
4097-
// called but include holding cell updates (and obviously we don't modify self).
4066+
// called.
40984067

40994068
writer.write_all(&[SERIALIZATION_VERSION; 1])?;
41004069
writer.write_all(&[MIN_SERIALIZATION_VERSION; 1])?;

lightning/src/ln/channelmanager.rs

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2921,7 +2921,7 @@ impl<ChanSigner: ChannelKeys, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
29212921
}
29222922

29232923
fn internal_channel_reestablish(&self, counterparty_node_id: &PublicKey, msg: &msgs::ChannelReestablish) -> Result<(), MsgHandleErrInternal> {
2924-
let chan_restoration_res = {
2924+
let (htlcs_failed_forward, chan_restoration_res) = {
29252925
let mut channel_state_lock = self.channel_state.lock().unwrap();
29262926
let channel_state = &mut *channel_state_lock;
29272927

@@ -2934,20 +2934,20 @@ impl<ChanSigner: ChannelKeys, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
29342934
// disconnect, so Channel's reestablish will never hand us any holding cell
29352935
// freed HTLCs to fail backwards. If in the future we no longer drop pending
29362936
// add-HTLCs on disconnect, we may be handed HTLCs to fail backwards here.
2937-
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, shutdown) =
2937+
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, htlcs_failed_forward, shutdown) =
29382938
try_chan_entry!(self, chan.get_mut().channel_reestablish(msg, &self.logger), channel_state, chan);
29392939
if let Some(msg) = shutdown {
29402940
channel_state.pending_msg_events.push(events::MessageSendEvent::SendShutdown {
29412941
node_id: counterparty_node_id.clone(),
29422942
msg,
29432943
});
29442944
}
2945-
handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked)
2945+
(htlcs_failed_forward, handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked))
29462946
},
29472947
hash_map::Entry::Vacant(_) => return Err(MsgHandleErrInternal::send_err_msg_no_close("Failed to find corresponding channel".to_owned(), msg.channel_id))
29482948
}
29492949
};
2950-
post_handle_chan_restoration!(self, chan_restoration_res, Vec::new(), Vec::new());
2950+
post_handle_chan_restoration!(self, chan_restoration_res, Vec::new(), htlcs_failed_forward);
29512951
Ok(())
29522952
}
29532953

@@ -3334,7 +3334,6 @@ impl<ChanSigner: ChannelKeys, M: Deref + Sync + Send, T: Deref + Sync + Send, K:
33343334
fn peer_disconnected(&self, counterparty_node_id: &PublicKey, no_connection_possible: bool) {
33353335
let _consistency_lock = self.total_consistency_lock.read().unwrap();
33363336
let mut failed_channels = Vec::new();
3337-
let mut failed_payments = Vec::new();
33383337
let mut no_channels_remain = true;
33393338
{
33403339
let mut channel_state_lock = self.channel_state.lock().unwrap();
@@ -3363,16 +3362,8 @@ impl<ChanSigner: ChannelKeys, M: Deref + Sync + Send, T: Deref + Sync + Send, K:
33633362
log_debug!(self.logger, "Marking channels with {} disconnected and generating channel_updates", log_pubkey!(counterparty_node_id));
33643363
channel_state.by_id.retain(|_, chan| {
33653364
if chan.get_counterparty_node_id() == *counterparty_node_id {
3366-
// Note that currently on channel reestablish we assert that there are no
3367-
// holding cell add-HTLCs, so if in the future we stop removing uncommitted HTLCs
3368-
// on peer disconnect here, there will need to be corresponding changes in
3369-
// reestablish logic.
3370-
let failed_adds = chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
3365+
chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
33713366
chan.to_disabled_marked();
3372-
if !failed_adds.is_empty() {
3373-
let chan_update = self.get_channel_update(&chan).map(|u| u.encode_with_len()).unwrap(); // Cannot add/recv HTLCs before we have a short_id so unwrap is safe
3374-
failed_payments.push((chan_update, failed_adds));
3375-
}
33763367
if chan.is_shutdown() {
33773368
if let Some(short_id) = chan.get_short_channel_id() {
33783369
short_to_id.remove(&short_id);
@@ -3415,11 +3406,6 @@ impl<ChanSigner: ChannelKeys, M: Deref + Sync + Send, T: Deref + Sync + Send, K:
34153406
for failure in failed_channels.drain(..) {
34163407
self.finish_force_close_channel(failure);
34173408
}
3418-
for (chan_update, mut htlc_sources) in failed_payments {
3419-
for (htlc_source, payment_hash) in htlc_sources.drain(..) {
3420-
self.fail_htlc_backwards_internal(self.channel_state.lock().unwrap(), htlc_source, &payment_hash, HTLCFailReason::Reason { failure_code: 0x1000 | 7, data: chan_update.clone() });
3421-
}
3422-
}
34233409
}
34243410

34253411
fn peer_connected(&self, counterparty_node_id: &PublicKey, init_msg: &msgs::Init) {

0 commit comments

Comments
 (0)