Skip to content

Commit 5613be8

Browse files
committed
Stop failing back HTLCs on temporary monitor udpate failures
Previously, if we get a temporary monitor update failure while there were HTLCs pending forwarding in the holding cell, we'd clear them and fail them all backwards. This makes sense if temporary failures are rare, but in an async environment, temporary monitor update failures may be the normal case. In such a world, this results in potentially a lot of spurious HTLC forwarding failures (which is the topic of #661).
1 parent bb9d109 commit 5613be8

File tree

2 files changed

+17
-61
lines changed

2 files changed

+17
-61
lines changed

lightning/src/ln/channel.rs

Lines changed: 12 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2690,13 +2690,11 @@ impl<Signer: Sign> Channel<Signer> {
26902690
/// implicitly dropping) and the payment_hashes of HTLCs we tried to add but are dropping.
26912691
/// No further message handling calls may be made until a channel_reestablish dance has
26922692
/// completed.
2693-
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) -> Vec<(HTLCSource, PaymentHash)> where L::Target: Logger {
2694-
let mut outbound_drops = Vec::new();
2695-
2693+
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) where L::Target: Logger {
26962694
assert_eq!(self.channel_state & ChannelState::ShutdownComplete as u32, 0);
26972695
if self.channel_state < ChannelState::FundingSent as u32 {
26982696
self.channel_state = ChannelState::ShutdownComplete as u32;
2699-
return outbound_drops;
2697+
return;
27002698
}
27012699
// Upon reconnect we have to start the closing_signed dance over, but shutdown messages
27022700
// will be retransmitted.
@@ -2739,23 +2737,8 @@ impl<Signer: Sign> Channel<Signer> {
27392737
}
27402738
}
27412739

2742-
self.holding_cell_htlc_updates.retain(|htlc_update| {
2743-
match htlc_update {
2744-
// Note that currently on channel reestablish we assert that there are
2745-
// no holding cell HTLC update_adds, so if in the future we stop
2746-
// dropping added HTLCs here and failing them backwards, then there will
2747-
// need to be corresponding changes made in the Channel's re-establish
2748-
// logic.
2749-
&HTLCUpdateAwaitingACK::AddHTLC { ref payment_hash, ref source, .. } => {
2750-
outbound_drops.push((source.clone(), payment_hash.clone()));
2751-
false
2752-
},
2753-
&HTLCUpdateAwaitingACK::ClaimHTLC {..} | &HTLCUpdateAwaitingACK::FailHTLC {..} => true,
2754-
}
2755-
});
27562740
self.channel_state |= ChannelState::PeerDisconnected as u32;
2757-
log_debug!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops and {} waiting-to-locally-announced HTLC drops on channel {}", outbound_drops.len(), inbound_drop_count, log_bytes!(self.channel_id()));
2758-
outbound_drops
2741+
log_debug!(logger, "Peer disconnection resulted in {} waiting-to-locally-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
27592742
}
27602743

27612744
/// Indicates that a ChannelMonitor update failed to be stored by the client and further
@@ -2913,7 +2896,7 @@ impl<Signer: Sign> Channel<Signer> {
29132896

29142897
/// May panic if some calls other than message-handling calls (which will all Err immediately)
29152898
/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
2916-
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
2899+
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Vec<(HTLCSource, PaymentHash)>, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
29172900
if self.channel_state & (ChannelState::PeerDisconnected as u32) == 0 {
29182901
// While BOLT 2 doesn't indicate explicitly we should error this channel here, it
29192902
// almost certainly indicates we are going to end up out-of-sync in some way, so we
@@ -2964,15 +2947,15 @@ impl<Signer: Sign> Channel<Signer> {
29642947
return Err(ChannelError::Close("Peer claimed they saw a revoke_and_ack but we haven't sent funding_locked yet".to_owned()));
29652948
}
29662949
// Short circuit the whole handler as there is nothing we can resend them
2967-
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2950+
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
29682951
}
29692952

29702953
// We have OurFundingLocked set!
29712954
let next_per_commitment_point = self.holder_signer.get_per_commitment_point(self.cur_holder_commitment_transaction_number, &self.secp_ctx);
29722955
return Ok((Some(msgs::FundingLocked {
29732956
channel_id: self.channel_id(),
29742957
next_per_commitment_point,
2975-
}), None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2958+
}), None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
29762959
}
29772960

29782961
let required_revoke = if msg.next_remote_commitment_number + 1 == INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number {
@@ -3013,14 +2996,6 @@ impl<Signer: Sign> Channel<Signer> {
30132996
}
30142997

30152998
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 {
3016-
// Note that if in the future we no longer drop holding cell update_adds on peer
3017-
// disconnect, this logic will need to be updated.
3018-
for htlc_update in self.holding_cell_htlc_updates.iter() {
3019-
if let &HTLCUpdateAwaitingACK::AddHTLC { .. } = htlc_update {
3020-
debug_assert!(false, "There shouldn't be any add-HTLCs in the holding cell now because they should have been dropped on peer disconnect. Panic here because said HTLCs won't be handled correctly.");
3021-
}
3022-
}
3023-
30242999
// We're up-to-date and not waiting on a remote revoke (if we are our
30253000
// channel_reestablish should result in them sending a revoke_and_ack), but we may
30263001
// have received some updates while we were disconnected. Free the holding cell
@@ -3029,20 +3004,14 @@ impl<Signer: Sign> Channel<Signer> {
30293004
Err(ChannelError::Close(msg)) => return Err(ChannelError::Close(msg)),
30303005
Err(ChannelError::Ignore(_)) | Err(ChannelError::CloseDelayBroadcast(_)) => panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
30313006
Ok((Some((commitment_update, monitor_update)), htlcs_to_fail)) => {
3032-
// If in the future we no longer drop holding cell update_adds on peer
3033-
// disconnect, we may be handed some HTLCs to fail backwards here.
3034-
assert!(htlcs_to_fail.is_empty());
3035-
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), shutdown_msg));
3007+
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
30363008
},
30373009
Ok((None, htlcs_to_fail)) => {
3038-
// If in the future we no longer drop holding cell update_adds on peer
3039-
// disconnect, we may be handed some HTLCs to fail backwards here.
3040-
assert!(htlcs_to_fail.is_empty());
3041-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
3010+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
30423011
},
30433012
}
30443013
} else {
3045-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
3014+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30463015
}
30473016
} else if msg.next_local_commitment_number == next_counterparty_commitment_number - 1 {
30483017
if required_revoke.is_some() {
@@ -3053,10 +3022,10 @@ impl<Signer: Sign> Channel<Signer> {
30533022

30543023
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
30553024
self.monitor_pending_commitment_signed = true;
3056-
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), shutdown_msg));
3025+
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30573026
}
30583027

3059-
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), shutdown_msg));
3028+
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30603029
} else {
30613030
return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction".to_owned()));
30623031
}
@@ -4311,7 +4280,7 @@ impl Readable for InboundHTLCRemovalReason {
43114280
impl<Signer: Sign> Writeable for Channel<Signer> {
43124281
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ::std::io::Error> {
43134282
// Note that we write out as if remove_uncommitted_htlcs_and_mark_paused had just been
4314-
// called but include holding cell updates (and obviously we don't modify self).
4283+
// called.
43154284

43164285
writer.write_all(&[SERIALIZATION_VERSION; 1])?;
43174286
writer.write_all(&[MIN_SERIALIZATION_VERSION; 1])?;

lightning/src/ln/channelmanager.rs

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3103,7 +3103,7 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
31033103
}
31043104

31053105
fn internal_channel_reestablish(&self, counterparty_node_id: &PublicKey, msg: &msgs::ChannelReestablish) -> Result<(), MsgHandleErrInternal> {
3106-
let chan_restoration_res = {
3106+
let (htlcs_failed_forward, chan_restoration_res) = {
31073107
let mut channel_state_lock = self.channel_state.lock().unwrap();
31083108
let channel_state = &mut *channel_state_lock;
31093109

@@ -3116,20 +3116,21 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
31163116
// disconnect, so Channel's reestablish will never hand us any holding cell
31173117
// freed HTLCs to fail backwards. If in the future we no longer drop pending
31183118
// add-HTLCs on disconnect, we may be handed HTLCs to fail backwards here.
3119-
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, shutdown) =
3119+
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, htlcs_failed_forward, shutdown) =
31203120
try_chan_entry!(self, chan.get_mut().channel_reestablish(msg, &self.logger), channel_state, chan);
31213121
if let Some(msg) = shutdown {
31223122
channel_state.pending_msg_events.push(events::MessageSendEvent::SendShutdown {
31233123
node_id: counterparty_node_id.clone(),
31243124
msg,
31253125
});
31263126
}
3127-
handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked)
3127+
(htlcs_failed_forward, handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked))
31283128
},
31293129
hash_map::Entry::Vacant(_) => return Err(MsgHandleErrInternal::send_err_msg_no_close("Failed to find corresponding channel".to_owned(), msg.channel_id))
31303130
}
31313131
};
31323132
post_handle_chan_restoration!(self, chan_restoration_res);
3133+
self.fail_holding_cell_htlcs(htlcs_failed_forward, msg.channel_id);
31333134
Ok(())
31343135
}
31353136

@@ -3647,7 +3648,6 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
36473648
fn peer_disconnected(&self, counterparty_node_id: &PublicKey, no_connection_possible: bool) {
36483649
let _persistence_guard = PersistenceNotifierGuard::new(&self.total_consistency_lock, &self.persistence_notifier);
36493650
let mut failed_channels = Vec::new();
3650-
let mut failed_payments = Vec::new();
36513651
let mut no_channels_remain = true;
36523652
{
36533653
let mut channel_state_lock = self.channel_state.lock().unwrap();
@@ -3676,16 +3676,8 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
36763676
log_debug!(self.logger, "Marking channels with {} disconnected and generating channel_updates", log_pubkey!(counterparty_node_id));
36773677
channel_state.by_id.retain(|_, chan| {
36783678
if chan.get_counterparty_node_id() == *counterparty_node_id {
3679-
// Note that currently on channel reestablish we assert that there are no
3680-
// holding cell add-HTLCs, so if in the future we stop removing uncommitted HTLCs
3681-
// on peer disconnect here, there will need to be corresponding changes in
3682-
// reestablish logic.
3683-
let failed_adds = chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
3679+
chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
36843680
chan.to_disabled_marked();
3685-
if !failed_adds.is_empty() {
3686-
let chan_update = self.get_channel_update(&chan).map(|u| u.encode_with_len()).unwrap(); // Cannot add/recv HTLCs before we have a short_id so unwrap is safe
3687-
failed_payments.push((chan_update, failed_adds));
3688-
}
36893681
if chan.is_shutdown() {
36903682
if let Some(short_id) = chan.get_short_channel_id() {
36913683
short_to_id.remove(&short_id);
@@ -3729,11 +3721,6 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
37293721
for failure in failed_channels.drain(..) {
37303722
self.finish_force_close_channel(failure);
37313723
}
3732-
for (chan_update, mut htlc_sources) in failed_payments {
3733-
for (htlc_source, payment_hash) in htlc_sources.drain(..) {
3734-
self.fail_htlc_backwards_internal(self.channel_state.lock().unwrap(), htlc_source, &payment_hash, HTLCFailReason::Reason { failure_code: 0x1000 | 7, data: chan_update.clone() });
3735-
}
3736-
}
37373724
}
37383725

37393726
fn peer_connected(&self, counterparty_node_id: &PublicKey, init_msg: &msgs::Init) {

0 commit comments

Comments
 (0)