Skip to content

Commit 6e865ea

Browse files
committed
Stop failing back HTLCs on temporary monitor udpate failures
Previously, if we get a temporary monitor update failure while there were HTLCs pending forwarding in the holding cell, we'd clear them and fail them all backwards. This makes sense if temporary failures are rare, but in an async environment, temporary monitor update failures may be the normal case. In such a world, this results in potentially a lot of spurious HTLC forwarding failures (which is the topic of #661).
1 parent 49c7b01 commit 6e865ea

File tree

2 files changed

+17
-62
lines changed

2 files changed

+17
-62
lines changed

lightning/src/ln/channel.rs

Lines changed: 12 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2668,13 +2668,11 @@ impl<Signer: Sign> Channel<Signer> {
26682668
/// implicitly dropping) and the payment_hashes of HTLCs we tried to add but are dropping.
26692669
/// No further message handling calls may be made until a channel_reestablish dance has
26702670
/// completed.
2671-
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) -> Vec<(HTLCSource, PaymentHash)> where L::Target: Logger {
2672-
let mut outbound_drops = Vec::new();
2673-
2671+
pub fn remove_uncommitted_htlcs_and_mark_paused<L: Deref>(&mut self, logger: &L) where L::Target: Logger {
26742672
assert_eq!(self.channel_state & ChannelState::ShutdownComplete as u32, 0);
26752673
if self.channel_state < ChannelState::FundingSent as u32 {
26762674
self.channel_state = ChannelState::ShutdownComplete as u32;
2677-
return outbound_drops;
2675+
return;
26782676
}
26792677
// Upon reconnect we have to start the closing_signed dance over, but shutdown messages
26802678
// will be retransmitted.
@@ -2717,23 +2715,8 @@ impl<Signer: Sign> Channel<Signer> {
27172715
}
27182716
}
27192717

2720-
self.holding_cell_htlc_updates.retain(|htlc_update| {
2721-
match htlc_update {
2722-
// Note that currently on channel reestablish we assert that there are
2723-
// no holding cell HTLC update_adds, so if in the future we stop
2724-
// dropping added HTLCs here and failing them backwards, then there will
2725-
// need to be corresponding changes made in the Channel's re-establish
2726-
// logic.
2727-
&HTLCUpdateAwaitingACK::AddHTLC { ref payment_hash, ref source, .. } => {
2728-
outbound_drops.push((source.clone(), payment_hash.clone()));
2729-
false
2730-
},
2731-
&HTLCUpdateAwaitingACK::ClaimHTLC {..} | &HTLCUpdateAwaitingACK::FailHTLC {..} => true,
2732-
}
2733-
});
27342718
self.channel_state |= ChannelState::PeerDisconnected as u32;
2735-
log_debug!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops and {} waiting-to-locally-announced HTLC drops on channel {}", outbound_drops.len(), inbound_drop_count, log_bytes!(self.channel_id()));
2736-
outbound_drops
2719+
log_debug!(logger, "Peer disconnection resulted in {} waiting-to-locally-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
27372720
}
27382721

27392722
/// Indicates that a ChannelMonitor update failed to be stored by the client and further
@@ -2908,7 +2891,7 @@ impl<Signer: Sign> Channel<Signer> {
29082891

29092892
/// May panic if some calls other than message-handling calls (which will all Err immediately)
29102893
/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
2911-
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
2894+
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L) -> Result<(Option<msgs::FundingLocked>, Option<msgs::RevokeAndACK>, Option<msgs::CommitmentUpdate>, Option<ChannelMonitorUpdate>, RAACommitmentOrder, Vec<(HTLCSource, PaymentHash)>, Option<msgs::Shutdown>), ChannelError> where L::Target: Logger {
29122895
if self.channel_state & (ChannelState::PeerDisconnected as u32) == 0 {
29132896
// While BOLT 2 doesn't indicate explicitly we should error this channel here, it
29142897
// almost certainly indicates we are going to end up out-of-sync in some way, so we
@@ -2959,15 +2942,15 @@ impl<Signer: Sign> Channel<Signer> {
29592942
return Err(ChannelError::Close("Peer claimed they saw a revoke_and_ack but we haven't sent funding_locked yet".to_owned()));
29602943
}
29612944
// Short circuit the whole handler as there is nothing we can resend them
2962-
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2945+
return Ok((None, None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
29632946
}
29642947

29652948
// We have OurFundingLocked set!
29662949
let next_per_commitment_point = self.holder_signer.get_per_commitment_point(self.cur_holder_commitment_transaction_number, &self.secp_ctx);
29672950
return Ok((Some(msgs::FundingLocked {
29682951
channel_id: self.channel_id(),
29692952
next_per_commitment_point,
2970-
}), None, None, None, RAACommitmentOrder::CommitmentFirst, shutdown_msg));
2953+
}), None, None, None, RAACommitmentOrder::CommitmentFirst, Vec::new(), shutdown_msg));
29712954
}
29722955

29732956
let required_revoke = if msg.next_remote_commitment_number + 1 == INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number {
@@ -3008,14 +2991,6 @@ impl<Signer: Sign> Channel<Signer> {
30082991
}
30092992

30102993
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 {
3011-
// Note that if in the future we no longer drop holding cell update_adds on peer
3012-
// disconnect, this logic will need to be updated.
3013-
for htlc_update in self.holding_cell_htlc_updates.iter() {
3014-
if let &HTLCUpdateAwaitingACK::AddHTLC { .. } = htlc_update {
3015-
debug_assert!(false, "There shouldn't be any add-HTLCs in the holding cell now because they should have been dropped on peer disconnect. Panic here because said HTLCs won't be handled correctly.");
3016-
}
3017-
}
3018-
30192994
// We're up-to-date and not waiting on a remote revoke (if we are our
30202995
// channel_reestablish should result in them sending a revoke_and_ack), but we may
30212996
// have received some updates while we were disconnected. Free the holding cell
@@ -3024,20 +2999,14 @@ impl<Signer: Sign> Channel<Signer> {
30242999
Err(ChannelError::Close(msg)) => return Err(ChannelError::Close(msg)),
30253000
Err(ChannelError::Ignore(_)) | Err(ChannelError::CloseDelayBroadcast(_)) => panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
30263001
Ok((Some((commitment_update, monitor_update)), htlcs_to_fail)) => {
3027-
// If in the future we no longer drop holding cell update_adds on peer
3028-
// disconnect, we may be handed some HTLCs to fail backwards here.
3029-
assert!(htlcs_to_fail.is_empty());
3030-
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), shutdown_msg));
3002+
return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(monitor_update), self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
30313003
},
30323004
Ok((None, htlcs_to_fail)) => {
3033-
// If in the future we no longer drop holding cell update_adds on peer
3034-
// disconnect, we may be handed some HTLCs to fail backwards here.
3035-
assert!(htlcs_to_fail.is_empty());
3036-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
3005+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), htlcs_to_fail, shutdown_msg));
30373006
},
30383007
}
30393008
} else {
3040-
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
3009+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30413010
}
30423011
} else if msg.next_local_commitment_number == next_counterparty_commitment_number - 1 {
30433012
if required_revoke.is_some() {
@@ -3048,10 +3017,10 @@ impl<Signer: Sign> Channel<Signer> {
30483017

30493018
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
30503019
self.monitor_pending_commitment_signed = true;
3051-
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), shutdown_msg));
3020+
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30523021
}
30533022

3054-
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), shutdown_msg));
3023+
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update(logger)), None, self.resend_order.clone(), Vec::new(), shutdown_msg));
30553024
} else {
30563025
return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction".to_owned()));
30573026
}
@@ -4289,7 +4258,7 @@ impl Readable for InboundHTLCRemovalReason {
42894258
impl<Signer: Sign> Writeable for Channel<Signer> {
42904259
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ::std::io::Error> {
42914260
// Note that we write out as if remove_uncommitted_htlcs_and_mark_paused had just been
4292-
// called but include holding cell updates (and obviously we don't modify self).
4261+
// called.
42934262

42944263
writer.write_all(&[SERIALIZATION_VERSION; 1])?;
42954264
writer.write_all(&[MIN_SERIALIZATION_VERSION; 1])?;

lightning/src/ln/channelmanager.rs

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2975,7 +2975,7 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
29752975
}
29762976

29772977
fn internal_channel_reestablish(&self, counterparty_node_id: &PublicKey, msg: &msgs::ChannelReestablish) -> Result<(), MsgHandleErrInternal> {
2978-
let chan_restoration_res = {
2978+
let (htlcs_failed_forward, chan_restoration_res) = {
29792979
let mut channel_state_lock = self.channel_state.lock().unwrap();
29802980
let channel_state = &mut *channel_state_lock;
29812981

@@ -2988,20 +2988,20 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
29882988
// disconnect, so Channel's reestablish will never hand us any holding cell
29892989
// freed HTLCs to fail backwards. If in the future we no longer drop pending
29902990
// add-HTLCs on disconnect, we may be handed HTLCs to fail backwards here.
2991-
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, shutdown) =
2991+
let (funding_locked, revoke_and_ack, commitment_update, monitor_update_opt, order, htlcs_failed_forward, shutdown) =
29922992
try_chan_entry!(self, chan.get_mut().channel_reestablish(msg, &self.logger), channel_state, chan);
29932993
if let Some(msg) = shutdown {
29942994
channel_state.pending_msg_events.push(events::MessageSendEvent::SendShutdown {
29952995
node_id: counterparty_node_id.clone(),
29962996
msg,
29972997
});
29982998
}
2999-
handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked)
2999+
(htlcs_failed_forward, handle_chan_restoration_locked!(self, channel_state_lock, channel_state, chan, revoke_and_ack, commitment_update, order, monitor_update_opt, Vec::new(), false, funding_locked))
30003000
},
30013001
hash_map::Entry::Vacant(_) => return Err(MsgHandleErrInternal::send_err_msg_no_close("Failed to find corresponding channel".to_owned(), msg.channel_id))
30023002
}
30033003
};
3004-
post_handle_chan_restoration!(self, chan_restoration_res, Vec::new(), Vec::new());
3004+
post_handle_chan_restoration!(self, chan_restoration_res, Vec::new(), htlcs_failed_forward);
30053005
Ok(())
30063006
}
30073007

@@ -3429,7 +3429,6 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
34293429
fn peer_disconnected(&self, counterparty_node_id: &PublicKey, no_connection_possible: bool) {
34303430
let _persistence_guard = PersistenceNotifierGuard::new(&self.total_consistency_lock, &self.persistence_notifier);
34313431
let mut failed_channels = Vec::new();
3432-
let mut failed_payments = Vec::new();
34333432
let mut no_channels_remain = true;
34343433
{
34353434
let mut channel_state_lock = self.channel_state.lock().unwrap();
@@ -3458,16 +3457,8 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
34583457
log_debug!(self.logger, "Marking channels with {} disconnected and generating channel_updates", log_pubkey!(counterparty_node_id));
34593458
channel_state.by_id.retain(|_, chan| {
34603459
if chan.get_counterparty_node_id() == *counterparty_node_id {
3461-
// Note that currently on channel reestablish we assert that there are no
3462-
// holding cell add-HTLCs, so if in the future we stop removing uncommitted HTLCs
3463-
// on peer disconnect here, there will need to be corresponding changes in
3464-
// reestablish logic.
3465-
let failed_adds = chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
3460+
chan.remove_uncommitted_htlcs_and_mark_paused(&self.logger);
34663461
chan.to_disabled_marked();
3467-
if !failed_adds.is_empty() {
3468-
let chan_update = self.get_channel_update(&chan).map(|u| u.encode_with_len()).unwrap(); // Cannot add/recv HTLCs before we have a short_id so unwrap is safe
3469-
failed_payments.push((chan_update, failed_adds));
3470-
}
34713462
if chan.is_shutdown() {
34723463
if let Some(short_id) = chan.get_short_channel_id() {
34733464
short_to_id.remove(&short_id);
@@ -3510,11 +3501,6 @@ impl<Signer: Sign, M: Deref + Sync + Send, T: Deref + Sync + Send, K: Deref + Sy
35103501
for failure in failed_channels.drain(..) {
35113502
self.finish_force_close_channel(failure);
35123503
}
3513-
for (chan_update, mut htlc_sources) in failed_payments {
3514-
for (htlc_source, payment_hash) in htlc_sources.drain(..) {
3515-
self.fail_htlc_backwards_internal(self.channel_state.lock().unwrap(), htlc_source, &payment_hash, HTLCFailReason::Reason { failure_code: 0x1000 | 7, data: chan_update.clone() });
3516-
}
3517-
}
35183504
}
35193505

35203506
fn peer_connected(&self, counterparty_node_id: &PublicKey, init_msg: &msgs::Init) {

0 commit comments

Comments
 (0)