Skip to content

Commit 50e16c9

Browse files
committed
Rewrite Channel resend tracking to make it much more reliable
Resending revoke_and_ack and commitment_signed (+update) messages after monitor-update-failure or disconnection has been a highly unreliable part of our codebase for some time (as evidenced by the number of bugs caught in the chanmon_fail_consistency fuzz target). This is due to its rather ad-hoc nature and tracking/behavior which consists of checking a number of different flags to try to deduce which messages were/were not delivered and go from there. Instead, this commit rewrites it to simply keep track of the order messages were generated originally, as we always resend in the originally-generated order. I'm anticipating this will be way more robust than the old code, in addition to its simplicity.
1 parent 63ed8fe commit 50e16c9

File tree

2 files changed

+43
-68
lines changed

2 files changed

+43
-68
lines changed

src/ln/channel.rs

Lines changed: 42 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -237,19 +237,21 @@ pub(super) struct Channel {
237237
cur_local_commitment_transaction_number: u64,
238238
cur_remote_commitment_transaction_number: u64,
239239
value_to_self_msat: u64, // Excluding all pending_htlcs, excluding fees
240-
/// Upon receipt of a channel_reestablish we have to figure out whether to send a
241-
/// revoke_and_ack first or a commitment update first. Generally, we prefer to send
242-
/// revoke_and_ack first, but if we had a pending commitment update of our own waiting on a
243-
/// remote revoke when we received the latest commitment update from the remote we have to make
244-
/// sure that commitment update gets resent first.
245-
received_commitment_while_awaiting_raa: bool,
246240
pending_inbound_htlcs: Vec<InboundHTLCOutput>,
247241
pending_outbound_htlcs: Vec<OutboundHTLCOutput>,
248242
holding_cell_htlc_updates: Vec<HTLCUpdateAwaitingACK>,
249243

244+
/// When resending CS/RAA messages on channel monitor restoration or on reconnect, we always
245+
/// need to ensure we resend them in the order we originally generated them. Note that because
246+
/// there can only ever be one in-flight CS and/or one in-flight RAA at any time, it is
247+
/// sufficient to simply set this to the opposite of any message we are generating as we
248+
/// generate it. ie when we generate a CS, we set this to RAAFirst as, if there is a pending
249+
/// in-flight RAA to resend, it will have been the first thing we generated, and thus we should
250+
/// send it first.
251+
resend_order: RAACommitmentOrder,
252+
250253
monitor_pending_revoke_and_ack: bool,
251254
monitor_pending_commitment_signed: bool,
252-
monitor_pending_order: Option<RAACommitmentOrder>,
253255
monitor_pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>,
254256
monitor_pending_failures: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>,
255257

@@ -457,7 +459,6 @@ impl Channel {
457459
cur_local_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
458460
cur_remote_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
459461
value_to_self_msat: channel_value_satoshis * 1000 - push_msat,
460-
received_commitment_while_awaiting_raa: false,
461462

462463
pending_inbound_htlcs: Vec::new(),
463464
pending_outbound_htlcs: Vec::new(),
@@ -468,9 +469,10 @@ impl Channel {
468469
next_remote_htlc_id: 0,
469470
channel_update_count: 1,
470471

472+
resend_order: RAACommitmentOrder::CommitmentFirst,
473+
471474
monitor_pending_revoke_and_ack: false,
472475
monitor_pending_commitment_signed: false,
473-
monitor_pending_order: None,
474476
monitor_pending_forwards: Vec::new(),
475477
monitor_pending_failures: Vec::new(),
476478

@@ -646,7 +648,6 @@ impl Channel {
646648
cur_local_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
647649
cur_remote_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
648650
value_to_self_msat: msg.push_msat,
649-
received_commitment_while_awaiting_raa: false,
650651

651652
pending_inbound_htlcs: Vec::new(),
652653
pending_outbound_htlcs: Vec::new(),
@@ -657,9 +658,10 @@ impl Channel {
657658
next_remote_htlc_id: 0,
658659
channel_update_count: 1,
659660

661+
resend_order: RAACommitmentOrder::CommitmentFirst,
662+
660663
monitor_pending_revoke_and_ack: false,
661664
monitor_pending_commitment_signed: false,
662-
monitor_pending_order: None,
663665
monitor_pending_forwards: Vec::new(),
664666
monitor_pending_failures: Vec::new(),
665667

@@ -1812,12 +1814,6 @@ impl Channel {
18121814
}
18131815
}
18141816

1815-
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
1816-
// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
1817-
// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
1818-
self.monitor_pending_order = None;
1819-
}
1820-
18211817
self.channel_monitor.provide_latest_local_commitment_tx_info(local_commitment_tx.0, local_keys, self.feerate_per_kw, htlcs_and_sigs);
18221818

18231819
for htlc in self.pending_inbound_htlcs.iter_mut() {
@@ -1840,14 +1836,13 @@ impl Channel {
18401836

18411837
self.cur_local_commitment_transaction_number -= 1;
18421838
self.last_local_commitment_txn = new_local_commitment_txn;
1843-
self.received_commitment_while_awaiting_raa = (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) != 0;
1839+
// Note that if we need_our_commitment & !AwaitingRemoteRevoke we'll call
1840+
// send_commitment_no_status_check() next which will reset this to RAAFirst.
1841+
self.resend_order = RAACommitmentOrder::CommitmentFirst;
18441842

18451843
if (self.channel_state & ChannelState::MonitorUpdateFailed as u32) != 0 {
18461844
// In case we initially failed monitor updating without requiring a response, we need
18471845
// to make sure the RAA gets sent first.
1848-
if !self.monitor_pending_commitment_signed {
1849-
self.monitor_pending_order = Some(RAACommitmentOrder::RevokeAndACKFirst);
1850-
}
18511846
self.monitor_pending_revoke_and_ack = true;
18521847
if need_our_commitment && (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) == 0 {
18531848
// If we were going to send a commitment_signed after the RAA, go ahead and do all
@@ -2021,12 +2016,6 @@ impl Channel {
20212016
self.their_prev_commitment_point = self.their_cur_commitment_point;
20222017
self.their_cur_commitment_point = Some(msg.next_per_commitment_point);
20232018
self.cur_remote_commitment_transaction_number -= 1;
2024-
self.received_commitment_while_awaiting_raa = false;
2025-
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
2026-
// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
2027-
// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
2028-
self.monitor_pending_order = None;
2029-
}
20302019

20312020
log_trace!(self, "Updating HTLCs on receipt of RAA...");
20322021
let mut to_forward_infos = Vec::new();
@@ -2144,7 +2133,7 @@ impl Channel {
21442133
// When the monitor updating is restored we'll call get_last_commitment_update(),
21452134
// which does not update state, but we're definitely now awaiting a remote revoke
21462135
// before we can step forward any more, so set it here.
2147-
self.channel_state |= ChannelState::AwaitingRemoteRevoke as u32;
2136+
self.send_commitment_no_status_check()?;
21482137
}
21492138
self.monitor_pending_forwards.append(&mut to_forward_infos);
21502139
self.monitor_pending_failures.append(&mut revoked_htlcs);
@@ -2292,15 +2281,13 @@ impl Channel {
22922281
/// Indicates that a ChannelMonitor update failed to be stored by the client and further
22932282
/// updates are partially paused.
22942283
/// This must be called immediately after the call which generated the ChannelMonitor update
2295-
/// which failed, with the order argument set to the type of call it represented (ie a
2296-
/// commitment update or a revoke_and_ack generation). The messages which were generated from
2297-
/// that original call must *not* have been sent to the remote end, and must instead have been
2298-
/// dropped. They will be regenerated when monitor_updating_restored is called.
2299-
pub fn monitor_update_failed(&mut self, order: RAACommitmentOrder, resend_raa: bool, resend_commitment: bool, mut pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>, mut pending_fails: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>) {
2284+
/// which failed. The messages which were generated from that call which generated the
2285+
/// monitor update failure must *not* have been sent to the remote end, and must instead
2286+
/// have been dropped. They will be regenerated when monitor_updating_restored is called.
2287+
pub fn monitor_update_failed(&mut self, resend_raa: bool, resend_commitment: bool, mut pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>, mut pending_fails: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>) {
23002288
assert_eq!(self.channel_state & ChannelState::MonitorUpdateFailed as u32, 0);
23012289
self.monitor_pending_revoke_and_ack = resend_raa;
23022290
self.monitor_pending_commitment_signed = resend_commitment;
2303-
self.monitor_pending_order = Some(order);
23042291
assert!(self.monitor_pending_forwards.is_empty());
23052292
mem::swap(&mut pending_forwards, &mut self.monitor_pending_forwards);
23062293
assert!(self.monitor_pending_failures.is_empty());
@@ -2321,7 +2308,6 @@ impl Channel {
23212308
mem::swap(&mut failures, &mut self.monitor_pending_failures);
23222309

23232310
if self.channel_state & (ChannelState::PeerDisconnected as u32) != 0 {
2324-
// Leave monitor_pending_order so we can order our channel_reestablish responses
23252311
self.monitor_pending_revoke_and_ack = false;
23262312
self.monitor_pending_commitment_signed = false;
23272313
return (None, None, RAACommitmentOrder::RevokeAndACKFirst, forwards, failures);
@@ -2336,7 +2322,7 @@ impl Channel {
23362322

23372323
self.monitor_pending_revoke_and_ack = false;
23382324
self.monitor_pending_commitment_signed = false;
2339-
let order = self.monitor_pending_order.clone().unwrap();
2325+
let order = self.resend_order.clone();
23402326
log_trace!(self, "Restored monitor updating resulting in {} commitment update and {} RAA, with {} first",
23412327
if commitment_update.is_some() { "a" } else { "no" },
23422328
if raa.is_some() { "an" } else { "no" },
@@ -2499,33 +2485,26 @@ impl Channel {
24992485
})
25002486
} else { None };
25012487

2502-
let order = self.monitor_pending_order.clone().unwrap_or(if self.received_commitment_while_awaiting_raa {
2503-
RAACommitmentOrder::CommitmentFirst
2504-
} else {
2505-
RAACommitmentOrder::RevokeAndACKFirst
2506-
});
2507-
25082488
if msg.next_local_commitment_number == our_next_remote_commitment_number {
25092489
if required_revoke.is_some() {
25102490
log_debug!(self, "Reconnected channel {} with only lost outbound RAA", log_bytes!(self.channel_id()));
25112491
} else {
25122492
log_debug!(self, "Reconnected channel {} with no loss", log_bytes!(self.channel_id()));
25132493
}
25142494

2515-
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 &&
2516-
self.monitor_pending_order.is_none() { // monitor_pending_order indicates we're waiting on a response to a unfreeze
2495+
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 {
25172496
// We're up-to-date and not waiting on a remote revoke (if we are our
25182497
// channel_reestablish should result in them sending a revoke_and_ack), but we may
25192498
// have received some updates while we were disconnected. Free the holding cell
25202499
// now!
25212500
match self.free_holding_cell_htlcs() {
25222501
Err(ChannelError::Close(msg)) => return Err(ChannelError::Close(msg)),
25232502
Err(ChannelError::Ignore(_)) => panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
2524-
Ok(Some((commitment_update, channel_monitor))) => return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(channel_monitor), order, shutdown_msg)),
2525-
Ok(None) => return Ok((resend_funding_locked, required_revoke, None, None, order, shutdown_msg)),
2503+
Ok(Some((commitment_update, channel_monitor))) => return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(channel_monitor), self.resend_order.clone(), shutdown_msg)),
2504+
Ok(None) => return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg)),
25262505
}
25272506
} else {
2528-
return Ok((resend_funding_locked, required_revoke, None, None, order, shutdown_msg));
2507+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
25292508
}
25302509
} else if msg.next_local_commitment_number == our_next_remote_commitment_number - 1 {
25312510
if required_revoke.is_some() {
@@ -2536,10 +2515,10 @@ impl Channel {
25362515

25372516
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
25382517
self.monitor_pending_commitment_signed = true;
2539-
return Ok((resend_funding_locked, None, None, None, order, shutdown_msg));
2518+
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), shutdown_msg));
25402519
}
25412520

2542-
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update()), None, order, shutdown_msg));
2521+
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update()), None, self.resend_order.clone(), shutdown_msg));
25432522
} else {
25442523
return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction"));
25452524
}
@@ -3360,6 +3339,7 @@ impl Channel {
33603339
htlc.state = OutboundHTLCState::AwaitingRemovedRemoteRevoke(fail_reason);
33613340
}
33623341
}
3342+
self.resend_order = RAACommitmentOrder::RevokeAndACKFirst;
33633343

33643344
let (res, remote_commitment_tx, htlcs) = match self.send_commitment_no_state_update() {
33653345
Ok((res, (remote_commitment_tx, mut htlcs))) => {
@@ -3570,8 +3550,6 @@ impl Writeable for Channel {
35703550
self.cur_remote_commitment_transaction_number.write(writer)?;
35713551
self.value_to_self_msat.write(writer)?;
35723552

3573-
self.received_commitment_while_awaiting_raa.write(writer)?;
3574-
35753553
let mut dropped_inbound_htlcs = 0;
35763554
for htlc in self.pending_inbound_htlcs.iter() {
35773555
if let InboundHTLCState::RemoteAnnounced(_) = htlc.state {
@@ -3671,13 +3649,13 @@ impl Writeable for Channel {
36713649
}
36723650
}
36733651

3652+
match self.resend_order {
3653+
RAACommitmentOrder::CommitmentFirst => 0u8.write(writer)?,
3654+
RAACommitmentOrder::RevokeAndACKFirst => 1u8.write(writer)?,
3655+
}
3656+
36743657
self.monitor_pending_revoke_and_ack.write(writer)?;
36753658
self.monitor_pending_commitment_signed.write(writer)?;
3676-
match self.monitor_pending_order {
3677-
None => 0u8.write(writer)?,
3678-
Some(RAACommitmentOrder::CommitmentFirst) => 1u8.write(writer)?,
3679-
Some(RAACommitmentOrder::RevokeAndACKFirst) => 2u8.write(writer)?,
3680-
}
36813659

36823660
(self.monitor_pending_forwards.len() as u64).write(writer)?;
36833661
for &(ref pending_forward, ref htlc_id) in self.monitor_pending_forwards.iter() {
@@ -3775,8 +3753,6 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
37753753
let cur_remote_commitment_transaction_number = Readable::read(reader)?;
37763754
let value_to_self_msat = Readable::read(reader)?;
37773755

3778-
let received_commitment_while_awaiting_raa = Readable::read(reader)?;
3779-
37803756
let pending_inbound_htlc_count: u64 = Readable::read(reader)?;
37813757
let mut pending_inbound_htlcs = Vec::with_capacity(cmp::min(pending_inbound_htlc_count as usize, OUR_MAX_HTLCS as usize));
37823758
for _ in 0..pending_inbound_htlc_count {
@@ -3839,16 +3815,15 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
38393815
});
38403816
}
38413817

3842-
let monitor_pending_revoke_and_ack = Readable::read(reader)?;
3843-
let monitor_pending_commitment_signed = Readable::read(reader)?;
3844-
3845-
let monitor_pending_order = match <u8 as Readable<R>>::read(reader)? {
3846-
0 => None,
3847-
1 => Some(RAACommitmentOrder::CommitmentFirst),
3848-
2 => Some(RAACommitmentOrder::RevokeAndACKFirst),
3818+
let resend_order = match <u8 as Readable<R>>::read(reader)? {
3819+
0 => RAACommitmentOrder::CommitmentFirst,
3820+
1 => RAACommitmentOrder::RevokeAndACKFirst,
38493821
_ => return Err(DecodeError::InvalidValue),
38503822
};
38513823

3824+
let monitor_pending_revoke_and_ack = Readable::read(reader)?;
3825+
let monitor_pending_commitment_signed = Readable::read(reader)?;
3826+
38523827
let monitor_pending_forwards_count: u64 = Readable::read(reader)?;
38533828
let mut monitor_pending_forwards = Vec::with_capacity(cmp::min(monitor_pending_forwards_count as usize, OUR_MAX_HTLCS as usize));
38543829
for _ in 0..monitor_pending_forwards_count {
@@ -3935,14 +3910,14 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
39353910
cur_remote_commitment_transaction_number,
39363911
value_to_self_msat,
39373912

3938-
received_commitment_while_awaiting_raa,
39393913
pending_inbound_htlcs,
39403914
pending_outbound_htlcs,
39413915
holding_cell_htlc_updates,
39423916

3917+
resend_order,
3918+
39433919
monitor_pending_revoke_and_ack,
39443920
monitor_pending_commitment_signed,
3945-
monitor_pending_order,
39463921
monitor_pending_forwards,
39473922
monitor_pending_failures,
39483923

src/ln/channelmanager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ macro_rules! handle_monitor_err {
494494
if !$resend_raa {
495495
debug_assert!($action_type == RAACommitmentOrder::CommitmentFirst || !$resend_commitment);
496496
}
497-
$entry.get_mut().monitor_update_failed($action_type, $resend_raa, $resend_commitment, $failed_forwards, $failed_fails);
497+
$entry.get_mut().monitor_update_failed($resend_raa, $resend_commitment, $failed_forwards, $failed_fails);
498498
Err(MsgHandleErrInternal::from_chan_no_close(ChannelError::Ignore("Failed to update ChannelMonitor"), *$entry.key()))
499499
},
500500
}

0 commit comments

Comments
 (0)