Skip to content

Commit 9cd702a

Browse files
committed
Refactor payment-claim logic to ensure MPP-claim atomicity
Previously if we claimed an MPP where a previous-hop channel was closed while we were waitng for the user to provide us the preimage we'd simply skip claiming that HTLC without letting the user know. This refactors the claim logic to first check that all the channels are still available (which is actually all we need - we really mostly care about updating the channel monitors, not the channels themselves) and then claim the HTLCs in the same lock, ensuring atomicity.
1 parent a563a8b commit 9cd702a

File tree

2 files changed

+122
-68
lines changed

2 files changed

+122
-68
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,7 +1458,7 @@ fn test_monitor_update_fail_claim() {
14581458
nodes[1].node.handle_update_add_htlc(&nodes[2].node.get_our_node_id(), &payment_event.msgs[0]);
14591459
let events = nodes[1].node.get_and_clear_pending_msg_events();
14601460
assert_eq!(events.len(), 0);
1461-
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Failed to update ChannelMonitor".to_string(), 1);
1461+
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Temporary failure claiming HTLC, treating as success: Failed to update ChannelMonitor".to_string(), 1);
14621462
commitment_signed_dance!(nodes[1], nodes[2], payment_event.commitment_msg, false, true);
14631463

14641464
let bs_fail_update = get_htlc_update_msgs!(nodes[1], nodes[2].node.get_our_node_id());
@@ -1599,7 +1599,7 @@ fn monitor_update_claim_fail_no_response() {
15991599
check_added_monitors!(nodes[1], 1);
16001600
let events = nodes[1].node.get_and_clear_pending_msg_events();
16011601
assert_eq!(events.len(), 0);
1602-
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Failed to update ChannelMonitor".to_string(), 1);
1602+
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Temporary failure claiming HTLC, treating as success: Failed to update ChannelMonitor".to_string(), 1);
16031603

16041604
*nodes[1].chan_monitor.update_ret.lock().unwrap() = Ok(());
16051605
let (outpoint, latest_update) = nodes[1].chan_monitor.latest_monitor_update_id.lock().unwrap().get(&channel_id).unwrap().clone();

lightning/src/ln/channelmanager.rs

Lines changed: 120 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,98 +1859,152 @@ impl<ChanSigner: ChannelKeys, M: Deref, T: Deref, K: Deref, F: Deref> ChannelMan
18591859
let removed_source = channel_state.as_mut().unwrap().claimable_htlcs.remove(&(payment_hash, *payment_secret));
18601860
if let Some(mut sources) = removed_source {
18611861
assert!(!sources.is_empty());
1862-
let valid_mpp_amount = if let &Some(ref data) = &sources[0].payment_data {
1862+
1863+
// If we are claiming an MPP payment, we have to take special care to ensure that each
1864+
// channel exists before claiming all of the payments (inside one lock).
1865+
// Note that channel existance is sufficient as we should always get a monitor update
1866+
// which will take care of the real HTLC claim enforcement.
1867+
//
1868+
// If we find an HTLC which we would need to claim but for which we do not have a
1869+
// channel, we will fail all parts of the MPP payment. While we could wait and see if
1870+
// the sender retries the already-failed path(s), it should be a pretty rare case where
1871+
// we got all the HTLCs and then a channel closed while we were waiting for the user to
1872+
// provide the preimage, so worrying too much about the optimal handling isn't worth
1873+
// it.
1874+
1875+
let (is_mpp, mut valid_mpp) = if let &Some(ref data) = &sources[0].payment_data {
18631876
assert!(payment_secret.is_some());
1864-
data.total_msat == expected_amount
1877+
(true, data.total_msat == expected_amount)
18651878
} else {
18661879
assert!(payment_secret.is_none());
1867-
false
1880+
(false, false)
18681881
};
18691882

1883+
for htlc in sources.iter() {
1884+
if !is_mpp || !valid_mpp { break; }
1885+
if let None = channel_state.as_ref().unwrap().short_to_id.get(&htlc.prev_hop.short_channel_id) {
1886+
valid_mpp = false;
1887+
}
1888+
}
1889+
1890+
let mut errs = Vec::new();
18701891
let mut claimed_any_htlcs = false;
18711892
for htlc in sources.drain(..) {
18721893
if channel_state.is_none() { channel_state = Some(self.channel_state.lock().unwrap()); }
1873-
if !valid_mpp_amount && (htlc.value < expected_amount || htlc.value > expected_amount * 2) {
1894+
if (is_mpp && !valid_mpp) || (!is_mpp && (htlc.value < expected_amount || htlc.value > expected_amount * 2)) {
18741895
let mut htlc_msat_data = byte_utils::be64_to_array(htlc.value).to_vec();
18751896
let mut height_data = byte_utils::be32_to_array(self.latest_block_height.load(Ordering::Acquire) as u32).to_vec();
18761897
htlc_msat_data.append(&mut height_data);
18771898
self.fail_htlc_backwards_internal(channel_state.take().unwrap(),
18781899
HTLCSource::PreviousHopData(htlc.prev_hop), &payment_hash,
18791900
HTLCFailReason::Reason { failure_code: 0x4000|15, data: htlc_msat_data });
18801901
} else {
1881-
self.claim_funds_internal(channel_state.take().unwrap(), HTLCSource::PreviousHopData(htlc.prev_hop), payment_preimage);
1882-
claimed_any_htlcs = true;
1902+
match self.claim_funds_from_hop(channel_state.as_mut().unwrap(), htlc.prev_hop, payment_preimage) {
1903+
Err(Some(e)) => {
1904+
if let msgs::ErrorAction::IgnoreError = e.1.err.action {
1905+
// We got a temporary failure updating monitor, but will claim the
1906+
// HTLC when the monitor updating is restored (or on chain).
1907+
log_error!(self, "Temporary failure claiming HTLC, treating as success: {}", e.1.err.err);
1908+
claimed_any_htlcs = true;
1909+
} else { errs.push(e); }
1910+
},
1911+
Err(None) if is_mpp => unreachable!("We already checked for channel existence, we can't fail here!"),
1912+
Err(None) => {
1913+
log_warn!(self, "Channel we expected to claim an HTLC from was closed.");
1914+
},
1915+
Ok(()) => claimed_any_htlcs = true,
1916+
}
18831917
}
18841918
}
1919+
1920+
// Now that we've done the entire above loop in one lock, we can handle any errors
1921+
// which were generated.
1922+
channel_state.take();
1923+
1924+
for (their_node_id, err) in errs.drain(..) {
1925+
let res: Result<(), _> = Err(err);
1926+
let _ = handle_error!(self, res, their_node_id);
1927+
}
1928+
18851929
claimed_any_htlcs
18861930
} else { false }
18871931
}
1888-
fn claim_funds_internal(&self, mut channel_state_lock: MutexGuard<ChannelHolder<ChanSigner>>, source: HTLCSource, payment_preimage: PaymentPreimage) {
1889-
let (their_node_id, err) = loop {
1890-
match source {
1891-
HTLCSource::OutboundRoute { .. } => {
1892-
mem::drop(channel_state_lock);
1893-
let mut pending_events = self.pending_events.lock().unwrap();
1894-
pending_events.push(events::Event::PaymentSent {
1895-
payment_preimage
1896-
});
1897-
},
1898-
HTLCSource::PreviousHopData(HTLCPreviousHopData { short_channel_id, htlc_id, .. }) => {
1899-
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
1900-
let channel_state = &mut *channel_state_lock;
19011932

1902-
let chan_id = match channel_state.short_to_id.get(&short_channel_id) {
1903-
Some(chan_id) => chan_id.clone(),
1904-
None => {
1905-
// TODO: There is probably a channel manager somewhere that needs to
1906-
// learn the preimage as the channel already hit the chain and that's
1907-
// why it's missing.
1908-
return
1909-
}
1910-
};
1933+
fn claim_funds_from_hop(&self, channel_state_lock: &mut MutexGuard<ChannelHolder<ChanSigner>>, prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage) -> Result<(), Option<(PublicKey, MsgHandleErrInternal)>> {
1934+
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
1935+
let channel_state = &mut **channel_state_lock;
1936+
let chan_id = match channel_state.short_to_id.get(&prev_hop.short_channel_id) {
1937+
Some(chan_id) => chan_id.clone(),
1938+
None => {
1939+
return Err(None)
1940+
}
1941+
};
19111942

1912-
if let hash_map::Entry::Occupied(mut chan) = channel_state.by_id.entry(chan_id) {
1913-
let was_frozen_for_monitor = chan.get().is_awaiting_monitor_update();
1914-
match chan.get_mut().get_update_fulfill_htlc_and_commit(htlc_id, payment_preimage) {
1915-
Ok((msgs, monitor_option)) => {
1916-
if let Some(monitor_update) = monitor_option {
1917-
if let Err(e) = self.monitor.update_monitor(chan.get().get_funding_txo().unwrap(), monitor_update) {
1918-
if was_frozen_for_monitor {
1919-
assert!(msgs.is_none());
1920-
} else {
1921-
break (chan.get().get_their_node_id(), handle_monitor_err!(self, e, channel_state, chan, RAACommitmentOrder::CommitmentFirst, false, msgs.is_some()));
1922-
}
1923-
}
1924-
}
1925-
if let Some((msg, commitment_signed)) = msgs {
1926-
channel_state.pending_msg_events.push(events::MessageSendEvent::UpdateHTLCs {
1927-
node_id: chan.get().get_their_node_id(),
1928-
updates: msgs::CommitmentUpdate {
1929-
update_add_htlcs: Vec::new(),
1930-
update_fulfill_htlcs: vec![msg],
1931-
update_fail_htlcs: Vec::new(),
1932-
update_fail_malformed_htlcs: Vec::new(),
1933-
update_fee: None,
1934-
commitment_signed,
1935-
}
1936-
});
1937-
}
1938-
},
1939-
Err(_e) => {
1940-
// TODO: There is probably a channel manager somewhere that needs to
1941-
// learn the preimage as the channel may be about to hit the chain.
1942-
//TODO: Do something with e?
1943-
return
1944-
},
1943+
if let hash_map::Entry::Occupied(mut chan) = channel_state.by_id.entry(chan_id) {
1944+
let was_frozen_for_monitor = chan.get().is_awaiting_monitor_update();
1945+
match chan.get_mut().get_update_fulfill_htlc_and_commit(prev_hop.htlc_id, payment_preimage) {
1946+
Ok((msgs, monitor_option)) => {
1947+
if let Some(monitor_update) = monitor_option {
1948+
if let Err(e) = self.monitor.update_monitor(chan.get().get_funding_txo().unwrap(), monitor_update) {
1949+
if was_frozen_for_monitor {
1950+
assert!(msgs.is_none());
1951+
} else {
1952+
return Err(Some((chan.get().get_their_node_id(), handle_monitor_err!(self, e, channel_state, chan, RAACommitmentOrder::CommitmentFirst, false, msgs.is_some()).unwrap_err())));
1953+
}
19451954
}
1946-
} else { unreachable!(); }
1955+
}
1956+
if let Some((msg, commitment_signed)) = msgs {
1957+
channel_state.pending_msg_events.push(events::MessageSendEvent::UpdateHTLCs {
1958+
node_id: chan.get().get_their_node_id(),
1959+
updates: msgs::CommitmentUpdate {
1960+
update_add_htlcs: Vec::new(),
1961+
update_fulfill_htlcs: vec![msg],
1962+
update_fail_htlcs: Vec::new(),
1963+
update_fail_malformed_htlcs: Vec::new(),
1964+
update_fee: None,
1965+
commitment_signed,
1966+
}
1967+
});
1968+
}
1969+
return Ok(())
1970+
},
1971+
Err(e) => {
1972+
// TODO: There is probably a channel manager somewhere that needs to
1973+
// learn the preimage as the channel may be about to hit the chain.
1974+
// TODO: Do something with e?
1975+
debug_assert!(false, "This shouldn't be reachable except in absurdly rare cases between monitor updates and HTLC timeouts: {:?}", e);
1976+
return Err(None)
19471977
},
19481978
}
1949-
return;
1950-
};
1979+
} else { unreachable!(); }
1980+
}
19511981

1952-
mem::drop(channel_state_lock);
1953-
let _ = handle_error!(self, err, their_node_id);
1982+
fn claim_funds_internal(&self, mut channel_state_lock: MutexGuard<ChannelHolder<ChanSigner>>, source: HTLCSource, payment_preimage: PaymentPreimage) {
1983+
match source {
1984+
HTLCSource::OutboundRoute { .. } => {
1985+
mem::drop(channel_state_lock);
1986+
let mut pending_events = self.pending_events.lock().unwrap();
1987+
pending_events.push(events::Event::PaymentSent {
1988+
payment_preimage
1989+
});
1990+
},
1991+
HTLCSource::PreviousHopData(hop_data) => {
1992+
if let Err((their_node_id, err)) = match self.claim_funds_from_hop(&mut channel_state_lock, hop_data, payment_preimage) {
1993+
Ok(()) => Ok(()),
1994+
Err(None) => {
1995+
// TODO: There is probably a channel manager somewhere that needs to
1996+
// learn the preimage as the channel already hit the chain and that's
1997+
// why it's missing.
1998+
Ok(())
1999+
},
2000+
Err(Some(res)) => Err(res),
2001+
} {
2002+
mem::drop(channel_state_lock);
2003+
let res: Result<(), _> = Err(err);
2004+
let _ = handle_error!(self, res, their_node_id);
2005+
}
2006+
},
2007+
}
19542008
}
19552009

19562010
/// Gets the node_id held by this ChannelManager

0 commit comments

Comments
 (0)