Skip to content

Commit 28fe326

Browse files
committed
Refactor payment-claim logic to ensure MPP-claim atomicity
Previously if we claimed an MPP where a previous-hop channel was closed while we were waitng for the user to provide us the preimage we'd simply skip claiming that HTLC without letting the user know. This refactors the claim logic to first check that all the channels are still available (which is actually all we need - we really mostly care about updating the channel monitors, not the channels themselves) and then claim the HTLCs in the same lock, ensuring atomicity.
1 parent ba9e674 commit 28fe326

File tree

2 files changed

+129
-68
lines changed

2 files changed

+129
-68
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,7 +1458,7 @@ fn test_monitor_update_fail_claim() {
14581458
nodes[1].node.handle_update_add_htlc(&nodes[2].node.get_our_node_id(), &payment_event.msgs[0]);
14591459
let events = nodes[1].node.get_and_clear_pending_msg_events();
14601460
assert_eq!(events.len(), 0);
1461-
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Failed to update ChannelMonitor".to_string(), 1);
1461+
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Temporary failure claiming HTLC, treating as success: Failed to update ChannelMonitor".to_string(), 1);
14621462
commitment_signed_dance!(nodes[1], nodes[2], payment_event.commitment_msg, false, true);
14631463

14641464
let bs_fail_update = get_htlc_update_msgs!(nodes[1], nodes[2].node.get_our_node_id());
@@ -1599,7 +1599,7 @@ fn monitor_update_claim_fail_no_response() {
15991599
check_added_monitors!(nodes[1], 1);
16001600
let events = nodes[1].node.get_and_clear_pending_msg_events();
16011601
assert_eq!(events.len(), 0);
1602-
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Failed to update ChannelMonitor".to_string(), 1);
1602+
nodes[1].logger.assert_log("lightning::ln::channelmanager".to_string(), "Temporary failure claiming HTLC, treating as success: Failed to update ChannelMonitor".to_string(), 1);
16031603

16041604
*nodes[1].chan_monitor.update_ret.lock().unwrap() = Ok(());
16051605
let (outpoint, latest_update) = nodes[1].chan_monitor.latest_monitor_update_id.lock().unwrap().get(&channel_id).unwrap().clone();

lightning/src/ln/channelmanager.rs

Lines changed: 127 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1850,6 +1850,9 @@ impl<ChanSigner: ChannelKeys, M: Deref, T: Deref, K: Deref, F: Deref> ChannelMan
18501850
/// privacy-breaking recipient-probing attacks which may reveal payment activity to
18511851
/// motivated attackers.
18521852
///
1853+
/// Note that the privacy concerns in (b) are not relevant in payments with a payment_secret
1854+
/// set. Thus, for such payments we will claim any payments which do not under-pay.
1855+
///
18531856
/// May panic if called except in response to a PaymentReceived event.
18541857
pub fn claim_funds(&self, payment_preimage: PaymentPreimage, payment_secret: &Option<PaymentSecret>, expected_amount: u64) -> bool {
18551858
let payment_hash = PaymentHash(Sha256::hash(&payment_preimage.0).into_inner());
@@ -1860,98 +1863,156 @@ impl<ChanSigner: ChannelKeys, M: Deref, T: Deref, K: Deref, F: Deref> ChannelMan
18601863
let removed_source = channel_state.as_mut().unwrap().claimable_htlcs.remove(&(payment_hash, *payment_secret));
18611864
if let Some(mut sources) = removed_source {
18621865
assert!(!sources.is_empty());
1863-
let valid_mpp_amount = if let &Some(ref data) = &sources[0].payment_data {
1866+
1867+
// If we are claiming an MPP payment, we have to take special care to ensure that each
1868+
// channel exists before claiming all of the payments (inside one lock).
1869+
// Note that channel existance is sufficient as we should always get a monitor update
1870+
// which will take care of the real HTLC claim enforcement.
1871+
//
1872+
// If we find an HTLC which we would need to claim but for which we do not have a
1873+
// channel, we will fail all parts of the MPP payment. While we could wait and see if
1874+
// the sender retries the already-failed path(s), it should be a pretty rare case where
1875+
// we got all the HTLCs and then a channel closed while we were waiting for the user to
1876+
// provide the preimage, so worrying too much about the optimal handling isn't worth
1877+
// it.
1878+
1879+
let (is_mpp, mut valid_mpp) = if let &Some(ref data) = &sources[0].payment_data {
18641880
assert!(payment_secret.is_some());
1865-
data.total_msat == expected_amount
1881+
(true, data.total_msat >= expected_amount)
18661882
} else {
18671883
assert!(payment_secret.is_none());
1868-
false
1884+
(false, false)
18691885
};
18701886

1887+
for htlc in sources.iter() {
1888+
if !is_mpp || !valid_mpp { break; }
1889+
if let None = channel_state.as_ref().unwrap().short_to_id.get(&htlc.prev_hop.short_channel_id) {
1890+
valid_mpp = false;
1891+
}
1892+
}
1893+
1894+
let mut errs = Vec::new();
18711895
let mut claimed_any_htlcs = false;
18721896
for htlc in sources.drain(..) {
18731897
if channel_state.is_none() { channel_state = Some(self.channel_state.lock().unwrap()); }
1874-
if !valid_mpp_amount && (htlc.value < expected_amount || htlc.value > expected_amount * 2) {
1898+
if (is_mpp && !valid_mpp) || (!is_mpp && (htlc.value < expected_amount || htlc.value > expected_amount * 2)) {
18751899
let mut htlc_msat_data = byte_utils::be64_to_array(htlc.value).to_vec();
18761900
let mut height_data = byte_utils::be32_to_array(self.latest_block_height.load(Ordering::Acquire) as u32).to_vec();
18771901
htlc_msat_data.append(&mut height_data);
18781902
self.fail_htlc_backwards_internal(channel_state.take().unwrap(),
18791903
HTLCSource::PreviousHopData(htlc.prev_hop), &payment_hash,
18801904
HTLCFailReason::Reason { failure_code: 0x4000|15, data: htlc_msat_data });
18811905
} else {
1882-
self.claim_funds_internal(channel_state.take().unwrap(), HTLCSource::PreviousHopData(htlc.prev_hop), payment_preimage);
1883-
claimed_any_htlcs = true;
1906+
match self.claim_funds_from_hop(channel_state.as_mut().unwrap(), htlc.prev_hop, payment_preimage) {
1907+
Err(Some(e)) => {
1908+
if let msgs::ErrorAction::IgnoreError = e.1.err.action {
1909+
// We got a temporary failure updating monitor, but will claim the
1910+
// HTLC when the monitor updating is restored (or on chain).
1911+
log_error!(self, "Temporary failure claiming HTLC, treating as success: {}", e.1.err.err);
1912+
claimed_any_htlcs = true;
1913+
} else { errs.push(e); }
1914+
},
1915+
Err(None) if is_mpp => unreachable!("We already checked for channel existence, we can't fail here!"),
1916+
Err(None) => {
1917+
log_warn!(self, "Channel we expected to claim an HTLC from was closed.");
1918+
},
1919+
Ok(()) => claimed_any_htlcs = true,
1920+
}
18841921
}
18851922
}
1923+
1924+
// Now that we've done the entire above loop in one lock, we can handle any errors
1925+
// which were generated.
1926+
channel_state.take();
1927+
1928+
for (their_node_id, err) in errs.drain(..) {
1929+
let res: Result<(), _> = Err(err);
1930+
let _ = handle_error!(self, res, their_node_id);
1931+
}
1932+
18861933
claimed_any_htlcs
18871934
} else { false }
18881935
}
1889-
fn claim_funds_internal(&self, mut channel_state_lock: MutexGuard<ChannelHolder<ChanSigner>>, source: HTLCSource, payment_preimage: PaymentPreimage) {
1890-
let (their_node_id, err) = loop {
1891-
match source {
1892-
HTLCSource::OutboundRoute { .. } => {
1893-
mem::drop(channel_state_lock);
1894-
let mut pending_events = self.pending_events.lock().unwrap();
1895-
pending_events.push(events::Event::PaymentSent {
1896-
payment_preimage
1897-
});
1898-
},
1899-
HTLCSource::PreviousHopData(HTLCPreviousHopData { short_channel_id, htlc_id, .. }) => {
1900-
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
1901-
let channel_state = &mut *channel_state_lock;
19021936

1903-
let chan_id = match channel_state.short_to_id.get(&short_channel_id) {
1904-
Some(chan_id) => chan_id.clone(),
1905-
None => {
1906-
// TODO: There is probably a channel manager somewhere that needs to
1907-
// learn the preimage as the channel already hit the chain and that's
1908-
// why it's missing.
1909-
return
1910-
}
1911-
};
1937+
fn claim_funds_from_hop(&self, channel_state_lock: &mut MutexGuard<ChannelHolder<ChanSigner>>, prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage) -> Result<(), Option<(PublicKey, MsgHandleErrInternal)>> {
1938+
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
1939+
let channel_state = &mut **channel_state_lock;
1940+
let chan_id = match channel_state.short_to_id.get(&prev_hop.short_channel_id) {
1941+
Some(chan_id) => chan_id.clone(),
1942+
None => {
1943+
return Err(None)
1944+
}
1945+
};
19121946

1913-
if let hash_map::Entry::Occupied(mut chan) = channel_state.by_id.entry(chan_id) {
1914-
let was_frozen_for_monitor = chan.get().is_awaiting_monitor_update();
1915-
match chan.get_mut().get_update_fulfill_htlc_and_commit(htlc_id, payment_preimage) {
1916-
Ok((msgs, monitor_option)) => {
1917-
if let Some(monitor_update) = monitor_option {
1918-
if let Err(e) = self.monitor.update_monitor(chan.get().get_funding_txo().unwrap(), monitor_update) {
1919-
if was_frozen_for_monitor {
1920-
assert!(msgs.is_none());
1921-
} else {
1922-
break (chan.get().get_their_node_id(), handle_monitor_err!(self, e, channel_state, chan, RAACommitmentOrder::CommitmentFirst, false, msgs.is_some()));
1923-
}
1924-
}
1925-
}
1926-
if let Some((msg, commitment_signed)) = msgs {
1927-
channel_state.pending_msg_events.push(events::MessageSendEvent::UpdateHTLCs {
1928-
node_id: chan.get().get_their_node_id(),
1929-
updates: msgs::CommitmentUpdate {
1930-
update_add_htlcs: Vec::new(),
1931-
update_fulfill_htlcs: vec![msg],
1932-
update_fail_htlcs: Vec::new(),
1933-
update_fail_malformed_htlcs: Vec::new(),
1934-
update_fee: None,
1935-
commitment_signed,
1936-
}
1937-
});
1938-
}
1939-
},
1940-
Err(_e) => {
1941-
// TODO: There is probably a channel manager somewhere that needs to
1942-
// learn the preimage as the channel may be about to hit the chain.
1943-
//TODO: Do something with e?
1944-
return
1945-
},
1947+
if let hash_map::Entry::Occupied(mut chan) = channel_state.by_id.entry(chan_id) {
1948+
let was_frozen_for_monitor = chan.get().is_awaiting_monitor_update();
1949+
match chan.get_mut().get_update_fulfill_htlc_and_commit(prev_hop.htlc_id, payment_preimage) {
1950+
Ok((msgs, monitor_option)) => {
1951+
if let Some(monitor_update) = monitor_option {
1952+
if let Err(e) = self.monitor.update_monitor(chan.get().get_funding_txo().unwrap(), monitor_update) {
1953+
if was_frozen_for_monitor {
1954+
assert!(msgs.is_none());
1955+
} else {
1956+
return Err(Some((chan.get().get_their_node_id(), handle_monitor_err!(self, e, channel_state, chan, RAACommitmentOrder::CommitmentFirst, false, msgs.is_some()).unwrap_err())));
1957+
}
19461958
}
1947-
} else { unreachable!(); }
1959+
}
1960+
if let Some((msg, commitment_signed)) = msgs {
1961+
channel_state.pending_msg_events.push(events::MessageSendEvent::UpdateHTLCs {
1962+
node_id: chan.get().get_their_node_id(),
1963+
updates: msgs::CommitmentUpdate {
1964+
update_add_htlcs: Vec::new(),
1965+
update_fulfill_htlcs: vec![msg],
1966+
update_fail_htlcs: Vec::new(),
1967+
update_fail_malformed_htlcs: Vec::new(),
1968+
update_fee: None,
1969+
commitment_signed,
1970+
}
1971+
});
1972+
}
1973+
return Ok(())
1974+
},
1975+
Err(e) => {
1976+
// TODO: Do something with e?
1977+
// This should only occur if we are claiming an HTLC at the same time as the
1978+
// HTLC is being failed (eg because a block is being connected and this caused
1979+
// an HTLC to time out). This should, of course, only occur if the user is the
1980+
// one doing the claiming (as it being a part of a peer claim would imply we're
1981+
// about to lose funds) and only if the lock in claim_funds was dropped as a
1982+
// previous HTLC was failed (thus not for an MPP payment).
1983+
debug_assert!(false, "This shouldn't be reachable except in absurdly rare cases between monitor updates and HTLC timeouts: {:?}", e);
1984+
return Err(None)
19481985
},
19491986
}
1950-
return;
1951-
};
1987+
} else { unreachable!(); }
1988+
}
19521989

1953-
mem::drop(channel_state_lock);
1954-
let _ = handle_error!(self, err, their_node_id);
1990+
fn claim_funds_internal(&self, mut channel_state_lock: MutexGuard<ChannelHolder<ChanSigner>>, source: HTLCSource, payment_preimage: PaymentPreimage) {
1991+
match source {
1992+
HTLCSource::OutboundRoute { .. } => {
1993+
mem::drop(channel_state_lock);
1994+
let mut pending_events = self.pending_events.lock().unwrap();
1995+
pending_events.push(events::Event::PaymentSent {
1996+
payment_preimage
1997+
});
1998+
},
1999+
HTLCSource::PreviousHopData(hop_data) => {
2000+
if let Err((their_node_id, err)) = match self.claim_funds_from_hop(&mut channel_state_lock, hop_data, payment_preimage) {
2001+
Ok(()) => Ok(()),
2002+
Err(None) => {
2003+
// TODO: There is probably a channel manager somewhere that needs to
2004+
// learn the preimage as the channel already hit the chain and that's
2005+
// why it's missing.
2006+
Ok(())
2007+
},
2008+
Err(Some(res)) => Err(res),
2009+
} {
2010+
mem::drop(channel_state_lock);
2011+
let res: Result<(), _> = Err(err);
2012+
let _ = handle_error!(self, res, their_node_id);
2013+
}
2014+
},
2015+
}
19552016
}
19562017

19572018
/// Gets the node_id held by this ChannelManager

0 commit comments

Comments
 (0)