Skip to content

Commit d2ea0a1

Browse files
committed
Replay MPP claims via background events using new CM metadata
When we claim an MPP payment, then crash before persisting all the relevant `ChannelMonitor`s, we rely on the payment data being available in the `ChannelManager` on restart to re-claim any parts that haven't yet been claimed. This is fine as long as the `ChannelManager` was persisted before the `PaymentClaimable` event was processed, which is generally the case in our `lightning-background-processor`, but may not be in other cases or in a somewhat rare race. In order to fix this, we need to track where all the MPP parts of a payment are in the `ChannelMonitor`, allowing us to re-claim any missing pieces without reference to any `ChannelManager` data. Further, in order to properly generate a `PaymentClaimed` event against the re-started claim, we have to store various payment metadata with the HTLC list as well. Here we finally implement claiming using the new MPP part list and metadata stored in `ChannelMonitor`s. In doing so, we use much more of the existing HTLC-claiming pipeline in `ChannelManager`, utilizing the on-startup background events flow as well as properly re-applying the RAA-blockers to ensure preimages cannot be lost.
1 parent 671a6cd commit d2ea0a1

File tree

3 files changed

+195
-75
lines changed

3 files changed

+195
-75
lines changed

lightning/src/ln/channel.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1290,7 +1290,7 @@ pub(super) struct ChannelContext<SP: Deref> where SP::Target: SignerProvider {
12901290
// further `send_update_fee` calls, dropping the previous holding cell update entirely.
12911291
holding_cell_update_fee: Option<u32>,
12921292
next_holder_htlc_id: u64,
1293-
next_counterparty_htlc_id: u64,
1293+
pub(super) next_counterparty_htlc_id: u64,
12941294
feerate_per_kw: u32,
12951295

12961296
/// The timestamp set on our latest `channel_update` message for this channel. It is updated

lightning/src/ln/channelmanager.rs

Lines changed: 172 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,24 @@ impl_writeable_tlv_based_enum!(EventCompletionAction,
11301130
}
11311131
);
11321132

1133+
struct HTLCClaimSource {
1134+
counterparty_node_id: Option<PublicKey>,
1135+
funding_txo: OutPoint,
1136+
channel_id: ChannelId,
1137+
htlc_id: u64,
1138+
}
1139+
1140+
impl From<&MPPClaimHTLCSource> for HTLCClaimSource {
1141+
fn from(o: &MPPClaimHTLCSource) -> HTLCClaimSource {
1142+
HTLCClaimSource {
1143+
counterparty_node_id: Some(o.counterparty_node_id),
1144+
funding_txo: o.funding_txo,
1145+
channel_id: o.channel_id,
1146+
htlc_id: o.htlc_id,
1147+
}
1148+
}
1149+
}
1150+
11331151
#[derive(Clone, Debug, PartialEq, Eq)]
11341152
/// The source of an HTLC which is being claimed as a part of an incoming payment. Each part is
11351153
/// tracked in [`PendingMPPClaim`].
@@ -6896,6 +6914,27 @@ where
68966914
>(
68976915
&self, prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage,
68986916
payment_info: Option<PaymentClaimDetails>, completion_action: ComplFunc,
6917+
) {
6918+
let counterparty_node_id =
6919+
match self.short_to_chan_info.read().unwrap().get(&prev_hop.short_channel_id) {
6920+
Some((cp_id, _dup_chan_id)) => Some(cp_id.clone()),
6921+
None => None
6922+
};
6923+
6924+
let htlc_source = HTLCClaimSource {
6925+
counterparty_node_id,
6926+
funding_txo: prev_hop.outpoint,
6927+
channel_id: prev_hop.channel_id,
6928+
htlc_id: prev_hop.htlc_id,
6929+
};
6930+
self.claim_mpp_part(htlc_source, payment_preimage, payment_info, completion_action)
6931+
}
6932+
6933+
fn claim_mpp_part<
6934+
ComplFunc: FnOnce(Option<u64>, bool) -> (Option<MonitorUpdateCompletionAction>, Option<RAAMonitorUpdateBlockingAction>)
6935+
>(
6936+
&self, prev_hop: HTLCClaimSource, payment_preimage: PaymentPreimage,
6937+
payment_info: Option<PaymentClaimDetails>, completion_action: ComplFunc,
68996938
) {
69006939
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
69016940

@@ -6912,12 +6951,8 @@ where
69126951
{
69136952
let per_peer_state = self.per_peer_state.read().unwrap();
69146953
let chan_id = prev_hop.channel_id;
6915-
let counterparty_node_id_opt = match self.short_to_chan_info.read().unwrap().get(&prev_hop.short_channel_id) {
6916-
Some((cp_id, _dup_chan_id)) => Some(cp_id.clone()),
6917-
None => None
6918-
};
69196954

6920-
let peer_state_opt = counterparty_node_id_opt.as_ref().map(
6955+
let peer_state_opt = prev_hop.counterparty_node_id.as_ref().map(
69216956
|counterparty_node_id| per_peer_state.get(counterparty_node_id)
69226957
.map(|peer_mutex| peer_mutex.lock().unwrap())
69236958
).unwrap_or(None);
@@ -6944,7 +6979,7 @@ where
69446979
peer_state.actions_blocking_raa_monitor_updates.entry(chan_id).or_insert_with(Vec::new).push(raa_blocker);
69456980
}
69466981
if !during_init {
6947-
handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
6982+
handle_new_monitor_update!(self, prev_hop.funding_txo, monitor_update, peer_state_lock,
69486983
peer_state, per_peer_state, chan);
69496984
} else {
69506985
// If we're running during init we cannot update a monitor directly -
@@ -6953,7 +6988,7 @@ where
69536988
self.pending_background_events.lock().unwrap().push(
69546989
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
69556990
counterparty_node_id,
6956-
funding_txo: prev_hop.outpoint,
6991+
funding_txo: prev_hop.funding_txo,
69576992
channel_id: prev_hop.channel_id,
69586993
update: monitor_update.clone(),
69596994
});
@@ -7027,7 +7062,7 @@ where
70277062
}
70287063
let preimage_update = ChannelMonitorUpdate {
70297064
update_id: CLOSED_CHANNEL_UPDATE_ID,
7030-
counterparty_node_id: None,
7065+
counterparty_node_id: prev_hop.counterparty_node_id,
70317066
updates: vec![ChannelMonitorUpdateStep::PaymentPreimage {
70327067
payment_preimage,
70337068
payment_info,
@@ -7038,7 +7073,7 @@ where
70387073
if !during_init {
70397074
// We update the ChannelMonitor on the backward link, after
70407075
// receiving an `update_fulfill_htlc` from the forward link.
7041-
let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
7076+
let update_res = self.chain_monitor.update_channel(prev_hop.funding_txo, &preimage_update);
70427077
if update_res != ChannelMonitorUpdateStatus::Completed {
70437078
// TODO: This needs to be handled somehow - if we receive a monitor update
70447079
// with a preimage we *must* somehow manage to propagate it to the upstream
@@ -7061,7 +7096,7 @@ where
70617096
// complete the monitor update completion action from `completion_action`.
70627097
self.pending_background_events.lock().unwrap().push(
70637098
BackgroundEvent::ClosedMonitorUpdateRegeneratedOnStartup((
7064-
prev_hop.outpoint, prev_hop.channel_id, preimage_update,
7099+
prev_hop.funding_txo, prev_hop.channel_id, preimage_update,
70657100
)));
70667101
}
70677102
// Note that we do process the completion action here. This totally could be a
@@ -7312,7 +7347,7 @@ where
73127347
onion_fields,
73137348
payment_id,
73147349
}) = payment {
7315-
self.pending_events.lock().unwrap().push_back((events::Event::PaymentClaimed {
7350+
let event = events::Event::PaymentClaimed {
73167351
payment_hash,
73177352
purpose,
73187353
amount_msat,
@@ -7321,7 +7356,16 @@ where
73217356
sender_intended_total_msat,
73227357
onion_fields,
73237358
payment_id,
7324-
}, None));
7359+
};
7360+
let event_action = (event, None);
7361+
let mut pending_events = self.pending_events.lock().unwrap();
7362+
// If we're replaying a claim on startup we may end up duplicating an event
7363+
// that's already in our queue, so check before we push another one. The
7364+
// `payment_id` should suffice to ensure we never spuriously drop a second
7365+
// event for a duplicate payment.
7366+
if !pending_events.contains(&event_action) {
7367+
pending_events.push_back(event_action);
7368+
}
73257369
}
73267370
},
73277371
MonitorUpdateCompletionAction::EmitEventAndFreeOtherChannel {
@@ -13130,67 +13174,126 @@ where
1313013174
};
1313113175

1313213176
for (_, monitor) in args.channel_monitors.iter() {
13133-
for (payment_hash, (payment_preimage, _)) in monitor.get_stored_preimages() {
13134-
let per_peer_state = channel_manager.per_peer_state.read().unwrap();
13135-
let mut claimable_payments = channel_manager.claimable_payments.lock().unwrap();
13136-
let payment = claimable_payments.claimable_payments.remove(&payment_hash);
13137-
mem::drop(claimable_payments);
13138-
if let Some(payment) = payment {
13139-
log_info!(channel_manager.logger, "Re-claiming HTLCs with payment hash {} as we've released the preimage to a ChannelMonitor!", &payment_hash);
13140-
let mut claimable_amt_msat = 0;
13141-
let mut receiver_node_id = Some(our_network_pubkey);
13142-
let phantom_shared_secret = payment.htlcs[0].prev_hop.phantom_shared_secret;
13143-
if phantom_shared_secret.is_some() {
13144-
let phantom_pubkey = channel_manager.node_signer.get_node_id(Recipient::PhantomNode)
13145-
.expect("Failed to get node_id for phantom node recipient");
13146-
receiver_node_id = Some(phantom_pubkey)
13147-
}
13148-
for claimable_htlc in &payment.htlcs {
13149-
claimable_amt_msat += claimable_htlc.value;
13150-
13151-
// Add a holding-cell claim of the payment to the Channel, which should be
13152-
// applied ~immediately on peer reconnection. Because it won't generate a
13153-
// new commitment transaction we can just provide the payment preimage to
13154-
// the corresponding ChannelMonitor and nothing else.
13155-
//
13156-
// We do so directly instead of via the normal ChannelMonitor update
13157-
// procedure as the ChainMonitor hasn't yet been initialized, implying
13158-
// we're not allowed to call it directly yet. Further, we do the update
13159-
// without incrementing the ChannelMonitor update ID as there isn't any
13160-
// reason to.
13161-
// If we were to generate a new ChannelMonitor update ID here and then
13162-
// crash before the user finishes block connect we'd end up force-closing
13163-
// this channel as well. On the flip side, there's no harm in restarting
13164-
// without the new monitor persisted - we'll end up right back here on
13165-
// restart.
13166-
let previous_channel_id = claimable_htlc.prev_hop.channel_id;
13167-
let peer_node_id_opt = channel_manager.outpoint_to_peer.lock().unwrap()
13168-
.get(&claimable_htlc.prev_hop.outpoint).cloned();
13169-
if let Some(peer_node_id) = peer_node_id_opt {
13170-
let peer_state_mutex = per_peer_state.get(&peer_node_id).unwrap();
13171-
let mut peer_state_lock = peer_state_mutex.lock().unwrap();
13172-
let peer_state = &mut *peer_state_lock;
13173-
if let Some(ChannelPhase::Funded(channel)) = peer_state.channel_by_id.get_mut(&previous_channel_id) {
13174-
let logger = WithChannelContext::from(&channel_manager.logger, &channel.context, Some(payment_hash));
13175-
channel.claim_htlc_while_disconnected_dropping_mon_update(claimable_htlc.prev_hop.htlc_id, payment_preimage, &&logger);
13177+
for (payment_hash, (payment_preimage, payment_claims)) in monitor.get_stored_preimages() {
13178+
if !payment_claims.is_empty() {
13179+
for payment_claim in payment_claims {
13180+
if payment_claim.mpp_parts.is_empty() {
13181+
return Err(DecodeError::InvalidValue);
13182+
}
13183+
let pending_claims = PendingMPPClaim {
13184+
channels_without_preimage: payment_claim.mpp_parts.clone(),
13185+
channels_with_preimage: Vec::new(),
13186+
};
13187+
let pending_claim_ptr_opt = Some(Arc::new(Mutex::new(pending_claims)));
13188+
13189+
// While it may be duplicative to generate a PaymentClaimed here, trying to
13190+
// figure out if the user definitely saw it before shutdown would require some
13191+
// nontrivial logic and may break as we move away from regularly persisting
13192+
// ChannelManager. Instead, we rely on the users' event handler being
13193+
// idempotent and just blindly generate one no matter what, letting the
13194+
// preimages eventually timing out from ChannelMonitors to prevent us from
13195+
// doing so forever.
13196+
13197+
let claim_found =
13198+
channel_manager.claimable_payments.lock().unwrap().begin_claiming_payment(
13199+
payment_hash, &channel_manager.node_signer, &channel_manager.logger,
13200+
&channel_manager.inbound_payment_id_secret, |_| Ok(()),
13201+
);
13202+
if claim_found.is_err() {
13203+
let mut claimable_payments = channel_manager.claimable_payments.lock().unwrap();
13204+
match claimable_payments.pending_claiming_payments.entry(payment_hash) {
13205+
hash_map::Entry::Occupied(_) => {
13206+
debug_assert!(false, "Entry was added in begin_claiming_payment");
13207+
return Err(DecodeError::InvalidValue);
13208+
},
13209+
hash_map::Entry::Vacant(entry) => {
13210+
entry.insert(payment_claim.claiming_payment);
13211+
},
1317613212
}
1317713213
}
13178-
if let Some(previous_hop_monitor) = args.channel_monitors.get(&claimable_htlc.prev_hop.outpoint) {
13179-
previous_hop_monitor.provide_payment_preimage(&payment_hash, &payment_preimage, &channel_manager.tx_broadcaster, &channel_manager.fee_estimator, &channel_manager.logger);
13214+
13215+
for part in payment_claim.mpp_parts.iter() {
13216+
let pending_mpp_claim = pending_claim_ptr_opt.as_ref().map(|ptr| (
13217+
part.counterparty_node_id, part.channel_id, part.htlc_id,
13218+
PendingMPPClaimPointer(Arc::clone(&ptr))
13219+
));
13220+
let pending_claim_ptr = pending_claim_ptr_opt.as_ref().map(|ptr|
13221+
RAAMonitorUpdateBlockingAction::ClaimedMPPPayment {
13222+
pending_claim: PendingMPPClaimPointer(Arc::clone(&ptr)),
13223+
}
13224+
);
13225+
// Note that we don't need to pass the `payment_info` here - its
13226+
// already (clearly) durably on disk in the `ChannelMonitor` so there's
13227+
// no need to worry about getting it into others.
13228+
channel_manager.claim_mpp_part(
13229+
part.into(), payment_preimage, None,
13230+
|_, _|
13231+
(Some(MonitorUpdateCompletionAction::PaymentClaimed { payment_hash, pending_mpp_claim }), pending_claim_ptr)
13232+
);
1318013233
}
1318113234
}
13182-
let mut pending_events = channel_manager.pending_events.lock().unwrap();
13183-
let payment_id = payment.inbound_payment_id(&inbound_payment_id_secret.unwrap());
13184-
pending_events.push_back((events::Event::PaymentClaimed {
13185-
receiver_node_id,
13186-
payment_hash,
13187-
purpose: payment.purpose,
13188-
amount_msat: claimable_amt_msat,
13189-
htlcs: payment.htlcs.iter().map(events::ClaimedHTLC::from).collect(),
13190-
sender_intended_total_msat: payment.htlcs.first().map(|htlc| htlc.total_msat),
13191-
onion_fields: payment.onion_fields,
13192-
payment_id: Some(payment_id),
13193-
}, None));
13235+
} else {
13236+
let per_peer_state = channel_manager.per_peer_state.read().unwrap();
13237+
let mut claimable_payments = channel_manager.claimable_payments.lock().unwrap();
13238+
let payment = claimable_payments.claimable_payments.remove(&payment_hash);
13239+
mem::drop(claimable_payments);
13240+
if let Some(payment) = payment {
13241+
log_info!(channel_manager.logger, "Re-claiming HTLCs with payment hash {} as we've released the preimage to a ChannelMonitor!", &payment_hash);
13242+
let mut claimable_amt_msat = 0;
13243+
let mut receiver_node_id = Some(our_network_pubkey);
13244+
let phantom_shared_secret = payment.htlcs[0].prev_hop.phantom_shared_secret;
13245+
if phantom_shared_secret.is_some() {
13246+
let phantom_pubkey = channel_manager.node_signer.get_node_id(Recipient::PhantomNode)
13247+
.expect("Failed to get node_id for phantom node recipient");
13248+
receiver_node_id = Some(phantom_pubkey)
13249+
}
13250+
for claimable_htlc in &payment.htlcs {
13251+
claimable_amt_msat += claimable_htlc.value;
13252+
13253+
// Add a holding-cell claim of the payment to the Channel, which should be
13254+
// applied ~immediately on peer reconnection. Because it won't generate a
13255+
// new commitment transaction we can just provide the payment preimage to
13256+
// the corresponding ChannelMonitor and nothing else.
13257+
//
13258+
// We do so directly instead of via the normal ChannelMonitor update
13259+
// procedure as the ChainMonitor hasn't yet been initialized, implying
13260+
// we're not allowed to call it directly yet. Further, we do the update
13261+
// without incrementing the ChannelMonitor update ID as there isn't any
13262+
// reason to.
13263+
// If we were to generate a new ChannelMonitor update ID here and then
13264+
// crash before the user finishes block connect we'd end up force-closing
13265+
// this channel as well. On the flip side, there's no harm in restarting
13266+
// without the new monitor persisted - we'll end up right back here on
13267+
// restart.
13268+
let previous_channel_id = claimable_htlc.prev_hop.channel_id;
13269+
let peer_node_id_opt = channel_manager.outpoint_to_peer.lock().unwrap()
13270+
.get(&claimable_htlc.prev_hop.outpoint).cloned();
13271+
if let Some(peer_node_id) = peer_node_id_opt {
13272+
let peer_state_mutex = per_peer_state.get(&peer_node_id).unwrap();
13273+
let mut peer_state_lock = peer_state_mutex.lock().unwrap();
13274+
let peer_state = &mut *peer_state_lock;
13275+
if let Some(ChannelPhase::Funded(channel)) = peer_state.channel_by_id.get_mut(&previous_channel_id) {
13276+
let logger = WithChannelContext::from(&channel_manager.logger, &channel.context, Some(payment_hash));
13277+
channel.claim_htlc_while_disconnected_dropping_mon_update(claimable_htlc.prev_hop.htlc_id, payment_preimage, &&logger);
13278+
}
13279+
}
13280+
if let Some(previous_hop_monitor) = args.channel_monitors.get(&claimable_htlc.prev_hop.outpoint) {
13281+
previous_hop_monitor.provide_payment_preimage(&payment_hash, &payment_preimage, &channel_manager.tx_broadcaster, &channel_manager.fee_estimator, &channel_manager.logger);
13282+
}
13283+
}
13284+
let mut pending_events = channel_manager.pending_events.lock().unwrap();
13285+
let payment_id = payment.inbound_payment_id(&inbound_payment_id_secret.unwrap());
13286+
pending_events.push_back((events::Event::PaymentClaimed {
13287+
receiver_node_id,
13288+
payment_hash,
13289+
purpose: payment.purpose,
13290+
amount_msat: claimable_amt_msat,
13291+
htlcs: payment.htlcs.iter().map(events::ClaimedHTLC::from).collect(),
13292+
sender_intended_total_msat: payment.htlcs.first().map(|htlc| htlc.total_msat),
13293+
onion_fields: payment.onion_fields,
13294+
payment_id: Some(payment_id),
13295+
}, None));
13296+
}
1319413297
}
1319513298
}
1319613299
}

lightning/src/ln/reload_tests.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -878,27 +878,39 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool) {
878878
// Now restart nodes[3].
879879
reload_node!(nodes[3], original_manager, &[&updated_monitor.0, &original_monitor.0], persister, new_chain_monitor, nodes_3_deserialized);
880880

881-
// On startup the preimage should have been copied into the non-persisted monitor:
881+
// Until the startup background events are processed (in `get_and_clear_pending_events`,
882+
// below), the preimage is not copied to the non-persisted monitor...
882883
assert!(get_monitor!(nodes[3], chan_id_persisted).get_stored_preimages().contains_key(&payment_hash));
883-
assert!(get_monitor!(nodes[3], chan_id_not_persisted).get_stored_preimages().contains_key(&payment_hash));
884+
assert_eq!(
885+
get_monitor!(nodes[3], chan_id_not_persisted).get_stored_preimages().contains_key(&payment_hash),
886+
persist_both_monitors,
887+
);
884888

885889
nodes[1].node.peer_disconnected(nodes[3].node.get_our_node_id());
886890
nodes[2].node.peer_disconnected(nodes[3].node.get_our_node_id());
887891

888892
// During deserialization, we should have closed one channel and broadcast its latest
889893
// commitment transaction. We should also still have the original PaymentClaimable event we
890-
// never finished processing.
894+
// never finished processing as well as a PaymentClaimed event regenerated when we replayed the
895+
// preimage onto the non-persisted monitor.
891896
let events = nodes[3].node.get_and_clear_pending_events();
892897
assert_eq!(events.len(), if persist_both_monitors { 4 } else { 3 });
893898
if let Event::PaymentClaimable { amount_msat: 15_000_000, .. } = events[0] { } else { panic!(); }
894899
if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[1] { } else { panic!(); }
895900
if persist_both_monitors {
896901
if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[2] { } else { panic!(); }
897-
check_added_monitors(&nodes[3], 2);
902+
if let Event::PaymentClaimed { amount_msat: 15_000_000, .. } = events[3] { } else { panic!(); }
903+
check_added_monitors(&nodes[3], 6);
898904
} else {
899-
check_added_monitors(&nodes[3], 1);
905+
if let Event::PaymentClaimed { amount_msat: 15_000_000, .. } = events[2] { } else { panic!(); }
906+
check_added_monitors(&nodes[3], 3);
900907
}
901908

909+
// Now that we've processed background events, the preimage should have been copied into the
910+
// non-persisted monitor:
911+
assert!(get_monitor!(nodes[3], chan_id_persisted).get_stored_preimages().contains_key(&payment_hash));
912+
assert!(get_monitor!(nodes[3], chan_id_not_persisted).get_stored_preimages().contains_key(&payment_hash));
913+
902914
// On restart, we should also get a duplicate PaymentClaimed event as we persisted the
903915
// ChannelManager prior to handling the original one.
904916
if let Event::PaymentClaimed { payment_hash: our_payment_hash, amount_msat: 15_000_000, .. } =
@@ -948,6 +960,11 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool) {
948960
nodes[0].node.handle_update_fulfill_htlc(nodes[2].node.get_our_node_id(), &cs_updates.update_fulfill_htlcs[0]);
949961
commitment_signed_dance!(nodes[0], nodes[2], cs_updates.commitment_signed, false, true);
950962
expect_payment_sent!(nodes[0], payment_preimage);
963+
964+
// Ensure that the remaining channel is fully operation and not blocked (and that after a
965+
// cycle of commitment updates the payment preimage is ultimately pruned).
966+
send_payment(&nodes[0], &[&nodes[2], &nodes[3]], 100_000);
967+
assert!(!get_monitor!(nodes[3], chan_id_not_persisted).get_stored_preimages().contains_key(&payment_hash));
951968
}
952969
}
953970

0 commit comments

Comments
 (0)