lightningd: allow up to 100 "slow open" channels before forgetting them.

Michael of Boltz told me this long ago, that when fees spiked some of their clients' opens got stuck. After two weeks their node forgot them, and when fees came back down the opens still failed. Unfortunately, I can't see an issue for this! We can give some grace: we don't want to waste too many resources, but 100 channels is nothing. The test needs to be adjusted to have *two* slow open channels, now. Changelog-Changed: Protocol: we won't forget still-opening channels after 2016 blocks, unless there are more than 100. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-04-29 09:55:08 +09:30
parent baf3c831dd
commit 3b80a81031
3 changed files with 90 additions and 47 deletions
--- a/lightningd/channel_control.c
+++ b/lightningd/channel_control.c
@@ -1,4 +1,5 @@
 #include "config.h"
+#include <ccan/asort/asort.h>
 #include <ccan/cast/cast.h>
 #include <ccan/mem/mem.h>
 #include <ccan/tal/str/str.h>
@@ -1927,18 +1928,8 @@ is_fundee_should_forget(struct lightningd *ld,
 			struct channel *channel,
 			u32 block_height)
 {
-	/* BOLT #2:
-	 *
-	 * A non-funding node (fundee):
-	 *   - SHOULD forget the channel if it does not see the
-	 * correct funding transaction after a timeout of 2016 blocks.
-	 */
-	u32 max_funding_unconfirmed;
-
-	if (ld->developer)
-		max_funding_unconfirmed = ld->dev_max_funding_unconfirmed;
-	else
-		max_funding_unconfirmed = 2016;
+	/* 2016 by default */
+	u32 max_funding_unconfirmed = ld->dev_max_funding_unconfirmed;

 	/* Only applies if we are fundee. */
 	if (channel->opener == LOCAL)
@@ -1969,16 +1960,42 @@ is_fundee_should_forget(struct lightningd *ld,
 	return true;
 }

+/* We want in ascending order, so oldest channels first */
+static int cmp_channel_start(struct channel *const *a, struct channel *const *b, void *unused)
+{
+	if ((*a)->first_blocknum < (*b)->first_blocknum)
+		return -1;
+	else if ((*a)->first_blocknum > (*b)->first_blocknum)
+		return 1;
+	return 0;
+}
+
 /* Notify all channels of new blocks. */
 void channel_notify_new_block(struct lightningd *ld,
 			      u32 block_height)
 {
 	struct peer *peer;
 	struct channel *channel;
-	struct channel **to_forget = tal_arr(NULL, struct channel *, 0);
+	struct channel **to_forget = tal_arr(tmpctx, struct channel *, 0);
 	size_t i;
 	struct peer_node_id_map_iter it;

+	/* BOLT #2:
+	 *
+	 * A non-funding node (fundee):
+	 *   - SHOULD forget the channel if it does not see the
+	 * correct funding transaction after a timeout of 2016 blocks.
+	 */
+
+	/* But we give some latitude!  Boltz reported that after a months-long
+	 * fee spike, they had some failed opens when the tx finally got mined.
+	 * They're individually cheap, keep the latest 100. */
+	size_t forgettable_channels_to_keep = 100;
+
+	/* For testing */
+	if (ld->dev_max_funding_unconfirmed != 2016)
+		forgettable_channels_to_keep = 1;
+
 	/* FIXME: keep separate block-aware channel structure instead? */
 	for (peer = peer_node_id_map_first(ld->peers, &it);
 	     peer;
@@ -1986,13 +2003,12 @@ void channel_notify_new_block(struct lightningd *ld,
 		list_for_each(&peer->channels, channel, list) {
 			if (channel_state_uncommitted(channel->state))
 				continue;
-			if (is_fundee_should_forget(ld, channel, block_height)) {
+			if (is_fundee_should_forget(ld, channel, block_height))
 				tal_arr_expand(&to_forget, channel);
-			} else
-				/* Let channels know about new blocks,
-				 * required for lease updates */
-				try_update_blockheight(ld, channel,
-						       block_height);
+
+			/* Let channels know about new blocks,
+			 * required for lease updates */
+			try_update_blockheight(ld, channel, block_height);
 		}
 	}

@@ -2004,7 +2020,11 @@ void channel_notify_new_block(struct lightningd *ld,
 	 * just the freeing of the channel that occurs, but the
 	 * potential destruction of the peer that invalidates
 	 * memory the inner loop is accessing. */
-	for (i = 0; i < tal_count(to_forget); ++i) {
+	if (tal_count(to_forget) < forgettable_channels_to_keep)
+		return;
+
+	asort(to_forget, tal_count(to_forget), cmp_channel_start, NULL);
+	for (i = 0; i + forgettable_channels_to_keep < tal_count(to_forget); ++i) {
 		channel = to_forget[i];
 		/* Report it first. */
 		log_unusual(channel->log,
@@ -2020,8 +2040,6 @@ void channel_notify_new_block(struct lightningd *ld,
 		/* And forget it. COMPLETELY. */
 		delete_channel(channel, true);
 	}
-
-	tal_free(to_forget);
 }

 /* Since this could vanish while we're checking with bitcoind, we need to save