From 3b80a81031712a37f918e04ffb1771fcc762daa6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Apr 2025 09:55:08 +0930
Subject: [PATCH] lightningd: allow up to 100 "slow open" channels before
 forgetting them.

Michael of Boltz told me this long ago, that when fees spiked some of their clients' opens got stuck.  After two weeks their node forgot them, and when fees came back down the opens still failed.  Unfortunately, I can't see an issue for this!

We can give some grace: we don't want to waste too many resources, but 100 channels is nothing.

The test needs to be adjusted to have *two* slow open channels, now.

Changelog-Changed: Protocol: we won't forget still-opening channels after 2016 blocks, unless there are more than 100.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lightningd/channel_control.c | 62 +++++++++++++++++++++++-------------
 tests/test_connection.py     | 52 +++++++++++++++++++-----------
 tests/test_opening.py        | 23 +++++++++----
 3 files changed, 90 insertions(+), 47 deletions(-)

diff --git a/lightningd/channel_control.c b/lightningd/channel_control.c
index f1d3d6477..bcfe93af9 100644
--- a/lightningd/channel_control.c
+++ b/lightningd/channel_control.c
@@ -1,4 +1,5 @@
 #include "config.h"
+#include <ccan/asort/asort.h>
 #include <ccan/cast/cast.h>
 #include <ccan/mem/mem.h>
 #include <ccan/tal/str/str.h>
@@ -1927,18 +1928,8 @@ is_fundee_should_forget(struct lightningd *ld,
 			struct channel *channel,
 			u32 block_height)
 {
-	/* BOLT #2:
-	 *
-	 * A non-funding node (fundee):
-	 *   - SHOULD forget the channel if it does not see the
-	 * correct funding transaction after a timeout of 2016 blocks.
-	 */
-	u32 max_funding_unconfirmed;
-
-	if (ld->developer)
-		max_funding_unconfirmed = ld->dev_max_funding_unconfirmed;
-	else
-		max_funding_unconfirmed = 2016;
+	/* 2016 by default */
+	u32 max_funding_unconfirmed = ld->dev_max_funding_unconfirmed;
 
 	/* Only applies if we are fundee. */
 	if (channel->opener == LOCAL)
@@ -1969,16 +1960,42 @@ is_fundee_should_forget(struct lightningd *ld,
 	return true;
 }
 
+/* We want in ascending order, so oldest channels first */
+static int cmp_channel_start(struct channel *const *a, struct channel *const *b, void *unused)
+{
+	if ((*a)->first_blocknum < (*b)->first_blocknum)
+		return -1;
+	else if ((*a)->first_blocknum > (*b)->first_blocknum)
+		return 1;
+	return 0;
+}
+
 /* Notify all channels of new blocks. */
 void channel_notify_new_block(struct lightningd *ld,
 			      u32 block_height)
 {
 	struct peer *peer;
 	struct channel *channel;
-	struct channel **to_forget = tal_arr(NULL, struct channel *, 0);
+	struct channel **to_forget = tal_arr(tmpctx, struct channel *, 0);
 	size_t i;
 	struct peer_node_id_map_iter it;
 
+	/* BOLT #2:
+	 *
+	 * A non-funding node (fundee):
+	 *   - SHOULD forget the channel if it does not see the
+	 * correct funding transaction after a timeout of 2016 blocks.
+	 */
+
+	/* But we give some latitude!  Boltz reported that after a months-long
+	 * fee spike, they had some failed opens when the tx finally got mined.
+	 * They're individually cheap, keep the latest 100. */
+	size_t forgettable_channels_to_keep = 100;
+
+	/* For testing */
+	if (ld->dev_max_funding_unconfirmed != 2016)
+		forgettable_channels_to_keep = 1;
+
 	/* FIXME: keep separate block-aware channel structure instead? */
 	for (peer = peer_node_id_map_first(ld->peers, &it);
 	     peer;
@@ -1986,13 +2003,12 @@ void channel_notify_new_block(struct lightningd *ld,
 		list_for_each(&peer->channels, channel, list) {
 			if (channel_state_uncommitted(channel->state))
 				continue;
-			if (is_fundee_should_forget(ld, channel, block_height)) {
+			if (is_fundee_should_forget(ld, channel, block_height))
 				tal_arr_expand(&to_forget, channel);
-			} else
-				/* Let channels know about new blocks,
-				 * required for lease updates */
-				try_update_blockheight(ld, channel,
-						       block_height);
+
+			/* Let channels know about new blocks,
+			 * required for lease updates */
+			try_update_blockheight(ld, channel, block_height);
 		}
 	}
 
@@ -2004,7 +2020,11 @@ void channel_notify_new_block(struct lightningd *ld,
 	 * just the freeing of the channel that occurs, but the
 	 * potential destruction of the peer that invalidates
 	 * memory the inner loop is accessing. */
-	for (i = 0; i < tal_count(to_forget); ++i) {
+	if (tal_count(to_forget) < forgettable_channels_to_keep)
+		return;
+
+	asort(to_forget, tal_count(to_forget), cmp_channel_start, NULL);
+	for (i = 0; i + forgettable_channels_to_keep < tal_count(to_forget); ++i) {
 		channel = to_forget[i];
 		/* Report it first. */
 		log_unusual(channel->log,
@@ -2020,8 +2040,6 @@ void channel_notify_new_block(struct lightningd *ld,
 		/* And forget it. COMPLETELY. */
 		delete_channel(channel, true);
 	}
-
-	tal_free(to_forget);
 }
 
 /* Since this could vanish while we're checking with bitcoind, we need to save
diff --git a/tests/test_connection.py b/tests/test_connection.py
index 5d8e150f1..d52d12bc3 100644
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@@ -2791,19 +2791,19 @@ def test_fundee_forget_funding_tx_unconfirmed(node_factory, bitcoind):
     """Test that fundee will forget the channel if
     the funding tx has been unconfirmed for too long.
     """
-    # Keep this low (default is 2016), since everything
+    # Keep confirms low (default is 2016), since everything
     # is much slower in VALGRIND mode and wait_for_log
     # could time out before lightningd processes all the
-    # blocks.
-    blocks = 50
-    # opener
-    l1 = node_factory.get_node()
-    # peer
-    l2 = node_factory.get_node(options={"dev-max-funding-unconfirmed-blocks": blocks})
-    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
+    # blocks.  This also reduces the number of "slow confirm" peers
+    # from 100, to 1.
+    blocks = 10
+    l1, l2, l3 = node_factory.line_graph(3, fundchannel=False,
+                                         opts={"dev-max-funding-unconfirmed-blocks": blocks})
 
-    # Give opener some funds.
+    # Give openers some funds.
     l1.fundwallet(10**7)
+    l3.fundwallet(10**7)
+    sync_blockheight(bitcoind, [l1, l2, l3])
 
     def mock_sendrawtransaction(r):
         return {'id': r['id'], 'error': {'code': 100, 'message': 'sendrawtransaction disabled'}}
@@ -2813,23 +2813,37 @@ def test_fundee_forget_funding_tx_unconfirmed(node_factory, bitcoind):
 
     # Prevent opener from broadcasting funding tx (any tx really).
     l1.daemon.rpcproxy.mock_rpc('sendrawtransaction', mock_sendrawtransaction)
+    # (for EXPERIMENTAL_DUAL_FUND=1, have to prevent l2 doing it too!)
     l2.daemon.rpcproxy.mock_rpc('sendrawtransaction', mock_donothing)
 
-    # Fund the channel.
-    # The process will complete, but opener will be unable
-    # to broadcast and confirm funding tx.
+    # l1 tries to open, fails.
     with pytest.raises(RpcError, match=r'sendrawtransaction disabled'):
         l1.rpc.fundchannel(l2.info['id'], 10**6)
 
-    # Generate blocks until unconfirmed.
-    bitcoind.generate_block(blocks)
+    # One block later, l3 tries, fails silently.
+    bitcoind.generate_block(1)
+    sync_blockheight(bitcoind, [l1, l2, l3])
 
-    # fundee will forget channel!
-    # (Note that we let the last number be anything (hence the {}\d)
-    l2.daemon.wait_for_log(r'Forgetting channel: It has been {}\d blocks'.format(str(blocks)[:-1]))
+    l3.daemon.rpcproxy.mock_rpc('sendrawtransaction', mock_donothing)
+    l3.rpc.fundchannel(l2.info['id'], 10**6)
 
-    # fundee will also forget, but not disconnect from peer.
-    wait_for(lambda: l2.rpc.listpeerchannels(l1.info['id'])['channels'] == [])
+    # After 10 blocks, l1's is due to be forgotten, but doesn't since we let 1 linger.
+    bitcoind.generate_block(blocks - 1)
+    assert not l2.daemon.is_in_log(r'Forgetting channel')
+
+    if EXPERIMENTAL_DUAL_FUND:
+        state = 'DUALOPEND_AWAITING_LOCKIN'
+    else:
+        state = 'CHANNELD_AWAITING_LOCKIN'
+    assert [c['state'] for c in l2.rpc.listpeerchannels()['channels']] == [state] * 2
+
+    # But once l3 is also delayed, l1 gets kicked out.
+    bitcoind.generate_block(1)
+
+    # fundee will forget oldest channel: the one with l1!
+    l2.daemon.wait_for_log(rf'Forgetting channel: It has been {blocks + 1} blocks')
+    assert [c['state'] for c in l2.rpc.listpeerchannels()['channels']] == [state]
+    assert l2.rpc.listpeerchannels(l1.info['id']) == {'channels': []}
 
 
 @pytest.mark.openchannel('v2')
diff --git a/tests/test_opening.py b/tests/test_opening.py
index 9fdb22a66..0fd52670b 100644
--- a/tests/test_opening.py
+++ b/tests/test_opening.py
@@ -10,6 +10,7 @@ from pathlib import Path
 import pytest
 import re
 import unittest
+import time
 
 
 def find_next_feerate(node, peer):
@@ -2697,8 +2698,8 @@ def test_zeroconf_forget(node_factory, bitcoind, dopay: bool):
     """
     blocks = 50
     plugin_path = Path(__file__).parent / "plugins" / "zeroconf-selective.py"
-    l1, l2 = node_factory.get_nodes(
-        2,
+    l1, l2, l3 = node_factory.get_nodes(
+        3,
         opts=[
             {},
             {
@@ -2707,6 +2708,7 @@ def test_zeroconf_forget(node_factory, bitcoind, dopay: bool):
                 "zeroconf-mindepth": "0",
                 "dev-max-funding-unconfirmed-blocks": blocks,
             },
+            {},
         ],
     )
 
@@ -2732,11 +2734,17 @@ def test_zeroconf_forget(node_factory, bitcoind, dopay: bool):
         l1.rpc.pay(inv["bolt11"])
         wait_for(lambda: only_one(l2.rpc.listpeerchannels()['channels'])['to_us_msat'] == 1)
 
+    # We need *another* channel to make it forget the first though!
+    l3.connect(l2)
+    l3.fundwallet(10**7)
+    l3.rpc.fundchannel(l2.info["id"], 10**6, mindepth=0)
+    bitcoind.generate_block(1)
+
     # Now stop, in order to cause catchup and re-evaluate whether to forget the channel
     l2.stop()
 
     # Now we generate enough blocks to cause l2 to forget the channel.
-    bitcoind.generate_block(blocks)  # > blocks
+    bitcoind.generate_block(blocks - 1)  # > blocks
     l2.start()
 
     sync_blockheight(bitcoind, [l1, l2])
@@ -2746,13 +2754,16 @@ def test_zeroconf_forget(node_factory, bitcoind, dopay: bool):
     bitcoind.generate_block(1)
     sync_blockheight(bitcoind, [l1, l2])
 
+    # This may take a moment!
+    time.sleep(5)
+
     have_forgotten = l2.daemon.is_in_log(
-        r"Forgetting channel: It has been [0-9]+ blocks without the funding transaction"
+        r"UNUSUAL {}-chan#1: Forgetting channel: It has been 51 blocks without the funding transaction ".format(l1.info['id'])
     )
 
     if dopay:
         assert not have_forgotten
-        assert len(l2.rpc.listpeerchannels()["channels"]) == 1
+        assert set([c['peer_id'] for c in l2.rpc.listpeerchannels()["channels"]]) == set([l1.info['id'], l3.info['id']])
     else:
         assert have_forgotten
-        assert l2.rpc.listpeerchannels() == {"channels": []}
+        assert [c['peer_id'] for c in l2.rpc.listpeerchannels()["channels"]] == [l3.info['id']]