From 65dccea5bde426edc04b746b2c3073636dcd885e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 10 May 2025 20:11:47 +0930
Subject: [PATCH] pytest: fix flake in test_reconnect_signed

We can fail the fundchannel because of a reconnect race: funder tells
us to reconnect so fast that we are still cleaning up from last time.

We deliberately defer clean up, to give the subds a chance to process any
final messages.  However, on reconnection we force them to exit immediately:
but this causes new `connect` commands to see the exit and fail.

The workaround is to do this cleanup when a `connect` command is
issued (as well as the current case, which covers an automatic
reconnection or an incoming reconnection)

```
2025-05-09T00:40:37.1769508Z     def test_reconnect_signed(node_factory):
2025-05-09T00:40:37.1770273Z         # This will fail *after* both sides consider channel opening.
2025-05-09T00:40:37.1770850Z         disconnects = ['<WIRE_FUNDING_SIGNED']
2025-05-09T00:40:37.1771298Z         if EXPERIMENTAL_DUAL_FUND:
2025-05-09T00:40:37.1771735Z             disconnects = ['<WIRE_COMMITMENT_SIGNED']
2025-05-09T00:40:37.1772155Z
2025-05-09T00:40:37.1772598Z         l1 = node_factory.get_node(may_reconnect=True, disconnect=disconnects)
2025-05-09T00:40:37.1773210Z         l2 = node_factory.get_node(may_reconnect=True)
2025-05-09T00:40:37.1773632Z
2025-05-09T00:40:37.1773917Z         l1.fundwallet(2000000)
2025-05-09T00:40:37.1774268Z
2025-05-09T00:40:37.1774611Z         l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
2025-05-09T00:40:37.1775151Z >       l1.rpc.fundchannel(l2.info['id'], CHANNEL_SIZE)
2025-05-09T00:40:37.1775388Z
2025-05-09T00:40:37.1775485Z tests/test_connection.py:667:
...
2025-05-09T00:40:37.1799527Z >           raise RpcError(method, payload, resp['error'])
2025-05-09T00:40:37.1800993Z E           pyln.client.lightning.RpcError: RPC call failed: method: fundchannel, payload: {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'amount': 50000, 'announce': True}, error: {'code': -1, 'message': 'Disconnected', 'data': {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'method': 'openchannel_update'}}
```

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lightningd/connect_control.c |  6 ++++++
 lightningd/peer_control.c    | 22 ++++++++++------------
 lightningd/peer_control.h    |  3 +++
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c
index ceb41d302..761e6d307 100644
--- a/lightningd/connect_control.c
+++ b/lightningd/connect_control.c
@@ -236,6 +236,12 @@ static struct command_result *json_connect(struct command *cmd,
 					   &peer->addr);
 	}
 
+	/* When a peer disconnects, we give subds time to clean themselves up
+	 * (this lets connectd ensure they've seen the final messages).  But
+	 * now it's going to try to reconnect, we've gotta force them out. */
+	if (peer)
+		peer_channels_cleanup(peer);
+
 	subd_send_msg(cmd->ld->connectd,
 		      take(towire_connectd_connect_to_peer(NULL, &id_addr.id, addr, true)));
 
diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c
index 97f50b8b9..56ea80828 100644
--- a/lightningd/peer_control.c
+++ b/lightningd/peer_control.c
@@ -177,16 +177,10 @@ void maybe_delete_peer(struct peer *peer)
 	delete_peer(peer);
 }
 
-static void peer_channels_cleanup(struct lightningd *ld,
-				  const struct node_id *id)
+void peer_channels_cleanup(struct peer *peer)
 {
-	struct peer *peer;
 	struct channel *c, **channels;
 
-	peer = peer_by_id(ld, id);
-	if (!peer)
-		return;
-
 	/* Freeing channels can free peer, so gather first. */
 	channels = tal_arr(tmpctx, struct channel *, 0);
 	list_for_each(&peer->channels, c, list)
@@ -1726,11 +1720,6 @@ void peer_connected(struct lightningd *ld, const u8 *msg)
 		fatal("Connectd gave bad CONNECT_PEER_CONNECTED message %s",
 		      tal_hex(msg, msg));
 
-	/* When a peer disconnects, we give subds time to clean themselves up
-	 * (this lets connectd ensure they've seen the final messages).  But
-	 * now it's reconnected, we've gotta force them out. */
-	peer_channels_cleanup(ld, &id);
-
 	/* If we connected, and it's a normal address */
 	if (!hook_payload->incoming
 	    && hook_payload->addr.itype == ADDR_INTERNAL_WIREADDR
@@ -1743,6 +1732,15 @@ void peer_connected(struct lightningd *ld, const u8 *msg)
 	/* If we're already dealing with this peer, hand off to correct
 	 * subdaemon.  Otherwise, we'll hand to openingd to wait there. */
 	peer = peer_by_id(ld, &id);
+	if (peer) {
+		/* When a peer disconnects, we give subds time to clean themselves up
+		 * (this lets connectd ensure they've seen the final messages).  But
+		 * now it's reconnected, we've gotta force them out.  This might free
+		 * the peer! */
+		peer_channels_cleanup(peer);
+		peer = peer_by_id(ld, &id);
+	}
+
 	if (!peer) {
 		/* If we connected to them, we know this is a good address. */
 		peer = new_peer(ld, 0, &id, &hook_payload->addr,
diff --git a/lightningd/peer_control.h b/lightningd/peer_control.h
index 88a5f8866..84d11f2ca 100644
--- a/lightningd/peer_control.h
+++ b/lightningd/peer_control.h
@@ -165,6 +165,9 @@ command_find_channel(struct command *cmd,
 		     const char *buffer, const jsmntok_t *tok,
 		     struct channel **channel);
 
+/* We do this lazily, when reconnecting */
+void peer_channels_cleanup(struct peer *peer);
+
 /* Ancient (0.7.0 and before) releases could create invalid commitment txs! */
 bool invalid_last_tx(const struct bitcoin_tx *tx);