pytest: make test_no_delay more robust.

Unfortunately the effect of leaving Nagle enabled is subtle. Here it is in v25.12: Normal: tests/test_connection.py::test_no_delay PASSED ====================================================================== 1 passed in 13.87s Nagle enabled: tests/test_connection.py::test_no_delay PASSED ====================================================================== 1 passed in 21.70s So it's hard to both catch this issue and not have false positives. Improve the test by deliberately running with Nagle enabled, so we can do a direct comparison. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2026-02-19 21:27:15 +10:30
parent 10eb2c2ca0
commit d0c158a69d
8 changed files with 49 additions and 10 deletions
--- a/connectd/connectd.c
+++ b/connectd/connectd.c
@@ -515,9 +515,13 @@ static bool get_remote_address(struct io_conn *conn,
 * inserting a delay, creating a trap for every author of network code
 * everywhere.
 */
-static void set_tcp_no_delay(int fd)
+static void set_tcp_no_delay(const struct daemon *daemon, int fd)
 {
 	int val = 1;
+
+	if (daemon->dev_keep_nagle)
+		return;
+
 	if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) {
 		status_broken("setsockopt TCP_NODELAY=1 fd=%u: %s",
 			      fd, strerror(errno));
@@ -658,7 +662,7 @@ static struct io_plan *connection_in(struct io_conn *conn,

 	/* Don't try to set TCP options on UNIX socket! */
 	if (conn_in_arg.addr.itype == ADDR_INTERNAL_WIREADDR)
-		set_tcp_no_delay(io_conn_fd(conn));
+		set_tcp_no_delay(daemon, io_conn_fd(conn));

 	conn_in_arg.daemon = daemon;
 	conn_in_arg.is_websocket = false;
@@ -1198,7 +1202,7 @@ static void try_connect_one_addr(struct connecting *connect)

 	/* Don't try to set TCP options on UNIX socket! */
 	if (addr->itype == ADDR_INTERNAL_WIREADDR)
-		set_tcp_no_delay(fd);
+		set_tcp_no_delay(connect->daemon, fd);
 	connect->connect_attempted = true;

 	/* This creates the new connection using our fd, with the initialization
@@ -1685,7 +1689,8 @@ static void connect_init(struct daemon *daemon, const u8 *msg)
 				    &dev_throttle_gossip,
 				    &daemon->dev_no_reconnect,
 				    &daemon->dev_fast_reconnect,
-				    &dev_limit_connections_inflight)) {
+				    &dev_limit_connections_inflight,
+				    &daemon->dev_keep_nagle)) {
 		/* This is a helper which prints the type expected and the actual
 		 * message, then exits (it should never be called!). */
 		master_badmsg(WIRE_CONNECTD_INIT, msg);
@@ -2555,6 +2560,7 @@ int main(int argc, char *argv[])
 	daemon->custom_msgs = NULL;
 	daemon->dev_exhausted_fds = false;
 	daemon->dev_lightningd_is_slow = false;
+	daemon->dev_keep_nagle = false;
 	/* We generally allow 1MB per second per peer, except for dev testing */
 	daemon->gossip_stream_limit = 1000000;
 	daemon->scid_htable = new_htable(daemon, scid_htable);
--- a/connectd/connectd.h
+++ b/connectd/connectd.h
@@ -371,6 +371,8 @@ struct daemon {
 	bool dev_fast_reconnect;
 	/* Don't complain about lightningd being unresponsive. */
 	bool dev_lightningd_is_slow;
+	/* Don't set TCP_NODELAY */
+	bool dev_keep_nagle;
 };

 /* Called by io_tor_connect once it has a connection out. */
--- a/connectd/connectd_wire.csv
+++ b/connectd/connectd_wire.csv
@@ -28,6 +28,7 @@ msgdata,connectd_init,dev_throttle_gossip,bool,
 msgdata,connectd_init,dev_no_reconnect,bool,
 msgdata,connectd_init,dev_fast_reconnect,bool,
 msgdata,connectd_init,dev_limit_connections_inflight,bool,
+msgdata,connectd_init,dev_keep_nagle,bool,

 # Connectd->master, here are the addresses I bound, can announce.
 msgtype,connectd_init_reply,2100
--- a/lightningd/connect_control.c
+++ b/lightningd/connect_control.c
@@ -726,7 +726,8 @@ int connectd_init(struct lightningd *ld)
 				   ld->dev_throttle_gossip,
 				   !ld->reconnect,
 				   ld->dev_fast_reconnect,
-				   ld->dev_limit_connections_inflight);
+				   ld->dev_limit_connections_inflight,
+				   ld->dev_keep_nagle);

 	subd_req(ld->connectd, ld->connectd, take(msg), -1, 0,
 		 connect_init_done, NULL);
--- a/lightningd/lightningd.c
+++ b/lightningd/lightningd.c
@@ -150,6 +150,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
 	ld->dev_handshake_no_reply = false;
 	ld->dev_strict_forwarding = false;
 	ld->dev_limit_connections_inflight = false;
+	ld->dev_keep_nagle = false;

 	/*~ We try to ensure enough fds for twice the number of channels
 	 * we start with.  We have a developer option to change that factor
--- a/lightningd/lightningd.h
+++ b/lightningd/lightningd.h
@@ -369,6 +369,9 @@ struct lightningd {
 	/* Tell connectd to block more than 1 simultanous connection attempt */
 	bool dev_limit_connections_inflight;

+	/* Tell connectd we don't want TCP_NODELAY */
+	bool dev_keep_nagle;
+
 	/* tor support */
 	struct wireaddr *proxyaddr;
 	bool always_use_proxy;
--- a/lightningd/options.c
+++ b/lightningd/options.c
@@ -948,6 +948,10 @@ static void dev_register_opts(struct lightningd *ld)
 		       opt_set_charp, opt_show_charp,
 		       &ld->plugins->dev_save_io,
 		       "Directory to place all plugin notifications/hooks JSON into.");
+	clnopt_noarg("--dev-keep-nagle", OPT_DEV,
+		       opt_set_bool,
+		       &ld->dev_keep_nagle,
+		       "Tell connectd not to set TCP_NODELAY.");
 	/* This is handled directly in daemon_developer_mode(), so we ignore it here */
 	clnopt_noarg("--dev-debug-self", OPT_DEV,
 		     opt_ignore,
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@@ -4581,10 +4581,11 @@ def test_private_channel_no_reconnect(node_factory):
    assert only_one(l1.rpc.listpeers()['peers'])['connected'] is False


-@unittest.skipIf(VALGRIND, "We assume machine is reasonably fast")
+@pytest.mark.slow_test
 def test_no_delay(node_factory):
    """Is our Nagle disabling for critical messages working?"""
-    l1, l2 = node_factory.line_graph(2)
+    l1, l2 = node_factory.line_graph(2, opts={'dev-keep-nagle': None,
+                                              'may_reconnect': True})

    scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id']
    routestep = {
@@ -4593,16 +4594,36 @@ def test_no_delay(node_factory):
        'delay': 5,
        'channel': scid
    }
+
+    # Test with nagle
    start = time.time()
-    # If we were stupid enough to leave Nagle enabled, this would add 200ms
-    # seconds delays each way!
    for _ in range(100):
        phash = random.randbytes(32).hex()
        l1.rpc.sendpay([routestep], phash)
        with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
            l1.rpc.waitsendpay(phash)
    end = time.time()
-    assert end < start + 100 * 0.5
+    nagle_time = end - start
+
+    del l1.daemon.opts['dev-keep-nagle']
+    del l2.daemon.opts['dev-keep-nagle']
+    l1.restart()
+    l2.restart()
+    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
+
+    # Test without nagle
+    start = time.time()
+    for _ in range(100):
+        phash = random.randbytes(32).hex()
+        l1.rpc.sendpay([routestep], phash)
+        with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
+            l1.rpc.waitsendpay(phash)
+    end = time.time()
+    normal_time = end - start
+
+    # 100 round trips, average delay 1/2 of 200ms -> 10 seconds extra.
+    # Make it half that for variance.
+    assert normal_time < nagle_time - 100 * (0.2 / 2) / 2


 def test_listpeerchannels_by_scid(node_factory):