From d0c158a69dcdfd176ec535d7405c6b29415b1ee7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Feb 2026 21:27:15 +1030 Subject: [PATCH] pytest: make test_no_delay more robust. Unfortunately the effect of leaving Nagle enabled is subtle. Here it is in v25.12: Normal: tests/test_connection.py::test_no_delay PASSED ====================================================================== 1 passed in 13.87s Nagle enabled: tests/test_connection.py::test_no_delay PASSED ====================================================================== 1 passed in 21.70s So it's hard to both catch this issue and not have false positives. Improve the test by deliberately running with Nagle enabled, so we can do a direct comparison. Signed-off-by: Rusty Russell --- connectd/connectd.c | 14 ++++++++++---- connectd/connectd.h | 2 ++ connectd/connectd_wire.csv | 1 + lightningd/connect_control.c | 3 ++- lightningd/lightningd.c | 1 + lightningd/lightningd.h | 3 +++ lightningd/options.c | 4 ++++ tests/test_connection.py | 31 ++++++++++++++++++++++++++----- 8 files changed, 49 insertions(+), 10 deletions(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index a1e355b9a..147139118 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -515,9 +515,13 @@ static bool get_remote_address(struct io_conn *conn, * inserting a delay, creating a trap for every author of network code * everywhere. */ -static void set_tcp_no_delay(int fd) +static void set_tcp_no_delay(const struct daemon *daemon, int fd) { int val = 1; + + if (daemon->dev_keep_nagle) + return; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) { status_broken("setsockopt TCP_NODELAY=1 fd=%u: %s", fd, strerror(errno)); @@ -658,7 +662,7 @@ static struct io_plan *connection_in(struct io_conn *conn, /* Don't try to set TCP options on UNIX socket! */ if (conn_in_arg.addr.itype == ADDR_INTERNAL_WIREADDR) - set_tcp_no_delay(io_conn_fd(conn)); + set_tcp_no_delay(daemon, io_conn_fd(conn)); conn_in_arg.daemon = daemon; conn_in_arg.is_websocket = false; @@ -1198,7 +1202,7 @@ static void try_connect_one_addr(struct connecting *connect) /* Don't try to set TCP options on UNIX socket! */ if (addr->itype == ADDR_INTERNAL_WIREADDR) - set_tcp_no_delay(fd); + set_tcp_no_delay(connect->daemon, fd); connect->connect_attempted = true; /* This creates the new connection using our fd, with the initialization @@ -1685,7 +1689,8 @@ static void connect_init(struct daemon *daemon, const u8 *msg) &dev_throttle_gossip, &daemon->dev_no_reconnect, &daemon->dev_fast_reconnect, - &dev_limit_connections_inflight)) { + &dev_limit_connections_inflight, + &daemon->dev_keep_nagle)) { /* This is a helper which prints the type expected and the actual * message, then exits (it should never be called!). */ master_badmsg(WIRE_CONNECTD_INIT, msg); @@ -2555,6 +2560,7 @@ int main(int argc, char *argv[]) daemon->custom_msgs = NULL; daemon->dev_exhausted_fds = false; daemon->dev_lightningd_is_slow = false; + daemon->dev_keep_nagle = false; /* We generally allow 1MB per second per peer, except for dev testing */ daemon->gossip_stream_limit = 1000000; daemon->scid_htable = new_htable(daemon, scid_htable); diff --git a/connectd/connectd.h b/connectd/connectd.h index 797700572..ea012f6a2 100644 --- a/connectd/connectd.h +++ b/connectd/connectd.h @@ -371,6 +371,8 @@ struct daemon { bool dev_fast_reconnect; /* Don't complain about lightningd being unresponsive. */ bool dev_lightningd_is_slow; + /* Don't set TCP_NODELAY */ + bool dev_keep_nagle; }; /* Called by io_tor_connect once it has a connection out. */ diff --git a/connectd/connectd_wire.csv b/connectd/connectd_wire.csv index 7b1491591..d5f6aee33 100644 --- a/connectd/connectd_wire.csv +++ b/connectd/connectd_wire.csv @@ -28,6 +28,7 @@ msgdata,connectd_init,dev_throttle_gossip,bool, msgdata,connectd_init,dev_no_reconnect,bool, msgdata,connectd_init,dev_fast_reconnect,bool, msgdata,connectd_init,dev_limit_connections_inflight,bool, +msgdata,connectd_init,dev_keep_nagle,bool, # Connectd->master, here are the addresses I bound, can announce. msgtype,connectd_init_reply,2100 diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index 09da14aa8..e95a04db9 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -726,7 +726,8 @@ int connectd_init(struct lightningd *ld) ld->dev_throttle_gossip, !ld->reconnect, ld->dev_fast_reconnect, - ld->dev_limit_connections_inflight); + ld->dev_limit_connections_inflight, + ld->dev_keep_nagle); subd_req(ld->connectd, ld->connectd, take(msg), -1, 0, connect_init_done, NULL); diff --git a/lightningd/lightningd.c b/lightningd/lightningd.c index 219784e79..5d6429859 100644 --- a/lightningd/lightningd.c +++ b/lightningd/lightningd.c @@ -150,6 +150,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx) ld->dev_handshake_no_reply = false; ld->dev_strict_forwarding = false; ld->dev_limit_connections_inflight = false; + ld->dev_keep_nagle = false; /*~ We try to ensure enough fds for twice the number of channels * we start with. We have a developer option to change that factor diff --git a/lightningd/lightningd.h b/lightningd/lightningd.h index 2d3f028dc..a9a8c165a 100644 --- a/lightningd/lightningd.h +++ b/lightningd/lightningd.h @@ -369,6 +369,9 @@ struct lightningd { /* Tell connectd to block more than 1 simultanous connection attempt */ bool dev_limit_connections_inflight; + /* Tell connectd we don't want TCP_NODELAY */ + bool dev_keep_nagle; + /* tor support */ struct wireaddr *proxyaddr; bool always_use_proxy; diff --git a/lightningd/options.c b/lightningd/options.c index ce4c54761..95892dce0 100644 --- a/lightningd/options.c +++ b/lightningd/options.c @@ -948,6 +948,10 @@ static void dev_register_opts(struct lightningd *ld) opt_set_charp, opt_show_charp, &ld->plugins->dev_save_io, "Directory to place all plugin notifications/hooks JSON into."); + clnopt_noarg("--dev-keep-nagle", OPT_DEV, + opt_set_bool, + &ld->dev_keep_nagle, + "Tell connectd not to set TCP_NODELAY."); /* This is handled directly in daemon_developer_mode(), so we ignore it here */ clnopt_noarg("--dev-debug-self", OPT_DEV, opt_ignore, diff --git a/tests/test_connection.py b/tests/test_connection.py index 07e38ccb8..0671b958a 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -4581,10 +4581,11 @@ def test_private_channel_no_reconnect(node_factory): assert only_one(l1.rpc.listpeers()['peers'])['connected'] is False -@unittest.skipIf(VALGRIND, "We assume machine is reasonably fast") +@pytest.mark.slow_test def test_no_delay(node_factory): """Is our Nagle disabling for critical messages working?""" - l1, l2 = node_factory.line_graph(2) + l1, l2 = node_factory.line_graph(2, opts={'dev-keep-nagle': None, + 'may_reconnect': True}) scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id'] routestep = { @@ -4593,16 +4594,36 @@ def test_no_delay(node_factory): 'delay': 5, 'channel': scid } + + # Test with nagle start = time.time() - # If we were stupid enough to leave Nagle enabled, this would add 200ms - # seconds delays each way! for _ in range(100): phash = random.randbytes(32).hex() l1.rpc.sendpay([routestep], phash) with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): l1.rpc.waitsendpay(phash) end = time.time() - assert end < start + 100 * 0.5 + nagle_time = end - start + + del l1.daemon.opts['dev-keep-nagle'] + del l2.daemon.opts['dev-keep-nagle'] + l1.restart() + l2.restart() + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) + + # Test without nagle + start = time.time() + for _ in range(100): + phash = random.randbytes(32).hex() + l1.rpc.sendpay([routestep], phash) + with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): + l1.rpc.waitsendpay(phash) + end = time.time() + normal_time = end - start + + # 100 round trips, average delay 1/2 of 200ms -> 10 seconds extra. + # Make it half that for variance. + assert normal_time < nagle_time - 100 * (0.2 / 2) / 2 def test_listpeerchannels_by_scid(node_factory):