pytest: make test_no_delay more robust.

Unfortunately the effect of leaving Nagle enabled is subtle.  Here it
is in v25.12:

Normal: 
    tests/test_connection.py::test_no_delay PASSED
    ====================================================================== 1 passed in 13.87s

Nagle enabled:
    tests/test_connection.py::test_no_delay PASSED
    ====================================================================== 1 passed in 21.70s

So it's hard to both catch this issue and not have false positives.  Improve the
test by deliberately running with Nagle enabled, so we can do a direct comparison.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell
2026-02-19 21:27:15 +10:30
parent 10eb2c2ca0
commit d0c158a69d
8 changed files with 49 additions and 10 deletions

View File

@@ -515,9 +515,13 @@ static bool get_remote_address(struct io_conn *conn,
* inserting a delay, creating a trap for every author of network code
* everywhere.
*/
static void set_tcp_no_delay(int fd)
static void set_tcp_no_delay(const struct daemon *daemon, int fd)
{
int val = 1;
if (daemon->dev_keep_nagle)
return;
if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) {
status_broken("setsockopt TCP_NODELAY=1 fd=%u: %s",
fd, strerror(errno));
@@ -658,7 +662,7 @@ static struct io_plan *connection_in(struct io_conn *conn,
/* Don't try to set TCP options on UNIX socket! */
if (conn_in_arg.addr.itype == ADDR_INTERNAL_WIREADDR)
set_tcp_no_delay(io_conn_fd(conn));
set_tcp_no_delay(daemon, io_conn_fd(conn));
conn_in_arg.daemon = daemon;
conn_in_arg.is_websocket = false;
@@ -1198,7 +1202,7 @@ static void try_connect_one_addr(struct connecting *connect)
/* Don't try to set TCP options on UNIX socket! */
if (addr->itype == ADDR_INTERNAL_WIREADDR)
set_tcp_no_delay(fd);
set_tcp_no_delay(connect->daemon, fd);
connect->connect_attempted = true;
/* This creates the new connection using our fd, with the initialization
@@ -1685,7 +1689,8 @@ static void connect_init(struct daemon *daemon, const u8 *msg)
&dev_throttle_gossip,
&daemon->dev_no_reconnect,
&daemon->dev_fast_reconnect,
&dev_limit_connections_inflight)) {
&dev_limit_connections_inflight,
&daemon->dev_keep_nagle)) {
/* This is a helper which prints the type expected and the actual
* message, then exits (it should never be called!). */
master_badmsg(WIRE_CONNECTD_INIT, msg);
@@ -2555,6 +2560,7 @@ int main(int argc, char *argv[])
daemon->custom_msgs = NULL;
daemon->dev_exhausted_fds = false;
daemon->dev_lightningd_is_slow = false;
daemon->dev_keep_nagle = false;
/* We generally allow 1MB per second per peer, except for dev testing */
daemon->gossip_stream_limit = 1000000;
daemon->scid_htable = new_htable(daemon, scid_htable);

View File

@@ -371,6 +371,8 @@ struct daemon {
bool dev_fast_reconnect;
/* Don't complain about lightningd being unresponsive. */
bool dev_lightningd_is_slow;
/* Don't set TCP_NODELAY */
bool dev_keep_nagle;
};
/* Called by io_tor_connect once it has a connection out. */

View File

@@ -28,6 +28,7 @@ msgdata,connectd_init,dev_throttle_gossip,bool,
msgdata,connectd_init,dev_no_reconnect,bool,
msgdata,connectd_init,dev_fast_reconnect,bool,
msgdata,connectd_init,dev_limit_connections_inflight,bool,
msgdata,connectd_init,dev_keep_nagle,bool,
# Connectd->master, here are the addresses I bound, can announce.
msgtype,connectd_init_reply,2100
1 #include <common/channel_id.h>
28 msgdata,connectd_init,dev_fast_reconnect,bool,
29 msgdata,connectd_init,dev_limit_connections_inflight,bool,
30 # Connectd->master, here are the addresses I bound, can announce. msgdata,connectd_init,dev_keep_nagle,bool,
31 # Connectd->master, here are the addresses I bound, can announce.
32 msgtype,connectd_init_reply,2100
33 msgdata,connectd_init_reply,num_bindings,u16,
34 msgdata,connectd_init_reply,bindings,wireaddr_internal,num_bindings

View File

@@ -726,7 +726,8 @@ int connectd_init(struct lightningd *ld)
ld->dev_throttle_gossip,
!ld->reconnect,
ld->dev_fast_reconnect,
ld->dev_limit_connections_inflight);
ld->dev_limit_connections_inflight,
ld->dev_keep_nagle);
subd_req(ld->connectd, ld->connectd, take(msg), -1, 0,
connect_init_done, NULL);

View File

@@ -150,6 +150,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
ld->dev_handshake_no_reply = false;
ld->dev_strict_forwarding = false;
ld->dev_limit_connections_inflight = false;
ld->dev_keep_nagle = false;
/*~ We try to ensure enough fds for twice the number of channels
* we start with. We have a developer option to change that factor

View File

@@ -369,6 +369,9 @@ struct lightningd {
/* Tell connectd to block more than 1 simultanous connection attempt */
bool dev_limit_connections_inflight;
/* Tell connectd we don't want TCP_NODELAY */
bool dev_keep_nagle;
/* tor support */
struct wireaddr *proxyaddr;
bool always_use_proxy;

View File

@@ -948,6 +948,10 @@ static void dev_register_opts(struct lightningd *ld)
opt_set_charp, opt_show_charp,
&ld->plugins->dev_save_io,
"Directory to place all plugin notifications/hooks JSON into.");
clnopt_noarg("--dev-keep-nagle", OPT_DEV,
opt_set_bool,
&ld->dev_keep_nagle,
"Tell connectd not to set TCP_NODELAY.");
/* This is handled directly in daemon_developer_mode(), so we ignore it here */
clnopt_noarg("--dev-debug-self", OPT_DEV,
opt_ignore,

View File

@@ -4581,10 +4581,11 @@ def test_private_channel_no_reconnect(node_factory):
assert only_one(l1.rpc.listpeers()['peers'])['connected'] is False
@unittest.skipIf(VALGRIND, "We assume machine is reasonably fast")
@pytest.mark.slow_test
def test_no_delay(node_factory):
"""Is our Nagle disabling for critical messages working?"""
l1, l2 = node_factory.line_graph(2)
l1, l2 = node_factory.line_graph(2, opts={'dev-keep-nagle': None,
'may_reconnect': True})
scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id']
routestep = {
@@ -4593,16 +4594,36 @@ def test_no_delay(node_factory):
'delay': 5,
'channel': scid
}
# Test with nagle
start = time.time()
# If we were stupid enough to leave Nagle enabled, this would add 200ms
# seconds delays each way!
for _ in range(100):
phash = random.randbytes(32).hex()
l1.rpc.sendpay([routestep], phash)
with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
l1.rpc.waitsendpay(phash)
end = time.time()
assert end < start + 100 * 0.5
nagle_time = end - start
del l1.daemon.opts['dev-keep-nagle']
del l2.daemon.opts['dev-keep-nagle']
l1.restart()
l2.restart()
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
# Test without nagle
start = time.time()
for _ in range(100):
phash = random.randbytes(32).hex()
l1.rpc.sendpay([routestep], phash)
with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
l1.rpc.waitsendpay(phash)
end = time.time()
normal_time = end - start
# 100 round trips, average delay 1/2 of 200ms -> 10 seconds extra.
# Make it half that for variance.
assert normal_time < nagle_time - 100 * (0.2 / 2) / 2
def test_listpeerchannels_by_scid(node_factory):