diff --git a/connectd/connectd.c b/connectd/connectd.c index a1e355b9a96d..1471391185d7 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -515,9 +515,13 @@ static bool get_remote_address(struct io_conn *conn, * inserting a delay, creating a trap for every author of network code * everywhere. */ -static void set_tcp_no_delay(int fd) +static void set_tcp_no_delay(const struct daemon *daemon, int fd) { int val = 1; + + if (daemon->dev_keep_nagle) + return; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) { status_broken("setsockopt TCP_NODELAY=1 fd=%u: %s", fd, strerror(errno)); @@ -658,7 +662,7 @@ static struct io_plan *connection_in(struct io_conn *conn, /* Don't try to set TCP options on UNIX socket! */ if (conn_in_arg.addr.itype == ADDR_INTERNAL_WIREADDR) - set_tcp_no_delay(io_conn_fd(conn)); + set_tcp_no_delay(daemon, io_conn_fd(conn)); conn_in_arg.daemon = daemon; conn_in_arg.is_websocket = false; @@ -1198,7 +1202,7 @@ static void try_connect_one_addr(struct connecting *connect) /* Don't try to set TCP options on UNIX socket! */ if (addr->itype == ADDR_INTERNAL_WIREADDR) - set_tcp_no_delay(fd); + set_tcp_no_delay(connect->daemon, fd); connect->connect_attempted = true; /* This creates the new connection using our fd, with the initialization @@ -1685,7 +1689,8 @@ static void connect_init(struct daemon *daemon, const u8 *msg) &dev_throttle_gossip, &daemon->dev_no_reconnect, &daemon->dev_fast_reconnect, - &dev_limit_connections_inflight)) { + &dev_limit_connections_inflight, + &daemon->dev_keep_nagle)) { /* This is a helper which prints the type expected and the actual * message, then exits (it should never be called!). */ master_badmsg(WIRE_CONNECTD_INIT, msg); @@ -2555,6 +2560,7 @@ int main(int argc, char *argv[]) daemon->custom_msgs = NULL; daemon->dev_exhausted_fds = false; daemon->dev_lightningd_is_slow = false; + daemon->dev_keep_nagle = false; /* We generally allow 1MB per second per peer, except for dev testing */ daemon->gossip_stream_limit = 1000000; daemon->scid_htable = new_htable(daemon, scid_htable); diff --git a/connectd/connectd.h b/connectd/connectd.h index 1cce2c64d3aa..ddb106dbce87 100644 --- a/connectd/connectd.h +++ b/connectd/connectd.h @@ -371,6 +371,8 @@ struct daemon { bool dev_fast_reconnect; /* Don't complain about lightningd being unresponsive. */ bool dev_lightningd_is_slow; + /* Don't set TCP_NODELAY */ + bool dev_keep_nagle; }; /* Called by io_tor_connect once it has a connection out. */ diff --git a/connectd/connectd_wire.csv b/connectd/connectd_wire.csv index 7b14915915ab..d5f6aee33b01 100644 --- a/connectd/connectd_wire.csv +++ b/connectd/connectd_wire.csv @@ -28,6 +28,7 @@ msgdata,connectd_init,dev_throttle_gossip,bool, msgdata,connectd_init,dev_no_reconnect,bool, msgdata,connectd_init,dev_fast_reconnect,bool, msgdata,connectd_init,dev_limit_connections_inflight,bool, +msgdata,connectd_init,dev_keep_nagle,bool, # Connectd->master, here are the addresses I bound, can announce. msgtype,connectd_init_reply,2100 diff --git a/contrib/pyln-testing/pyln/testing/fixtures.py b/contrib/pyln-testing/pyln/testing/fixtures.py index c5015ed3af78..e04d6d8e7929 100644 --- a/contrib/pyln-testing/pyln/testing/fixtures.py +++ b/contrib/pyln-testing/pyln/testing/fixtures.py @@ -1,6 +1,6 @@ from concurrent import futures from pyln.testing.db import SqliteDbProvider, PostgresDbProvider -from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, LightningNode, TEST_DEBUG, TEST_NETWORK +from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, LightningNode, TEST_DEBUG, TEST_NETWORK, SLOW_MACHINE, VALGRIND from pyln.client import Millisatoshi from typing import Dict from pathlib import Path @@ -635,6 +635,10 @@ def checkBroken(node): if node.broken_log: ex = re.compile(node.broken_log) broken_lines = [l for l in broken_lines if not ex.search(l)] + # Valgrind under CI can be really slow, so we get spurious alerts + if SLOW_MACHINE and VALGRIND: + slowreq = re.compile("That's weird: Request .* took [0-9]* milliseconds") + broken_lines = [l for l in broken_lines if not slowreq.search(l)] if broken_lines: print(broken_lines) return 1 diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index ce2a1007cd92..dd60020cc1c1 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -292,15 +292,43 @@ def cleanup_files(self): except Exception: pass + def readlines_wait_for_end(self, f, timeout=TIMEOUT): + """Read all complete lines from file object `f`. + + If the last line is incomplete (no trailing newline), wait briefly + for it to complete before returning. + + Returns list of lines including trailing newline. + """ + lines = [] + cur = '' + start = time.time() + + while True: + line = f.readline() + + if not line: + if cur != '': + if time.time() - start > timeout: + raise TimeoutError(f"Incomplete line never finished: {cur}") + time.sleep(0.01) + continue + return lines + + cur += line + if cur.endswith('\n'): + lines.append(cur) + cur = '' + def logs_catchup(self): """Save the latest stdout / stderr contents; return true if we got anything. """ - new_stdout = self.stdout_read.readlines() + new_stdout = self.readlines_wait_for_end(self.stdout_read) if self.verbose: for line in new_stdout: sys.stdout.write("{}: {}".format(self.prefix, line)) self.logs += [l.rstrip() for l in new_stdout] - new_stderr = self.stderr_read.readlines() + new_stderr = self.readlines_wait_for_end(self.stderr_read) if self.verbose: for line in new_stderr: sys.stderr.write("{}-stderr: {}".format(self.prefix, line)) diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index 09da14aa8948..e95a04db9167 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -726,7 +726,8 @@ int connectd_init(struct lightningd *ld) ld->dev_throttle_gossip, !ld->reconnect, ld->dev_fast_reconnect, - ld->dev_limit_connections_inflight); + ld->dev_limit_connections_inflight, + ld->dev_keep_nagle); subd_req(ld->connectd, ld->connectd, take(msg), -1, 0, connect_init_done, NULL); diff --git a/lightningd/lightningd.c b/lightningd/lightningd.c index 219784e79b8c..5d6429859a7c 100644 --- a/lightningd/lightningd.c +++ b/lightningd/lightningd.c @@ -150,6 +150,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx) ld->dev_handshake_no_reply = false; ld->dev_strict_forwarding = false; ld->dev_limit_connections_inflight = false; + ld->dev_keep_nagle = false; /*~ We try to ensure enough fds for twice the number of channels * we start with. We have a developer option to change that factor diff --git a/lightningd/lightningd.h b/lightningd/lightningd.h index 2d3f028dcd9a..a9a8c165a3f0 100644 --- a/lightningd/lightningd.h +++ b/lightningd/lightningd.h @@ -369,6 +369,9 @@ struct lightningd { /* Tell connectd to block more than 1 simultanous connection attempt */ bool dev_limit_connections_inflight; + /* Tell connectd we don't want TCP_NODELAY */ + bool dev_keep_nagle; + /* tor support */ struct wireaddr *proxyaddr; bool always_use_proxy; diff --git a/lightningd/options.c b/lightningd/options.c index ce4c547615fd..95892dce081c 100644 --- a/lightningd/options.c +++ b/lightningd/options.c @@ -948,6 +948,10 @@ static void dev_register_opts(struct lightningd *ld) opt_set_charp, opt_show_charp, &ld->plugins->dev_save_io, "Directory to place all plugin notifications/hooks JSON into."); + clnopt_noarg("--dev-keep-nagle", OPT_DEV, + opt_set_bool, + &ld->dev_keep_nagle, + "Tell connectd not to set TCP_NODELAY."); /* This is handled directly in daemon_developer_mode(), so we ignore it here */ clnopt_noarg("--dev-debug-self", OPT_DEV, opt_ignore, diff --git a/tests/test_closing.py b/tests/test_closing.py index beb7542d40ed..5c4a17acc5f4 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -4308,6 +4308,10 @@ def no_new_blocks(req): l3.daemon.wait_for_log("peer_in WIRE_ERROR") wait_for(lambda: only_one(l3.rpc.listpeerchannels(l2.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL') + # If we're slow enough, l3 can get upset with the invalid + # responses from bitcoind, so stop that now. + l3.daemon.rpcproxy.mock_rpc('getblockhash', None) + @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd anchors not supportd') def test_onchain_slow_anchor(node_factory, bitcoind): @@ -4395,6 +4399,10 @@ def no_new_blocks(req): # Make sure l2 was happy with the reestablish message. assert not l2.daemon.is_in_log('bad reestablish') + # If we're slow enough, l2 can get upset with the invalid + # responses from bitcoind, so stop that now. + l2.daemon.rpcproxy.mock_rpc('getblockhash', None) + @unittest.skipIf(TEST_NETWORK != 'regtest', "elementsd doesn't use p2tr anyway") def test_onchain_close_no_p2tr(node_factory, bitcoind): diff --git a/tests/test_connection.py b/tests/test_connection.py index 5dd5b768f5c8..0671b958a3ad 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3081,6 +3081,12 @@ def test_dataloss_protection(node_factory, bitcoind): Path(dbpath).write_bytes(orig_db) l2.start() + # l1 will keep trying to reconnect, but it's using exponential backoff, + # which only gets reset after the connection has lasted MAX_WAIT_SECONDS (300) + # which it hasn't. Speed things up (and avoid a timeout flake!) by reconnecting + # manually now. + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) + # l2 should freak out! l2.daemon.wait_for_log("Peer permanent failure in CHANNELD_NORMAL:.*Awaiting unilateral close") @@ -4575,10 +4581,11 @@ def test_private_channel_no_reconnect(node_factory): assert only_one(l1.rpc.listpeers()['peers'])['connected'] is False -@unittest.skipIf(VALGRIND, "We assume machine is reasonably fast") +@pytest.mark.slow_test def test_no_delay(node_factory): """Is our Nagle disabling for critical messages working?""" - l1, l2 = node_factory.line_graph(2) + l1, l2 = node_factory.line_graph(2, opts={'dev-keep-nagle': None, + 'may_reconnect': True}) scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id'] routestep = { @@ -4587,16 +4594,36 @@ def test_no_delay(node_factory): 'delay': 5, 'channel': scid } + + # Test with nagle + start = time.time() + for _ in range(100): + phash = random.randbytes(32).hex() + l1.rpc.sendpay([routestep], phash) + with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): + l1.rpc.waitsendpay(phash) + end = time.time() + nagle_time = end - start + + del l1.daemon.opts['dev-keep-nagle'] + del l2.daemon.opts['dev-keep-nagle'] + l1.restart() + l2.restart() + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) + + # Test without nagle start = time.time() - # If we were stupid enough to leave Nagle enabled, this would add 200ms - # seconds delays each way! for _ in range(100): phash = random.randbytes(32).hex() l1.rpc.sendpay([routestep], phash) with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): l1.rpc.waitsendpay(phash) end = time.time() - assert end < start + 100 * 0.5 + normal_time = end - start + + # 100 round trips, average delay 1/2 of 200ms -> 10 seconds extra. + # Make it half that for variance. + assert normal_time < nagle_time - 100 * (0.2 / 2) / 2 def test_listpeerchannels_by_scid(node_factory): diff --git a/tests/test_gossip.py b/tests/test_gossip.py index b4c373da091c..e0b1ed9b0681 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -2393,7 +2393,7 @@ def test_gossip_force_broadcast_channel_msgs(node_factory, bitcoind): # Make sure the noise is within reasonable bounds assert tally['query_short_channel_ids'] <= 1 assert tally['query_channel_range'] <= 1 - assert tally['ping'] <= 3 + assert tally['ping'] <= 5 assert tally['gossip_filter'] >= 1 del tally['query_short_channel_ids'] del tally['query_channel_range'] diff --git a/tests/test_plugin.py b/tests/test_plugin.py index a5b258487902..e19495297a5e 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2455,7 +2455,7 @@ def test_important_plugin(node_factory): n = node_factory.get_node(options={"important-plugin": os.path.join(pluginsdir, "nonexistent")}, may_fail=True, expect_fail=True, # Other plugins can complain as lightningd stops suddenly: - broken_log='Plugin marked as important, shutting down lightningd|Reading sync lightningd: Connection reset by peer|Lost connection to the RPC socket|Plugin terminated before replying to RPC call', + broken_log='Plugin marked as important, shutting down lightningd|Reading sync lightningd: Connection reset by peer|Lost connection to the RPC socket|Plugin terminated before replying to RPC call|plugin-cln-xpay: askrene-create-layer failed with.*Unkown command', start=False) n.daemon.start(wait_for_initialized=False, stderr_redir=True) @@ -2984,8 +2984,7 @@ def test_plugin_shutdown(node_factory): def test_commando(node_factory, executor): l1, l2 = node_factory.line_graph(2, fundchannel=False, - # Under valgrind, checkrune of 400k command can be slow! - opts={'log-level': 'io', 'broken_log': "That's weird: Request .* took"}) + opts={'log-level': 'io'}) rune = l1.rpc.createrune()['rune'] diff --git a/tests/test_wallet.py b/tests/test_wallet.py index b8eeb990fa30..5be686d11ef4 100644 --- a/tests/test_wallet.py +++ b/tests/test_wallet.py @@ -1225,8 +1225,7 @@ def test_sign_and_send_psbt(node_factory, bitcoind, chainparams): @unittest.skipIf(TEST_NETWORK == 'liquid-regtest', "BIP86 random_hsm not compatible with liquid-regtest bech32") def test_txsend(node_factory, bitcoind, chainparams): amount = 1000000 - # Under valgrind, we can actually take 5 seconds to sign multiple inputs! - l1 = node_factory.get_node(random_hsm=True, broken_log="That's weird: Request signpsbt took") + l1 = node_factory.get_node(random_hsm=True) addr = chainparams['example_addr'] # Add some funds to withdraw later