Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions connectd/connectd.c
Original file line number Diff line number Diff line change
Expand Up @@ -515,9 +515,13 @@ static bool get_remote_address(struct io_conn *conn,
* inserting a delay, creating a trap for every author of network code
* everywhere.
*/
static void set_tcp_no_delay(int fd)
static void set_tcp_no_delay(const struct daemon *daemon, int fd)
{
int val = 1;

if (daemon->dev_keep_nagle)
return;

if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) {
status_broken("setsockopt TCP_NODELAY=1 fd=%u: %s",
fd, strerror(errno));
Expand Down Expand Up @@ -658,7 +662,7 @@ static struct io_plan *connection_in(struct io_conn *conn,

/* Don't try to set TCP options on UNIX socket! */
if (conn_in_arg.addr.itype == ADDR_INTERNAL_WIREADDR)
set_tcp_no_delay(io_conn_fd(conn));
set_tcp_no_delay(daemon, io_conn_fd(conn));

conn_in_arg.daemon = daemon;
conn_in_arg.is_websocket = false;
Expand Down Expand Up @@ -1198,7 +1202,7 @@ static void try_connect_one_addr(struct connecting *connect)

/* Don't try to set TCP options on UNIX socket! */
if (addr->itype == ADDR_INTERNAL_WIREADDR)
set_tcp_no_delay(fd);
set_tcp_no_delay(connect->daemon, fd);
connect->connect_attempted = true;

/* This creates the new connection using our fd, with the initialization
Expand Down Expand Up @@ -1685,7 +1689,8 @@ static void connect_init(struct daemon *daemon, const u8 *msg)
&dev_throttle_gossip,
&daemon->dev_no_reconnect,
&daemon->dev_fast_reconnect,
&dev_limit_connections_inflight)) {
&dev_limit_connections_inflight,
&daemon->dev_keep_nagle)) {
/* This is a helper which prints the type expected and the actual
* message, then exits (it should never be called!). */
master_badmsg(WIRE_CONNECTD_INIT, msg);
Expand Down Expand Up @@ -2555,6 +2560,7 @@ int main(int argc, char *argv[])
daemon->custom_msgs = NULL;
daemon->dev_exhausted_fds = false;
daemon->dev_lightningd_is_slow = false;
daemon->dev_keep_nagle = false;
/* We generally allow 1MB per second per peer, except for dev testing */
daemon->gossip_stream_limit = 1000000;
daemon->scid_htable = new_htable(daemon, scid_htable);
Expand Down
2 changes: 2 additions & 0 deletions connectd/connectd.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,8 @@ struct daemon {
bool dev_fast_reconnect;
/* Don't complain about lightningd being unresponsive. */
bool dev_lightningd_is_slow;
/* Don't set TCP_NODELAY */
bool dev_keep_nagle;
};

/* Called by io_tor_connect once it has a connection out. */
Expand Down
1 change: 1 addition & 0 deletions connectd/connectd_wire.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ msgdata,connectd_init,dev_throttle_gossip,bool,
msgdata,connectd_init,dev_no_reconnect,bool,
msgdata,connectd_init,dev_fast_reconnect,bool,
msgdata,connectd_init,dev_limit_connections_inflight,bool,
msgdata,connectd_init,dev_keep_nagle,bool,

# Connectd->master, here are the addresses I bound, can announce.
msgtype,connectd_init_reply,2100
Expand Down
6 changes: 5 additions & 1 deletion contrib/pyln-testing/pyln/testing/fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from concurrent import futures
from pyln.testing.db import SqliteDbProvider, PostgresDbProvider
from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, LightningNode, TEST_DEBUG, TEST_NETWORK
from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, LightningNode, TEST_DEBUG, TEST_NETWORK, SLOW_MACHINE, VALGRIND
from pyln.client import Millisatoshi
from typing import Dict
from pathlib import Path
Expand Down Expand Up @@ -635,6 +635,10 @@ def checkBroken(node):
if node.broken_log:
ex = re.compile(node.broken_log)
broken_lines = [l for l in broken_lines if not ex.search(l)]
# Valgrind under CI can be really slow, so we get spurious alerts
if SLOW_MACHINE and VALGRIND:
slowreq = re.compile("That's weird: Request .* took [0-9]* milliseconds")
broken_lines = [l for l in broken_lines if not slowreq.search(l)]
if broken_lines:
print(broken_lines)
return 1
Expand Down
32 changes: 30 additions & 2 deletions contrib/pyln-testing/pyln/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,15 +292,43 @@ def cleanup_files(self):
except Exception:
pass

def readlines_wait_for_end(self, f, timeout=TIMEOUT):
"""Read all complete lines from file object `f`.

If the last line is incomplete (no trailing newline), wait briefly
for it to complete before returning.

Returns list of lines including trailing newline.
"""
lines = []
cur = ''
start = time.time()

while True:
line = f.readline()

if not line:
if cur != '':
if time.time() - start > timeout:
raise TimeoutError(f"Incomplete line never finished: {cur}")
time.sleep(0.01)
continue
return lines

cur += line
if cur.endswith('\n'):
lines.append(cur)
cur = ''

def logs_catchup(self):
"""Save the latest stdout / stderr contents; return true if we got anything.
"""
new_stdout = self.stdout_read.readlines()
new_stdout = self.readlines_wait_for_end(self.stdout_read)
if self.verbose:
for line in new_stdout:
sys.stdout.write("{}: {}".format(self.prefix, line))
self.logs += [l.rstrip() for l in new_stdout]
new_stderr = self.stderr_read.readlines()
new_stderr = self.readlines_wait_for_end(self.stderr_read)
if self.verbose:
for line in new_stderr:
sys.stderr.write("{}-stderr: {}".format(self.prefix, line))
Expand Down
3 changes: 2 additions & 1 deletion lightningd/connect_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,8 @@ int connectd_init(struct lightningd *ld)
ld->dev_throttle_gossip,
!ld->reconnect,
ld->dev_fast_reconnect,
ld->dev_limit_connections_inflight);
ld->dev_limit_connections_inflight,
ld->dev_keep_nagle);

subd_req(ld->connectd, ld->connectd, take(msg), -1, 0,
connect_init_done, NULL);
Expand Down
1 change: 1 addition & 0 deletions lightningd/lightningd.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
ld->dev_handshake_no_reply = false;
ld->dev_strict_forwarding = false;
ld->dev_limit_connections_inflight = false;
ld->dev_keep_nagle = false;

/*~ We try to ensure enough fds for twice the number of channels
* we start with. We have a developer option to change that factor
Expand Down
3 changes: 3 additions & 0 deletions lightningd/lightningd.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,9 @@ struct lightningd {
/* Tell connectd to block more than 1 simultanous connection attempt */
bool dev_limit_connections_inflight;

/* Tell connectd we don't want TCP_NODELAY */
bool dev_keep_nagle;

/* tor support */
struct wireaddr *proxyaddr;
bool always_use_proxy;
Expand Down
4 changes: 4 additions & 0 deletions lightningd/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,10 @@ static void dev_register_opts(struct lightningd *ld)
opt_set_charp, opt_show_charp,
&ld->plugins->dev_save_io,
"Directory to place all plugin notifications/hooks JSON into.");
clnopt_noarg("--dev-keep-nagle", OPT_DEV,
opt_set_bool,
&ld->dev_keep_nagle,
"Tell connectd not to set TCP_NODELAY.");
/* This is handled directly in daemon_developer_mode(), so we ignore it here */
clnopt_noarg("--dev-debug-self", OPT_DEV,
opt_ignore,
Expand Down
8 changes: 8 additions & 0 deletions tests/test_closing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4308,6 +4308,10 @@ def no_new_blocks(req):
l3.daemon.wait_for_log("peer_in WIRE_ERROR")
wait_for(lambda: only_one(l3.rpc.listpeerchannels(l2.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL')

# If we're slow enough, l3 can get upset with the invalid
# responses from bitcoind, so stop that now.
l3.daemon.rpcproxy.mock_rpc('getblockhash', None)


@unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd anchors not supportd')
def test_onchain_slow_anchor(node_factory, bitcoind):
Expand Down Expand Up @@ -4395,6 +4399,10 @@ def no_new_blocks(req):
# Make sure l2 was happy with the reestablish message.
assert not l2.daemon.is_in_log('bad reestablish')

# If we're slow enough, l2 can get upset with the invalid
# responses from bitcoind, so stop that now.
l2.daemon.rpcproxy.mock_rpc('getblockhash', None)


@unittest.skipIf(TEST_NETWORK != 'regtest', "elementsd doesn't use p2tr anyway")
def test_onchain_close_no_p2tr(node_factory, bitcoind):
Expand Down
37 changes: 32 additions & 5 deletions tests/test_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3081,6 +3081,12 @@ def test_dataloss_protection(node_factory, bitcoind):
Path(dbpath).write_bytes(orig_db)
l2.start()

# l1 will keep trying to reconnect, but it's using exponential backoff,
# which only gets reset after the connection has lasted MAX_WAIT_SECONDS (300)
# which it hasn't. Speed things up (and avoid a timeout flake!) by reconnecting
# manually now.
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)

# l2 should freak out!
l2.daemon.wait_for_log("Peer permanent failure in CHANNELD_NORMAL:.*Awaiting unilateral close")

Expand Down Expand Up @@ -4575,10 +4581,11 @@ def test_private_channel_no_reconnect(node_factory):
assert only_one(l1.rpc.listpeers()['peers'])['connected'] is False


@unittest.skipIf(VALGRIND, "We assume machine is reasonably fast")
@pytest.mark.slow_test
def test_no_delay(node_factory):
"""Is our Nagle disabling for critical messages working?"""
l1, l2 = node_factory.line_graph(2)
l1, l2 = node_factory.line_graph(2, opts={'dev-keep-nagle': None,
'may_reconnect': True})

scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id']
routestep = {
Expand All @@ -4587,16 +4594,36 @@ def test_no_delay(node_factory):
'delay': 5,
'channel': scid
}

# Test with nagle
start = time.time()
for _ in range(100):
phash = random.randbytes(32).hex()
l1.rpc.sendpay([routestep], phash)
with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
l1.rpc.waitsendpay(phash)
end = time.time()
nagle_time = end - start

del l1.daemon.opts['dev-keep-nagle']
del l2.daemon.opts['dev-keep-nagle']
l1.restart()
l2.restart()
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)

# Test without nagle
start = time.time()
# If we were stupid enough to leave Nagle enabled, this would add 200ms
# seconds delays each way!
for _ in range(100):
phash = random.randbytes(32).hex()
l1.rpc.sendpay([routestep], phash)
with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"):
l1.rpc.waitsendpay(phash)
end = time.time()
assert end < start + 100 * 0.5
normal_time = end - start

# 100 round trips, average delay 1/2 of 200ms -> 10 seconds extra.
# Make it half that for variance.
assert normal_time < nagle_time - 100 * (0.2 / 2) / 2


def test_listpeerchannels_by_scid(node_factory):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_gossip.py
Original file line number Diff line number Diff line change
Expand Up @@ -2393,7 +2393,7 @@ def test_gossip_force_broadcast_channel_msgs(node_factory, bitcoind):
# Make sure the noise is within reasonable bounds
assert tally['query_short_channel_ids'] <= 1
assert tally['query_channel_range'] <= 1
assert tally['ping'] <= 3
assert tally['ping'] <= 5
assert tally['gossip_filter'] >= 1
del tally['query_short_channel_ids']
del tally['query_channel_range']
Expand Down
5 changes: 2 additions & 3 deletions tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2455,7 +2455,7 @@ def test_important_plugin(node_factory):
n = node_factory.get_node(options={"important-plugin": os.path.join(pluginsdir, "nonexistent")},
may_fail=True, expect_fail=True,
# Other plugins can complain as lightningd stops suddenly:
broken_log='Plugin marked as important, shutting down lightningd|Reading sync lightningd: Connection reset by peer|Lost connection to the RPC socket|Plugin terminated before replying to RPC call',
broken_log='Plugin marked as important, shutting down lightningd|Reading sync lightningd: Connection reset by peer|Lost connection to the RPC socket|Plugin terminated before replying to RPC call|plugin-cln-xpay: askrene-create-layer failed with.*Unkown command',
start=False)

n.daemon.start(wait_for_initialized=False, stderr_redir=True)
Expand Down Expand Up @@ -2984,8 +2984,7 @@ def test_plugin_shutdown(node_factory):

def test_commando(node_factory, executor):
l1, l2 = node_factory.line_graph(2, fundchannel=False,
# Under valgrind, checkrune of 400k command can be slow!
opts={'log-level': 'io', 'broken_log': "That's weird: Request .* took"})
opts={'log-level': 'io'})

rune = l1.rpc.createrune()['rune']

Expand Down
3 changes: 1 addition & 2 deletions tests/test_wallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1225,8 +1225,7 @@ def test_sign_and_send_psbt(node_factory, bitcoind, chainparams):
@unittest.skipIf(TEST_NETWORK == 'liquid-regtest', "BIP86 random_hsm not compatible with liquid-regtest bech32")
def test_txsend(node_factory, bitcoind, chainparams):
amount = 1000000
# Under valgrind, we can actually take 5 seconds to sign multiple inputs!
l1 = node_factory.get_node(random_hsm=True, broken_log="That's weird: Request signpsbt took")
l1 = node_factory.get_node(random_hsm=True)
addr = chainparams['example_addr']

# Add some funds to withdraw later
Expand Down
Loading