From 2b4b1479ed9095ab4439b9e728d73c140a510b2e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 10 Feb 2025 16:37:58 +1030 Subject: [PATCH] gossipd: check that gossmap code sees updates from gossip_store writes. After analyzing various weird cases where we ended up with duplicate gossip_store entries, it could be explained by us not fully processing the gossip store. It's not clear that my assumptions that we would always see our own writes are true: technically this may require an fsync(). So we now add the check, and do an fsync and try again. Signed-off-by: Rusty Russell Changelog-Fixed: gossipd: more sanity checks that we are correctly updating the gossip_store file. --- common/gossmap.c | 14 +++++++++ common/gossmap.h | 3 ++ gossipd/gossip_store.c | 2 +- gossipd/gossip_store.h | 1 + gossipd/gossmap_manage.c | 61 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 1 deletion(-) diff --git a/common/gossmap.c b/common/gossmap.c index 524cf1b4f..42cc1527a 100644 --- a/common/gossmap.c +++ b/common/gossmap.c @@ -1851,3 +1851,17 @@ int gossmap_fd(const struct gossmap *map) { return map->fd; } + +const u8 *gossmap_fetch_tail(const tal_t *ctx, const struct gossmap *map) +{ + size_t len; + u8 *p; + + /* Shouldn't happen... */ + if (map->map_end > map->map_size) + return NULL; + len = map->map_size - map->map_end; + p = tal_arr(ctx, u8, len); + map_copy(map, map->map_size, p, len); + return p; +} diff --git a/common/gossmap.h b/common/gossmap.h index dfdee55ee..14a17a047 100644 --- a/common/gossmap.h +++ b/common/gossmap.h @@ -304,4 +304,7 @@ u64 gossmap_lengths(const struct gossmap *map, u64 *total); /* Debugging: connectd wants to enumerate fds */ int gossmap_fd(const struct gossmap *map); + +/* Fetch unprocessed part of gossmap */ +const u8 *gossmap_fetch_tail(const tal_t *ctx, const struct gossmap *map); #endif /* LIGHTNING_COMMON_GOSSMAP_H */ diff --git a/gossipd/gossip_store.c b/gossipd/gossip_store.c index 0c8b0934e..906b6c10f 100644 --- a/gossipd/gossip_store.c +++ b/gossipd/gossip_store.c @@ -408,7 +408,7 @@ struct gossip_store *gossip_store_new(const tal_t *ctx, return gs; } -static void gossip_store_fsync(const struct gossip_store *gs) +void gossip_store_fsync(const struct gossip_store *gs) { if (fsync(gs->fd) != 0) status_failed(STATUS_FAIL_INTERNAL_ERROR, diff --git a/gossipd/gossip_store.h b/gossipd/gossip_store.h index a75ca0666..c75696f08 100644 --- a/gossipd/gossip_store.h +++ b/gossipd/gossip_store.h @@ -110,5 +110,6 @@ void gossip_store_set_timestamp(struct gossip_store *gs, u64 offset, u32 timesta * For debugging. */ u64 gossip_store_len_written(const struct gossip_store *gs); +void gossip_store_fsync(const struct gossip_store *gs); #endif /* LIGHTNING_GOSSIPD_GOSSIP_STORE_H */ diff --git a/gossipd/gossmap_manage.c b/gossipd/gossmap_manage.c index 93a7c5bad..19e1a9818 100644 --- a/gossipd/gossmap_manage.c +++ b/gossipd/gossmap_manage.c @@ -1365,9 +1365,70 @@ void gossmap_manage_channel_spent(struct gossmap_manage *gm, } } +/* Fetch the part of the gossmap we didn't process via read() */ +static const u8 *fetch_tail_fd(const tal_t *ctx, + int gossmap_fd, + u64 map_used, u64 map_size) +{ + size_t len; + ssize_t r; + u8 *p; + + /* Shouldn't happen... */ + if (map_used > map_size) + return NULL; + len = map_size - map_used; + p = tal_arrz(ctx, u8, len); + r = pread(gossmap_fd, p, len, map_used); + if (r != len) + status_broken("Partial read on gossmap EOF (%zi vs %zu)", + r, len); + return p; +} + struct gossmap *gossmap_manage_get_gossmap(struct gossmap_manage *gm) { + u64 map_used, map_size, written_len; + gossmap_refresh(gm->raw_gossmap); + + /* Sanity check that we see everything we wrote. */ + map_used = gossmap_lengths(gm->raw_gossmap, &map_size); + written_len = gossip_store_len_written(gm->gs); + + if (map_size != written_len) { + status_broken("gossmap size %"PRIu64" != written size %"PRIu64, + map_size, written_len); + /* Push harder! */ + gossip_store_fsync(gm->gs); + gossmap_refresh(gm->raw_gossmap); + + /* Sanity check that we see everything we wrote. */ + map_used = gossmap_lengths(gm->raw_gossmap, &map_size); + written_len = gossip_store_len_written(gm->gs); + if (map_used != written_len || map_size != map_used) + status_failed(STATUS_FAIL_INTERNAL_ERROR, + "gossmap read inconsistent even after sync" + " used=%"PRIu64" seen=%"PRIu64" written=%"PRIu64, + map_used, map_size, written_len); + } else if (map_size != map_used) { + const u8 *remainder_fd, *remainder_mmap; + + /* Sigh. Did gossmap see something different (via mmap) + * from what we see via read? It's possible it's caught up + * now, but just in case, log BOTH */ + remainder_mmap = gossmap_fetch_tail(tmpctx, gm->raw_gossmap); + remainder_fd = fetch_tail_fd(tmpctx, + gossmap_fd(gm->raw_gossmap), + map_used, map_size); + status_failed(STATUS_FAIL_INTERNAL_ERROR, + "Gossmap failed to process entire gossip_store: " + "at %"PRIu64" of %"PRIu64" remaining_mmap=%s remaining_fd=%s", + map_used, map_size, + tal_hex(tmpctx, remainder_mmap), + tal_hex(tmpctx, remainder_fd)); + } + return gm->raw_gossmap; }