Files
palladum-lightning/common/trace.c

455 lines
11 KiB
C
Raw Normal View History

2023-07-27 14:43:53 +09:30
#include "config.h"
#include <assert.h>
#include <ccan/array_size/array_size.h>
#include <ccan/crypto/siphash24/siphash24.h>
#include <ccan/endian/endian.h>
#include <ccan/err/err.h>
#include <ccan/htable/htable_type.h>
2023-07-27 14:43:53 +09:30
#include <ccan/str/hex/hex.h>
#include <ccan/tal/str/str.h>
#include <common/json_stream.h>
#include <common/memleak.h>
#include <common/pseudorand.h>
2023-07-27 14:43:53 +09:30
#include <common/trace.h>
#include <inttypes.h>
2023-07-27 14:43:53 +09:30
#include <stdio.h>
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
#include <unistd.h>
2023-07-27 14:43:53 +09:30
#if HAVE_USDT
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
#include <sys/sdt.h>
/* The traceperent format is defined in W3C Trace Context RFC[1].
* Its format is defined as
*
* ```
* version-format = trace-id "-" parent-id "-" trace-flags
* trace-id = 32HEXDIGLC ; 16 bytes array identifier. All zeroes forbidden
* parent-id = 16HEXDIGLC ; 8 bytes array identifier. All zeroes forbidden
* trace-flags = 2HEXDIGLC ; 8 bit flags. Currently, only one bit is used.
* ```
*
* [1] https://www.w3.org/TR/trace-context/
*/
#define TRACEPARENT_LEN (2 + 1 + 32 + 1 + 16 + 1 + 2)
2023-07-27 14:43:53 +09:30
#ifdef TRACE_DEBUG
#define TRACE_DBG(args...) fprintf(stderr, args)
#else
#define TRACE_DBG(args...)
#endif
2023-07-27 14:43:53 +09:30
const char *trace_service_name = "lightningd";
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
static bool disable_trace = false;
static FILE *trace_to_file = NULL;
2023-07-27 14:43:53 +09:30
#define SPAN_MAX_TAGS 2
2023-07-27 14:43:53 +09:30
struct span_tag {
const char *name;
const char *valuestr;
int valuelen;
2023-07-27 14:43:53 +09:30
};
struct span {
/* Our own id */
u64 id;
/* The trace_id for this span and all its children. */
u64 trace_id_hi, trace_id_lo;
2023-07-27 14:43:53 +09:30
u64 start_time;
u64 end_time;
/* The unique key used to find ourselves in the active
* spans. */
size_t key;
struct span *parent;
struct span_tag tags[SPAN_MAX_TAGS];
const char *name;
bool suspended;
2023-07-27 14:43:53 +09:30
};
static size_t span_keyof(const struct span *span)
{
return span->key;
}
static size_t span_key_hash(size_t key)
{
return siphash24(siphash_seed(), &key, sizeof(key));
}
static bool span_key_eq(const struct span *span, size_t key)
{
return span->key == key;
}
HTABLE_DEFINE_NODUPS_TYPE(struct span, span_keyof, span_key_hash, span_key_eq,
span_htable);
static struct span fixed_spans[8];
static struct span_htable *spans = NULL;
2023-07-27 14:43:53 +09:30
static struct span *current;
static void init_span(struct span *s,
size_t key,
const char *name,
struct span *parent)
{
struct timeabs now = time_now(); /* discouraged: but tracing wants non-dev time */
s->key = key;
s->id = pseudorand_u64();
s->start_time = (now.ts.tv_sec * 1000000) + now.ts.tv_nsec / 1000;
s->parent = parent;
s->name = name;
s->suspended = false;
for (size_t i = 0; i < SPAN_MAX_TAGS; i++)
s->tags[i].name = NULL;
/* If this is a new root span we also need to associate a new
* trace_id with it. */
if (!s->parent) {
s->trace_id_hi = pseudorand_u64();
s->trace_id_lo = pseudorand_u64();
} else {
s->trace_id_hi = current->trace_id_hi;
s->trace_id_lo = current->trace_id_lo;
}
span_htable_add(spans, s);
}
/* FIXME: forward decls for minimal patch size */
static struct span *trace_span_slot(void);
static size_t trace_key(const void *key);
static void trace_span_clear(struct span *s);
/* If the `CLN_TRACEPARENT` envvar is set, we inject that as the
* parent for the startup. This allows us to integrate the startup
* tracing with whatever tooling we build around it. This only has an
* effect if the envvar is set, otherwise the startup will create its
* own parent. */
static void trace_inject_traceparent(void)
2023-07-27 14:43:53 +09:30
{
const char *traceparent;
be64 trace_hi, trace_lo, span;
traceparent = getenv("CLN_TRACEPARENT");
if (!traceparent)
return;
assert(strlen(traceparent) == TRACEPARENT_LEN);
current = trace_span_slot();
assert(current);
init_span(current, trace_key(&spans), "", NULL);
assert(current && !current->parent);
if (!hex_decode(traceparent + 3, 16, &trace_hi, sizeof(trace_hi))
|| !hex_decode(traceparent + 3 + 16, 16, &trace_lo, sizeof(trace_lo))
|| !hex_decode(traceparent + 3 + 16 + 16 + 1, 16, &span, sizeof(span))) {
/* We failed to parse the traceparent, abandon. */
fprintf(stderr, "Failed!");
trace_span_clear(current);
current = NULL;
} else {
current->trace_id_hi = be64_to_cpu(trace_hi);
current->trace_id_lo = be64_to_cpu(trace_lo);
current->id = be64_to_cpu(span);
}
2023-07-27 14:43:53 +09:30
}
static void memleak_scan_spans(struct htable *memtable, struct span_htable *spantable)
{
struct span_htable_iter i;
const struct span *span;
for (span = span_htable_first(spantable, &i);
span;
span = span_htable_next(spantable, &i)) {
memleak_ptr(memtable, span);
memleak_scan_region(memtable, span, sizeof(*span));
}
}
static void trace_init(void)
{
const char *dev_trace_file;
if (spans)
return;
/* We can't use new_htable here because we put non-tal
* objects in our htable, and that breaks memleak_scan_htable! */
spans = notleak(tal(NULL, struct span_htable));
memleak_add_helper(spans, memleak_scan_spans);
span_htable_init(spans);
2023-07-27 14:43:53 +09:30
current = NULL;
dev_trace_file = getenv("CLN_DEV_TRACE_FILE");
if (dev_trace_file) {
const char *fname = tal_fmt(tmpctx, "%s.%u",
dev_trace_file, (unsigned)getpid());
trace_to_file = fopen(fname, "a+");
if (!trace_to_file)
err(1, "Opening CLN_DEV_TRACE_FILE %s", fname);
}
trace_inject_traceparent();
2023-07-27 14:43:53 +09:30
}
/**
* Convert the pointer to a context object to a numeric key.
*/
static size_t trace_key(const void *key)
{
return (size_t)key;
}
static struct span *trace_span_find(size_t key)
{
return span_htable_get(spans, key);
2023-07-27 14:43:53 +09:30
}
/**
* Find an empty slot for a new span.
*/
static struct span *trace_span_slot(void)
{
/* Look for a free fixed slot. */
for (size_t i = 0; i < ARRAY_SIZE(fixed_spans); i++) {
if (fixed_spans[i].key == 0)
return &fixed_spans[i];
}
/* Those are used up, we have to allocate. */
return tal(spans, struct span);
2023-07-27 14:43:53 +09:30
}
#define MAX_BUF_SIZE 2048
2023-07-27 14:43:53 +09:30
static void trace_emit(struct span *s)
{
char span_id[hex_str_size(sizeof(s->id))];
char buffer[MAX_BUF_SIZE + 1];
size_t len;
snprintf(span_id, sizeof(span_id), "%016"PRIx64, s->id);
len = snprintf(buffer, MAX_BUF_SIZE,
"[{\"id\":\"%s\",\"name\":\"%s\","
"\"timestamp\":%"PRIu64",\"duration\":%"PRIu64","
"\"localEndpoint\":{\"serviceName\":\"%s\"},",
span_id, s->name, s->start_time, s->end_time - s->start_time, trace_service_name);
2023-07-27 14:43:53 +09:30
if (s->parent != NULL) {
len += snprintf(buffer + len, MAX_BUF_SIZE - len,
"\"parentId\":\"%016"PRIx64"\",",
s->parent->id);
if (len > MAX_BUF_SIZE)
len = MAX_BUF_SIZE;
2023-07-27 14:43:53 +09:30
}
len += snprintf(buffer + len, MAX_BUF_SIZE - len,
"\"tags\":{");
if (len > MAX_BUF_SIZE)
len = MAX_BUF_SIZE;
for (size_t i = 0; i < SPAN_MAX_TAGS; i++) {
if (!s->tags[i].name)
continue;
len += snprintf(buffer + len, MAX_BUF_SIZE - len,
"%s\"%s\":\"%.*s\"", i == 0 ? "" : ", ",
s->tags[i].name,
s->tags[i].valuelen,
s->tags[i].valuestr);
if (len > MAX_BUF_SIZE)
len = MAX_BUF_SIZE;
2023-07-27 14:43:53 +09:30
}
len += snprintf(buffer + len, MAX_BUF_SIZE - len,
"},\"traceId\":\"%016"PRIx64"%016"PRIx64"\"}]",
s->trace_id_hi, s->trace_id_lo);
if (len > MAX_BUF_SIZE)
len = MAX_BUF_SIZE;
buffer[len] = '\0';
/* FIXME: span_id here is in hex, could be u64? */
DTRACE_PROBE2(lightningd, span_emit, span_id, buffer);
if (trace_to_file) {
fprintf(trace_to_file, "span_emit %s %s\n", span_id, buffer);
fflush(trace_to_file);
}
2023-07-27 14:43:53 +09:30
}
/**
* Release the span back into the pool of available spans.
*/
static void trace_span_clear(struct span *s)
{
if (!span_htable_del(spans, s))
abort();
/* If s is actually in fixed_spans, just zero it out. */
if (s >= fixed_spans && s < fixed_spans + ARRAY_SIZE(fixed_spans)) {
s->key = 0;
return;
}
/* Dynamically allocated, so we need to free it */
tal_free(s);
2023-07-27 14:43:53 +09:30
}
void trace_span_start_(const char *name, const void *key)
2023-07-27 14:43:53 +09:30
{
size_t numkey = trace_key(key);
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (disable_trace)
return;
trace_init();
2023-07-27 14:43:53 +09:30
assert(trace_span_find(numkey) == NULL);
struct span *s = trace_span_slot();
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (!s)
return;
init_span(s, numkey, name, current);
2023-07-27 14:43:53 +09:30
current = s;
DTRACE_PROBE1(lightningd, span_start, s->id);
if (trace_to_file) {
fprintf(trace_to_file, "span_start %016"PRIx64"\n", s->id);
fflush(trace_to_file);
}
2023-07-27 14:43:53 +09:30
}
void trace_span_remote(u64 trace_id_hi, u64 trade_id_lo, u64 span_id)
{
abort();
}
2023-07-27 14:43:53 +09:30
void trace_span_end(const void *key)
{
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (disable_trace)
return;
2023-07-27 14:43:53 +09:30
size_t numkey = trace_key(key);
struct span *s = trace_span_find(numkey);
assert(s && "Span to end not found");
assert(s == current && "Ending a span that isn't the current one");
struct timeabs now = time_now(); /* discouraged: but tracing wants non-dev time */
2023-07-27 14:43:53 +09:30
s->end_time = (now.ts.tv_sec * 1000000) + now.ts.tv_nsec / 1000;
DTRACE_PROBE1(lightningd, span_end, s->id);
if (trace_to_file) {
fprintf(trace_to_file, "span_end %016"PRIx64"\n", s->id);
fflush(trace_to_file);
}
2023-07-27 14:43:53 +09:30
trace_emit(s);
/* Reset the context span we are in. */
current = s->parent;
/* Now reset the span */
trace_span_clear(s);
}
void trace_span_tag(const void *key, const char *name, const char *value)
{
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (disable_trace)
return;
assert(name);
2023-07-27 14:43:53 +09:30
size_t numkey = trace_key(key);
struct span *span = trace_span_find(numkey);
assert(span);
for (size_t i = 0; i < SPAN_MAX_TAGS; i++) {
struct span_tag *t = &span->tags[i];
if (!t->name) {
t->name = name;
t->valuestr = value;
t->valuelen = strlen(value);
if (t->valuestr[0] == '"'
&& t->valuelen > 1
&& t->valuestr[t->valuelen-1] == '"') {
t->valuestr++;
t->valuelen -= 2;
}
return;
}
2024-11-13 14:52:43 +10:30
}
abort();
2023-07-27 14:43:53 +09:30
}
void trace_span_suspend_(const void *key, const char *lbl)
2023-07-27 14:43:53 +09:30
{
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (disable_trace)
return;
size_t numkey = trace_key(key);
struct span *span = trace_span_find(numkey);
TRACE_DBG("Suspending span %s (%zu)\n", current->name, current->key);
assert(current == span);
assert(!span->suspended);
span->suspended = true;
current = current->parent;
DTRACE_PROBE1(lightningd, span_suspend, span->id);
if (trace_to_file) {
fprintf(trace_to_file, "span_suspend %016"PRIx64"\n", span->id);
fflush(trace_to_file);
}
2023-07-27 14:43:53 +09:30
}
static void destroy_trace_span(const void *key)
{
size_t numkey = trace_key(key);
struct span *span = trace_span_find(numkey);
/* It's usually ended normally. */
if (!span)
return;
/* Otherwise resume so we can terminate it */
if (trace_to_file)
fprintf(trace_to_file, "destroying span\n");
trace_span_resume(key);
trace_span_end(key);
}
void trace_span_suspend_may_free_(const void *key, const char *lbl)
{
if (disable_trace)
return;
trace_span_suspend_(key, lbl);
tal_add_destructor(key, destroy_trace_span);
}
void trace_span_resume_(const void *key, const char *lbl)
2023-07-27 14:43:53 +09:30
{
trace: minimal fix to avoid crash when > 128 traces active. chanbackup with many peers can do more than 128 concurrent rpc commands. autoclean is the other plugin which can do many requests at once, so I expect a similar issue there. I tested this by rebuilding with `MAX_ACTIVE_SPANS` 1, which autoclean tests triggered immediately. The real fix is probably to use a hash table with a large initial size. ``` Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: common/trace.c:190: trace_span_slot: Assertion `s' failed. Mar 24 06:30:45 mlbb2 sh[28000]: chanbackup: FATAL SIGNAL 6 (version v25.02) Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232bac4f send_backtrace Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:33 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232baceb crashdump Mar 24 06:30:45 mlbb2 sh[28000]: common/daemon.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd851f ??? Mar 24 06:30:45 mlbb2 sh[28000]: ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_implementation Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:44 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __pthread_kill_internal Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:78 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958d2c9fc __GI___pthread_kill Mar 24 06:30:45 mlbb2 sh[28000]: ./nptl/pthread_kill.c:89 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cd8475 __GI_raise Mar 24 06:30:45 mlbb2 sh[28000]: ../sysdeps/posix/raise.c:26 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe7f2 __GI_abort Mar 24 06:30:45 mlbb2 sh[28000]: ./stdlib/abort.c:79 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958cbe71a __assert_fail_base Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:94 Mar 24 06:30:45 mlbb2 sh[28000]: 0x7f2958ccfe95 __GI___assert_fail Mar 24 06:30:45 mlbb2 sh[28000]: ./assert/assert.c:103 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232ab7fa trace_span_slot Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:190 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232abc9f trace_span_start Mar 24 06:30:45 mlbb2 sh[28000]: common/trace.c:267 Mar 24 06:30:45 mlbb2 sh[28000]: 0x5575232a7c34 send_outreq Mar 24 06:30:45 mlbb2 sh[28000]: plugins/libplugin.c:1112 ``` Changelog-Fixed: autoclean/chanbackup: fixed tracepoint crash on large number of requests. Fixes: https://github.com/ElementsProject/lightning/issues/8177 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-03-31 15:12:01 +10:30
if (disable_trace)
return;
2023-07-27 14:43:53 +09:30
size_t numkey = trace_key(key);
current = trace_span_find(numkey);
assert(current->suspended);
current->suspended = false;
TRACE_DBG("Resuming span %s (%zu)\n", current->name, current->key);
DTRACE_PROBE1(lightningd, span_resume, current->id);
if (trace_to_file) {
fprintf(trace_to_file, "span_resume %016"PRIx64"\n", current->id);
fflush(trace_to_file);
}
2023-07-27 14:43:53 +09:30
}
void trace_cleanup(void)
{
spans = tal_free(spans);
}
2023-07-27 14:43:53 +09:30
#else /* HAVE_USDT */
void trace_span_start_(const char *name, const void *key) {}
2023-07-27 14:43:53 +09:30
void trace_span_end(const void *key) {}
void trace_span_suspend_(const void *key, const char *lbl) {}
void trace_span_suspend_may_free_(const void *key, const char *lbl) {}
void trace_span_resume_(const void *key, const char *lbl) {}
2023-07-27 14:43:53 +09:30
void trace_span_tag(const void *key, const char *name, const char *value) {}
void trace_cleanup(void) {}
2023-07-27 14:43:53 +09:30
#endif /* HAVE_USDT */