From 0950778b3a13fe31ff83223827d6692076cba5e5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Nov 2024 11:20:45 +0100 Subject: [PATCH] MINOR: debug: add a function to dump a stuck thread There's currently no way to just emit a warning informing that a thread is stuck without crashing. This is a problem because sometimes users would benefit from this info to clean up their configuration (e.g. abuse of map_regm, lua-load etc). This commit adds a new function ha_stuck_warning() that will emit a warning indicating that the designated thread has been stuck for XX milliseconds, with a number of streams blocked, and will make that thread dump its own state. The warning will then be sent to stderr, along with some reminders about the impacts of such situations to encourage users to fix their configuration. In order not to disrupt operations, a local 4kB buffer is allocated in the stack. This should be quite sufficient. For now the function is not used. --- include/haproxy/debug.h | 1 + src/debug.c | 78 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/include/haproxy/debug.h b/include/haproxy/debug.h index f01441c9f7..4bdec78da6 100644 --- a/include/haproxy/debug.h +++ b/include/haproxy/debug.h @@ -30,6 +30,7 @@ void ha_thread_dump_one(int thr, int from_signal); void ha_dump_backtrace(struct buffer *buf, const char *prefix, int dump); void ha_backtrace_to_stderr(void); void ha_panic(void); +void ha_stuck_warning(int thr); void post_mortem_add_component(const char *name, const char *version, const char *toolchain, const char *toolchain_opts, diff --git a/src/debug.c b/src/debug.c index 6c3bcbc730..4d79ef17c0 100644 --- a/src/debug.c +++ b/src/debug.c @@ -729,6 +729,84 @@ void ha_panic() abort(); } +/* Dumps a state of the current thread on fd #2 and returns. It takes a great + * care about not using any global state variable so as to gracefully recover. + */ +void ha_stuck_warning(int thr) +{ + char msg_buf[4096]; + struct buffer buf; + ullong n, p; + + if (get_tainted() & TAINTED_PANIC) { + /* a panic dump is already in progress, let's not disturb it, + * we'll be called via signal DEBUGSIG. By returning we may be + * able to leave a current signal handler (e.g. WDT) so that + * this will ensure more reliable signal delivery. + */ + return; + } + + buf = b_make(msg_buf, sizeof(msg_buf), 0, 0); + + p = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].prev_cpu_time); + n = now_cpu_time_thread(thr); + + chunk_printf(&buf, + "\nWARNING! thread %u has stopped processing traffic for %llu milliseconds\n" + " with %d streams currently blocked, prevented from making any progress.\n" + " While this may occasionally happen with inefficient configurations\n" + " involving excess of regular expressions, map_reg, or heavy Lua processing,\n" + " this must remain exceptional because the system's stability is now at risk.\n" + " Timers in logs may be reported incorrectly, spurious timeouts may happen,\n" + " some incoming connections may silently be dropped, health checks may\n" + " randomly fail, and accesses to the CLI may block the whole process. Please\n" + " check the trace below for any clues about configuration elements that need\n" + " to be corrected:\n\n", + thr + 1, (n - p) / 1000000ULL, + HA_ATOMIC_LOAD(&ha_thread_ctx[thr].stream_cnt)); + + DISGUISE(write(2, buf.area, buf.data)); + + /* Note below: the target thread will dump itself */ + chunk_reset(&buf); + if (ha_thread_dump_fill(&buf, thr)) { + DISGUISE(write(2, buf.area, buf.data)); + /* restore the thread's dump pointer for easier post-mortem analysis */ + ha_thread_dump_done(NULL, thr); + } + + chunk_printf(&buf, " => Trying to gracefully recover now.\n"); + DISGUISE(write(2, buf.area, buf.data)); + +#ifdef USE_LUA + if (get_tainted() & TAINTED_LUA_STUCK_SHARED && global.nbthread > 1) { + chunk_printf(&buf, + "### Note: at least one thread was stuck in a Lua context loaded using the\n" + " 'lua-load' directive, which is known for causing heavy contention\n" + " when used with threads. Please consider using 'lua-load-per-thread'\n" + " instead if your code is safe to run in parallel on multiple threads.\n"); + DISGUISE(write(2, buf.area, buf.data)); + } + else if (get_tainted() & TAINTED_LUA_STUCK) { + chunk_printf(&buf, + "### Note: at least one thread was stuck in a Lua context in a way that suggests\n" + " heavy processing inside a dependency or a long loop that can't yield.\n" + " Please make sure any external code you may rely on is safe for use in\n" + " an event-driven engine.\n"); + DISGUISE(write(2, buf.area, buf.data)); + } +#endif + if (get_tainted() & TAINTED_MEM_TRIMMING_STUCK) { + chunk_printf(&buf, + "### Note: one thread was found stuck under malloc_trim(), which can run for a\n" + " very long time on large memory systems. You way want to disable this\n" + " memory reclaiming feature by setting 'no-memory-trimming' in the\n" + " 'global' section of your configuration to avoid this in the future.\n"); + DISGUISE(write(2, buf.area, buf.data)); + } +} + /* Complain with message on stderr. If is not NULL, it is * atomically incremented, and the message is only printed when the counter * was zero, so that the message is only printed once. is only checked