MINOR: threads: add a "stuck" flag to the thread_info struct

This flag is constantly cleared by the scheduler and will be set by the
watchdog timer to detect stuck threads. It is also set by the "show
threads" command so that it is easy to spot if the situation has evolved
between two subsequent calls : if the first "show threads" shows no stuck
thread and the second one shows such a stuck thread, it indicates that
this thread didn't manage to make any forward progress since the previous
call, which is extremely suspicious.
This commit is contained in:
Willy Tarreau 2019-05-22 07:06:44 +02:00
parent 578ea8be55
commit e6a02fa65a
4 changed files with 34 additions and 9 deletions

View File

@ -2525,11 +2525,18 @@ show threads
an advanced dump mechanism involving thread signals is used so that each
thread can dump its own state in turn. Without this option, the thread
processing the command shows all its details but the other ones are less
detailed. A stat ('*') is displayed in front of the thread handling the
command. The output format is purposely not documented so that it can easily
evolve as new needs are identified, without having to maintain any backwards
compatibility, and just like with "show activity", the values are only
meaningful with the code at hand.
detailed. A star ('*') is displayed in front of the thread handling the
command. A right angle bracket ('>') may also be displayed in front of
threads which didn't make any progress since last invocation of this command,
indicating a bug in the code which must absolutely be reported. When this
happens between two threads it usually indicates a deadlock. If a thread is
alone, it's a different bug like a corrupted list. In all cases the process
needs is not fully functional anymore and needs to be restarted.
The output format is purposely not documented so that it can easily evolve as
new needs are identified, without having to maintain any form of backwards
compatibility, and just like with "show activity", the values are meaningless
without the code at hand.
show tls-keys [id|*]
Dump all loaded TLS ticket keys references. The TLS ticket key reference ID

View File

@ -38,6 +38,10 @@
* only one thread is enabled, it equals 1.
*/
/* thread info flags, for thread_info[].flags */
#define TI_FL_STUCK 0x00000001
#ifndef USE_THREAD
#define MAX_THREADS 1
@ -57,6 +61,7 @@ extern struct thread_info {
uint64_t prev_cpu_time; /* previous per thread CPU time */
uint64_t prev_mono_time; /* previous system wide monotonic time */
unsigned int idle_pct; /* idle to total ratio over last sample (percent) */
unsigned int flags; /* thread info flags, TI_FL_* */
/* pad to cache line (64B) */
char __pad[0]; /* unused except to check remaining room */
char __end[0] __attribute__((aligned(64)));
@ -405,6 +410,7 @@ extern struct thread_info {
uint64_t prev_cpu_time; /* previous per thread CPU time */
uint64_t prev_mono_time; /* previous system wide monotonic time */
unsigned int idle_pct; /* idle to total ratio over last sample (percent) */
unsigned int flags; /* thread info flags, TI_FL_* */
/* pad to cache line (64B) */
char __pad[0]; /* unused except to check remaining room */
char __end[0] __attribute__((aligned(64)));

View File

@ -33,18 +33,20 @@
* optionally extra info for the current thread. The dump will be appended to
* the buffer, so the caller is responsible for preliminary initializing it.
* The calling thread ID needs to be passed in <calling_tid> to display a star
* in front of the calling thread's line (usually it's tid).
* in front of the calling thread's line (usually it's tid). Any stuck thread
* is also prefixed with a '>'.
*/
void ha_thread_dump(struct buffer *buf, int thr, int calling_tid)
{
unsigned long thr_bit = 1UL << thr;
unsigned long long p = thread_info[thr].prev_cpu_time;
unsigned long long n = now_cpu_time_thread(&thread_info[thr]);
int stuck = !!(thread_info[thr].flags & TI_FL_STUCK);
chunk_appendf(buf,
"%c Thread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n"
" fdcache=%d prof=%d",
(thr == calling_tid) ? '*' : ' ', thr + 1,
"%c%cThread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n"
" stuck=%d fdcache=%d prof=%d",
(thr == calling_tid) ? '*' : ' ', stuck ? '>' : ' ', thr + 1,
!!(active_tasks_mask & thr_bit),
!!(global_tasks_mask & thr_bit),
!eb_is_empty(&task_per_thread[thr].timers),
@ -52,6 +54,7 @@ void ha_thread_dump(struct buffer *buf, int thr, int calling_tid)
!LIST_ISEMPTY(&task_per_thread[thr].task_list),
task_per_thread[thr].task_list_size,
task_per_thread[thr].rqueue_size,
stuck,
!!(fd_cache_mask & thr_bit),
!!(task_profiling_mask & thr_bit));
@ -467,6 +470,12 @@ void debug_handler(int sig, siginfo_t *si, void *arg)
else
ha_thread_relax();
}
/* mark the current thread as stuck to detect it upon next invocation
* if it didn't move.
*/
if (!((threads_harmless_mask|sleeping_thread_mask) & tid_bit))
ti->flags |= TI_FL_STUCK;
}
static int init_debug_per_thread()

View File

@ -281,6 +281,8 @@ void process_runnable_tasks()
struct task *t;
int max_processed;
ti->flags &= ~TI_FL_STUCK; // this thread is still running
if (!(active_tasks_mask & tid_bit)) {
activity[tid].empty_rq++;
return;
@ -372,6 +374,7 @@ void process_runnable_tasks()
__ha_barrier_atomic_store();
__task_remove_from_tasklet_list(t);
ti->flags &= ~TI_FL_STUCK; // this thread is still running
activity[tid].ctxsw++;
ctx = t->context;
process = t->process;