MINOR: task: add one extra tasklet class: TL_HEAVY

This class will be used exclusively for heavy processing tasklets. It
will be cleaner than mixing them with the bulk ones. For now it's
allocated ~1% of the CPU bandwidth.

The largest part of the patch consists in re-arranging the fields in the
task_per_thread structure to preserve a clean alignment with one more
list head. Since we're now forced to increase the struct past a second
cache line, it now uses 4 cache lines (for easy multiplying) with the
first two ones being exclusively used by local operations and the third
one mostly by atomic operations. Interestingly, this better arrangement
causes less stress and reduced the response time by 8 microseconds at
1 million requests per second.
This commit is contained in:
Willy Tarreau 2021-02-26 09:16:22 +01:00
parent 6ac61e39c4
commit 401135cee6
2 changed files with 29 additions and 13 deletions

View File

@ -58,7 +58,8 @@
enum {
TL_URGENT = 0, /* urgent tasklets (I/O callbacks) */
TL_NORMAL = 1, /* normal tasks */
TL_BULK = 2, /* bulk task/tasklets, streaming I/Os */
TL_BULK = 2, /* bulk task/tasklets, streaming I/Os */
TL_HEAVY = 3, /* heavy computational tasklets (e.g. TLS handshakes) */
TL_CLASSES /* must be last */
};
@ -73,18 +74,25 @@ struct notification {
/* force to split per-thread stuff into separate cache lines */
struct task_per_thread {
// first and second cache lines on 64 bits: thread-local operations only.
struct eb_root timers; /* tree constituting the per-thread wait queue */
struct eb_root rqueue; /* tree constituting the per-thread run queue */
struct mt_list shared_tasklet_list; /* Tasklet to be run, woken up by other threads */
struct list tasklets[TL_CLASSES]; /* tasklets (and/or tasks) to run, by class */
unsigned int rqueue_ticks; /* Insertion counter for the run queue */
int tasks_in_list; /* Number of tasks in the per-thread tasklets list */
int current_queue; /* points to current tasklet list being run, -1 if none */
unsigned int rq_total; /* total size of the run queue, prio_tree + tasklets */
struct task *current; /* current task (not tasklet) */
unsigned int rqueue_ticks; /* Insertion counter for the run queue */
int current_queue; /* points to current tasklet list being run, -1 if none */
unsigned int nb_tasks; /* number of tasks allocated on this thread */
uint8_t tl_class_mask; /* bit mask of non-empty tasklets classes */
__attribute__((aligned(64))) char end[0];
// 11 bytes hole here
ALWAYS_ALIGN(2*sizeof(void*));
struct list tasklets[TL_CLASSES]; /* tasklets (and/or tasks) to run, by class */
// third cache line here on 64 bits: accessed mostly using atomic ops
ALWAYS_ALIGN(64);
struct mt_list shared_tasklet_list; /* Tasklet to be run, woken up by other threads */
unsigned int rq_total; /* total size of the run queue, prio_tree + tasklets */
int tasks_in_list; /* Number of tasks in the per-thread tasklets list */
ALWAYS_ALIGN(128);
};

View File

@ -635,6 +635,7 @@ void process_runnable_tasks()
[TL_URGENT] = 64, // ~50% of CPU bandwidth for I/O
[TL_NORMAL] = 48, // ~37% of CPU bandwidth for tasks
[TL_BULK] = 16, // ~13% of CPU bandwidth for self-wakers
[TL_HEAVY] = 1, // never more than 1 heavy task at once
};
unsigned int max[TL_CLASSES]; // max to be run per class
unsigned int max_total; // sum of max above
@ -673,6 +674,14 @@ void process_runnable_tasks()
if ((tt->tl_class_mask & (1 << TL_BULK)))
max[TL_BULK] = default_weights[TL_BULK];
/* heavy tasks are processed only once and never refilled in a
* call round.
*/
if ((tt->tl_class_mask & (1 << TL_HEAVY)))
max[TL_HEAVY] = default_weights[TL_HEAVY];
else
max[TL_HEAVY] = 0;
/* Now compute a fair share of the weights. Total may slightly exceed
* 100% due to rounding, this is not a problem. Note that while in
* theory the sum cannot be NULL as we cannot get there without tasklets
@ -681,7 +690,7 @@ void process_runnable_tasks()
* a first MT_LIST_ISEMPTY() to succeed for thread_has_task() and the
* one above to finally fail. This is extremely rare and not a problem.
*/
max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK];
max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK] + max[TL_HEAVY];
if (!max_total)
return;
@ -881,7 +890,7 @@ void mworker_cleantasks()
/* perform minimal intializations */
static void init_task()
{
int i;
int i, q;
#ifdef USE_THREAD
memset(&timers, 0, sizeof(timers));
@ -889,9 +898,8 @@ static void init_task()
#endif
memset(&task_per_thread, 0, sizeof(task_per_thread));
for (i = 0; i < MAX_THREADS; i++) {
LIST_INIT(&task_per_thread[i].tasklets[TL_URGENT]);
LIST_INIT(&task_per_thread[i].tasklets[TL_NORMAL]);
LIST_INIT(&task_per_thread[i].tasklets[TL_BULK]);
for (q = 0; q < TL_CLASSES; q++)
LIST_INIT(&task_per_thread[i].tasklets[q]);
MT_LIST_INIT(&task_per_thread[i].shared_tasklet_list);
}
}