MEDIUM: tasks: add a tune.sched.low-latency option

Now that all tasklet queues are scanned at once by run_tasks_from_lists(),
it becomes possible to always check for lower priority classes and jump
back to them when they exist.

This patch adds tune.sched.low-latency global setting to enable this
behavior. What it does is stick to the lowest ranked priority list in
which tasks are still present with an available budget, and leave the
loop to refill the tasklet lists if the trees got new tasks or if new
work arrived into the shared urgent queue.

Doing so allows to cut the latency in half when running with extremely
deep run queues (10k-100k), thus allowing forwarding of small and large
objects to coexist better. It remains off by default since it does have
a small impact on large traffic by default (shorter batches).
This commit is contained in:
Willy Tarreau 2020-06-24 11:11:02 +02:00
parent 59153fef86
commit e7723bddd7
3 changed files with 83 additions and 10 deletions

View File

@ -697,6 +697,7 @@ The following keywords are supported in the "global" section :
- tune.rcvbuf.server
- tune.recv_enough
- tune.runqueue-depth
- tune.sched.low-latency
- tune.sndbuf.client
- tune.sndbuf.server
- tune.ssl.cachesize
@ -2095,7 +2096,20 @@ tune.recv_enough <number>
tune.runqueue-depth <number>
Sets the maximum amount of task that can be processed at once when running
tasks. The default value is 200. Increasing it may incur latency when
dealing with I/Os, making it too small can incur extra overhead.
dealing with I/Os, making it too small can incur extra overhead. When
experimenting with much larger values, it may be useful to also enable
tune.sched.low-latency to limit the maximum latency to the lowest possible.
tune.sched.low-latency { on | off }
Enables ('on') or disables ('off') the low-latency task scheduler. By default
haproxy processes tasks from several classes one class at a time as this is
the most efficient. But when running with large values of tune.runqueue-depth
this can have a measurable effect on request or connection latency. When this
low-latency setting is enabled, tasks of lower priority classes will always
be executed before other ones if they exist. This will permit to lower the
maximum latency experienced by new requests or connections in the middle of
massive traffic, at the expense of a higher impact on this large traffic.
For regular usage it is better to leave this off. The default value is off.
tune.sndbuf.client <number>
tune.sndbuf.server <number>
@ -15838,11 +15852,12 @@ lat_ns_avg : integer
the value low, it is possible to reduce the scheduler's run queue depth using
"tune.runqueue-depth", to reduce the number of concurrent events processed at
once using "tune.maxpollevents", to decrease the stream's nice value using
the "nice" option on the "bind" lines or in the frontend, or to look for
other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
whose processing needs to be adjusted or fixed. Compression of large buffers
could be a culprit, like heavy regex or long lists of regex.
Note: this value is exactly lat_ns_tot divided by cpu_calls.
the "nice" option on the "bind" lines or in the frontend, to enable low
latency scheduling using "tune.sched.low-latency", or to look for other heavy
requests in logs (those exhibiting large values of "cpu_ns_avg"), whose
processing needs to be adjusted or fixed. Compression of large buffers could
be a culprit, like heavy regex or long lists of regex. Note: this value is
exactly lat_ns_tot divided by cpu_calls.
lat_ns_tot : integer
Returns the total number of nanoseconds spent between the moment the task
@ -15854,10 +15869,11 @@ lat_ns_tot : integer
the value low, it is possible to reduce the scheduler's run queue depth using
"tune.runqueue-depth", to reduce the number of concurrent events processed at
once using "tune.maxpollevents", to decrease the stream's nice value using
the "nice" option on the "bind" lines or in the frontend, or to look for
other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
whose processing needs to be adjusted or fixed. Compression of large buffers
could be a culprit, like heavy regex or long lists of regex. Note: while it
the "nice" option on the "bind" lines or in the frontend, to enable low
latency scheduling using "tune.sched.low-latency", or to look for other heavy
requests in logs (those exhibiting large values of "cpu_ns_avg"), whose
processing needs to be adjusted or fixed. Compression of large buffers could
be a culprit, like heavy regex or long lists of regex. Note: while it
may intuitively seem that the total latency adds to a transfer time, it is
almost never true because while a task waits for the CPU, network buffers
continue to fill up and the next call will process more at once. The value

View File

@ -67,6 +67,7 @@
#define GTUNE_INSECURE_FORK (1<<16)
#define GTUNE_INSECURE_SETUID (1<<17)
#define GTUNE_FD_ET (1<<18)
#define GTUNE_SCHED_LOW_LATENCY (1<<19)
/* SSL server verify mode */
enum {

View File

@ -16,6 +16,7 @@
#include <import/eb32tree.h>
#include <haproxy/api.h>
#include <haproxy/cfgparse.h>
#include <haproxy/fd.h>
#include <haproxy/freq_ctr.h>
#include <haproxy/list.h>
@ -328,6 +329,7 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
struct task *(*process)(struct task *t, void *ctx, unsigned short state);
struct list *tl_queues = sched->tasklets;
struct task *t;
uint8_t budget_mask = (1 << TL_CLASSES) - 1;
unsigned int done = 0;
unsigned int queue;
unsigned short state;
@ -336,6 +338,33 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
for (queue = 0; queue < TL_CLASSES;) {
sched->current_queue = queue;
/* global.tune.sched.low-latency is set */
if (global.tune.options & GTUNE_SCHED_LOW_LATENCY) {
if (unlikely(sched->tl_class_mask & budget_mask & ((1 << queue) - 1))) {
/* a lower queue index has tasks again and still has a
* budget to run them. Let's switch to it now.
*/
queue = (sched->tl_class_mask & 1) ? 0 :
(sched->tl_class_mask & 2) ? 1 : 2;
continue;
}
if (unlikely(queue > TL_URGENT &&
budget_mask & (1 << TL_URGENT) &&
!MT_LIST_ISEMPTY(&sched->shared_tasklet_list))) {
/* an urgent tasklet arrived from another thread */
break;
}
if (unlikely(queue > TL_NORMAL &&
budget_mask & (1 << TL_NORMAL) &&
((sched->rqueue_size > 0) ||
(global_tasks_mask & tid_bit)))) {
/* a task was woken up by a bulk tasklet or another thread */
break;
}
}
if (LIST_ISEMPTY(&tl_queues[queue])) {
sched->tl_class_mask &= ~(1 << queue);
queue++;
@ -343,6 +372,7 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
}
if (!budgets[queue]) {
budget_mask &= ~(1 << queue);
queue++;
continue;
}
@ -687,6 +717,32 @@ static void init_task()
}
}
/* config parser for global "tune.sched.low-latency", accepts "on" or "off" */
static int cfg_parse_tune_sched_low_latency(char **args, int section_type, struct proxy *curpx,
struct proxy *defpx, const char *file, int line,
char **err)
{
if (too_many_args(1, args, err, NULL))
return -1;
if (strcmp(args[1], "on") == 0)
global.tune.options |= GTUNE_SCHED_LOW_LATENCY;
else if (strcmp(args[1], "off") == 0)
global.tune.options &= ~GTUNE_SCHED_LOW_LATENCY;
else {
memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]);
return -1;
}
return 0;
}
/* config keyword parsers */
static struct cfg_kw_list cfg_kws = {ILH, {
{ CFG_GLOBAL, "tune.sched.low-latency", cfg_parse_tune_sched_low_latency },
{ 0, NULL, NULL }
}};
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
INITCALL0(STG_PREPARE, init_task);
/*