MINOR: ring: make the number of queues configurable

Now the rings have one wait queue per group. This should limit the
contention on systems such as EPYC CPUs where the performance drops
dramatically when using more than one CCX.

Tests were run with different numbers and it was showed that value
6 outperforms all other ones at 12, 24, 48, 64 and 80 threads on an
EPYC, a Xeon and an Ampere CPU. Value 7 sometimes comes close and
anything around these values degrades quickly. The value has been
left tunable in the global section.

This commit only introduces everything needed to set up the queue count
so that it's easier to adjust it in the forthcoming patches, but it was
initially added after the series, making it harder to compare.

It was also shown that trying to group the threads in queues by their
thread groups is counter-productive and that it was more efficient to
do that by applying a modulo on the thread number. As surprising as it
seems, it does have the benefit of well balancing any number of threads.
This commit is contained in:
Willy Tarreau 2024-03-14 08:57:02 +01:00
parent e3f101a19a
commit 6c1b29d06f
7 changed files with 78 additions and 6 deletions

View File

@ -1413,6 +1413,7 @@ The following keywords are supported in the "global" section :
- tune.rcvbuf.frontend
- tune.rcvbuf.server
- tune.recv_enough
- tune.ring.queues
- tune.runqueue-depth
- tune.sched.low-latency
- tune.sndbuf.backend
@ -3769,6 +3770,15 @@ tune.recv_enough <number>
may be changed by this setting to better deal with workloads involving lots
of short messages such as telnet or SSH sessions.
tune.ring.queues <number>
Sets the number of write queues in front of ring buffers. This can have an
effect on the CPU usage of traces during debugging sessions, and both too
low or too large a value can have an important effect. The good value was
determined experimentally by developers and there should be no reason to
try to change it unless instructed to do so in order to try to address
specific issues. Such a setting should not be left in the configuration
across version upgrades because its optimal value may evolve over time.
tune.runqueue-depth <number>
Sets the maximum amount of task that can be processed at once when running
tasks. The default value depends on the number of threads but sits between 35

View File

@ -532,4 +532,24 @@
# endif
#endif
/* number of ring wait queues depending on the number
* of threads.
*/
#ifndef RING_WAIT_QUEUES
# if defined(USE_THREAD) && MAX_THREADS >= 32
# define RING_WAIT_QUEUES 16
# elif defined(USE_THREAD)
# define RING_WAIT_QUEUES ((MAX_THREADS + 1) / 2)
# else
# define RING_WAIT_QUEUES 1
# endif
#endif
/* it has been found that 6 queues was optimal on various archs at various
* thread counts, so let's use that by default.
*/
#ifndef RING_DFLT_QUEUES
# define RING_DFLT_QUEUES 6
#endif
#endif /* _HAPROXY_DEFAULTS_H */

View File

@ -190,6 +190,7 @@ struct global {
int nb_stk_ctr; /* number of stick counters, defaults to MAX_SESS_STKCTR */
int default_shards; /* default shards for listeners, or -1 (by-thread) or -2 (by-group) */
uint max_checks_per_thread; /* if >0, no more than this concurrent checks per thread */
uint ring_queues; /* if >0, #ring queues, otherwise equals #thread groups */
#ifdef USE_QUIC
unsigned int quic_backend_max_idle_timeout;
unsigned int quic_frontend_max_idle_timeout;

View File

@ -148,10 +148,10 @@ struct ring {
/* keep the queue in a separate cache line below */
THREAD_PAD(64 - 3*sizeof(void*) - 4*sizeof(int));
struct ring_wait_cell *queue; // wait queue
/* and leave a spacer after it to avoid false sharing */
struct {
struct ring_wait_cell *ptr;
THREAD_PAD(64 - sizeof(void*));
} queue[RING_WAIT_QUEUES + 1]; // wait queue + 1 spacer
};
#endif /* _HAPROXY_RING_T_H */

View File

@ -110,7 +110,7 @@ struct thread_info {
uint tid, ltid; /* process-wide and group-wide thread ID (start at 0) */
ulong ltid_bit; /* bit masks for the tid/ltid */
uint tgid; /* ID of the thread group this thread belongs to (starts at 1; 0=unset) */
/* 32-bit hole here */
uint ring_queue; /* queue number for the rings */
ullong pth_id; /* the pthread_t cast to a ullong */
void *stack_top; /* the top of the stack when entering the thread */

View File

@ -3150,6 +3150,18 @@ static void *run_thread_poll_loop(void *data)
#endif
ha_thread_info[tid].stack_top = __builtin_frame_address(0);
/* Assign the ring queue. Contrary to an intuitive thought, this does
* not benefit from locality and it's counter-productive to group
* threads from a same group or range number in the same queue. In some
* sense it arranges us because it means we can use a modulo and ensure
* that even small numbers of threads are well spread.
*/
ha_thread_info[tid].ring_queue =
(tid % MIN(global.nbthread,
(global.tune.ring_queues ?
global.tune.ring_queues :
RING_DFLT_QUEUES))) % RING_WAIT_QUEUES;
/* thread is started, from now on it is not idle nor harmless */
thread_harmless_end();
thread_idle_end();

View File

@ -22,6 +22,7 @@
#include <haproxy/api.h>
#include <haproxy/applet.h>
#include <haproxy/buf.h>
#include <haproxy/cfgparse.h>
#include <haproxy/cli.h>
#include <haproxy/ring.h>
#include <haproxy/sc_strm.h>
@ -51,7 +52,7 @@ void ring_init(struct ring *ring, void *area, size_t size, int reset)
ring->storage = area;
ring->pending = 0;
ring->waking = 0;
ring->queue = NULL;
memset(&ring->queue, 0, sizeof(ring->queue));
if (reset) {
ring->storage->size = size - sizeof(*ring->storage);
@ -646,6 +647,34 @@ size_t ring_max_payload(const struct ring *ring)
return max;
}
/* config parser for global "tune.ring.queues", accepts a number from 0 to RING_WAIT_QUEUES */
static int cfg_parse_tune_ring_queues(char **args, int section_type, struct proxy *curpx,
const struct proxy *defpx, const char *file, int line,
char **err)
{
int queues;
if (too_many_args(1, args, err, NULL))
return -1;
queues = atoi(args[1]);
if (queues < 0 || queues > RING_WAIT_QUEUES) {
memprintf(err, "'%s' expects a number between 0 and %d but got '%s'.", args[0], RING_WAIT_QUEUES, args[1]);
return -1;
}
global.tune.ring_queues = queues;
return 0;
}
/* config keyword parsers */
static struct cfg_kw_list cfg_kws = {ILH, {
{ CFG_GLOBAL, "tune.ring.queues", cfg_parse_tune_ring_queues },
{ 0, NULL, NULL }
}};
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
/*
* Local variables:
* c-indent-level: 8