MINOR: ring: make the number of queues configurable

Now the rings have one wait queue per group. This should limit the contention on systems such as EPYC CPUs where the performance drops dramatically when using more than one CCX. Tests were run with different numbers and it was showed that value 6 outperforms all other ones at 12, 24, 48, 64 and 80 threads on an EPYC, a Xeon and an Ampere CPU. Value 7 sometimes comes close and anything around these values degrades quickly. The value has been left tunable in the global section. This commit only introduces everything needed to set up the queue count so that it's easier to adjust it in the forthcoming patches, but it was initially added after the series, making it harder to compare. It was also shown that trying to group the threads in queues by their thread groups is counter-productive and that it was more efficient to do that by applying a modulo on the thread number. As surprising as it seems, it does have the benefit of well balancing any number of threads.
2025-05-01 23:39:32 +00:00 · 2024-03-14 08:57:02 +01:00 · 2024-03-14 08:57:02 +01:00 · 6c1b29d06f
commit 6c1b29d06f
parent e3f101a19a
7 changed files with 78 additions and 6 deletions
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@ -1413,6 +1413,7 @@ The following keywords are supported in the "global" section :
   - tune.rcvbuf.frontend
   - tune.rcvbuf.server
   - tune.recv_enough
+   - tune.ring.queues
   - tune.runqueue-depth
   - tune.sched.low-latency
   - tune.sndbuf.backend
@ -3769,6 +3770,15 @@ tune.recv_enough <number>
  may be changed by this setting to better deal with workloads involving lots
  of short messages such as telnet or SSH sessions.

+tune.ring.queues <number>
+  Sets the number of write queues in front of ring buffers. This can have an
+  effect on the CPU usage of traces during debugging sessions, and both too
+  low or too large a value can have an important effect. The good value was
+  determined experimentally by developers and there should be no reason to
+  try to change it unless instructed to do so in order to try to address
+  specific issues. Such a setting should not be left in the configuration
+  across version upgrades because its optimal value may evolve over time.
+
 tune.runqueue-depth <number>
  Sets the maximum amount of task that can be processed at once when running
  tasks. The default value depends on the number of threads but sits between 35
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@ -532,4 +532,24 @@
 # endif
 #endif

+/* number of ring wait queues depending on the number
+ * of threads.
+ */
+#ifndef RING_WAIT_QUEUES
+# if defined(USE_THREAD) && MAX_THREADS >= 32
+#  define RING_WAIT_QUEUES   16
+# elif defined(USE_THREAD)
+#  define RING_WAIT_QUEUES ((MAX_THREADS + 1) / 2)
+# else
+#  define RING_WAIT_QUEUES 1
+# endif
+#endif
+
+/* it has been found that 6 queues was optimal on various archs at various
+ * thread counts, so let's use that by default.
+ */
+#ifndef RING_DFLT_QUEUES
+# define RING_DFLT_QUEUES   6
+#endif
+
 #endif /* _HAPROXY_DEFAULTS_H */
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@ -190,6 +190,7 @@ struct global {
 		int nb_stk_ctr;       /* number of stick counters, defaults to MAX_SESS_STKCTR */
 		int default_shards; /* default shards for listeners, or -1 (by-thread) or -2 (by-group) */
 		uint max_checks_per_thread; /* if >0, no more than this concurrent checks per thread */
+		uint ring_queues;   /* if >0, #ring queues, otherwise equals #thread groups */
 #ifdef USE_QUIC
 		unsigned int quic_backend_max_idle_timeout;
 		unsigned int quic_frontend_max_idle_timeout;
--- a/include/haproxy/ring-t.h
+++ b/include/haproxy/ring-t.h
@ -148,10 +148,10 @@ struct ring {

 	/* keep the queue in a separate cache line below */
 	THREAD_PAD(64 - 3*sizeof(void*) - 4*sizeof(int));
-	struct ring_wait_cell *queue; // wait queue
-
-	/* and leave a spacer after it to avoid false sharing */
-	THREAD_PAD(64 - sizeof(void*));
+	struct {
+		struct ring_wait_cell *ptr;
+		THREAD_PAD(64 - sizeof(void*));
+	} queue[RING_WAIT_QUEUES + 1]; // wait queue + 1 spacer
 };

 #endif /* _HAPROXY_RING_T_H */
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@ -110,7 +110,7 @@ struct thread_info {
 	uint tid, ltid;                   /* process-wide and group-wide thread ID (start at 0) */
 	ulong ltid_bit;                   /* bit masks for the tid/ltid */
 	uint tgid;                        /* ID of the thread group this thread belongs to (starts at 1; 0=unset) */
-	/* 32-bit hole here */
+	uint ring_queue;                  /* queue number for the rings */

 	ullong pth_id;                    /* the pthread_t cast to a ullong */
 	void *stack_top;                  /* the top of the stack when entering the thread */
--- a/src/haproxy.c
+++ b/src/haproxy.c
@ -3150,6 +3150,18 @@ static void *run_thread_poll_loop(void *data)
 #endif
 	ha_thread_info[tid].stack_top = __builtin_frame_address(0);

+	/* Assign the ring queue. Contrary to an intuitive thought, this does
+	 * not benefit from locality and it's counter-productive to group
+	 * threads from a same group or range number in the same queue. In some
+	 * sense it arranges us because it means we can use a modulo and ensure
+	 * that even small numbers of threads are well spread.
+	 */
+	ha_thread_info[tid].ring_queue =
+		(tid % MIN(global.nbthread,
+			   (global.tune.ring_queues ?
+			    global.tune.ring_queues :
+			    RING_DFLT_QUEUES))) % RING_WAIT_QUEUES;
+
 	/* thread is started, from now on it is not idle nor harmless */
 	thread_harmless_end();
 	thread_idle_end();
--- a/src/ring.c
+++ b/src/ring.c
@ -22,6 +22,7 @@
 #include <haproxy/api.h>
 #include <haproxy/applet.h>
 #include <haproxy/buf.h>
+#include <haproxy/cfgparse.h>
 #include <haproxy/cli.h>
 #include <haproxy/ring.h>
 #include <haproxy/sc_strm.h>
@ -51,7 +52,7 @@ void ring_init(struct ring *ring, void *area, size_t size, int reset)
 	ring->storage = area;
 	ring->pending = 0;
 	ring->waking = 0;
-	ring->queue = NULL;
+	memset(&ring->queue, 0, sizeof(ring->queue));

 	if (reset) {
 		ring->storage->size = size - sizeof(*ring->storage);
@ -646,6 +647,34 @@ size_t ring_max_payload(const struct ring *ring)
 	return max;
 }

+/* config parser for global "tune.ring.queues", accepts a number from 0 to RING_WAIT_QUEUES */
+static int cfg_parse_tune_ring_queues(char **args, int section_type, struct proxy *curpx,
+                                       const struct proxy *defpx, const char *file, int line,
+                                       char **err)
+{
+	int queues;
+
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	queues = atoi(args[1]);
+	if (queues < 0 || queues > RING_WAIT_QUEUES) {
+		memprintf(err, "'%s' expects a number between 0 and %d but got '%s'.", args[0], RING_WAIT_QUEUES, args[1]);
+		return -1;
+	}
+
+	global.tune.ring_queues = queues;
+	return 0;
+}
+
+/* config keyword parsers */
+static struct cfg_kw_list cfg_kws = {ILH, {
+	{ CFG_GLOBAL, "tune.ring.queues", cfg_parse_tune_ring_queues },
+	{ 0, NULL, NULL }
+}};
+
+INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
+
 /*
 * Local variables:
 *  c-indent-level: 8