From 6c1b29d06fc7824d69d14599fe41daf79178bec7 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Thu, 14 Mar 2024 08:57:02 +0100
Subject: [PATCH] MINOR: ring: make the number of queues configurable

Now the rings have one wait queue per group. This should limit the
contention on systems such as EPYC CPUs where the performance drops
dramatically when using more than one CCX.

Tests were run with different numbers and it was showed that value
6 outperforms all other ones at 12, 24, 48, 64 and 80 threads on an
EPYC, a Xeon and an Ampere CPU. Value 7 sometimes comes close and
anything around these values degrades quickly. The value has been
left tunable in the global section.

This commit only introduces everything needed to set up the queue count
so that it's easier to adjust it in the forthcoming patches, but it was
initially added after the series, making it harder to compare.

It was also shown that trying to group the threads in queues by their
thread groups is counter-productive and that it was more efficient to
do that by applying a modulo on the thread number. As surprising as it
seems, it does have the benefit of well balancing any number of threads.
---
 doc/configuration.txt      | 10 ++++++++++
 include/haproxy/defaults.h | 20 ++++++++++++++++++++
 include/haproxy/global-t.h |  1 +
 include/haproxy/ring-t.h   |  8 ++++----
 include/haproxy/tinfo-t.h  |  2 +-
 src/haproxy.c              | 12 ++++++++++++
 src/ring.c                 | 31 ++++++++++++++++++++++++++++++-
 7 files changed, 78 insertions(+), 6 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 980de0b92..7387f4b53 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1413,6 +1413,7 @@ The following keywords are supported in the "global" section :
    - tune.rcvbuf.frontend
    - tune.rcvbuf.server
    - tune.recv_enough
+   - tune.ring.queues
    - tune.runqueue-depth
    - tune.sched.low-latency
    - tune.sndbuf.backend
@@ -3769,6 +3770,15 @@ tune.recv_enough <number>
   may be changed by this setting to better deal with workloads involving lots
   of short messages such as telnet or SSH sessions.
 
+tune.ring.queues <number>
+  Sets the number of write queues in front of ring buffers. This can have an
+  effect on the CPU usage of traces during debugging sessions, and both too
+  low or too large a value can have an important effect. The good value was
+  determined experimentally by developers and there should be no reason to
+  try to change it unless instructed to do so in order to try to address
+  specific issues. Such a setting should not be left in the configuration
+  across version upgrades because its optimal value may evolve over time.
+
 tune.runqueue-depth <number>
   Sets the maximum amount of task that can be processed at once when running
   tasks. The default value depends on the number of threads but sits between 35
diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index 051ca8111..47db366d0 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -532,4 +532,24 @@
 # endif
 #endif
 
+/* number of ring wait queues depending on the number
+ * of threads.
+ */
+#ifndef RING_WAIT_QUEUES
+# if defined(USE_THREAD) && MAX_THREADS >= 32
+#  define RING_WAIT_QUEUES   16
+# elif defined(USE_THREAD)
+#  define RING_WAIT_QUEUES ((MAX_THREADS + 1) / 2)
+# else
+#  define RING_WAIT_QUEUES 1
+# endif
+#endif
+
+/* it has been found that 6 queues was optimal on various archs at various
+ * thread counts, so let's use that by default.
+ */
+#ifndef RING_DFLT_QUEUES
+# define RING_DFLT_QUEUES   6
+#endif
+
 #endif /* _HAPROXY_DEFAULTS_H */
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index 25536df14..f26b13f21 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -190,6 +190,7 @@ struct global {
 		int nb_stk_ctr;       /* number of stick counters, defaults to MAX_SESS_STKCTR */
 		int default_shards; /* default shards for listeners, or -1 (by-thread) or -2 (by-group) */
 		uint max_checks_per_thread; /* if >0, no more than this concurrent checks per thread */
+		uint ring_queues;   /* if >0, #ring queues, otherwise equals #thread groups */
 #ifdef USE_QUIC
 		unsigned int quic_backend_max_idle_timeout;
 		unsigned int quic_frontend_max_idle_timeout;
diff --git a/include/haproxy/ring-t.h b/include/haproxy/ring-t.h
index 3d699b503..4e091ee0a 100644
--- a/include/haproxy/ring-t.h
+++ b/include/haproxy/ring-t.h
@@ -148,10 +148,10 @@ struct ring {
 
 	/* keep the queue in a separate cache line below */
 	THREAD_PAD(64 - 3*sizeof(void*) - 4*sizeof(int));
-	struct ring_wait_cell *queue; // wait queue
-
-	/* and leave a spacer after it to avoid false sharing */
-	THREAD_PAD(64 - sizeof(void*));
+	struct {
+		struct ring_wait_cell *ptr;
+		THREAD_PAD(64 - sizeof(void*));
+	} queue[RING_WAIT_QUEUES + 1]; // wait queue + 1 spacer
 };
 
 #endif /* _HAPROXY_RING_T_H */
diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h
index 357c4c0aa..8e7638e2b 100644
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@@ -110,7 +110,7 @@ struct thread_info {
 	uint tid, ltid;                   /* process-wide and group-wide thread ID (start at 0) */
 	ulong ltid_bit;                   /* bit masks for the tid/ltid */
 	uint tgid;                        /* ID of the thread group this thread belongs to (starts at 1; 0=unset) */
-	/* 32-bit hole here */
+	uint ring_queue;                  /* queue number for the rings */
 
 	ullong pth_id;                    /* the pthread_t cast to a ullong */
 	void *stack_top;                  /* the top of the stack when entering the thread */
diff --git a/src/haproxy.c b/src/haproxy.c
index 723335a6e..7b2a18a72 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -3150,6 +3150,18 @@ static void *run_thread_poll_loop(void *data)
 #endif
 	ha_thread_info[tid].stack_top = __builtin_frame_address(0);
 
+	/* Assign the ring queue. Contrary to an intuitive thought, this does
+	 * not benefit from locality and it's counter-productive to group
+	 * threads from a same group or range number in the same queue. In some
+	 * sense it arranges us because it means we can use a modulo and ensure
+	 * that even small numbers of threads are well spread.
+	 */
+	ha_thread_info[tid].ring_queue =
+		(tid % MIN(global.nbthread,
+			   (global.tune.ring_queues ?
+			    global.tune.ring_queues :
+			    RING_DFLT_QUEUES))) % RING_WAIT_QUEUES;
+
 	/* thread is started, from now on it is not idle nor harmless */
 	thread_harmless_end();
 	thread_idle_end();
diff --git a/src/ring.c b/src/ring.c
index d45bc245d..4118a645d 100644
--- a/src/ring.c
+++ b/src/ring.c
@@ -22,6 +22,7 @@
 #include <haproxy/api.h>
 #include <haproxy/applet.h>
 #include <haproxy/buf.h>
+#include <haproxy/cfgparse.h>
 #include <haproxy/cli.h>
 #include <haproxy/ring.h>
 #include <haproxy/sc_strm.h>
@@ -51,7 +52,7 @@ void ring_init(struct ring *ring, void *area, size_t size, int reset)
 	ring->storage = area;
 	ring->pending = 0;
 	ring->waking = 0;
-	ring->queue = NULL;
+	memset(&ring->queue, 0, sizeof(ring->queue));
 
 	if (reset) {
 		ring->storage->size = size - sizeof(*ring->storage);
@@ -646,6 +647,34 @@ size_t ring_max_payload(const struct ring *ring)
 	return max;
 }
 
+/* config parser for global "tune.ring.queues", accepts a number from 0 to RING_WAIT_QUEUES */
+static int cfg_parse_tune_ring_queues(char **args, int section_type, struct proxy *curpx,
+                                       const struct proxy *defpx, const char *file, int line,
+                                       char **err)
+{
+	int queues;
+
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	queues = atoi(args[1]);
+	if (queues < 0 || queues > RING_WAIT_QUEUES) {
+		memprintf(err, "'%s' expects a number between 0 and %d but got '%s'.", args[0], RING_WAIT_QUEUES, args[1]);
+		return -1;
+	}
+
+	global.tune.ring_queues = queues;
+	return 0;
+}
+
+/* config keyword parsers */
+static struct cfg_kw_list cfg_kws = {ILH, {
+	{ CFG_GLOBAL, "tune.ring.queues", cfg_parse_tune_ring_queues },
+	{ 0, NULL, NULL }
+}};
+
+INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
+
 /*
  * Local variables:
  *  c-indent-level: 8