MEDIUM: checks: implement a queue in order to limit concurrent checks

The progressive adoption of OpenSSL 3 and its abysmal handshake performance has started to reveal situations where it simply isn't possible anymore to succesfully run health checks on many servers, because between the moment all the checks are started and the moment the handshake finally completes, the timeout has expired! This also has consequences on production traffic which gets significantly delayed as well, all that for lots of checks. While it's possible to increase the check delays, it doesn't solve everything as checks still take a huge amount of time to converge in such conditions. Here we take a different approach by permitting to enforce the maximum concurrent checks per thread limitation and implementing an ordered queue. Thanks to this, if a thread about to start a check has reached its limit, it will add the check at the end of a queue and it will be processed once another check is finished. This proves to be extremely efficient, with all checks completing in a reasonable amount of time and not being disturbed by the rest of the traffic from other checks. They're just cycling slower, but at the speed the machine can handle. One must understand however that if some complex checks perform multiple exchanges, they will take a check slot for all the required duration. This is why the limit is not enforced by default. Tests on SSL show that a limit of 5-50 checks per thread on local servers gives excellent results already, so that could be a good starting point.
2025-04-11 03:31:36 +00:00 · 2023-08-31 16:54:22 +02:00 · 2023-08-31 16:54:22 +02:00 · 844a3bc25b
commit 844a3bc25b
parent cfc0bceeb5
4 changed files with 66 additions and 4 deletions
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@ -3283,14 +3283,17 @@ tune.lua.task-timeout <timeout>

 tune.max-checks-per-thread <number>
  Sets the number of active checks per thread above which a thread will
-  actively try to search a less loaded thread to run the health check. The
+  actively try to search a less loaded thread to run the health check, or
+  queue it until the number of active checks running on it diminishes. The
  default value is zero, meaning no such limit is set. It may be needed in
  certain environments running an extremely large number of expensive checks
  with many threads when the load appears unequal and may make health checks
  to randomly time out on startup, typically when using OpenSSL 3.0 which is
  about 20 times more CPU-intensive on health checks than older ones. This will
  have for result to try to level the health check work across all threads. The
-  vast majority of configurations do not need to touch this parameter.
+  vast majority of configurations do not need to touch this parameter. Please
+  note that too low values may significantly slow down the health checking if
+  checks are slow to execute.

 tune.maxaccept <number>
  Sets the maximum number of consecutive connections a process may accept in a
@ -16149,7 +16152,14 @@ downinter <delay>
  are started with a small time offset between them. It is also possible to
  add some random noise in the agent and health checks interval using the
  global "spread-checks" keyword. This makes sense for instance when a lot
-  of backends use the same servers.
+  of backends use the same servers. The global "tune.max-checks-per-thread"
+  setting, if defined to a non-nul value, will limit the number of concurrent
+  checks being performed at once on any given thread. In order to achieve this,
+  haproxy will put in a queue the checks that were about to start on a thread
+  that has reached this limit, until another check finishes. This will have for
+  effect to extend the effective check interval. In such a case, reducing the
+  "inter" setting will have a very limited effect as it will not be able to
+  reduce the time spent in the queue.

 log-proto <logproto>
  The "log-proto" specifies the protocol used to forward event messages to
--- a/include/haproxy/check-t.h
+++ b/include/haproxy/check-t.h
@ -62,7 +62,7 @@ enum chk_result {

 /* 4 possible states for CHK_ST_SLEEPING and CHK_ST_READY:
 *   SLP  RDY   State      Description
- *    0    0    -          (reserved)
+ *    0    0    QUEUED     Check is in queue due to concurrency limit
 *    0    1    RUNNING    Check is bound to current thread and running
 *    1    0    SLEEPING   Check is sleeping, not bound to a thread
 *    1    1    MIGRATING  Check is migrating to another thread
@ -191,6 +191,7 @@ struct check {
 	char *alpn_str;                         /* ALPN to use for checks */
 	int alpn_len;                           /* ALPN string length */
 	const struct mux_proto_list *mux_proto; /* the mux to use for all outgoing connections (specified by the "proto" keyword) */
+	struct list check_queue;                /* entry in the check queue. Not empty = in queue. */
 	int via_socks4;                         /* check the connection via socks4 proxy */
 };

--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@ -136,6 +136,7 @@ struct thread_ctx {
 	struct list streams;                /* list of streams attached to this thread */
 	struct list quic_conns;             /* list of active quic-conns attached to this thread */
 	struct list quic_conns_clo;         /* list of closing quic-conns attached to this thread */
+	struct list queued_checks;          /* checks waiting for a connection slot */

 	ALWAYS_ALIGN(2*sizeof(void*));
 	struct list tasklets[TL_CLASSES];   /* tasklets (and/or tasks) to run, by class */
--- a/src/check.c
+++ b/src/check.c
@ -1238,6 +1238,25 @@ struct task *process_chk_conn(struct task *t, void *context, unsigned int state)
 		task_set_thread(t, tid);
 		check->state &= ~CHK_ST_SLEEPING;

+		/* if we just woke up and the thread is full of running, or
+		 * already has others waiting, we might have to wait in queue
+		 * (for health checks only). This means !SLEEPING && !READY.
+		 */
+		if (check->server &&
+		    (!LIST_ISEMPTY(&th_ctx->queued_checks) ||
+		     (global.tune.max_checks_per_thread &&
+		      _HA_ATOMIC_LOAD(&th_ctx->running_checks) >= global.tune.max_checks_per_thread))) {
+			TRACE_DEVEL("health-check queued", CHK_EV_TASK_WAKE, check);
+			t->expire = TICK_ETERNITY;
+			LIST_APPEND(&th_ctx->queued_checks, &check->check_queue);
+
+			/* reset fastinter flag (if set) so that srv_getinter()
+			 * only returns fastinter if server health is degraded
+			 */
+			check->state &= ~CHK_ST_FASTINTER;
+			goto out_leave;
+		}
+
 		/* OK let's run, now we cannot roll back anymore */
 		check->state |= CHK_ST_READY;
 		activity[tid].check_started++;
@ -1403,6 +1422,25 @@ struct task *process_chk_conn(struct task *t, void *context, unsigned int state)
 	check->state |= CHK_ST_SLEEPING;

 update_timer:
+	/* when going to sleep, we need to check if other checks are waiting
+	 * for a slot. If so we pick them out of the queue and wake them up.
+	 */
+	if (check->server && (check->state & CHK_ST_SLEEPING)) {
+		if (!LIST_ISEMPTY(&th_ctx->queued_checks) &&
+		    _HA_ATOMIC_LOAD(&th_ctx->running_checks) < global.tune.max_checks_per_thread) {
+			struct check *next_chk = LIST_ELEM(th_ctx->queued_checks.n, struct check *, check_queue);
+
+			/* wake up pending task */
+			LIST_DEL_INIT(&next_chk->check_queue);
+
+			activity[tid].check_started++;
+			_HA_ATOMIC_INC(&th_ctx->running_checks);
+			next_chk->state |= CHK_ST_READY;
+			/* now running */
+			task_wakeup(next_chk->task, TASK_WOKEN_RES);
+		}
+	}
+
 	if (check->server) {
 		rv = 0;
 		if (global.spread_checks > 0) {
@ -1428,6 +1466,7 @@ struct task *process_chk_conn(struct task *t, void *context, unsigned int state)
 	if (check->server)
 		HA_SPIN_UNLOCK(SERVER_LOCK, &check->server->lock);

+ out_leave:
 	TRACE_LEAVE(CHK_EV_TASK_WAKE, check);

 	/* Free the check if set to PURGE. After this, the check instance may be
@ -1512,6 +1551,7 @@ const char *init_check(struct check *check, int type)
 	check->bi = BUF_NULL;
 	check->bo = BUF_NULL;
 	LIST_INIT(&check->buf_wait.list);
+	LIST_INIT(&check->check_queue);
 	return NULL;
 }

@ -1950,6 +1990,16 @@ REGISTER_POST_CHECK(start_checks);
 REGISTER_SERVER_DEINIT(deinit_srv_check);
 REGISTER_SERVER_DEINIT(deinit_srv_agent_check);

+/* perform minimal intializations */
+static void init_checks()
+{
+	int i;
+
+	for (i = 0; i < MAX_THREADS; i++)
+		LIST_INIT(&ha_thread_ctx[i].queued_checks);
+}
+
+INITCALL0(STG_PREPARE, init_checks);

 /**************************************************************************/
 /************************** Check sample fetches **************************/