From 4e9df2737dde3f6d1d171fa17e2594c4c765c00c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 17 Feb 2021 15:20:19 +0100 Subject: [PATCH] BUG/MEDIUM: checks: don't needlessly take the server lock in health_adjust() The server lock was taken preventively for anything in health_adjust(), including the static config checks needed to detect that the lock was not needed, while the function is always called on the response path to update a server's status. This was responsible for huge contention causing a performance drop of about 17% on 16 threads. Let's move the lock only where it should be, i.e. inside the function around the critical sections only. By doing this, a 16-thread process jumped back from 575 to 675 krps. This should be backported to 2.3 as the situation degraded there, and maybe later to 2.2. --- include/haproxy/check.h | 6 +----- src/check.c | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/haproxy/check.h b/include/haproxy/check.h index ffeef4e22..09ae5d379 100644 --- a/include/haproxy/check.h +++ b/include/haproxy/check.h @@ -60,15 +60,11 @@ void set_srv_agent_port(struct server *srv, int port); */ static inline void health_adjust(struct server *s, short status) { - HA_SPIN_LOCK(SERVER_LOCK, &s->lock); /* return now if observing nor health check is not enabled */ - if (!s->observe || !s->check.task) { - HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + if (!s->observe || !s->check.task) return; - } __health_adjust(s, status); - HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); } #endif /* _HAPROXY_CHECKS_H */ diff --git a/src/check.c b/src/check.c index 06c86af79..a09c67a7d 100644 --- a/src/check.c +++ b/src/check.c @@ -404,7 +404,7 @@ void check_notify_stopping(struct check *check) } /* note: use health_adjust() only, which first checks that the observe mode is - * enabled. + * enabled. This will take the server lock if needed. */ void __health_adjust(struct server *s, short status) { @@ -444,6 +444,13 @@ void __health_adjust(struct server *s, short status) chunk_printf(&trash, "Detected %d consecutive errors, last one was: %s", s->consecutive_errors, get_analyze_status(status)); + if (s->check.fastinter) + expire = tick_add(now_ms, MS_TO_TICKS(s->check.fastinter)); + else + expire = TICK_ETERNITY; + + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + switch (s->onerror) { case HANA_ONERR_FASTINTER: /* force fastinter - nothing to do here as all modes force it */ @@ -476,16 +483,14 @@ void __health_adjust(struct server *s, short status) break; } + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + s->consecutive_errors = 0; _HA_ATOMIC_ADD(&s->counters.failed_hana, 1); - if (s->check.fastinter) { - expire = tick_add(now_ms, MS_TO_TICKS(s->check.fastinter)); - if (tick_is_lt(expire, s->check.task->expire)) { - s->check.task->expire = expire; - /* requeue check task with new expire */ - task_queue(s->check.task); - } + if (tick_is_lt(expire, s->check.task->expire)) { + /* requeue check task with new expire */ + task_schedule(s->check.task, expire); } }