From 0d587116c2b15a7d24ec93ac6069b0bdba57d7bd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 1 Jul 2020 15:04:38 +0200 Subject: [PATCH] BUG/MEDIUM: backend: always search in the safe list after failing on the idle one There's a tricky behavior that was lost when the idle connections were made sharable between thread in commit 566df309c ("MEDIUM: connections: Attempt to get idle connections from other threads."), it is the ability to retry from the safe list when looking for any type of idle connection and not finding one in the idle list. It is already important when dealing with long-lived connections since they ultimately all become safe, but that case is already covered by the fact that safe conns not being used end up closing and are not looked up anymore since connect_server() sees there are none. But it's even more important when using server-side connections which periodically close, because the new connections may spend half of their time in safe state and the other half in the idle state, and failing to grab one such connection from the right list results in establishing a new connection. This patch makes sure that a failure to find an idle connection results in a new attempt at finding one from the safe list if available. In order to avoid locking twice, connections are attempted alternatively from the idle then safe list when picking from siblings. Tests have shown a ~2% performance increase by avoiding to lock twice. A typical test with 10000 connections over 16 threads with 210 servers having a 1 millisecond response time and closing every 5 requests shows a degrading performance starting at 120k req/s down to 60-90k and an average reuse rate of 44%. After the fix, the reuse rate raises to 79% and the performance becomes stable at 254k req/s. Similarly the previous test with full keep-alive has now increased from 96% reuse rate to 99% and from 352k to 375k req/s. No backport is needed as this is 2.2-only. --- src/backend.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/backend.c b/src/backend.c index f54181ade..f540e321d 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1070,7 +1070,8 @@ static void assign_tproxy_address(struct stream *s) } /* Attempt to get a backend connection from the specified mt_list array - * (safe or idle connections). + * (safe or idle connections). The argument means what type of + * connection the caller wants. */ static struct connection *conn_backend_get(struct server *srv, int is_safe) { @@ -1086,6 +1087,17 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) i = tid; HA_SPIN_LOCK(OTHER_LOCK, &idle_conns[tid].toremove_lock); conn = MT_LIST_POP(&mt_list[tid], struct connection *, list); + + /* If we failed to pick a connection from the idle list, let's try again with + * the safe list. + */ + if (!conn && !is_safe && srv->curr_safe_nb > 0) { + conn = MT_LIST_POP(&srv->safe_conns[tid], struct connection *, list); + if (conn) { + is_safe = 1; + mt_list = srv->safe_conns; + } + } HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conns[tid].toremove_lock); /* If we found a connection in our own list, and we don't have to @@ -1104,7 +1116,7 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) goto done; /* Lookup all other threads for an idle connection, starting from tid + 1 */ - for (i = tid; !found && (i = ((i + 1 == global.nbthread) ? 0 : i + 1)) != tid;) { + while (!found && (i = ((i + 1 == global.nbthread) ? 0 : i + 1)) != tid) { struct mt_list *elt1, elt2; if (!srv->curr_idle_thr[i]) @@ -1119,6 +1131,19 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) break; } } + + if (!found && !is_safe && srv->curr_safe_nb > 0) { + mt_list_for_each_entry_safe(conn, &srv->safe_conns[i], list, elt1, elt2) { + if (conn->mux->takeover && conn->mux->takeover(conn) == 0) { + MT_LIST_DEL_SAFE(elt1); + _HA_ATOMIC_ADD(&activity[tid].fd_takeover, 1); + found = 1; + is_safe = 1; + mt_list = srv->safe_conns; + break; + } + } + } HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conns[i].toremove_lock); } @@ -1216,10 +1241,13 @@ int connect_server(struct stream *s) reuse = 1; } else if (!srv_conn && srv->curr_idle_conns > 0) { - if (srv->idle_conns && + if (srv->idle_conns && srv->safe_conns && ((s->be->options & PR_O_REUSE_MASK) != PR_O_REUSE_NEVR && s->txn && (s->txn->flags & TX_NOT_FIRST)) && - srv->curr_idle_nb > 0) { + srv->curr_idle_nb + srv->curr_safe_nb > 0) { + /* we're on the second column of the tables above, let's + * try idle then safe. + */ srv_conn = conn_backend_get(srv, 0); was_unused = 1; }