From beb859abcebf2dc1b025b7bb83e53a28192ea6e9 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Thu, 22 Nov 2018 18:07:59 +0100
Subject: [PATCH] MINOR: polling: add an option to support busy polling

In some situations, especially when dealing with low latency on processors
supporting a variable frequency or when running inside virtual machines,
each time the process waits for an I/O using the poller, the processor
goes back to sleep or is offered to another VM for a long time, and it
causes excessively high latencies.

A solution to this provided by this patch is to enable busy polling using
a global option. When busy polling is enabled, the pollers never sleep and
loop over themselves waiting for an I/O event to happen or for a timeout
to occur. On multi-processor machines it can significantly overheat the
processor but it usually results in much lower latencies.

A typical test consisting in injecting traffic over a single connection at
a time over the loopback shows a bump from 4640 to 8540 connections per
second on forwarded connections, indicating a latency reduction of 98
microseconds for each connection, and a bump from 12500 to 21250 for
locally terminated connections (redirects), indicating a reduction of
33 microseconds.

It is only usable with epoll and kqueue because select() and poll()'s
API is not convenient for such usages, and the level of performance they
are used in doesn't benefit from this anyway.

The option, which obviously remains disabled by default, can be turned
on using "busy-polling" in the global section, and turned off later
using "no busy-polling". Its status is reported in "show info" to help
troubleshooting suspicious CPU spikes.
---
 doc/configuration.txt  | 19 ++++++++++++++++
 include/types/global.h |  2 ++
 include/types/stats.h  |  1 +
 src/cfgparse-global.c  |  8 +++++++
 src/cfgparse.c         |  4 ++--
 src/ev_epoll.c         | 19 ++++++++++++++--
 src/ev_kqueue.c        | 49 ++++++++++++++++++++++++++++--------------
 src/stats.c            |  2 ++
 8 files changed, 84 insertions(+), 20 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 2ce96167e..b34946485 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1294,6 +1294,25 @@ wurfl-useragent-priority { plain | sideloaded_browser }
 3.2. Performance tuning
 -----------------------
 
+busy-polling
+  In some situations, especially when dealing with low latency on processors
+  supporting a variable frequency or when running inside virtual machines, each
+  time the process waits for an I/O using the poller, the processor goes back
+  to sleep or is offered to another VM for a long time, and it causes
+  excessively high latencies. This option provides a solution preventing the
+  processor from sleeping by always using a null timeout on the pollers. This
+  results in a significant latency reduction (30 to 100 microseconds observed)
+  at the expense of a risk to overheat the processor. It may even be used with
+  threads, in which case improperly bound threads may heavily conflict,
+  resulting in a worse performance and high values for the CPU stolen fields
+  in "show info" output, indicating which threads are misconfigured. It is
+  important not to let the process run on the same processor as the network
+  interrupts when this option is used. It is also better to avoid using it on
+  multiple CPU threads sharing the same core. This option is disabled by
+  default. If it has been enabled, it may still be forcibly disabled by
+  prefixing it with the "no" keyword. It is ignored by the "select" and
+  "poll" pollers.
+
 max-spread-checks <delay in milliseconds>
   By default, haproxy tries to spread the start of health checks across the
   smallest health check interval of all the servers in a farm. The principle is
diff --git a/include/types/global.h b/include/types/global.h
index 5a3f338a4..24eeb0ce7 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -68,6 +68,8 @@
 #define GTUNE_NOEXIT_ONFAILURE   (1<<9)
 #define GTUNE_USE_SYSTEMD        (1<<10)
 
+#define GTUNE_BUSY_POLLING       (1<<11)
+
 /* Access level for a stats socket */
 #define ACCESS_LVL_NONE     0
 #define ACCESS_LVL_USER     1
diff --git a/include/types/stats.h b/include/types/stats.h
index 85cc906da..a188667e8 100644
--- a/include/types/stats.h
+++ b/include/types/stats.h
@@ -295,6 +295,7 @@ enum info_field {
 	INF_ACTIVE_PEERS,
 	INF_CONNECTED_PEERS,
 	INF_DROPPED_LOGS,
+	INF_BUSY_POLLING,
 
 	/* must always be the last one */
 	INF_TOTAL_FIELDS
diff --git a/src/cfgparse-global.c b/src/cfgparse-global.c
index a00d96255..4303ef987 100644
--- a/src/cfgparse-global.c
+++ b/src/cfgparse-global.c
@@ -73,6 +73,14 @@ int cfg_parse_global(const char *file, int linenum, char **args, int kwm)
 			goto out;
 		global.tune.options &= ~GTUNE_USE_POLL;
 	}
+	else if (!strcmp(args[0], "busy-polling")) { /* "no busy-polling" or "busy-polling" */
+		if (alertif_too_many_args(0, file, linenum, args, &err_code))
+			goto out;
+		if (kwm == KWM_NO)
+			global.tune.options &= ~GTUNE_BUSY_POLLING;
+		else
+			global.tune.options |=  GTUNE_BUSY_POLLING;
+	}
 	else if (!strcmp(args[0], "nosplice")) {
 		if (alertif_too_many_args(0, file, linenum, args, &err_code))
 			goto out;
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 2c660ab0d..1e6566856 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -1871,8 +1871,8 @@ int readcfgfile(const char *file)
 		}
 
 		if (kwm != KWM_STD && strcmp(args[0], "option") != 0 && 	\
-		     strcmp(args[0], "log") != 0) {
-			ha_alert("parsing [%s:%d]: negation/default currently supported only for options and log.\n", file, linenum);
+		    strcmp(args[0], "log") != 0 && strcmp(args[0], "busy-polling")) {
+			ha_alert("parsing [%s:%d]: negation/default currently supported only for options, log, and busy-polling.\n", file, linenum);
 			err_code |= ERR_ALERT | ERR_FATAL;
 		}
 
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index 272ded2b4..0bb8c9f89 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c
@@ -27,6 +27,7 @@
 
 #include <proto/activity.h>
 #include <proto/fd.h>
+#include <proto/signal.h>
 
 
 /* private data */
@@ -149,8 +150,22 @@ REGPRM2 static void _do_poll(struct poller *p, int exp)
 	wait_time = compute_poll_timeout(exp);
 	tv_entering_poll();
 	activity_count_runtime();
-	status = epoll_wait(epoll_fd[tid], epoll_events, global.tune.maxpollevents, wait_time);
-	tv_update_date(wait_time, status);
+	do {
+		int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time;
+
+		status = epoll_wait(epoll_fd[tid], epoll_events, global.tune.maxpollevents, timeout);
+		tv_update_date(timeout, status);
+
+		if (status)
+			break;
+		if (timeout || !wait_time)
+			break;
+		if (signal_queue_len)
+			break;
+		if (tick_isset(exp) && tick_is_expired(exp, now_ms))
+			break;
+	} while (1);
+
 	tv_leaving_poll(wait_time, status);
 
 	thread_harmless_end();
diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c
index a894f6687..d09784770 100644
--- a/src/ev_kqueue.c
+++ b/src/ev_kqueue.c
@@ -28,6 +28,7 @@
 
 #include <proto/activity.h>
 #include <proto/fd.h>
+#include <proto/signal.h>
 
 
 /* private data */
@@ -76,14 +77,14 @@ static int _update_fd(int fd, int start)
 REGPRM2 static void _do_poll(struct poller *p, int exp)
 {
 	int status;
-	int count, fd, delta_ms;
-	struct timespec timeout;
+	int count, fd, wait_time;
+	struct timespec timeout_ts;
 	int updt_idx;
 	int changes = 0;
 	int old_fd;
 
-	timeout.tv_sec  = 0;
-	timeout.tv_nsec = 0;
+	timeout_ts.tv_sec  = 0;
+	timeout_ts.tv_nsec = 0;
 	/* first, scan the update list to find changes */
 	for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
 		fd = fd_updt[updt_idx];
@@ -126,25 +127,41 @@ REGPRM2 static void _do_poll(struct poller *p, int exp)
 		 */
 		EV_SET(&kev[changes++], -1, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
 #endif
-		kevent(kqueue_fd[tid], kev, changes, kev_out, changes, &timeout);
+		kevent(kqueue_fd[tid], kev, changes, kev_out, changes, &timeout_ts);
 	}
 	fd_nbupdt = 0;
 
 	/* now let's wait for events */
-	delta_ms = compute_poll_timeout(exp);
-	timeout.tv_sec  = (delta_ms / 1000);
-	timeout.tv_nsec = (delta_ms % 1000) * 1000000;
+	wait_time = compute_poll_timeout(exp);
 	fd = global.tune.maxpollevents;
 	tv_entering_poll();
 	activity_count_runtime();
-	status = kevent(kqueue_fd[tid], // int kq
-			NULL,      // const struct kevent *changelist
-			0,         // int nchanges
-			kev,       // struct kevent *eventlist
-			fd,        // int nevents
-			&timeout); // const struct timespec *timeout
-	tv_update_date(delta_ms, status);
-	tv_leaving_poll(delta_ms, status);
+
+	do {
+		int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time;
+
+		timeout_ts.tv_sec  = (timeout / 1000);
+		timeout_ts.tv_nsec = (timeout % 1000) * 1000000;
+
+		status = kevent(kqueue_fd[tid], // int kq
+		                NULL,      // const struct kevent *changelist
+		                0,         // int nchanges
+		                kev,       // struct kevent *eventlist
+		                fd,        // int nevents
+		                &timeout_ts); // const struct timespec *timeout
+		tv_update_date(timeout, status);
+
+		if (status)
+			break;
+		if (timeout || !wait_time)
+			break;
+		if (signal_queue_len)
+			break;
+		if (tick_isset(exp) && tick_is_expired(exp, now_ms))
+			break;
+	} while (1);
+
+	tv_leaving_poll(wait_time, status);
 
 	thread_harmless_end();
 
diff --git a/src/stats.c b/src/stats.c
index b082d810f..9b0800edd 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -139,6 +139,7 @@ const char *info_field_names[INF_TOTAL_FIELDS] = {
 	[INF_ACTIVE_PEERS]                   = "ActivePeers",
 	[INF_CONNECTED_PEERS]                = "ConnectedPeers",
 	[INF_DROPPED_LOGS]                   = "DroppedLogs",
+	[INF_BUSY_POLLING]                   = "BusyPolling",
 };
 
 const char *stat_field_names[ST_F_TOTAL_FIELDS] = {
@@ -3562,6 +3563,7 @@ int stats_fill_info(struct field *info, int len)
 	info[INF_ACTIVE_PEERS]                   = mkf_u32(0, active_peers);
 	info[INF_CONNECTED_PEERS]                = mkf_u32(0, connected_peers);
 	info[INF_DROPPED_LOGS]                   = mkf_u32(0, dropped_logs);
+	info[INF_BUSY_POLLING]                   = mkf_u32(0, !!(global.tune.options & GTUNE_BUSY_POLLING));
 
 	return 1;
 }