diff --git a/Makefile b/Makefile index fcd158121..8fde90b9e 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ # explicitly specified : # USE_EPOLL : enable epoll() on Linux 2.6. Automatic. # USE_KQUEUE : enable kqueue() on BSD. Automatic. +# USE_EVPORTS : enable event ports on SunOS systems. Automatic. # USE_MY_EPOLL : redefine epoll_* syscalls. Automatic. # USE_MY_SPLICE : redefine the splice syscall if build fails without. # USE_NETFILTER : enable netfilter on Linux. Automatic. @@ -285,7 +286,7 @@ use_opts = USE_EPOLL USE_KQUEUE USE_MY_EPOLL USE_MY_SPLICE USE_NETFILTER \ USE_GETADDRINFO USE_OPENSSL USE_LUA USE_FUTEX USE_ACCEPT4 \ USE_MY_ACCEPT4 USE_ZLIB USE_SLZ USE_CPU_AFFINITY USE_TFO USE_NS \ USE_DL USE_RT USE_DEVICEATLAS USE_51DEGREES USE_WURFL USE_SYSTEMD \ - USE_OBSOLETE_LINKER USE_PRCTL USE_THREAD_DUMP + USE_OBSOLETE_LINKER USE_PRCTL USE_THREAD_DUMP USE_EVPORTS #### Target system options # Depending on the target platform, some options are set, as well as some @@ -352,7 +353,7 @@ ifeq ($(TARGET),solaris) # We also enable getaddrinfo() which works since solaris 8. set_target_defaults = $(call default_opts, \ USE_POLL USE_TPROXY USE_LIBCRYPT USE_CRYPT_H USE_GETADDRINFO USE_THREAD \ - USE_OBSOLETE_LINKER) + USE_OBSOLETE_LINKER USE_EVPORTS) TARGET_CFLAGS = -DFD_SETSIZE=65536 -D_REENTRANT -D_XOPEN_SOURCE=500 -D__EXTENSIONS__ TARGET_LDFLAGS = -lnsl -lsocket endif @@ -515,6 +516,11 @@ OPTIONS_CFLAGS += -DENABLE_KQUEUE OPTIONS_OBJS += src/ev_kqueue.o endif +ifneq ($(USE_EVPORTS),) +OPTIONS_CFLAGS += -DENABLE_EVPORTS +OPTIONS_OBJS += src/ev_evports.o +endif + ifneq ($(USE_VSYSCALL),) OPTIONS_OBJS += src/i386-linux-vsys.o OPTIONS_CFLAGS += -DCONFIG_HAP_LINUX_VSYSCALL diff --git a/doc/configuration.txt b/doc/configuration.txt index 7543cc3fb..cc3e31ffb 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -616,6 +616,7 @@ The following keywords are supported in the "global" section : - maxzlibmem - noepoll - nokqueue + - noevports - nopoll - nosplice - nogetaddrinfo @@ -1460,11 +1461,18 @@ nokqueue equivalent to the command-line argument "-dk". The next polling system used will generally be "poll". See also "nopoll". +noevports + Disables the use of the event ports event polling system on SunOS systems + derived from Solaris 10 and later. It is equivalent to the command-line + argument "-dv". The next polling system used will generally be "poll". See + also "nopoll". + nopoll Disables the use of the "poll" event polling system. It is equivalent to the command-line argument "-dp". The next polling system used will be "select". It should never be needed to disable "poll" since it's available on all - platforms supported by HAProxy. See also "nokqueue" and "noepoll". + platforms supported by HAProxy. See also "nokqueue", "noepoll" and + "noevports". nosplice Disables the use of kernel tcp splicing between sockets on Linux. It is diff --git a/doc/haproxy.1 b/doc/haproxy.1 index 91f58a3b8..86d17cfac 100644 --- a/doc/haproxy.1 +++ b/doc/haproxy.1 @@ -104,6 +104,11 @@ in daemon mode. \fB\-dk\fP Disable use of \fBkqueue\fP(2). \fBkqueue\fP(2) is available only on BSD systems. +.TP +\fB\-dv\fP +Disable use of event ports. Event ports are available only on SunOS systems +derived from Solaris 10 and later (including illumos systems). + .TP \fB\-ds\fP Disable use of speculative \fBepoll\fP(7). \fBepoll\fP(7) is available only on diff --git a/include/types/global.h b/include/types/global.h index a52b258b0..62652f5ac 100644 --- a/include/types/global.h +++ b/include/types/global.h @@ -73,6 +73,8 @@ #define GTUNE_LISTENER_MQ (1<<12) #define GTUNE_SET_DUMPABLE (1<<13) +#define GTUNE_USE_EVPORTS (1<<14) + /* Access level for a stats socket */ #define ACCESS_LVL_NONE 0 #define ACCESS_LVL_USER 1 diff --git a/src/cfgparse-global.c b/src/cfgparse-global.c index f4037c2e7..8355a8f1b 100644 --- a/src/cfgparse-global.c +++ b/src/cfgparse-global.c @@ -68,6 +68,11 @@ int cfg_parse_global(const char *file, int linenum, char **args, int kwm) goto out; global.tune.options &= ~GTUNE_USE_KQUEUE; } + else if (!strcmp(args[0], "noevports")) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_EVPORTS; + } else if (!strcmp(args[0], "nopoll")) { if (alertif_too_many_args(0, file, linenum, args, &err_code)) goto out; diff --git a/src/ev_evports.c b/src/ev_evports.c new file mode 100644 index 000000000..dbad7b790 --- /dev/null +++ b/src/ev_evports.c @@ -0,0 +1,418 @@ +/* + * FD polling functions for SunOS event ports. + * + * Copyright 2018 Joyent, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * Private data: + */ +static int evports_fd[MAX_THREADS]; // per-thread evports_fd +static THREAD_LOCAL port_event_t *evports_evlist = NULL; +static THREAD_LOCAL int evports_evlist_max = 0; + +/* + * Convert the "state" member of "fdtab" into an event ports event mask. + */ +static inline int evports_state_to_events(int state) +{ + int events = 0; + + if (state & FD_EV_POLLED_W) + events |= POLLOUT; + if (state & FD_EV_POLLED_R) + events |= POLLIN; + + return (events); +} + +/* + * Associate or dissociate this file descriptor with the event port, using the + * specified event mask. + */ +static inline void evports_resync_fd(int fd, int events) +{ + if (events == 0) + port_dissociate(evports_fd[tid], PORT_SOURCE_FD, fd); + else + port_associate(evports_fd[tid], PORT_SOURCE_FD, fd, events, NULL); +} + +static void _update_fd(int fd) +{ + int en; + int events; + + en = fdtab[fd].state; + + if (!(fdtab[fd].thread_mask & tid_bit) || !(en & FD_EV_POLLED_RW)) { + if (!(polled_mask[fd] & tid_bit)) { + /* fd was not watched, it's still not */ + return; + } + /* fd totally removed from poll list */ + events = 0; + _HA_ATOMIC_AND(&polled_mask[fd], ~tid_bit); + } + else { + /* OK fd has to be monitored, it was either added or changed */ + events = evports_state_to_events(en); + _HA_ATOMIC_OR(&polled_mask[fd], tid_bit); + } + evports_resync_fd(fd, events); +} + +/* + * Event Ports poller. This routine interacts with the file descriptor + * management data structures and routines; see the large block comment in + * "src/fd.c" for more information. + */ + +REGPRM2 static void _do_poll(struct poller *p, int exp) +{ + int i; + int wait_time; + struct timespec timeout_ts; + unsigned int nevlist; + int fd, old_fd; + int status; + + /* + * Scan the list of file descriptors with an updated status: + */ + for (i = 0; i < fd_nbupdt; i++) { + fd = fd_updt[i]; + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); + if (fdtab[fd].owner == NULL) { + activity[tid].poll_drop++; + continue; + } + + _update_fd(fd); + } + fd_nbupdt = 0; + /* Scan the global update list */ + for (old_fd = fd = update_list.first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + if (fdtab[fd].update_mask & tid_bit) + done_update_polling(fd); + else + continue; + if (!fdtab[fd].owner) + continue; + _update_fd(fd); + } + + thread_harmless_now(); + + /* + * Determine how long to wait for events to materialise on the port. + */ + wait_time = compute_poll_timeout(exp); + tv_entering_poll(); + activity_count_runtime(); + + do { + int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time; + int interrupted = 0; + nevlist = 1; /* desired number of events to be retrieved */ + timeout_ts.tv_sec = (timeout / 1000); + timeout_ts.tv_nsec = (timeout % 1000) * 1000000; + + status = port_getn(evports_fd[tid], + evports_evlist, + evports_evlist_max, + &nevlist, /* updated to the number of events retrieved */ + &timeout_ts); + if (status != 0) { + int e = errno; + switch (e) { + case ETIME: + /* + * Though the manual page has not historically made it + * clear, port_getn() can return -1 with an errno of + * ETIME and still have returned some number of events. + */ + /* nevlist >= 0 */ + break; + default: + nevlist = 0; + interrupted = 1; + break; + } + } + tv_update_date(timeout, nevlist); + + if (nevlist || interrupted) + break; + if (timeout || !wait_time) + break; + if (signal_queue_len) + break; + if (tick_isset(exp) && tick_is_expired(exp, now_ms)) + break; + } while(1); + + tv_leaving_poll(wait_time, nevlist); + + thread_harmless_end(); + + for (i = 0; i < nevlist; i++) { + unsigned int n = 0; + int events, rebind_events; + fd = evports_evlist[i].portev_object; + events = evports_evlist[i].portev_events; + + if (fdtab[fd].owner == NULL) { + activity[tid].poll_dead++; + continue; + } + + if (!(fdtab[fd].thread_mask & tid_bit)) { + activity[tid].poll_skip++; + continue; + } + + /* + * By virtue of receiving an event for this file descriptor, it + * is no longer associated with the port in question. Store + * the previous event mask so that we may reassociate after + * processing is complete. + */ + rebind_events = evports_state_to_events(fdtab[fd].state); + /* rebind_events != 0 */ + + /* + * Set bits based on the events we received from the port: + */ + if (events & POLLIN) + n |= FD_POLL_IN; + if (events & POLLOUT) + n |= FD_POLL_OUT; + if (events & POLLERR) + n |= FD_POLL_ERR; + if (events & POLLHUP) + n |= FD_POLL_HUP; + + /* + * Call connection processing callbacks. Note that it's + * possible for this processing to alter the required event + * port assocation; i.e., the "state" member of the "fdtab" + * entry. If it changes, the fd will be placed on the updated + * list for processing the next time we are called. + */ + fd_update_events(fd, n); + + /* + * This file descriptor was closed during the processing of + * polled events. No need to reassociate. + */ + if (fdtab[fd].owner == NULL) + continue; + + /* + * Reassociate with the port, using the same event mask as + * before. This call will not result in a dissociation as we + * asserted that _some_ events needed to be rebound above. + * + * Reassociating with the same mask allows us to mimic the + * level-triggered behaviour of poll(2). In the event that we + * are interested in the same events on the next turn of the + * loop, this represents no extra work. + * + * If this additional port_associate(3C) call becomes a + * performance problem, we would need to verify that we can + * correctly interact with the file descriptor cache and update + * list (see "src/fd.c") to avoid reassociating here, or to use + * a different events mask. + */ + evports_resync_fd(fd, rebind_events); + } +} + +static int init_evports_per_thread() +{ + int fd; + + evports_evlist_max = global.tune.maxpollevents; + evports_evlist = calloc(evports_evlist_max, sizeof (port_event_t)); + if (evports_evlist == NULL) { + goto fail_alloc; + } + + if (MAX_THREADS > 1 && tid) { + if ((evports_fd[tid] = port_create()) == -1) { + goto fail_fd; + } + } + + /* we may have to unregister some events initially registered on the + * original fd when it was alone, and/or to register events on the new + * fd for this thread. Let's just mark them as updated, the poller will + * do the rest. + */ + for (fd = 0; fd < global.maxsock; fd++) + updt_fd_polling(fd); + + return 1; + + fail_fd: + free(evports_evlist); + evports_evlist = NULL; + evports_evlist_max = 0; + fail_alloc: + return 0; +} + +static void deinit_evports_per_thread() +{ + if (MAX_THREADS > 1 && tid) + close(evports_fd[tid]); + + free(evports_evlist); + evports_evlist = NULL; + evports_evlist_max = 0; +} + +/* + * Initialisation of the event ports poller. + * Returns 0 in case of failure, non-zero in case of success. + */ +REGPRM1 static int _do_init(struct poller *p) +{ + p->private = NULL; + + if ((evports_fd[tid] = port_create()) == -1) { + goto fail; + } + + hap_register_per_thread_init(init_evports_per_thread); + hap_register_per_thread_deinit(deinit_evports_per_thread); + + return 1; + +fail: + p->pref = 0; + return 0; +} + +/* + * Termination of the event ports poller. + * All resources are released and the poller is marked as inoperative. + */ +REGPRM1 static void _do_term(struct poller *p) +{ + if (evports_fd[tid] != -1) { + close(evports_fd[tid]); + evports_fd[tid] = -1; + } + + p->private = NULL; + p->pref = 0; + + free(evports_evlist); + evports_evlist = NULL; + evports_evlist_max = 0; +} + +/* + * Run-time check to make sure we can allocate the resources needed for + * the poller to function correctly. + * Returns 1 on success, otherwise 0. + */ +REGPRM1 static int _do_test(struct poller *p) +{ + int fd; + + if ((fd = port_create()) == -1) { + return 0; + } + + close(fd); + return 1; +} + +/* + * Close and recreate the event port after fork(). Returns 1 on success, + * otherwise 0. If this function fails, "_do_term()" must be called to + * clean up the poller. + */ +REGPRM1 static int _do_fork(struct poller *p) +{ + if (evports_fd[tid] != -1) { + close(evports_fd[tid]); + } + + if ((evports_fd[tid] = port_create()) == -1) { + return 0; + } + + return 1; +} + +/* + * This constructor must be called before main() to register the event ports + * poller. + */ +__attribute__((constructor)) +static void _do_register(void) +{ + struct poller *p; + int i; + + if (nbpollers >= MAX_POLLERS) + return; + + for (i = 0; i < MAX_THREADS; i++) + evports_fd[i] = -1; + + p = &pollers[nbpollers++]; + + p->name = "evports"; + p->pref = 300; + p->flags = 0; + p->private = NULL; + + p->clo = NULL; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; + p->fork = _do_fork; +} diff --git a/src/haproxy.c b/src/haproxy.c index 76d2aa0bf..f85f15fb4 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -460,6 +460,9 @@ static void usage(char *name) #if defined(ENABLE_KQUEUE) " -dk disables kqueue() usage even when available\n" #endif +#if defined(ENABLE_EVPORTS) + " -dv disables event ports usage even when available\n" +#endif #if defined(ENABLE_POLL) " -dp disables poll() usage even when available\n" #endif @@ -1352,6 +1355,9 @@ static void init(int argc, char **argv) #if defined(ENABLE_KQUEUE) global.tune.options |= GTUNE_USE_KQUEUE; #endif +#if defined(ENABLE_EVPORTS) + global.tune.options |= GTUNE_USE_EVPORTS; +#endif #if defined(CONFIG_HAP_LINUX_SPLICE) global.tune.options |= GTUNE_USE_SPLICE; #endif @@ -1396,6 +1402,10 @@ static void init(int argc, char **argv) else if (*flag == 'd' && flag[1] == 'k') global.tune.options &= ~GTUNE_USE_KQUEUE; #endif +#if defined(ENABLE_EVPORTS) + else if (*flag == 'd' && flag[1] == 'v') + global.tune.options &= ~GTUNE_USE_EVPORTS; +#endif #if defined(CONFIG_HAP_LINUX_SPLICE) else if (*flag == 'd' && flag[1] == 'S') global.tune.options &= ~GTUNE_USE_SPLICE; @@ -2025,6 +2035,9 @@ static void init(int argc, char **argv) if (!(global.tune.options & GTUNE_USE_KQUEUE)) disable_poller("kqueue"); + if (!(global.tune.options & GTUNE_USE_EVPORTS)) + disable_poller("evports"); + if (!(global.tune.options & GTUNE_USE_EPOLL)) disable_poller("epoll");