mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-03-05 02:49:01 +00:00
When trying to speculatively send data to a server being connected to, we see the following pattern : connect() = EINPROGRESS send() = EAGAIN epoll_ctl(add, W) epoll_wait() = EPOLLOUT send() = success > epoll_ctl(del, W) > recv() = EAGAIN > epoll_ctl(add, R) recv() = success epoll_ctl(del, R) The reason for the failed recv() call is that the reading was marked as speculative while we already have a polled I/O there. So we already know when removing send write poll that the read is pending. Thus, let's improve this by merging speculative I/O into polled I/O when polled state changes. The result is now the following as expected : connect() = EINPROGRESS send() = EAGAIN epoll_ctl(add, W) epoll_wait() = EPOLLOUT send() = success epoll_ctl(mod, R) recv() = success epoll_ctl(del, R) This is specific to epoll(), it doesn't make much sense at the moment to do so for other pollers, because the cost of updating them is very small. The average performance gain on small requests is of 1.6% in TCP mode, which is easily explained with the syscall stats below for 10000 forwarded connections : Before : % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 91.02 0.024608 0 60000 1 epoll_wait 2.19 0.000593 0 20000 shutdown 1.52 0.000412 0 10000 10000 connect 1.36 0.000367 0 29998 9998 sendto 1.09 0.000294 0 49993 epoll_ctl 0.93 0.000252 0 50004 20002 recvfrom 0.79 0.000214 0 20005 close 0.62 0.000167 0 20001 10001 accept4 0.25 0.000067 0 20002 setsockopt 0.13 0.000035 0 10001 socket 0.10 0.000028 0 10001 fcntl After: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 87.59 0.024269 0 50012 1 epoll_wait 3.19 0.000884 0 20000 shutdown 2.33 0.000646 0 29996 9996 sendto 2.02 0.000560 0 10005 10003 connect 1.40 0.000387 0 40013 10013 recvfrom 1.35 0.000374 0 40000 epoll_ctl 0.64 0.000178 0 20001 10001 accept4 0.55 0.000152 0 20005 close 0.45 0.000124 0 20002 setsockopt 0.31 0.000086 0 10001 fcntl 0.17 0.000047 0 10001 socket Overall : -16.6% epoll_wait -20% recvfrom -20% epoll_ctl On HTTP, the gain is even better : % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 80.43 0.015386 0 60006 1 epoll_wait 4.61 0.000882 0 30000 10000 sendto 3.74 0.000715 0 20001 10001 accept4 3.35 0.000640 0 10000 10000 connect 2.66 0.000508 0 20005 close 1.34 0.000257 0 30002 10002 recvfrom 1.27 0.000242 0 30005 epoll_ctl 1.20 0.000230 0 10000 shutdown 0.62 0.000119 0 20003 setsockopt 0.40 0.000077 0 10001 socket 0.39 0.000074 0 10001 fcntl willy@wtap:haproxy$ head -15 apres.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 83.47 0.020301 0 50008 1 epoll_wait 4.26 0.001036 0 20005 close 3.30 0.000803 0 30000 10000 sendto 2.55 0.000621 0 20001 10001 accept4 1.76 0.000428 0 10000 10000 connect 1.20 0.000292 0 10000 shutdown 1.14 0.000278 0 20001 1 recvfrom 0.86 0.000210 0 20003 epoll_ctl 0.71 0.000173 0 20003 setsockopt 0.49 0.000120 0 10001 socket 0.25 0.000060 0 10001 fcntl Overall : -16.6% epoll_wait -33% recvfrom -33% epoll_ctl
361 lines
8.7 KiB
C
361 lines
8.7 KiB
C
/*
|
|
* FD polling functions for Linux epoll
|
|
*
|
|
* Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <unistd.h>
|
|
#include <sys/time.h>
|
|
#include <sys/types.h>
|
|
|
|
#include <common/compat.h>
|
|
#include <common/config.h>
|
|
#include <common/debug.h>
|
|
#include <common/epoll.h>
|
|
#include <common/standard.h>
|
|
#include <common/ticks.h>
|
|
#include <common/time.h>
|
|
#include <common/tools.h>
|
|
|
|
#include <types/global.h>
|
|
|
|
#include <proto/fd.h>
|
|
#include <proto/signal.h>
|
|
#include <proto/task.h>
|
|
|
|
|
|
static int absmaxevents = 0; // absolute maximum amounts of polled events
|
|
|
|
/* private data */
|
|
static struct epoll_event *epoll_events;
|
|
static int epoll_fd;
|
|
|
|
/* This structure may be used for any purpose. Warning! do not use it in
|
|
* recursive functions !
|
|
*/
|
|
static struct epoll_event ev;
|
|
|
|
#ifndef EPOLLRDHUP
|
|
/* EPOLLRDHUP was defined late in libc, and it appeared in kernel 2.6.17 */
|
|
#define EPOLLRDHUP 0x2000
|
|
#endif
|
|
|
|
/*
|
|
* speculative epoll() poller
|
|
*/
|
|
REGPRM2 static void _do_poll(struct poller *p, int exp)
|
|
{
|
|
int status, eo, en;
|
|
int fd, opcode;
|
|
int count;
|
|
int updt_idx;
|
|
int wait_time;
|
|
|
|
/* first, scan the update list to find changes */
|
|
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
|
|
fd = fd_updt[updt_idx];
|
|
en = fdtab[fd].spec_e & 15; /* new events */
|
|
eo = fdtab[fd].spec_e >> 4; /* previous events */
|
|
|
|
if (fdtab[fd].owner && (eo ^ en)) {
|
|
if ((eo ^ en) & FD_EV_POLLED_RW) {
|
|
/* poll status changed. We'll have to run some syscalls
|
|
* for this, so let's merge any pending speculative events
|
|
* into them in order to avoid possible future failed calls
|
|
* (typically recv()). In practice on a slow connection
|
|
* establishment, this saves one epoll_ctl() and one recv().
|
|
*/
|
|
en = (en & FD_EV_POLLED_RW) | ((en & FD_EV_ACTIVE_RW) * FD_EV_POLLED / FD_EV_ACTIVE);
|
|
|
|
if ((en & FD_EV_POLLED_RW) == 0) {
|
|
/* fd removed from poll list */
|
|
opcode = EPOLL_CTL_DEL;
|
|
}
|
|
else if ((eo & FD_EV_POLLED_RW) == 0) {
|
|
/* new fd in the poll list */
|
|
opcode = EPOLL_CTL_ADD;
|
|
}
|
|
else {
|
|
/* fd status changed */
|
|
opcode = EPOLL_CTL_MOD;
|
|
}
|
|
|
|
/* construct the epoll events based on new state */
|
|
ev.events = 0;
|
|
if (en & FD_EV_POLLED_R)
|
|
ev.events |= EPOLLIN | EPOLLRDHUP;
|
|
|
|
if (en & FD_EV_POLLED_W)
|
|
ev.events |= EPOLLOUT;
|
|
|
|
ev.data.fd = fd;
|
|
epoll_ctl(epoll_fd, opcode, fd, &ev);
|
|
}
|
|
|
|
fdtab[fd].spec_e = (en << 4) + en; /* save new events */
|
|
|
|
if (!(en & FD_EV_ACTIVE_RW)) {
|
|
/* This fd doesn't use any active entry anymore, we can
|
|
* kill its entry.
|
|
*/
|
|
release_spec_entry(fd);
|
|
}
|
|
else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
|
|
/* we need a new spec entry now */
|
|
alloc_spec_entry(fd);
|
|
}
|
|
|
|
}
|
|
fdtab[fd].updated = 0;
|
|
fdtab[fd].new = 0;
|
|
}
|
|
fd_nbupdt = 0;
|
|
|
|
/* compute the epoll_wait() timeout */
|
|
|
|
if (fd_nbspec || run_queue || signal_queue_len) {
|
|
/* Maybe we still have events in the spec list, or there are
|
|
* some tasks left pending in the run_queue, so we must not
|
|
* wait in epoll() otherwise we would delay their delivery by
|
|
* the next timeout.
|
|
*/
|
|
wait_time = 0;
|
|
}
|
|
else {
|
|
if (!exp)
|
|
wait_time = MAX_DELAY_MS;
|
|
else if (tick_is_expired(exp, now_ms))
|
|
wait_time = 0;
|
|
else {
|
|
wait_time = TICKS_TO_MS(tick_remain(now_ms, exp)) + 1;
|
|
if (wait_time > MAX_DELAY_MS)
|
|
wait_time = MAX_DELAY_MS;
|
|
}
|
|
}
|
|
|
|
/* now let's wait for polled events */
|
|
|
|
gettimeofday(&before_poll, NULL);
|
|
status = epoll_wait(epoll_fd, epoll_events, global.tune.maxpollevents, wait_time);
|
|
tv_update_date(wait_time, status);
|
|
measure_idle();
|
|
|
|
/* process polled events */
|
|
|
|
for (count = 0; count < status; count++) {
|
|
unsigned int n;
|
|
unsigned int e = epoll_events[count].events;
|
|
fd = epoll_events[count].data.fd;
|
|
|
|
if (!fdtab[fd].owner)
|
|
continue;
|
|
|
|
/* it looks complicated but gcc can optimize it away when constants
|
|
* have same values... In fact it depends on gcc :-(
|
|
*/
|
|
fdtab[fd].ev &= FD_POLL_STICKY;
|
|
if (EPOLLIN == FD_POLL_IN && EPOLLOUT == FD_POLL_OUT &&
|
|
EPOLLPRI == FD_POLL_PRI && EPOLLERR == FD_POLL_ERR &&
|
|
EPOLLHUP == FD_POLL_HUP) {
|
|
n = e & (EPOLLIN|EPOLLOUT|EPOLLPRI|EPOLLERR|EPOLLHUP);
|
|
}
|
|
else {
|
|
n = ((e & EPOLLIN ) ? FD_POLL_IN : 0) |
|
|
((e & EPOLLPRI) ? FD_POLL_PRI : 0) |
|
|
((e & EPOLLOUT) ? FD_POLL_OUT : 0) |
|
|
((e & EPOLLERR) ? FD_POLL_ERR : 0) |
|
|
((e & EPOLLHUP) ? FD_POLL_HUP : 0);
|
|
}
|
|
|
|
/* always remap RDHUP to HUP as they're used similarly */
|
|
if (e & EPOLLRDHUP)
|
|
n |= FD_POLL_HUP;
|
|
|
|
if (!n)
|
|
continue;
|
|
|
|
fdtab[fd].ev |= n;
|
|
|
|
if (fdtab[fd].iocb) {
|
|
int new_updt, old_updt;
|
|
|
|
/* Mark the events as speculative before processing
|
|
* them so that if nothing can be done we don't need
|
|
* to poll again.
|
|
*/
|
|
if (fdtab[fd].ev & FD_POLL_IN)
|
|
fd_ev_set(fd, DIR_RD);
|
|
|
|
if (fdtab[fd].ev & FD_POLL_OUT)
|
|
fd_ev_set(fd, DIR_WR);
|
|
|
|
if (fdtab[fd].spec_p) {
|
|
/* This fd was already scheduled for being called as a speculative I/O */
|
|
continue;
|
|
}
|
|
|
|
/* Save number of updates to detect creation of new FDs. */
|
|
old_updt = fd_nbupdt;
|
|
fdtab[fd].iocb(fd);
|
|
|
|
/* One or more fd might have been created during the iocb().
|
|
* This mainly happens with new incoming connections that have
|
|
* just been accepted, so we'd like to process them immediately
|
|
* for better efficiency. Second benefit, if at the end the fds
|
|
* are disabled again, we can safely destroy their update entry
|
|
* to reduce the scope of later scans. This is the reason we
|
|
* scan the new entries backwards.
|
|
*/
|
|
|
|
for (new_updt = fd_nbupdt; new_updt > old_updt; new_updt--) {
|
|
fd = fd_updt[new_updt - 1];
|
|
if (!fdtab[fd].new)
|
|
continue;
|
|
|
|
fdtab[fd].new = 0;
|
|
fdtab[fd].ev &= FD_POLL_STICKY;
|
|
|
|
if ((fdtab[fd].spec_e & FD_EV_STATUS_R) == FD_EV_ACTIVE_R)
|
|
fdtab[fd].ev |= FD_POLL_IN;
|
|
|
|
if ((fdtab[fd].spec_e & FD_EV_STATUS_W) == FD_EV_ACTIVE_W)
|
|
fdtab[fd].ev |= FD_POLL_OUT;
|
|
|
|
if (fdtab[fd].ev && fdtab[fd].iocb && fdtab[fd].owner)
|
|
fdtab[fd].iocb(fd);
|
|
|
|
/* we can remove this update entry if it's the last one and is
|
|
* unused, otherwise we don't touch anything.
|
|
*/
|
|
if (new_updt == fd_nbupdt && fdtab[fd].spec_e == 0) {
|
|
fdtab[fd].updated = 0;
|
|
fd_nbupdt--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* the caller will take care of speculative events */
|
|
}
|
|
|
|
/*
|
|
* Initialization of the speculative epoll() poller.
|
|
* Returns 0 in case of failure, non-zero in case of success. If it fails, it
|
|
* disables the poller by setting its pref to 0.
|
|
*/
|
|
REGPRM1 static int _do_init(struct poller *p)
|
|
{
|
|
p->private = NULL;
|
|
|
|
epoll_fd = epoll_create(global.maxsock + 1);
|
|
if (epoll_fd < 0)
|
|
goto fail_fd;
|
|
|
|
/* See comments at the top of the file about this formula. */
|
|
absmaxevents = MAX(global.tune.maxpollevents, global.maxsock);
|
|
epoll_events = (struct epoll_event*)
|
|
calloc(1, sizeof(struct epoll_event) * absmaxevents);
|
|
|
|
if (epoll_events == NULL)
|
|
goto fail_ee;
|
|
|
|
return 1;
|
|
|
|
fail_ee:
|
|
close(epoll_fd);
|
|
epoll_fd = -1;
|
|
fail_fd:
|
|
p->pref = 0;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Termination of the speculative epoll() poller.
|
|
* Memory is released and the poller is marked as unselectable.
|
|
*/
|
|
REGPRM1 static void _do_term(struct poller *p)
|
|
{
|
|
free(epoll_events);
|
|
|
|
if (epoll_fd >= 0) {
|
|
close(epoll_fd);
|
|
epoll_fd = -1;
|
|
}
|
|
|
|
epoll_events = NULL;
|
|
p->private = NULL;
|
|
p->pref = 0;
|
|
}
|
|
|
|
/*
|
|
* Check that the poller works.
|
|
* Returns 1 if OK, otherwise 0.
|
|
*/
|
|
REGPRM1 static int _do_test(struct poller *p)
|
|
{
|
|
int fd;
|
|
|
|
fd = epoll_create(global.maxsock + 1);
|
|
if (fd < 0)
|
|
return 0;
|
|
close(fd);
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Recreate the epoll file descriptor after a fork(). Returns 1 if OK,
|
|
* otherwise 0. It will ensure that all processes will not share their
|
|
* epoll_fd. Some side effects were encountered because of this, such
|
|
* as epoll_wait() returning an FD which was previously deleted.
|
|
*/
|
|
REGPRM1 static int _do_fork(struct poller *p)
|
|
{
|
|
if (epoll_fd >= 0)
|
|
close(epoll_fd);
|
|
epoll_fd = epoll_create(global.maxsock + 1);
|
|
if (epoll_fd < 0)
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* It is a constructor, which means that it will automatically be called before
|
|
* main(). This is GCC-specific but it works at least since 2.95.
|
|
* Special care must be taken so that it does not need any uninitialized data.
|
|
*/
|
|
__attribute__((constructor))
|
|
static void _do_register(void)
|
|
{
|
|
struct poller *p;
|
|
|
|
if (nbpollers >= MAX_POLLERS)
|
|
return;
|
|
|
|
epoll_fd = -1;
|
|
p = &pollers[nbpollers++];
|
|
|
|
p->name = "epoll";
|
|
p->pref = 300;
|
|
p->private = NULL;
|
|
|
|
p->clo = NULL;
|
|
p->test = _do_test;
|
|
p->init = _do_init;
|
|
p->term = _do_term;
|
|
p->poll = _do_poll;
|
|
p->fork = _do_fork;
|
|
}
|
|
|
|
|
|
/*
|
|
* Local variables:
|
|
* c-indent-level: 8
|
|
* c-basic-offset: 8
|
|
* End:
|
|
*/
|