[OPTIM] linux: add support for bypassing libc to force using vsyscalls

Some distros' libc are built for CPUs earlier than i686 and as such do
not offer support for Linux kernel's faster vsyscalls. This code adds
a new build option USE_VSYSCALLS to bypass libc for most commonly used
system calls. A net gain of about 10% can be observed with this change
alone.

It only works when /proc/sys/abi/vsyscall32 equals exactly 2. When it's
set to 1, the VDSO is randomized and cannot be used.
This commit is contained in:
Willy Tarreau 2010-11-14 17:09:33 +01:00
parent b695a6e5fa
commit 48d84c10b5
3 changed files with 216 additions and 1 deletions

View File

@ -22,6 +22,7 @@
# USE_LINUX_SPLICE : enable kernel 2.6 splicing (broken on old kernels) # USE_LINUX_SPLICE : enable kernel 2.6 splicing (broken on old kernels)
# USE_LIBCRYPT : enable crypted passwords using -lcrypt # USE_LIBCRYPT : enable crypted passwords using -lcrypt
# USE_CRYPT_H : set it if your system requires including crypt.h # USE_CRYPT_H : set it if your system requires including crypt.h
# USE_VSYSCALL : enable vsyscall on Linux x86, bypassing libc
# #
# Options can be forced by specifying "USE_xxx=1" or can be disabled by using # Options can be forced by specifying "USE_xxx=1" or can be disabled by using
# "USE_xxx=" (empty string). # "USE_xxx=" (empty string).
@ -374,6 +375,12 @@ OPTIONS_OBJS += src/ev_kqueue.o
BUILD_OPTIONS += $(call ignore_implicit,USE_KQUEUE) BUILD_OPTIONS += $(call ignore_implicit,USE_KQUEUE)
endif endif
ifneq ($(USE_VSYSCALL),)
OPTIONS_OBJS += src/i386-linux-vsys.o
OPTIONS_CFLAGS += -DCONFIG_HAP_LINUX_VSYSCALL
BUILD_OPTIONS += $(call ignore_implicit,USE_VSYSCALL)
endif
ifneq ($(USE_NETFILTER),) ifneq ($(USE_NETFILTER),)
OPTIONS_CFLAGS += -DNETFILTER OPTIONS_CFLAGS += -DNETFILTER
BUILD_OPTIONS += $(call ignore_implicit,USE_NETFILTER) BUILD_OPTIONS += $(call ignore_implicit,USE_NETFILTER)

204
src/i386-linux-vsys.c Normal file
View File

@ -0,0 +1,204 @@
/*
* Fast system call support for x86 on Linux
*
* Copyright 2010 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Recent kernels support a faster syscall ABI on x86 using the VDSO page, but
* some libc that are built for CPUs earlier than i686 do not implement it.
* This code bypasses the libc when the VDSO is detected. It should only be
* used when it's sure that the libc really does not support the VDSO, but
* fixing the libc is preferred. Using the VDSO can improve the overall
* performance by about 10%.
*/
#if defined(__linux__) && defined(__i386__)
/* Silently ignore other platforms to be friendly with distro packagers */
#include <dlfcn.h>
#include <sys/mman.h>
void int80(void); /* declared in the assembler code */
static void *vsyscall = &int80; /* initialize vsyscall to use int80 by default */
static __attribute__((used)) unsigned int back_ebx;
/* now we redefine some frequently used syscalls. Epoll_create is defined too
* in order to replace old disabled implementations.
*/
asm
(
"epoll_create: .GLOBL epoll_create\n"
" mov $0xfe, %eax\n"
" mov %ebx, back_ebx\n"
" mov 4(%esp), %ebx\n"
" jmp do_syscall\n"
"epoll_ctl: .GLOBL epoll_ctl\n"
" push %esi\n"
" mov $0xff, %eax\n"
" mov %ebx, back_ebx\n"
" mov 20(%esp), %esi\n"
" mov 16(%esp), %edx\n"
" mov 12(%esp), %ecx\n"
" mov 8(%esp), %ebx\n"
" call do_syscall\n"
" pop %esi\n"
" ret\n"
"epoll_wait: .GLOBL epoll_wait\n"
" push %esi\n"
" mov $0x100, %eax\n"
" mov %ebx, back_ebx\n"
" mov 20(%esp), %esi\n"
" mov 16(%esp), %edx\n"
" mov 12(%esp), %ecx\n"
" mov 8(%esp), %ebx\n"
" call do_syscall\n"
" pop %esi\n"
" ret\n"
"splice: .GLOBL splice\n"
" push %ebp\n"
" push %edi\n"
" push %esi\n"
" mov $0x139, %eax\n"
" mov %ebx, back_ebx\n"
" mov 36(%esp), %ebp\n"
" mov 32(%esp), %edi\n"
" mov 28(%esp), %esi\n"
" mov 24(%esp), %edx\n"
" mov 20(%esp), %ecx\n"
" mov 16(%esp), %ebx\n"
" call do_syscall\n"
" pop %esi\n"
" pop %edi\n"
" pop %ebp\n"
" ret\n"
"close: .GLOBL close\n"
" mov $0x06, %eax\n"
" mov %ebx, back_ebx\n"
" mov 4(%esp), %ebx\n"
" jmp do_syscall\n"
"gettimeofday: .GLOBL gettimeofday\n"
" mov $0x4e, %eax\n"
" mov %ebx, back_ebx\n"
" mov 8(%esp), %ecx\n"
" mov 4(%esp), %ebx\n"
" jmp do_syscall\n"
"fcntl: .GLOBL fcntl\n"
" mov $0xdd, %eax\n"
" mov %ebx, back_ebx\n"
" mov 12(%esp), %edx\n"
" mov 8(%esp), %ecx\n"
" mov 4(%esp), %ebx\n"
" jmp do_syscall\n"
"socket: .GLOBL socket\n"
" mov $0x01, %eax\n"
" jmp socketcall\n"
"bind: .GLOBL bind\n"
" mov $0x02, %eax\n"
" jmp socketcall\n"
"connect: .GLOBL connect\n"
" mov $0x03, %eax\n"
" jmp socketcall\n"
"listen: .GLOBL listen\n"
" mov $0x04, %eax\n"
" jmp socketcall\n"
"accept: .GLOBL accept\n"
" mov $0x05, %eax\n"
" jmp socketcall\n"
"getsockname: .GLOBL getsockname\n"
" mov $0x06, %eax\n"
" jmp socketcall\n"
"send: .GLOBL send\n"
" mov $0x09, %eax\n"
" jmp socketcall\n"
"recv: .GLOBL recv\n"
" mov $0x0a, %eax\n"
" jmp socketcall\n"
"shutdown: .GLOBL shutdown\n"
" mov $0x0d, %eax\n"
" jmp socketcall\n"
"setsockopt: .GLOBL setsockopt\n"
" mov $0x0e, %eax\n"
" jmp socketcall\n"
"getsockopt: .GLOBL getsockopt\n"
" mov $0x0f, %eax\n"
" jmp socketcall\n"
"socketcall:\n"
" mov %ebx, back_ebx\n"
" mov %eax, %ebx\n"
" mov $0x66, %eax\n"
" lea 4(%esp), %ecx\n"
/* fall through */
"do_syscall:\n"
" call *vsyscall\n" // always valid, may be int80 or vsyscall
" mov back_ebx, %ebx\n"
" cmpl $0xfffff000, %eax\n" // consider -4096..-1 for errno
" jae 0f\n"
" ret\n"
"0:\n" // error handling
" neg %eax\n" // get errno value
" push %eax\n" // save it
" call __errno_location\n"
" popl (%eax)\n" // store the pushed errno into the proper location
" mov $-1, %eax\n" // and return -1
" ret\n"
"int80:\n" // default compatible calling convention
" int $0x80\n"
" ret\n"
);
__attribute__((constructor))
static void __i386_linux_vsyscall_init(void)
{
/* We can get the pointer by resolving the __kernel_vsyscall symbol
* from the "linux-gate.so.1" virtual shared object, but this requires
* libdl. Or we can also know that the vsyscall pointer is always
* located at 0xFFFFE018 when /proc/sys/abi/vsyscall32 contains the
* default value 2. So we can use that once we've checked that we can
* access it without faulting. The dlsym method will also work when
* vsyscall32 = 1, which randomizes the VDSO address.
*/
#ifdef USE_VSYSCALL_DLSYM
void *handle = dlopen("linux-gate.so.1", RTLD_NOW);
if (handle) {
void *ptr = dlsym(handle, "__kernel_vsyscall");
dlclose(handle);
if (ptr)
vsyscall = ptr;
}
#else
/* Heuristic: trying to mprotect() the VDSO area will only succeed if
* it is mapped.
*/
if (mprotect((void *)0xffffe000, 4096, PROT_READ|PROT_EXEC) == 0) {
unsigned long ptr = *(unsigned long *)0xFFFFE018; /* VDSO is mapped */
if ((ptr & 0xFFFFE000) == 0xFFFFE000)
vsyscall = (void *)ptr;
}
#endif
}
#endif /* defined(__linux__) && defined(__i386__) */

View File

@ -77,8 +77,12 @@
#define __NR_splice 313 #define __NR_splice 313
#endif /* $arch */ #endif /* $arch */
#if defined(CONFIG_HAP_LINUX_VSYSCALL) && defined(__linux__) && defined(__i386__)
/* the syscall is redefined somewhere else */
extern int splice(int fdin, loff_t *off_in, int fdout, loff_t *off_out, size_t len, unsigned long flags);
#else
_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags) _syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
#endif
#endif /* __NR_splice */ #endif /* __NR_splice */
/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes /* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes