MINOR: thread: provide an alternative to pthread's rwlock

Since version 1.1.0, OpenSSL's libcrypto ignores the provided locking
mechanism and uses pthread's rwlocks instead. The problem is that for
some code paths (e.g. async engines) this results in a huge amount of
syscalls on systems facing a bit of contention, to the point where more
than 80% of the CPU can be spent in the system dealing with spinlocks
just for futex_wake().

This patch provides an alternative by redefining the relevant pthread
rwlocks from the low-overhead version of the progressive rw locks. This
way there will be no more syscalls in case of contention, and CPU will
be burnt in userland. Doing this saves massive amounts of CPU, where
the locks only take 12-15% vs 80% before, which allows SSL to work much
faster on large thread counts (e.g. 24 or more).

The tryrdlock and trywrlock variants have been implemented using a CAS
since their goal is only to succeed on no contention and never to wait.
The pthread_rwlock API is complete except that the timed versions of
the rdlock and wrlock do not wait and simply fall back to trylock
versions.

Since the gains have only been observed with async engines for now,
this option remains disabled by default. It can be enabled at build
time using USE_PTHREAD_EMULATION=1.
This commit is contained in:
Willy Tarreau 2022-07-10 10:58:57 +02:00
parent 688709d814
commit 87aff021db
2 changed files with 71 additions and 1 deletions

View File

@ -56,6 +56,7 @@
# USE_OT : enable the OpenTracing filter
# USE_MEMORY_PROFILING : enable the memory profiler. Linux-glibc only.
# USE_LIBATOMIC : force to link with/without libatomic. Automatic.
# USE_PTHREAD_EMULATION: replace pthread's rwlocks with ours
#
# Options can be forced by specifying "USE_xxx=1" or can be disabled by using
# "USE_xxx=" (empty string). The list of enabled and disabled options for a
@ -337,7 +338,7 @@ LDFLAGS = $(ARCH_FLAGS) -g
# the reported build options.
use_opts = USE_EPOLL USE_KQUEUE USE_NETFILTER \
USE_PCRE USE_PCRE_JIT USE_PCRE2 USE_PCRE2_JIT USE_POLL \
USE_THREAD USE_BACKTRACE \
USE_THREAD USE_PTHREAD_EMULATION USE_BACKTRACE \
USE_STATIC_PCRE USE_STATIC_PCRE2 USE_TPROXY USE_LINUX_TPROXY \
USE_LINUX_SPLICE USE_LIBCRYPT USE_CRYPT_H USE_ENGINE \
USE_GETADDRINFO USE_OPENSSL USE_LUA USE_ACCEPT4 \

View File

@ -968,6 +968,75 @@ void __spin_unlock(enum lock_label lbl, struct ha_spinlock *l,
#endif // defined(DEBUG_THREAD) || defined(DEBUG_FULL)
#if defined(USE_PTHREAD_EMULATION)
/* pthread rwlock emulation using plocks (to avoid expensive futexes).
* these are a direct mapping on Progressive Locks, with the exception that
* since there's a common unlock operation in pthreads, we need to know if
* we need to unlock for reads or writes, so we set the topmost bit to 1 when
* a write lock is acquired to indicate that a write unlock needs to be
* performed. It's not a problem since this bit will never be used given that
* haproxy won't support as many threads as the plocks.
*
* The storage is the pthread_rwlock_t cast as an ulong
*/
int pthread_rwlock_init(pthread_rwlock_t *restrict rwlock, const pthread_rwlockattr_t *restrict attr)
{
ulong *lock = (ulong *)rwlock;
*lock = 0;
return 0;
}
int pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
{
ulong *lock = (ulong *)rwlock;
*lock = 0;
return 0;
}
int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
{
pl_lorw_rdlock((unsigned long *)rwlock);
return 0;
}
int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
{
return !!pl_cmpxchg((unsigned long *)rwlock, 0, PLOCK_LORW_SHR_BASE);
}
int pthread_rwlock_timedrdlock(pthread_rwlock_t *restrict rwlock, const struct timespec *restrict abstime)
{
return pthread_rwlock_tryrdlock(rwlock);
}
int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
{
pl_lorw_wrlock((unsigned long *)rwlock);
return 0;
}
int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
{
return !!pl_cmpxchg((unsigned long *)rwlock, 0, PLOCK_LORW_EXC_BASE);
}
int pthread_rwlock_timedwrlock(pthread_rwlock_t *restrict rwlock, const struct timespec *restrict abstime)
{
return pthread_rwlock_trywrlock(rwlock);
}
int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
{
pl_lorw_unlock((unsigned long *)rwlock);
return 0;
}
#endif // defined(USE_PTHREAD_EMULATION)
/* Depending on the platform and how libpthread was built, pthread_exit() may
* involve some code in libgcc_s that would be loaded on exit for the first
* time, causing aborts if the process is chrooted. It's harmless bit very