From 92c059c2ac13a61960ffb4c18dacce258043a7f7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 6 Apr 2021 14:24:51 +0200 Subject: [PATCH] MINOR: atomic: implement native BTS/BTR for x86 The current BTS/BTR operations on x86 are ugly because they rely on a CAS, so they may be unfair and take time to converge. Fortunately, where they are currently used (mostly FDs) the contention is expected to be rare (mostly listeners). But this also limits their use to such few low-load cases. On x86 there is a set of BTS/BTR instructions which help for this, but before the FD's state migrated to 32 bits there was little use of them since they do not exist in 8 bits. Now at least it makes sense to use them, at the very least in order to significantly reduce the code size (one BTS instead of a CMPXCHG loop). The implementation relies on modern gcc's ability to return condition flags and limit code inflation and register spilling. The fall back is retained on the old implementation for all other situations (inappropriate target size or non-capable compiler). The code shrank by 1.6 kB on the fast path. As expected, for now on up to 4 threads there is no measurable difference of performance. --- include/haproxy/atomic.h | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/include/haproxy/atomic.h b/include/haproxy/atomic.h index bb99e8db3e..9f566908b3 100644 --- a/include/haproxy/atomic.h +++ b/include/haproxy/atomic.h @@ -327,6 +327,59 @@ #define HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST) #define HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_SEQ_CST) +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__)) +#define HA_ATOMIC_BTS(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btsq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btsl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btsw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_or((val), __b_bts, __ATOMIC_SEQ_CST) & __b_bts); \ + } \ + __ret; \ + }) + +#define HA_ATOMIC_BTR(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btrq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btrl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btrw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_and((val), ~__b_bts, __ATOMIC_SEQ_CST) & __b_bts); \ + } \ + __ret; \ + }) + +#else // not x86 or !__GCC_ASM_FLAG_OUTPUTS__ + #define HA_ATOMIC_BTS(val, bit) \ ({ \ typeof(*(val)) __b_bts = (1UL << (bit)); \ @@ -339,6 +392,8 @@ __atomic_fetch_and((val), ~__b_btr, __ATOMIC_SEQ_CST) & __b_btr; \ }) +#endif // x86 || __GCC_ASM_FLAG_OUTPUTS__ + #define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) /* warning, n is a pointer to the double value for dwcas */ @@ -394,6 +449,7 @@ #define _HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_RELAXED) #define _HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_RELAXED) +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__)) #define _HA_ATOMIC_BTS(val, bit) \ ({ \ typeof(*(val)) __b_bts = (1UL << (bit)); \ @@ -405,6 +461,7 @@ typeof(*(val)) __b_btr = (1UL << (bit)); \ __atomic_fetch_and((val), ~__b_btr, __ATOMIC_RELAXED) & __b_btr; \ }) +#endif #define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED) /* warning, n is a pointer to the double value for dwcas */