MINOR: atomic: implement native BTS/BTR for x86

The current BTS/BTR operations on x86 are ugly because they rely on a
CAS, so they may be unfair and take time to converge. Fortunately,
where they are currently used (mostly FDs) the contention is expected
to be rare (mostly listeners). But this also limits their use to such
few low-load cases.

On x86 there is a set of BTS/BTR instructions which help for this,
but before the FD's state migrated to 32 bits there was little use of
them since they do not exist in 8 bits.

Now at least it makes sense to use them, at the very least in order
to significantly reduce the code size (one BTS instead of a CMPXCHG
loop). The implementation relies on modern gcc's ability to return
condition flags and limit code inflation and register spilling. The
fall back is retained on the old implementation for all other situations
(inappropriate target size or non-capable compiler). The code shrank
by 1.6 kB on the fast path.

As expected, for now on up to 4 threads there is no measurable difference
of performance.
This commit is contained in:
Willy Tarreau 2021-04-06 14:24:51 +02:00
parent fa68d2641b
commit 92c059c2ac
1 changed files with 57 additions and 0 deletions

View File

@ -327,6 +327,59 @@
#define HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST)
#define HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_SEQ_CST)
#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__))
#define HA_ATOMIC_BTS(val, bit) \
({ \
unsigned char __ret; \
if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \
asm volatile("lock btsq %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned long)(bit)) \
: "cc"); \
} else if (sizeof(*(val)) == 4) { \
asm volatile("lock btsl %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned int)(bit)) \
: "cc"); \
} else if (sizeof(*(val)) == 2) { \
asm volatile("lock btsw %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned short)(bit)) \
: "cc"); \
} else { \
typeof(*(val)) __b_bts = (1UL << (bit)); \
__ret = !!(__atomic_fetch_or((val), __b_bts, __ATOMIC_SEQ_CST) & __b_bts); \
} \
__ret; \
})
#define HA_ATOMIC_BTR(val, bit) \
({ \
unsigned char __ret; \
if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \
asm volatile("lock btrq %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned long)(bit)) \
: "cc"); \
} else if (sizeof(*(val)) == 4) { \
asm volatile("lock btrl %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned int)(bit)) \
: "cc"); \
} else if (sizeof(*(val)) == 2) { \
asm volatile("lock btrw %2, %0\n" \
: "+m" (*(val)), "=@ccc"(__ret) \
: "Ir" ((unsigned short)(bit)) \
: "cc"); \
} else { \
typeof(*(val)) __b_bts = (1UL << (bit)); \
__ret = !!(__atomic_fetch_and((val), ~__b_bts, __ATOMIC_SEQ_CST) & __b_bts); \
} \
__ret; \
})
#else // not x86 or !__GCC_ASM_FLAG_OUTPUTS__
#define HA_ATOMIC_BTS(val, bit) \
({ \
typeof(*(val)) __b_bts = (1UL << (bit)); \
@ -339,6 +392,8 @@
__atomic_fetch_and((val), ~__b_btr, __ATOMIC_SEQ_CST) & __b_btr; \
})
#endif // x86 || __GCC_ASM_FLAG_OUTPUTS__
#define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
/* warning, n is a pointer to the double value for dwcas */
@ -394,6 +449,7 @@
#define _HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_RELAXED)
#define _HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_RELAXED)
#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__))
#define _HA_ATOMIC_BTS(val, bit) \
({ \
typeof(*(val)) __b_bts = (1UL << (bit)); \
@ -405,6 +461,7 @@
typeof(*(val)) __b_btr = (1UL << (bit)); \
__atomic_fetch_and((val), ~__b_btr, __ATOMIC_RELAXED) & __b_btr; \
})
#endif
#define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)
/* warning, n is a pointer to the double value for dwcas */