mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2024-12-17 17:04:35 +00:00
MINOR: atomic: add armv8.1-a atomics variant for cas-dw
This variant uses the CASP instruction available on armv8.1-a CPU cores, which is detected when __ARM_FEATURE_ATOMICS is set (gcc-linaro >= 7, mainline >= 9). This one was tested on cortex-A55 (S905D3) and on AWS' Graviton2 CPUs. The instruction performs way better on high thread counts since it guarantees some forward progress when facing extreme contention while the original LL/SC approach is light on low-thread counts but doesn't guarantee progress. The implementation is not the most optimal possible. In particular since the instruction requires to work on register pairs and there doesn't seem to be a way to force gcc to emit register pairs, we have to decide to force to use the pair (x0,x1) to store the old value, and (x2,x3) to store the new one, and this necessarily involves some extra moves. But at least it does improve the situation with 16 threads and more. See issue #958 for more context. Note, a first implementation of this function was making use of an input/output constraint passed using "+Q"(*(void**)target), which was resulting in smaller overall code than passing "target" as an input register only. It turned out that the cause was directly related to whether the function was inlined or not, hence the "forceinline" attribute. Any changes to this code should still pay attention to this important factor.
This commit is contained in:
parent
168fc5332c
commit
46cca86900
@ -510,6 +510,49 @@ __ha_barrier_atomic_full(void)
|
||||
*/
|
||||
#define __ha_cpu_relax() ({ asm volatile("isb" ::: "memory"); 1; })
|
||||
|
||||
#if defined(__ARM_FEATURE_ATOMICS) // ARMv8.1-A atomics
|
||||
|
||||
/* returns 0 on failure, non-zero on success */
|
||||
static forceinline int __ha_cas_dw(void *target, void *compare, const void *set)
|
||||
{
|
||||
/* There's no status set by the CASP instruction so we need to keep a
|
||||
* copy of the original registers and compare them afterwards to detect
|
||||
* if we could apply the change. The problem is that we need to force
|
||||
* register pairs but there appears to be no gcc constraint to enforce
|
||||
* these. So instead we declare them as local variables of type
|
||||
* register. This causes extra moves in the code but remains better
|
||||
* than the LL/SC versions on many cores.
|
||||
*/
|
||||
void *back0 = ((void **)compare)[0];
|
||||
void *back1 = ((void **)compare)[1];
|
||||
register void *cmp0 asm("x0") = back0;
|
||||
register void *cmp1 asm("x1") = back1;
|
||||
register const void *set0 asm("x2") = ((void * const *)set)[0];
|
||||
register const void *set1 asm("x3") = ((void * const *)set)[1];
|
||||
long ret;
|
||||
|
||||
__asm__ __volatile__("casp %0, %1, %3, %4, [%2]\n"
|
||||
: "+r" (cmp0), // %0
|
||||
"+r" (cmp1) // %1
|
||||
: "r" (target), // %2
|
||||
"r" (set0), // %3
|
||||
"r" (set1) // %4
|
||||
: "memory");
|
||||
|
||||
/* if the old value is still the same unchanged, we won, otherwise we
|
||||
* store the refreshed old value. This construct remains more efficient
|
||||
* than an or(xor,xor).
|
||||
*/
|
||||
ret = cmp0 == back0 && cmp1 == back1;
|
||||
if (!ret) {
|
||||
((void **)compare)[0] = cmp0;
|
||||
((void **)compare)[1] = cmp1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else // no ARMv8.1-A atomics
|
||||
|
||||
static __inline int __ha_cas_dw(void *target, void *compare, void *set)
|
||||
{
|
||||
void *value[2];
|
||||
@ -535,6 +578,7 @@ static __inline int __ha_cas_dw(void *target, void *compare, void *set)
|
||||
memcpy(compare, &value, sizeof(value));
|
||||
return (tmp1);
|
||||
}
|
||||
#endif // ARMv8.1-A atomics
|
||||
|
||||
#else /* unknown / unhandled architecture, fall back to generic barriers */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user