IMPORT: ebtree: rework the fls macros to better deal with arch-specific ones
The definitions were a bit of a mess and there wasn't even a fall back to __builtin_clz() on compilers supporting it. Now we instead define a macro for each implementation that is set on an arch-dependent case by case, and add the fall back ones only when not defined. This also allows the flsnz8() to automatically fall back to the 32-bit arch-specific version if available. This shows a consistent 33% speedup on arm for strings. This is cbtree commit c6075742e8d0a6924e7183d44bd93dec20ca8049. This is ebtree commit f452d0f83eca72f6c3484ccb138d341ed6fd27ed.
This commit is contained in:
parent
fc421e5b3d
commit
1c46a07460
|
@ -250,46 +250,83 @@
|
|||
#include <import/ebtree-t.h>
|
||||
#include <haproxy/api.h>
|
||||
|
||||
static inline int flsnz8_generic(unsigned int x)
|
||||
/* returns clz from 7 to 0 for 0x01 to 0xFF. Returns 7 for 0 as well. */
|
||||
static inline unsigned int clz8(unsigned char c)
|
||||
{
|
||||
unsigned int r = 4;
|
||||
|
||||
if (c & 0xf0) {
|
||||
r = 0;
|
||||
c >>= 4;
|
||||
}
|
||||
return r + ((0x000055afU >> (c * 2)) & 0x3);
|
||||
}
|
||||
|
||||
/* FLSNZ: find last set bit for non-zero value. "Last" here means the highest
|
||||
* one. It returns a value from 1 to 32 for 1<<0 to 1<<31.
|
||||
*/
|
||||
|
||||
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__atom__)
|
||||
/* DO NOT USE ON ATOM! The instruction is emulated and is several times slower
|
||||
* than doing the math by hand.
|
||||
*/
|
||||
static inline int flsnz32(unsigned int x)
|
||||
{
|
||||
int r;
|
||||
__asm__("bsrl %1,%0\n"
|
||||
: "=r" (r) : "rm" (x));
|
||||
return r + 1;
|
||||
}
|
||||
#define flsnz32(x) flsnz32(x)
|
||||
|
||||
# if defined(__x86_64__)
|
||||
static inline int flsnz64(unsigned long long x)
|
||||
{
|
||||
unsigned long long r;
|
||||
__asm__("bsrq %1,%0\n"
|
||||
: "=r" (r) : "rm" (x));
|
||||
return r + 1;
|
||||
}
|
||||
# define flsnz64(x) flsnz64(x)
|
||||
# endif
|
||||
|
||||
#elif !defined(__atom__) && defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2)))
|
||||
/* gcc >= 4.2 brings __builtin_clz() and __builtin_clzl(), usable for non-x86 */
|
||||
|
||||
static inline int flsnz32(unsigned int x)
|
||||
{
|
||||
return 32 - __builtin_clz(x);
|
||||
}
|
||||
# define flsnz32(x) flsnz32(x)
|
||||
|
||||
# if defined(__SIZEOF_LONG__) && (__SIZEOF_LONG__ > 4)
|
||||
static inline int flsnz64(unsigned long x)
|
||||
{
|
||||
return (__SIZEOF_LONG__ * 8) - __builtin_clzl(x);
|
||||
}
|
||||
# define flsnz64(x) flsnz64(x)
|
||||
# endif
|
||||
|
||||
#endif /* end of arch-specific implementations */
|
||||
|
||||
/*** Fallback versions below ***/
|
||||
|
||||
#ifndef flsnz8
|
||||
# if defined(flsnz32)
|
||||
# define flsnz8(x) flsnz32((unsigned char)x)
|
||||
# else
|
||||
static inline int flsnz8(unsigned int x)
|
||||
{
|
||||
int ret = 0;
|
||||
if (x >> 4) { x >>= 4; ret += 4; }
|
||||
return ret + ((0xFFFFAA50U >> (x << 1)) & 3) + 1;
|
||||
}
|
||||
# define flsnz8(x) flsnz8(x)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Note: we never need to run fls on null keys, so we can optimize the fls
|
||||
* function by removing a conditional jump.
|
||||
*/
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
/* this code is similar on 32 and 64 bit */
|
||||
static inline int flsnz(int x)
|
||||
{
|
||||
int r;
|
||||
__asm__("bsrl %1,%0\n"
|
||||
: "=r" (r) : "rm" (x));
|
||||
return r+1;
|
||||
}
|
||||
|
||||
static inline int flsnz8(unsigned char x)
|
||||
{
|
||||
int r;
|
||||
__asm__("movzbl %%al, %%eax\n"
|
||||
"bsrl %%eax,%0\n"
|
||||
: "=r" (r) : "a" (x));
|
||||
return r+1;
|
||||
}
|
||||
|
||||
static inline long flsnz_long(unsigned long x)
|
||||
{
|
||||
long r;
|
||||
__asm__("bsr %1,%0\n"
|
||||
: "=r" (r) : "rm" (x));
|
||||
return r + 1;
|
||||
}
|
||||
|
||||
#else
|
||||
// returns 1 to 32 for 1<<0 to 1<<31. Undefined for 0.
|
||||
#define flsnz(___a) ({ \
|
||||
#ifndef flsnz32
|
||||
# define flsnz32(___a) ({ \
|
||||
register int ___x, ___bits = 0; \
|
||||
___x = (___a); \
|
||||
if (___x & 0xffff0000) { ___x &= 0xffff0000; ___bits += 16;} \
|
||||
|
@ -299,17 +336,10 @@ static inline long flsnz_long(unsigned long x)
|
|||
if (___x & 0xaaaaaaaa) { ___x &= 0xaaaaaaaa; ___bits += 1;} \
|
||||
___bits + 1; \
|
||||
})
|
||||
|
||||
static inline int flsnz8(unsigned int x)
|
||||
{
|
||||
return flsnz8_generic(x);
|
||||
}
|
||||
|
||||
#define flsnz_long(x) ((sizeof(long) > 4) ? flsnz64(x) : flsnz32(x))
|
||||
|
||||
#endif
|
||||
|
||||
static inline int fls64(unsigned long long x)
|
||||
#ifndef flsnz64
|
||||
static inline int flsnz64(unsigned long long x)
|
||||
{
|
||||
unsigned int h;
|
||||
unsigned int bits = 32;
|
||||
|
@ -319,10 +349,21 @@ static inline int fls64(unsigned long long x)
|
|||
h = x;
|
||||
bits = 0;
|
||||
}
|
||||
return flsnz(h) + bits;
|
||||
return flsnz32(h) + bits;
|
||||
}
|
||||
# define flsnz64(x) flsnz64(x)
|
||||
#endif
|
||||
|
||||
#define fls_auto(x) ((sizeof(x) > 4) ? fls64(x) : flsnz(x))
|
||||
#ifndef flsnz_long
|
||||
# define flsnz_long(x) ((sizeof(long) > 4) ? flsnz64(x) : flsnz32(x))
|
||||
#endif
|
||||
|
||||
#ifndef flsnz
|
||||
# define flsnz(x) ((sizeof(x) > 4) ? flsnz64(x) : (sizeof(x) > 1) ? flsnz32(x) : flsnz8(x))
|
||||
#endif
|
||||
|
||||
#define fls64(x) flsnz64(x)
|
||||
#define fls_auto(x) ((x) ? flsnz(x) : 0)
|
||||
|
||||
/* Linux-like "container_of". It returns a pointer to the structure of type
|
||||
* <type> which has its member <name> stored at address <ptr>.
|
||||
|
|
Loading…
Reference in New Issue