From 1c46a0746038416a6c098a71d512a6e1a65b9184 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 30 Dec 2023 15:37:17 +0100 Subject: [PATCH] IMPORT: ebtree: rework the fls macros to better deal with arch-specific ones The definitions were a bit of a mess and there wasn't even a fall back to __builtin_clz() on compilers supporting it. Now we instead define a macro for each implementation that is set on an arch-dependent case by case, and add the fall back ones only when not defined. This also allows the flsnz8() to automatically fall back to the 32-bit arch-specific version if available. This shows a consistent 33% speedup on arm for strings. This is cbtree commit c6075742e8d0a6924e7183d44bd93dec20ca8049. This is ebtree commit f452d0f83eca72f6c3484ccb138d341ed6fd27ed. --- include/import/ebtree.h | 131 ++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 45 deletions(-) diff --git a/include/import/ebtree.h b/include/import/ebtree.h index 10583590dd..25eb85cfbe 100644 --- a/include/import/ebtree.h +++ b/include/import/ebtree.h @@ -250,46 +250,83 @@ #include #include -static inline int flsnz8_generic(unsigned int x) +/* returns clz from 7 to 0 for 0x01 to 0xFF. Returns 7 for 0 as well. */ +static inline unsigned int clz8(unsigned char c) +{ + unsigned int r = 4; + + if (c & 0xf0) { + r = 0; + c >>= 4; + } + return r + ((0x000055afU >> (c * 2)) & 0x3); +} + +/* FLSNZ: find last set bit for non-zero value. "Last" here means the highest + * one. It returns a value from 1 to 32 for 1<<0 to 1<<31. + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__atom__) +/* DO NOT USE ON ATOM! The instruction is emulated and is several times slower + * than doing the math by hand. + */ +static inline int flsnz32(unsigned int x) +{ + int r; + __asm__("bsrl %1,%0\n" + : "=r" (r) : "rm" (x)); + return r + 1; +} +#define flsnz32(x) flsnz32(x) + +# if defined(__x86_64__) +static inline int flsnz64(unsigned long long x) +{ + unsigned long long r; + __asm__("bsrq %1,%0\n" + : "=r" (r) : "rm" (x)); + return r + 1; +} +# define flsnz64(x) flsnz64(x) +# endif + +#elif !defined(__atom__) && defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2))) +/* gcc >= 4.2 brings __builtin_clz() and __builtin_clzl(), usable for non-x86 */ + +static inline int flsnz32(unsigned int x) +{ + return 32 - __builtin_clz(x); +} +# define flsnz32(x) flsnz32(x) + +# if defined(__SIZEOF_LONG__) && (__SIZEOF_LONG__ > 4) +static inline int flsnz64(unsigned long x) +{ + return (__SIZEOF_LONG__ * 8) - __builtin_clzl(x); +} +# define flsnz64(x) flsnz64(x) +# endif + +#endif /* end of arch-specific implementations */ + +/*** Fallback versions below ***/ + +#ifndef flsnz8 +# if defined(flsnz32) +# define flsnz8(x) flsnz32((unsigned char)x) +# else +static inline int flsnz8(unsigned int x) { int ret = 0; if (x >> 4) { x >>= 4; ret += 4; } return ret + ((0xFFFFAA50U >> (x << 1)) & 3) + 1; } +# define flsnz8(x) flsnz8(x) +# endif +#endif -/* Note: we never need to run fls on null keys, so we can optimize the fls - * function by removing a conditional jump. - */ -#if defined(__i386__) || defined(__x86_64__) -/* this code is similar on 32 and 64 bit */ -static inline int flsnz(int x) -{ - int r; - __asm__("bsrl %1,%0\n" - : "=r" (r) : "rm" (x)); - return r+1; -} - -static inline int flsnz8(unsigned char x) -{ - int r; - __asm__("movzbl %%al, %%eax\n" - "bsrl %%eax,%0\n" - : "=r" (r) : "a" (x)); - return r+1; -} - -static inline long flsnz_long(unsigned long x) -{ - long r; - __asm__("bsr %1,%0\n" - : "=r" (r) : "rm" (x)); - return r + 1; -} - -#else -// returns 1 to 32 for 1<<0 to 1<<31. Undefined for 0. -#define flsnz(___a) ({ \ +#ifndef flsnz32 +# define flsnz32(___a) ({ \ register int ___x, ___bits = 0; \ ___x = (___a); \ if (___x & 0xffff0000) { ___x &= 0xffff0000; ___bits += 16;} \ @@ -299,17 +336,10 @@ static inline long flsnz_long(unsigned long x) if (___x & 0xaaaaaaaa) { ___x &= 0xaaaaaaaa; ___bits += 1;} \ ___bits + 1; \ }) - -static inline int flsnz8(unsigned int x) -{ - return flsnz8_generic(x); -} - -#define flsnz_long(x) ((sizeof(long) > 4) ? flsnz64(x) : flsnz32(x)) - #endif -static inline int fls64(unsigned long long x) +#ifndef flsnz64 +static inline int flsnz64(unsigned long long x) { unsigned int h; unsigned int bits = 32; @@ -319,10 +349,21 @@ static inline int fls64(unsigned long long x) h = x; bits = 0; } - return flsnz(h) + bits; + return flsnz32(h) + bits; } +# define flsnz64(x) flsnz64(x) +#endif -#define fls_auto(x) ((sizeof(x) > 4) ? fls64(x) : flsnz(x)) +#ifndef flsnz_long +# define flsnz_long(x) ((sizeof(long) > 4) ? flsnz64(x) : flsnz32(x)) +#endif + +#ifndef flsnz +# define flsnz(x) ((sizeof(x) > 4) ? flsnz64(x) : (sizeof(x) > 1) ? flsnz32(x) : flsnz8(x)) +#endif + +#define fls64(x) flsnz64(x) +#define fls_auto(x) ((x) ? flsnz(x) : 0) /* Linux-like "container_of". It returns a pointer to the structure of type * which has its member stored at address .