mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-01-05 19:52:14 +00:00
a532324128
added methods to provide a better hash with small input sets
203 lines
5.6 KiB
C
203 lines
5.6 KiB
C
/*
|
|
* Integer hashing tests. These functions work with 32-bit integers, so are
|
|
* perfectly suited for IPv4 addresses. A few tests show that they may also
|
|
* be chained for larger keys (eg: IPv6), this way :
|
|
* f(x[0-3]) = f(f(f(f(x[0])^x[1])^x[2])^x[3])
|
|
*
|
|
* See also bob jenkin's site for more info on hashing, and check perfect
|
|
* hashing for constants (eg: header names).
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <arpa/inet.h>
|
|
#include <math.h>
|
|
|
|
#define NSERV 8
|
|
#define MAXLINE 1000
|
|
|
|
|
|
int counts_id[NSERV][NSERV];
|
|
uint32_t hash_id( uint32_t a)
|
|
{
|
|
return a;
|
|
}
|
|
|
|
/* Full-avalanche integer hashing function from Thomas Wang, suitable for use
|
|
* with a modulo. See below, worth a read !
|
|
* http://www.concentric.net/~Ttwang/tech/inthash.htm
|
|
*
|
|
* See also tests performed by Bob Jenkins (says it's faster than his) :
|
|
* http://burtleburtle.net/bob/hash/integer.html
|
|
*
|
|
* This function is small and fast. It does not seem as smooth as bj6 though.
|
|
* About 0x40 bytes, 6 shifts.
|
|
*/
|
|
int counts_tw1[NSERV][NSERV];
|
|
uint32_t hash_tw1(uint32_t a)
|
|
{
|
|
a += ~(a<<15);
|
|
a ^= (a>>10);
|
|
a += (a<<3);
|
|
a ^= (a>>6);
|
|
a += ~(a<<11);
|
|
a ^= (a>>16);
|
|
return a;
|
|
}
|
|
|
|
/* Thomas Wang's mix function. The multiply is optimized away by the compiler
|
|
* on most platforms.
|
|
* It is about equivalent to the one above.
|
|
*/
|
|
int counts_tw2[NSERV][NSERV];
|
|
uint32_t hash_tw2(uint32_t a)
|
|
{
|
|
a = ~a + (a << 15);
|
|
a = a ^ (a >> 12);
|
|
a = a + (a << 2);
|
|
a = a ^ (a >> 4);
|
|
a = a * 2057;
|
|
a = a ^ (a >> 16);
|
|
return a;
|
|
}
|
|
|
|
/* Thomas Wang's multiplicative hash function. About 0x30 bytes, and it is
|
|
* extremely fast on recent processors with a fast multiply. However, it
|
|
* must not be used on low bits only, as multiples of 0x00100010 only return
|
|
* even values !
|
|
*/
|
|
int counts_tw3[NSERV][NSERV];
|
|
uint32_t hash_tw3(uint32_t a)
|
|
{
|
|
a = (a ^ 61) ^ (a >> 16);
|
|
a = a + (a << 3);
|
|
a = a ^ (a >> 4);
|
|
a = a * 0x27d4eb2d;
|
|
a = a ^ (a >> 15);
|
|
return a;
|
|
}
|
|
|
|
|
|
/* Full-avalanche integer hashing function from Bob Jenkins, suitable for use
|
|
* with a modulo. It has a very smooth distribution.
|
|
* http://burtleburtle.net/bob/hash/integer.html
|
|
* About 0x50 bytes, 6 shifts.
|
|
*/
|
|
int counts_bj6[NSERV][NSERV];
|
|
int counts_bj6x[NSERV][NSERV];
|
|
uint32_t hash_bj6(uint32_t a)
|
|
{
|
|
a = (a+0x7ed55d16) + (a<<12);
|
|
a = (a^0xc761c23c) ^ (a>>19);
|
|
a = (a+0x165667b1) + (a<<5);
|
|
a = (a+0xd3a2646c) ^ (a<<9);
|
|
a = (a+0xfd7046c5) + (a<<3);
|
|
a = (a^0xb55a4f09) ^ (a>>16);
|
|
return a;
|
|
}
|
|
|
|
/* Similar function with one more shift and no magic number. It is slightly
|
|
* slower but provides the overall smoothest distribution.
|
|
* About 0x40 bytes, 7 shifts.
|
|
*/
|
|
int counts_bj7[NSERV][NSERV];
|
|
int counts_bj7x[NSERV][NSERV];
|
|
uint32_t hash_bj7(uint32_t a)
|
|
{
|
|
a -= (a<<6);
|
|
a ^= (a>>17);
|
|
a -= (a<<9);
|
|
a ^= (a<<4);
|
|
a -= (a<<3);
|
|
a ^= (a<<10);
|
|
a ^= (a>>15);
|
|
return a;
|
|
}
|
|
|
|
|
|
void count_hash_results(unsigned long hash, int counts[NSERV][NSERV]) {
|
|
int srv, nsrv;
|
|
|
|
for (nsrv = 0; nsrv < NSERV; nsrv++) {
|
|
srv = hash % (nsrv + 1);
|
|
counts[nsrv][srv]++;
|
|
}
|
|
}
|
|
|
|
void dump_hash_results(char *name, int counts[NSERV][NSERV]) {
|
|
int srv, nsrv;
|
|
double err, total_err, max_err;
|
|
|
|
printf("%s:\n", name);
|
|
for (nsrv = 0; nsrv < NSERV; nsrv++) {
|
|
total_err = 0.0;
|
|
max_err = 0.0;
|
|
printf("%02d srv: ", nsrv+1);
|
|
for (srv = 0; srv <= nsrv; srv++) {
|
|
err = 100.0*(counts[nsrv][srv] - (double)counts[0][0]/(nsrv+1)) / (double)counts[0][0];
|
|
//printf("%6d ", counts[nsrv][srv]);
|
|
printf("% 3.1f%%%c ", err,
|
|
counts[nsrv][srv]?' ':'*'); /* display '*' when a server is never selected */
|
|
err = fabs(err);
|
|
total_err += err;
|
|
if (err > max_err)
|
|
max_err = err;
|
|
}
|
|
total_err /= (double)(nsrv+1);
|
|
for (srv = nsrv+1; srv < NSERV; srv++)
|
|
printf(" ");
|
|
printf(" avg_err=%3.1f, max_err=%3.1f\n", total_err, max_err);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
int main() {
|
|
int nr;
|
|
unsigned int address = 0;
|
|
unsigned int mask = ~0;
|
|
|
|
memset(counts_id, 0, sizeof(counts_id));
|
|
memset(counts_tw1, 0, sizeof(counts_tw1));
|
|
memset(counts_tw2, 0, sizeof(counts_tw2));
|
|
memset(counts_tw3, 0, sizeof(counts_tw3));
|
|
memset(counts_bj6, 0, sizeof(counts_bj6));
|
|
memset(counts_bj7, 0, sizeof(counts_bj7));
|
|
|
|
address = 0x10000000;
|
|
mask = 0xffffff00; // user mask to apply to addresses
|
|
for (nr = 0; nr < 0x10; nr++) {
|
|
//address += ~nr; // semi-random addresses.
|
|
//address += 1;
|
|
address += 0x00000100;
|
|
//address += 0x11111111;
|
|
//address += 7;
|
|
//address += 8;
|
|
//address += 256;
|
|
//address += 65536;
|
|
//address += 131072;
|
|
//address += 0x00100010; // this increment kills tw3 !
|
|
count_hash_results(hash_id (address & mask), counts_id); // 0.69s / 100M
|
|
count_hash_results(hash_tw1(address & mask), counts_tw1); // 1.04s / 100M
|
|
count_hash_results(hash_tw2(address & mask), counts_tw2); // 1.13s / 100M
|
|
count_hash_results(hash_tw3(address & mask), counts_tw3); // 1.01s / 100M
|
|
count_hash_results(hash_bj6(address & mask), counts_bj6); // 1.07s / 100M
|
|
count_hash_results(hash_bj7(address & mask), counts_bj7); // 1.20s / 100M
|
|
/* adding the original address after the hash reduces the error
|
|
* rate in in presence of very small data sets (eg: 16 source
|
|
* addresses for 8 servers). In this case, bj7 is very good.
|
|
*/
|
|
count_hash_results(hash_bj6(address & mask)+(address&mask), counts_bj6x); // 1.07s / 100M
|
|
count_hash_results(hash_bj7(address & mask)+(address&mask), counts_bj7x); // 1.20s / 100M
|
|
}
|
|
|
|
dump_hash_results("hash_id", counts_id);
|
|
dump_hash_results("hash_tw1", counts_tw1);
|
|
dump_hash_results("hash_tw2", counts_tw2);
|
|
dump_hash_results("hash_tw3", counts_tw3);
|
|
dump_hash_results("hash_bj6", counts_bj6);
|
|
dump_hash_results("hash_bj6x", counts_bj6x);
|
|
dump_hash_results("hash_bj7", counts_bj7);
|
|
dump_hash_results("hash_bj7x", counts_bj7x);
|
|
return 0;
|
|
}
|