diff --git a/doc/configuration.txt b/doc/configuration.txt index 777d4f775..dd2b7ff58 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -4896,6 +4896,8 @@ error-log-format X X X - force-persist - - X X filter - X X X fullconn X - X X +hash-balance-factor X - X X +hash-key X - X X hash-type X - X X http-after-response X (!) X X X http-check comment X - X X @@ -6691,6 +6693,29 @@ hash-balance-factor See also : "balance" and "hash-type". +hash-key + Specify how "hash-type consistent" node keys are computed + + Arguments : + may be one of the following : + + id The node keys will be derived from the server's numeric + identifier as set from "id" or which defaults to its position + in the server list. + + addr The node keys will be derived from the server's address, when + available, or else fall back on "id". + + addr-port The node keys will be derived from the server's address and + port, when available, or else fall back on "id". + + The "addr" and "addr-port" options may be useful in scenarios where multiple + HAProxy processes are balancing traffic to the same set of servers. If the + server order of each process is different (because, for example, DNS records + were resolved in different orders) then this will allow each independent + HAProxy processes to agree on routing decisions. + + hash-type Specify a method to use for mapping hashes to servers diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h index bf4d4c9b6..9dd77b12f 100644 --- a/include/haproxy/server-t.h +++ b/include/haproxy/server-t.h @@ -223,6 +223,13 @@ struct pid_list { int exited; }; +/* srv methods of computing chash keys */ +enum srv_hash_key { + SRV_HASH_KEY_ID = 0, /* derived from server puid */ + SRV_HASH_KEY_ADDR, /* derived from server address */ + SRV_HASH_KEY_ADDR_PORT /* derived from server address and port */ +}; + /* A tree occurrence is a descriptor of a place in a tree, with a pointer back * to the server itself. */ @@ -367,6 +374,8 @@ struct server { struct tree_occ *lb_nodes; /* lb_nodes_tot * struct tree_occ */ unsigned lb_nodes_tot; /* number of allocated lb_nodes (C-HASH) */ unsigned lb_nodes_now; /* number of lb_nodes placed in the tree (C-HASH) */ + enum srv_hash_key hash_key; /* method to compute node hash (C-HASH) */ + unsigned lb_server_key; /* hash of the values indicated by "hash_key" (C-HASH) */ const struct netns_entry *netns; /* contains network namespace name or NULL. Network namespace comes from configuration */ struct xprt_ops *xprt; /* transport-layer operations */ diff --git a/src/lb_chash.c b/src/lb_chash.c index 4e8fb1536..b3e472ef1 100644 --- a/src/lb_chash.c +++ b/src/lb_chash.c @@ -21,8 +21,9 @@ #include #include #include -#include +#include #include +#include /* Return next tree node after which must still be in the tree, or be * NULL. Lookup wraps around the end to the beginning. If the next node is the @@ -58,6 +59,77 @@ static inline void chash_dequeue_srv(struct server *s) } } +/* Compute a key that can be used to insert a node into the CHASH tree. Servers + * have a base key, which can be computed in several ways (see + * chash_compute_server_key) and this function uses that seed to generate hash + * keys for however many nodes need to be inserted into the tree. + */ +static inline u32 chash_compute_node_key(struct server *s, unsigned node_index) +{ + return full_hash(s->lb_server_key + node_index); +} + +/* Compute the base server key that will be used to compute node keys. Servers + * may be configured to determine their hashes either from their ID, address, or + * address+port; the latter options allow independent HAProxy instances to agree + * on routing decisions, regardless of their order in the server list (which may + * be arbitrary, since it could depend on factors such as the order of entries + * in a DNS SRV record). If an address is not known or if the server is + * configured with `hash-key id` (the default) then the key will be determined + * from the server's puid. + */ +static inline u32 chash_compute_server_key(struct server *s) +{ + enum srv_hash_key hash_key = s->hash_key; + struct server_inetaddr srv_addr; + u32 key; + + /* If hash-key is addr or addr-port then we need the address, but if we + * can't determine the address then we fall back on hashing the puid. + */ + switch (hash_key) { + case SRV_HASH_KEY_ADDR: + case SRV_HASH_KEY_ADDR_PORT: + server_get_inetaddr(s, &srv_addr); + if (srv_addr.family != AF_INET && srv_addr.family != AF_INET6) { + hash_key = SRV_HASH_KEY_ID; + } + break; + + default: + break; + } + + if (hash_key == SRV_HASH_KEY_ADDR_PORT) { + key = full_hash(srv_addr.port.svc); + } else { + key = 0; + } + + switch (hash_key) { + case SRV_HASH_KEY_ADDR_PORT: + case SRV_HASH_KEY_ADDR: + switch (srv_addr.family) { + case AF_INET: + key = full_hash(key + srv_addr.addr.v4.s_addr); + break; + case AF_INET6: + key = XXH32(srv_addr.addr.v6.s6_addr, 16, key); + break; + default: + break; + } + break; + + case SRV_HASH_KEY_ID: + default: + key = full_hash(s->puid); + break; + } + + return key; +} + /* Adjust the number of entries of a server in its tree. The server must appear * as many times as its weight indicates it. If it's there too often, we remove * the last occurrences. If it's not there enough, we add more occurrences. To @@ -67,6 +139,15 @@ static inline void chash_dequeue_srv(struct server *s) */ static inline void chash_queue_dequeue_srv(struct server *s) { + u32 server_key = chash_compute_server_key(s); + + /* If the server key changed then we must rehash all the nodes. */ + if (server_key != s->lb_server_key) { + chash_dequeue_srv(s); + s->lb_nodes_tot = 0; + s->lb_server_key = server_key; + } + while (s->lb_nodes_now > s->next_eweight) { if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway s->lb_nodes_now = s->lb_nodes_tot; @@ -95,7 +176,7 @@ static inline void chash_queue_dequeue_srv(struct server *s) (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes)); for (j = s->lb_nodes_tot; j < s->next_eweight; j++) { s->lb_nodes[j].server = s; - s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j); + s->lb_nodes[j].node.key = chash_compute_node_key(s, j); } s->lb_nodes_tot = s->next_eweight; } @@ -238,9 +319,6 @@ static void chash_update_server_weight(struct server *srv) int old_state, new_state; struct proxy *p = srv->proxy; - if (!srv_lb_status_changed(srv)) - return; - /* If changing the server's weight changes its state, we simply apply * the procedures we already have for status change. If the state * remains down, the server is not in any tree, so it's as easy as @@ -505,9 +583,10 @@ int chash_init_server_tree(struct proxy *p) ha_alert("failed to allocate lb_nodes for server %s.\n", srv->id); return -1; } + srv->lb_server_key = chash_compute_server_key(srv); for (node = 0; node < srv->lb_nodes_tot; node++) { srv->lb_nodes[node].server = srv; - srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node); + srv->lb_nodes[node].node.key = chash_compute_node_key(srv, node); } if (srv_currently_usable(srv)) diff --git a/src/server.c b/src/server.c index ea93e1ff4..52b715bfa 100644 --- a/src/server.c +++ b/src/server.c @@ -184,6 +184,9 @@ static void _srv_set_inetaddr_port(struct server *srv, else srv->flags &= ~SRV_F_MAPPORTS; + /* balancers (chash in particular) may use the addr in their routing decisions */ + srv->proxy->lbprm.update_server_eweight(srv); + if (srv->log_target && srv->log_target->type == LOG_TARGET_DGRAM) { /* server is used as a log target, manually update log target addr for DGRAM */ ipcpy(addr, srv->log_target->addr); @@ -948,6 +951,32 @@ static int srv_parse_ws(char **args, int *cur_arg, return 0; } +/* Parse the "hash-key" server keyword */ +static int srv_parse_hash_key(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (!args[*cur_arg + 1]) { + memprintf(err, "'%s expects 'id', 'addr', or 'addr-port' value", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[*cur_arg + 1], "id") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ID; + } + else if (strcmp(args[*cur_arg + 1], "addr") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ADDR; + } + else if (strcmp(args[*cur_arg + 1], "addr-port") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ADDR_PORT; + } + else { + memprintf(err, "'%s' has to be 'id', 'addr', or 'addr-port'", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + /* Parse the "init-addr" server keyword */ static int srv_parse_init_addr(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) @@ -2221,6 +2250,7 @@ static struct srv_kw_list srv_kws = { "ALL", { }, { { "enabled", srv_parse_enabled, 0, 1, 0 }, /* Start the server in 'enabled' state */ { "error-limit", srv_parse_error_limit, 1, 1, 1 }, /* Configure the consecutive count of check failures to consider a server on error */ { "ws", srv_parse_ws, 1, 1, 1 }, /* websocket protocol */ + { "hash-key", srv_parse_hash_key, 1, 1, 1 }, /* Configure how chash keys are computed */ { "id", srv_parse_id, 1, 0, 1 }, /* set id# of server */ { "init-addr", srv_parse_init_addr, 1, 1, 0 }, /* */ { "log-bufsize", srv_parse_log_bufsize, 1, 1, 0 }, /* Set the ring bufsize for log server (only for log backends) */ @@ -2705,6 +2735,7 @@ void srv_settings_cpy(struct server *srv, const struct server *src, int srv_tmpl srv->minconn = src->minconn; srv->maxconn = src->maxconn; srv->slowstart = src->slowstart; + srv->hash_key = src->hash_key; srv->observe = src->observe; srv->onerror = src->onerror; srv->onmarkeddown = src->onmarkeddown;