mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-02-25 15:11:10 +00:00
[MEDIUM] Spread health checks even more
When one server appears at the same position in multiple backends, it receives all the checks from all the backends exactly at the same time because the health-checks are only spread within a backend but not globally. Attached patch implements per-server start delay in a different way. Checks are now spread globally - not locally to one backend. It also makes them start faster - IMHO there is no need to add a 'server->inter' when calculating first execution. Calculation were moved from cfgparse.c to checks.c. There is a new function start_checks() and now it is not called when haproxy is started in MODE_CHECK. With this patch it is also possible to set a global 'spread-checks' parameter. It takes a percentage value (1..50, probably something near 5..10 is a good idea) so haproxy adds or removes that many percent to the original interval after each check. My test shows that with 18 backends, 54 servers total and 10000ms/5% it takes about 45m to mix them completely. I decided to use rand/srand pseudo-random number generator. I am aware it is not recommend for a good randomness but a) we do not need a good random generator here b) it is probably the most portable one.
This commit is contained in:
parent
87ea548313
commit
b304dc7fd7
@ -26,6 +26,7 @@
|
||||
#include <common/config.h>
|
||||
|
||||
void process_chk(struct task *t, struct timeval *next);
|
||||
int start_checks();
|
||||
|
||||
#endif /* _PROTO_CHECKS_H */
|
||||
|
||||
|
@ -55,6 +55,7 @@ struct global {
|
||||
int rlimit_memmax; /* default ulimit-d in megs value : 0=unset */
|
||||
int mode;
|
||||
int last_checks;
|
||||
int spread_checks;
|
||||
char *chroot;
|
||||
char *pidfile;
|
||||
int logfac1, logfac2;
|
||||
|
@ -451,7 +451,21 @@ int cfg_parse_global(const char *file, int linenum, char **args)
|
||||
Alert("parsing [%s:%d] : too many syslog servers\n", file, linenum);
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
else if (!strcmp(args[0], "spread-checks")) { /* random time between checks (0-50) */
|
||||
if (global.spread_checks != 0) {
|
||||
Alert("parsing [%s:%d]: spread-checks already specified. Continuing.\n", file, linenum);
|
||||
return 0;
|
||||
}
|
||||
if (*(args[1]) == 0) {
|
||||
Alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]);
|
||||
return -1;
|
||||
}
|
||||
global.spread_checks = atol(args[1]);
|
||||
if (global.spread_checks < 0 || global.spread_checks > 50) {
|
||||
Alert("parsing [%s:%d]: 'spread-checks' needs a positive value in range 0..50.\n", file, linenum);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
Alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "global");
|
||||
@ -2261,7 +2275,6 @@ int readcfgfile(const char *file)
|
||||
char *args[MAX_LINE_ARGS + 1];
|
||||
int arg;
|
||||
int cfgerr = 0;
|
||||
int nbchk, mininter;
|
||||
int confsect = CFG_NONE;
|
||||
|
||||
struct proxy *curproxy = NULL;
|
||||
@ -2708,56 +2721,6 @@ int readcfgfile(const char *file)
|
||||
newsrv = newsrv->next;
|
||||
}
|
||||
|
||||
/* now we'll start this proxy's health checks if any */
|
||||
/* 1- count the checkers to run simultaneously */
|
||||
nbchk = 0;
|
||||
mininter = 0;
|
||||
newsrv = curproxy->srv;
|
||||
while (newsrv != NULL) {
|
||||
if (newsrv->state & SRV_CHECKED) {
|
||||
if (!mininter || mininter > newsrv->inter)
|
||||
mininter = newsrv->inter;
|
||||
nbchk++;
|
||||
}
|
||||
newsrv = newsrv->next;
|
||||
}
|
||||
|
||||
/* 2- start them as far as possible from each others while respecting
|
||||
* their own intervals. For this, we will start them after their own
|
||||
* interval added to the min interval divided by the number of servers,
|
||||
* weighted by the server's position in the list.
|
||||
*/
|
||||
if (nbchk > 0) {
|
||||
struct task *t;
|
||||
int srvpos;
|
||||
|
||||
newsrv = curproxy->srv;
|
||||
srvpos = 0;
|
||||
while (newsrv != NULL) {
|
||||
/* should this server be checked ? */
|
||||
if (newsrv->state & SRV_CHECKED) {
|
||||
if ((t = pool_alloc2(pool2_task)) == NULL) {
|
||||
Alert("parsing [%s:%d] : out of memory.\n", file, linenum);
|
||||
return -1;
|
||||
}
|
||||
|
||||
t->wq = NULL;
|
||||
t->qlist.p = NULL;
|
||||
t->state = TASK_IDLE;
|
||||
t->process = process_chk;
|
||||
t->context = newsrv;
|
||||
|
||||
/* check this every ms */
|
||||
tv_ms_add(&t->expire, &now,
|
||||
newsrv->inter + mininter * srvpos / nbchk);
|
||||
task_queue(t);
|
||||
//task_wakeup(&rq, t);
|
||||
srvpos++;
|
||||
}
|
||||
newsrv = newsrv->next;
|
||||
}
|
||||
}
|
||||
|
||||
curproxy = curproxy->next;
|
||||
}
|
||||
if (cfgerr > 0) {
|
||||
|
71
src/checks.c
71
src/checks.c
@ -13,7 +13,9 @@
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
@ -281,6 +283,7 @@ void process_chk(struct task *t, struct timeval *next)
|
||||
struct server *s = t->context;
|
||||
struct sockaddr_in sa;
|
||||
int fd;
|
||||
int rv;
|
||||
|
||||
//fprintf(stderr, "process_chk: task=%p\n", t);
|
||||
|
||||
@ -503,8 +506,15 @@ void process_chk(struct task *t, struct timeval *next)
|
||||
set_server_down(s);
|
||||
s->curfd = -1;
|
||||
fd_delete(fd);
|
||||
|
||||
rv = 0;
|
||||
if (global.spread_checks > 0) {
|
||||
rv = s->inter * global.spread_checks / 100;
|
||||
rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
|
||||
//fprintf(stderr, "process_chk: (%d+/-%d%%) random=%d\n", s->inter, global.spread_checks, rv);
|
||||
}
|
||||
while (tv_isle(&t->expire, &now))
|
||||
tv_ms_add(&t->expire, &t->expire, s->inter);
|
||||
tv_ms_add(&t->expire, &t->expire, s->inter + rv);
|
||||
goto new_chk;
|
||||
}
|
||||
/* if result is 0 and there's no timeout, we have to wait again */
|
||||
@ -517,6 +527,65 @@ void process_chk(struct task *t, struct timeval *next)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start health-check.
|
||||
* Returns 0 if OK, -1 if error, and prints the error in this case.
|
||||
*/
|
||||
int start_checks() {
|
||||
|
||||
struct proxy *px;
|
||||
struct server *s;
|
||||
struct task *t;
|
||||
int nbchk=0, mininter=0, srvpos=0;
|
||||
|
||||
/* 1- count the checkers to run simultaneously */
|
||||
for (px = proxy; px; px = px->next) {
|
||||
for (s = px->srv; s; s = s->next) {
|
||||
if (!(s->state & SRV_CHECKED))
|
||||
continue;
|
||||
|
||||
if (!mininter || mininter > s->inter)
|
||||
mininter = s->inter;
|
||||
|
||||
nbchk++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!nbchk)
|
||||
return 0;
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
/*
|
||||
* 2- start them as far as possible from each others. For this, we will
|
||||
* start them after their interval set to the min interval divided by
|
||||
* the number of servers, weighted by the server's position in the list.
|
||||
*/
|
||||
for (px = proxy; px; px = px->next) {
|
||||
for (s = px->srv; s; s = s->next) {
|
||||
if (!(s->state & SRV_CHECKED))
|
||||
continue;
|
||||
|
||||
if ((t = pool_alloc2(pool2_task)) == NULL) {
|
||||
Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
|
||||
return -1;
|
||||
}
|
||||
|
||||
t->wq = NULL;
|
||||
t->qlist.p = NULL;
|
||||
t->state = TASK_IDLE;
|
||||
t->process = process_chk;
|
||||
t->context = s;
|
||||
|
||||
/* check this every ms */
|
||||
tv_ms_add(&t->expire, &now, mininter * srvpos / nbchk);
|
||||
task_queue(t);
|
||||
|
||||
srvpos++;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local variables:
|
||||
|
@ -81,6 +81,7 @@
|
||||
#include <proto/acl.h>
|
||||
#include <proto/backend.h>
|
||||
#include <proto/buffers.h>
|
||||
#include <proto/checks.h>
|
||||
#include <proto/client.h>
|
||||
#include <proto/fd.h>
|
||||
#include <proto/log.h>
|
||||
@ -506,6 +507,7 @@ void init(int argc, char **argv)
|
||||
Alert("Error reading configuration file : %s\n", cfg_cfgfile);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (have_appsession)
|
||||
appsession_init();
|
||||
|
||||
@ -514,6 +516,9 @@ void init(int argc, char **argv)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (start_checks() < 0)
|
||||
exit(1);
|
||||
|
||||
if (cfg_maxconn > 0)
|
||||
global.maxconn = cfg_maxconn;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user