BUG/MEDIUM: checks: fix slowstart behaviour when server tracking is in use

Ludovic Levesque reported and diagnosed an annoying bug. When a server is
configured to track another one and has a slowstart interval set, it's
assigned a minimal weight when the tracked server goes back up but keeps
this weight forever.

This is because the throttling during the warmup phase is only computed
in the health checking function.

After several attempts to resolve the issue, the only real solution is to
split the check processing task in two tasks, one for the checks and one
for the warmup. Each server with a slowstart setting has a warmum task
which is responsible for updating the server's weight after a down to up
transition. The task does not run in othe situations.

In the end, the fix is neither complex nor long and should be backported
to 1.4 since the issue was detected there first.
This commit is contained in:
Willy Tarreau 2011-10-31 11:53:20 +01:00
parent 4426770013
commit 2e99390faf
3 changed files with 65 additions and 30 deletions

View File

@ -105,6 +105,7 @@ struct server {
struct list pendconns; /* pending connections */
struct list actconns; /* active connections */
struct task *check; /* the task associated to the health check processing */
struct task *warmup; /* the task dedicated to the warmup when slowstart is set */
int iface_len; /* bind interface name length */
char *iface_name; /* bind interface name or NULL */

View File

@ -472,6 +472,7 @@ void set_server_up(struct server *s) {
if (s->proxy->lbprm.update_server_eweight)
s->proxy->lbprm.update_server_eweight(s);
}
task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20))));
}
s->proxy->lbprm.set_server_status_up(s);
@ -1219,6 +1220,49 @@ static int event_srv_chk_r(int fd)
return 0;
}
/*
* updates the server's weight during a warmup stage. Once the final weight is
* reached, the task automatically stops. Note that any server status change
* must have updated s->last_change accordingly.
*/
static struct task *server_warmup(struct task *t)
{
struct server *s = t->context;
/* by default, plan on stopping the task */
t->expire = TICK_ETERNITY;
if ((s->state & (SRV_RUNNING|SRV_WARMINGUP|SRV_MAINTAIN)) != (SRV_RUNNING|SRV_WARMINGUP))
return t;
if (now.tv_sec < s->last_change || now.tv_sec >= s->last_change + s->slowstart) {
/* go to full throttle if the slowstart interval is reached */
s->state &= ~SRV_WARMINGUP;
if (s->proxy->lbprm.algo & BE_LB_PROP_DYN)
s->eweight = s->uweight * BE_WEIGHT_SCALE;
if (s->proxy->lbprm.update_server_eweight)
s->proxy->lbprm.update_server_eweight(s);
}
else if (s->proxy->lbprm.algo & BE_LB_PROP_DYN) {
/* for dynamic algorithms, let's slowly update the weight */
s->eweight = (BE_WEIGHT_SCALE * (now.tv_sec - s->last_change) +
s->slowstart - 1) / s->slowstart;
s->eweight *= s->uweight;
if (s->proxy->lbprm.update_server_eweight)
s->proxy->lbprm.update_server_eweight(s);
}
/* Note that static algorithms are already running at full throttle */
/* probably that we can refill this server with a bit more connections */
check_for_pending(s);
/* get back there in 1 second or 1/20th of the slowstart interval,
* whichever is greater, resulting in small 5% steps.
*/
if (s->state & SRV_WARMINGUP)
t->expire = tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20)));
return t;
}
/*
* manages a server health-check. Returns
* the time the task accepts to wait, or TIME_ETERNITY for infinity.
@ -1231,8 +1275,6 @@ struct task *process_chk(struct task *t)
int fd;
int rv;
//fprintf(stderr, "process_chk: task=%p\n", t);
new_chk:
if (attempts++ > 0) {
/* we always fail to create a server, let's stop insisting... */
@ -1242,7 +1284,6 @@ struct task *process_chk(struct task *t)
}
fd = s->curfd;
if (fd < 0) { /* no check currently running */
//fprintf(stderr, "process_chk: 2\n");
if (!tick_is_expired(t->expire, now_ms)) /* woke up too early */
return t;
@ -1472,31 +1513,8 @@ struct task *process_chk(struct task *t)
goto new_chk;
}
else {
//fprintf(stderr, "process_chk: 8\n");
/* there was a test running */
if ((s->result & (SRV_CHK_ERROR|SRV_CHK_RUNNING)) == SRV_CHK_RUNNING) { /* good server detected */
//fprintf(stderr, "process_chk: 9\n");
if (s->state & SRV_WARMINGUP) {
if (now.tv_sec < s->last_change || now.tv_sec >= s->last_change + s->slowstart) {
s->state &= ~SRV_WARMINGUP;
if (s->proxy->lbprm.algo & BE_LB_PROP_DYN)
s->eweight = s->uweight * BE_WEIGHT_SCALE;
if (s->proxy->lbprm.update_server_eweight)
s->proxy->lbprm.update_server_eweight(s);
}
else if (s->proxy->lbprm.algo & BE_LB_PROP_DYN) {
/* for dynamic algorithms, let's update the weight */
s->eweight = (BE_WEIGHT_SCALE * (now.tv_sec - s->last_change) +
s->slowstart - 1) / s->slowstart;
s->eweight *= s->uweight;
if (s->proxy->lbprm.update_server_eweight)
s->proxy->lbprm.update_server_eweight(s);
}
/* probably that we can refill this server with a bit more connections */
check_for_pending(s);
}
/* we may have to add/remove this server from the LB group */
if ((s->state & SRV_RUNNING) && (s->proxy->options & PR_O_DISABLE404)) {
if ((s->state & SRV_GOINGDOWN) &&
@ -1520,7 +1538,6 @@ struct task *process_chk(struct task *t)
if (global.spread_checks > 0) {
rv = srv_getinter(s) * global.spread_checks / 100;
rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
//fprintf(stderr, "process_chk(%p): (%d+/-%d%%) random=%d\n", s, srv_getinter(s), global.spread_checks, rv);
}
t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
goto new_chk;
@ -1537,7 +1554,6 @@ struct task *process_chk(struct task *t)
}
}
//fprintf(stderr, "process_chk: 10\n");
/* failure or timeout detected */
if (s->health > s->rise) {
s->health--; /* still good */
@ -1552,14 +1568,12 @@ struct task *process_chk(struct task *t)
if (global.spread_checks > 0) {
rv = srv_getinter(s) * global.spread_checks / 100;
rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
//fprintf(stderr, "process_chk(%p): (%d+/-%d%%) random=%d\n", s, srv_getinter(s), global.spread_checks, rv);
}
t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
goto new_chk;
}
/* if result is unknown and there's no timeout, we have to wait again */
}
//fprintf(stderr, "process_chk: 11\n");
s->result = SRV_CHK_UNKNOWN;
return t;
}
@ -1607,9 +1621,24 @@ int start_checks() {
*/
for (px = proxy; px; px = px->next) {
for (s = px->srv; s; s = s->next) {
if (s->slowstart) {
if ((t = task_new()) == NULL) {
Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
return -1;
}
/* We need a warmup task that will be called when the server
* state switches from down to up.
*/
s->warmup = t;
t->process = server_warmup;
t->context = s;
t->expire = TICK_ETERNITY;
}
if (!(s->state & SRV_CHECKED))
continue;
/* one task for the checks */
if ((t = task_new()) == NULL) {
Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
return -1;

View File

@ -928,6 +928,11 @@ void deinit(void)
task_free(s->check);
}
if (s->warmup) {
task_delete(s->warmup);
task_free(s->warmup);
}
free(s->id);
free(s->cookie);
free(s->check_data);