MEDIUM: Set rise and fall of agent checks to 1

This is achieved by moving rise and fall from struct server to struct check.

After this move the behaviour of the primary check, server->check is
unchanged. However, the secondary agent check, server->agent now has
independent rise and fall values each of which are set to 1.

The result is that receiving "fail", "stopped" or "down" just once from the
agent will mark the server as down. And receiving a weight just once will
allow the server to be marked up if its primary check is in good health.

This opens up the scope to allow the rise and fall values of the agent
check to be configurable, however this has not been implemented at this
stage.

Signed-off-by: Simon Horman <horms@verge.net.au>
This commit is contained in:
Simon Horman 2013-11-25 10:46:38 +09:00 committed by Willy Tarreau
parent 2f1f955c8c
commit 58c32978b2
7 changed files with 47 additions and 43 deletions

View File

@ -127,6 +127,8 @@
#define DEF_CHKINTR 2000
#define DEF_FALLTIME 3
#define DEF_RISETIME 2
#define DEF_AGENT_FALLTIME 1
#define DEF_AGENT_RISETIME 1
#define DEF_CHECK_REQ "OPTIONS / HTTP/1.0\r\n"
#define DEF_SMTP_CHECK_REQ "HELO localhost\r\n"
#define DEF_LDAP_CHECK_REQ "\x30\x0c\x02\x01\x01\x60\x07\x02\x01\x03\x04\x00\x80\x00"

View File

@ -123,8 +123,9 @@ struct check {
int inter, fastinter, downinter; /* checks: time in milliseconds */
int result; /* health-check result : SRV_CHK_* */
int state; /* health-check result : CHK_* */
int health; /* 0 to server->rise-1 = bad;
* rise to server->rise+server->fall-1 = good */
int health; /* 0 to rise-1 = bad;
* rise to rise+fall-1 = good */
int rise, fall; /* time in iterations */
int type; /* Check type, one of PR_O2_*_CHK */
struct server *server; /* back-pointer to server */
};
@ -157,7 +158,6 @@ struct server {
struct server *tracknext, *track; /* next server in a tracking list, tracked server */
char *trackit; /* temporary variable to make assignment deferrable */
int consecutive_errors; /* current number of consecutive errors */
int rise, fall; /* time in iterations */
int consecutive_errors_limit; /* number of consecutive errors that triggers an event */
short observe, onerror; /* observing mode: one of HANA_OBS_*; what to do on error: on of ANA_ONERR_* */
short onmarkeddown; /* what to do when marked down: one of HANA_ONMARKEDDOWN_* */

View File

@ -1328,8 +1328,10 @@ void init_default_instance()
defproxy.defsrv.agent.inter = DEF_CHKINTR;
defproxy.defsrv.agent.fastinter = 0;
defproxy.defsrv.agent.downinter = 0;
defproxy.defsrv.rise = DEF_RISETIME;
defproxy.defsrv.fall = DEF_FALLTIME;
defproxy.defsrv.check.rise = DEF_RISETIME;
defproxy.defsrv.check.fall = DEF_FALLTIME;
defproxy.defsrv.agent.rise = DEF_AGENT_RISETIME;
defproxy.defsrv.agent.fall = DEF_AGENT_FALLTIME;
defproxy.defsrv.check.port = 0;
defproxy.defsrv.agent.port = 0;
defproxy.defsrv.maxqueue = 0;
@ -4287,8 +4289,6 @@ stats_error_parsing:
newsrv->agent.inter = curproxy->defsrv.agent.inter;
newsrv->agent.fastinter = curproxy->defsrv.agent.fastinter;
newsrv->agent.downinter = curproxy->defsrv.agent.downinter;
newsrv->rise = curproxy->defsrv.rise;
newsrv->fall = curproxy->defsrv.fall;
newsrv->maxqueue = curproxy->defsrv.maxqueue;
newsrv->minconn = curproxy->defsrv.minconn;
newsrv->maxconn = curproxy->defsrv.maxconn;
@ -4303,11 +4303,15 @@ stats_error_parsing:
= curproxy->defsrv.iweight;
newsrv->check.status = HCHK_STATUS_INI;
newsrv->check.health = newsrv->rise; /* up, but will fall down at first failure */
newsrv->check.rise = curproxy->defsrv.check.rise;
newsrv->check.fall = curproxy->defsrv.check.fall;
newsrv->check.health = newsrv->check.rise; /* up, but will fall down at first failure */
newsrv->check.server = newsrv;
newsrv->agent.status = HCHK_STATUS_INI;
newsrv->agent.health = newsrv->rise; /* up, but will fall down at first failure */
newsrv->agent.rise = curproxy->defsrv.agent.rise;
newsrv->agent.fall = curproxy->defsrv.agent.fall;
newsrv->agent.health = newsrv->agent.rise; /* up, but will fall down at first failure */
newsrv->agent.server = newsrv;
cur_arg = 3;
@ -4361,8 +4365,8 @@ stats_error_parsing:
goto out;
}
newsrv->rise = atol(args[cur_arg + 1]);
if (newsrv->rise <= 0) {
newsrv->check.rise = atol(args[cur_arg + 1]);
if (newsrv->check.rise <= 0) {
Alert("parsing [%s:%d]: '%s' has to be > 0.\n",
file, linenum, args[cur_arg]);
err_code |= ERR_ALERT | ERR_FATAL;
@ -4370,13 +4374,11 @@ stats_error_parsing:
}
if (newsrv->check.health)
newsrv->check.health = newsrv->rise;
if (newsrv->agent.health)
newsrv->agent.health = newsrv->rise;
newsrv->check.health = newsrv->check.rise;
cur_arg += 2;
}
else if (!strcmp(args[cur_arg], "fall")) {
newsrv->fall = atol(args[cur_arg + 1]);
newsrv->check.fall = atol(args[cur_arg + 1]);
if (!*args[cur_arg + 1]) {
Alert("parsing [%s:%d]: '%s' expects an integer argument.\n",
@ -4385,7 +4387,7 @@ stats_error_parsing:
goto out;
}
if (newsrv->fall <= 0) {
if (newsrv->check.fall <= 0) {
Alert("parsing [%s:%d]: '%s' has to be > 0.\n",
file, linenum, args[cur_arg]);
err_code |= ERR_ALERT | ERR_FATAL;

View File

@ -236,7 +236,7 @@ static void set_server_check_status(struct check *check, short status, const cha
if (s->proxy->options2 & PR_O2_LOGHCHKS &&
(((check->health != 0) && (check->result & SRV_CHK_FAILED)) ||
((check->health != s->rise + s->fall - 1) && (check->result & SRV_CHK_PASSED)) ||
((check->health != check->rise + check->fall - 1) && (check->result & SRV_CHK_PASSED)) ||
((s->state & SRV_GOINGDOWN) && !(check->result & SRV_CHK_DISABLE)) ||
(!(s->state & SRV_GOINGDOWN) && (check->result & SRV_CHK_DISABLE)))) {
@ -246,8 +246,8 @@ static void set_server_check_status(struct check *check, short status, const cha
/* FIXME begin: calculate local version of the health/rise/fall/state */
health = check->health;
rise = s->rise;
fall = s->fall;
rise = check->rise;
fall = check->fall;
state = s->state;
if (check->result & SRV_CHK_FAILED) {
@ -401,10 +401,10 @@ void set_server_down(struct check *check)
int xferred;
if (s->state & SRV_MAINTAIN) {
check->health = s->rise;
check->health = check->rise;
}
if ((s->state & SRV_RUNNING && check->health == s->rise) || s->track) {
if ((s->state & SRV_RUNNING && check->health == check->rise) || s->track) {
int srv_was_paused = s->state & SRV_GOINGDOWN;
int prev_srv_count = s->proxy->srv_bck + s->proxy->srv_act;
@ -468,11 +468,11 @@ void set_server_up(struct check *check) {
unsigned int old_state = s->state;
if (s->state & SRV_MAINTAIN) {
check->health = s->rise;
check->health = check->rise;
}
if ((s->check.health >= s->rise && s->agent.health >= s->rise &&
check->health == s->rise) || s->track) {
if ((s->check.health >= s->check.rise && s->agent.health >= s->agent.rise &&
check->health == check->rise) || s->track) {
if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
if (s->proxy->last_change < now.tv_sec) // ignore negative times
s->proxy->down_time += now.tv_sec - s->proxy->last_change;
@ -533,8 +533,8 @@ void set_server_up(struct check *check) {
set_server_up(check);
}
if (check->health >= s->rise)
check->health = s->rise + s->fall - 1; /* OK now */
if (check->health >= check->rise)
check->health = check->rise + check->fall - 1; /* OK now */
}
@ -623,7 +623,7 @@ static void check_failed(struct check *check)
if (check == &s->agent && check->status != HCHK_STATUS_L7STS)
return;
if (check->health > s->rise) {
if (check->health > check->rise) {
check->health--; /* still good */
s->counters.failed_checks++;
}
@ -680,8 +680,8 @@ void health_adjust(struct server *s, short status)
case HANA_ONERR_SUDDTH:
/* simulate a pre-fatal failed health check */
if (s->check.health > s->rise)
s->check.health = s->rise + 1;
if (s->check.health > s->check.rise)
s->check.health = s->check.rise + 1;
/* no break - fall through */
@ -694,7 +694,7 @@ void health_adjust(struct server *s, short status)
case HANA_ONERR_MARKDWN:
/* mark server down */
s->check.health = s->rise;
s->check.health = s->check.rise;
set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str);
set_server_down(&s->check);
@ -734,7 +734,7 @@ static int httpchk_build_status_header(struct server *s, char *buffer)
if (!(s->state & SRV_CHECKED))
sv_state = 6; /* should obviously never happen */
else if (s->state & SRV_RUNNING) {
if (s->check.health == s->rise + s->fall - 1)
if (s->check.health == s->check.rise + s->check.fall - 1)
sv_state = 3; /* UP */
else
sv_state = 2; /* going down */
@ -750,8 +750,8 @@ static int httpchk_build_status_header(struct server *s, char *buffer)
hlen += sprintf(buffer + hlen,
srv_hlt_st[sv_state],
(s->state & SRV_RUNNING) ? (s->check.health - s->rise + 1) : (s->check.health),
(s->state & SRV_RUNNING) ? (s->fall) : (s->rise));
(s->state & SRV_RUNNING) ? (s->check.health - s->check.rise + 1) : (s->check.health),
(s->state & SRV_RUNNING) ? (s->check.fall) : (s->check.rise));
hlen += sprintf(buffer + hlen, "; name=%s/%s; node=%s; weight=%d/%d; scur=%d/%d; qcur=%d",
s->proxy->id, s->id,
@ -1498,7 +1498,7 @@ static struct task *process_chk(struct task *t)
set_server_disabled(check);
}
if (check->health < s->rise + s->fall - 1) {
if (check->health < check->rise + check->fall - 1) {
check->health++; /* was bad, stays for a while */
set_server_up(check);
}

View File

@ -1309,14 +1309,14 @@ static int stats_sock_parse_request(struct stream_interface *si, char *line)
*/
if (sv->track->state & SRV_RUNNING) {
set_server_up(&sv->check);
sv->check.health = sv->rise; /* up, but will fall down at first failure */
sv->check.health = sv->check.rise; /* up, but will fall down at first failure */
} else {
sv->state &= ~SRV_MAINTAIN;
set_server_down(&sv->check);
}
} else {
set_server_up(&sv->check);
sv->check.health = sv->rise; /* up, but will fall down at first failure */
sv->check.health = sv->check.rise; /* up, but will fall down at first failure */
}
}
@ -2266,8 +2266,8 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
chunk_appendf(&trash, "%s ", human_time(now.tv_sec - ref->last_change, 1));
chunk_appendf(&trash,
srv_hlt_st[state],
(ref->state & SRV_RUNNING) ? (ref->check.health - ref->rise + 1) : (ref->check.health),
(ref->state & SRV_RUNNING) ? (ref->fall) : (ref->rise));
(ref->state & SRV_RUNNING) ? (ref->check.health - ref->check.rise + 1) : (ref->check.health),
(ref->state & SRV_RUNNING) ? (ref->check.fall) : (ref->check.rise));
}
if (sv->state & SRV_CHECKED) {
@ -2374,8 +2374,8 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
else
chunk_appendf(&trash,
srv_hlt_st[state],
(ref->state & SRV_RUNNING) ? (ref->check.health - ref->rise + 1) : (ref->check.health),
(ref->state & SRV_RUNNING) ? (ref->fall) : (ref->rise));
(ref->state & SRV_RUNNING) ? (ref->check.health - ref->check.rise + 1) : (ref->check.health),
(ref->state & SRV_RUNNING) ? (ref->check.fall) : (ref->check.rise));
chunk_appendf(&trash,
/* weight, active, backup */
@ -2944,7 +2944,7 @@ static int stats_dump_proxy_to_buffer(struct stream_interface *si, struct proxy
if (!(svs->state & SRV_CHECKED))
sv_state = 6;
else if (svs->state & SRV_RUNNING) {
if (svs->check.health == svs->rise + svs->fall - 1)
if (svs->check.health == svs->check.rise + svs->check.fall - 1)
sv_state = 3; /* UP */
else
sv_state = 2; /* going down */

View File

@ -2920,7 +2920,7 @@ int http_process_req_stat_post(struct stream_interface *si, struct http_txn *txn
if ((px->state != PR_STSTOPPED) && (sv->state & SRV_MAINTAIN)) {
/* Already in maintenance, we can change the server state */
set_server_up(&sv->check);
sv->check.health = sv->rise; /* up, but will fall down at first failure */
sv->check.health = sv->check.rise; /* up, but will fall down at first failure */
altered_servers++;
total_servers++;
}

View File

@ -34,7 +34,7 @@ int srv_getinter(const struct check *check)
{
const struct server *s = check->server;
if ((s->state & SRV_CHECKED) && (check->health == s->rise + s->fall - 1))
if ((s->state & SRV_CHECKED) && (check->health == check->rise + check->fall - 1))
return check->inter;
if (!(s->state & SRV_RUNNING) && check->health == 0)