[MEDIUM] Decrease server health based on http responses / events, version 3

Implement decreasing health based on observing communication between
HAProxy and servers.

Changes in this version 2:
 - documentation
 - close race between a started check and health analysis event
 - don't force fastinter if it is not set
 - better names for options
 - layer4 support

Changes in this version 3:
 - add stats
 - port to the current 1.4 tree
This commit is contained in:
Krzysztof Piotr Oledzki 2009-12-15 22:31:24 +01:00 committed by Willy Tarreau
parent f864533c05
commit 97f07b832f
12 changed files with 326 additions and 18 deletions

View File

@ -4694,6 +4694,13 @@ cookie <value>
the same cookie value, and it is in fact somewhat common between normal and
backup servers. See also the "cookie" keyword in backend section.
error-limit <count>
If health observing is enabled, the "error-limit" parameter specifies the number
of consecutive errors that triggers event selected by the "on-error" option.
By default it is set to 10 consecutive errors.
See also the "check", "error-limit" and "on-error".
fall <count>
The "fall" parameter states that a server will be considered as dead after
<count> consecutive unsuccessful health checks. This value defaults to 3 if
@ -4761,7 +4768,29 @@ minconn <minconn>
server during normal loads, but push it further for important loads without
overloading the server during exceptional loads. See also the "maxconn"
and "maxqueue" parameters, as well as the "fullconn" backend keyword.
observe <mode>
This option enables health adjusting based on observing communication with
the server. By default this functionality is disabled and enabling it also
requires to enable health checks. There are two supported modes: "layer4" and
"layer7". In layer4 mode, only successful/unsuccessful tcp connections are
significant. In layer7, which is only allowed for http proxies, responses
received from server are verified, like valid/wrong http code, unparsable
headers, a timeout, etc.
See also the "check", "on-error" and "error-limit".
on-error <mode>
Select what should happen when enough consecutive errors are detected.
Currently, four modes are available:
- fastinter: force fastinter
- fail-check: simulate a failed check, also forces fastinter (default)
- sudden-death: simulate a pre-fatal failed health check, one more failed
check will mark a server down, forces fastinter
- mark-down: mark the server immediately down and force fastinter
See also the "check", "observe" and "error-limit".
port <port>
Using the "port" parameter, it becomes possible to use a different port to
send health-checks. On some servers, it may be desirable to dedicate a port

View File

@ -120,6 +120,9 @@
#define DEF_CHECK_REQ "OPTIONS / HTTP/1.0\r\n\r\n"
#define DEF_SMTP_CHECK_REQ "HELO localhost\r\n"
#define DEF_HANA_ONERR HANA_ONERR_FAILCHK
#define DEF_HANA_ERRLIMIT 10
// X-Forwarded-For header default
#define DEF_XFORWARDFOR_HDR "X-Forwarded-For"

View File

@ -29,6 +29,7 @@ const char *get_check_status_description(short check_status);
const char *get_check_status_info(short check_status);
struct task *process_chk(struct task *t);
int start_checks();
void health_adjust(struct server *s, short status);
#endif /* _PROTO_CHECKS_H */

View File

@ -18,6 +18,9 @@ enum {
/* Below we have finished checks */
HCHK_STATUS_CHECKED, /* DUMMY STATUS */
HCHK_STATUS_HANA, /* Healt analyze detected enough consecutive errors */
HCHK_STATUS_SOCKERR, /* Socket error */
HCHK_STATUS_L4OK, /* L4 check passed, for example tcp connect */
@ -41,8 +44,51 @@ enum {
HCHK_STATUS_SIZE
};
/* health status for response tracking */
enum {
HANA_STATUS_UNKNOWN = 0,
HANA_STATUS_L4_OK, /* L4 successful connection */
HANA_STATUS_L4_ERR, /* L4 unsuccessful connection */
HANA_STATUS_HTTP_OK, /* Correct http response */
HANA_STATUS_HTTP_STS, /* Wrong http response, for example HTTP 5xx */
HANA_STATUS_HTTP_HDRRSP, /* Invalid http response (headers) */
HANA_STATUS_HTTP_RSP, /* Invalid http response */
HANA_STATUS_HTTP_READ_ERROR, /* Read error */
HANA_STATUS_HTTP_READ_TIMEOUT, /* Read timeout */
HANA_STATUS_HTTP_BROKEN_PIPE, /* Unexpected close from server */
HANA_STATUS_SIZE
};
enum {
HANA_ONERR_UNKNOWN = 0,
HANA_ONERR_FASTINTER, /* Force fastinter*/
HANA_ONERR_FAILCHK, /* Simulate a failed check */
HANA_ONERR_SUDDTH, /* Enters sudden death - one more failed check will mark this server down */
HANA_ONERR_MARKDWN, /* Mark this server down, now! */
};
enum {
HANA_OBS_NONE = 0,
HANA_OBS_LAYER4, /* Observe L4 - for example tcp */
HANA_OBS_LAYER7, /* Observe L7 - for example http */
HANA_OBS_SIZE
};
struct check_status {
short result; /* one of SRV_CHK_* */
char *info; /* human readable short info */
char *desc; /* long description */
};
struct analyze_status {
char *desc; /* description */
unsigned char lr[HANA_OBS_SIZE]; /* result for l4/l7: 0 = ignore, 1 - error, 2 - OK */
};

View File

@ -81,7 +81,8 @@ struct srvcounters {
} http;
} p;
long long failed_checks, down_trans; /* failed checks and up->down transitions */
long long failed_checks, failed_hana; /* failed health checks and health analyses */
long long down_trans; /* up->down transitions */
};
#endif /* _TYPES_COUNTERS_H */

View File

@ -115,7 +115,10 @@ struct server {
struct sockaddr_in check_addr; /* the address to check, if different from <addr> */
short check_port; /* the port to use for the health checks */
int health; /* 0->rise-1 = bad; rise->rise+fall-1 = good */
int consecutive_errors; /* current number of consecutive errors */
int rise, fall; /* time in iterations */
int consecutive_errors_limit; /* number of consecutive errors that triggers an event */
short observe, onerror; /* observing mode: one of HANA_OBS_*; what to do on error: on of ANA_ONERR_* */
int inter, fastinter, downinter; /* checks: time in milliseconds */
int slowstart; /* slowstart time in seconds (ms in the conf) */
int result; /* health-check result : SRV_CHK_* */
@ -137,9 +140,9 @@ struct server {
unsigned down_time; /* total time the server was down */
time_t last_change; /* last time, when the state was changed */
struct timeval check_start; /* last health check start time */
unsigned long check_duration; /* time in ms took to finish last health check */
long check_duration; /* time in ms took to finish last health check */
short check_status, check_code; /* check result, check code */
char check_desc[HCHK_DESC_LEN]; /* healt check descritpion */
char check_desc[HCHK_DESC_LEN]; /* health check descritpion */
struct freq_ctr sess_per_sec; /* sessions per second on this server */
int puid; /* proxy-unique server ID, used for SNMP */

View File

@ -2618,6 +2618,8 @@ int cfg_parse_listen(const char *file, int linenum, char **args, int kwm)
newsrv->uweight = newsrv->iweight = 1;
newsrv->maxqueue = 0;
newsrv->slowstart = 0;
newsrv->onerror = DEF_HANA_ONERR;
newsrv->consecutive_errors_limit = DEF_HANA_ERRLIMIT;
cur_arg = 3;
while (*args[cur_arg]) {
@ -2823,6 +2825,65 @@ int cfg_parse_listen(const char *file, int linenum, char **args, int kwm)
do_check = 1;
cur_arg += 1;
}
else if (!strcmp(args[cur_arg], "observe")) {
if (!strcmp(args[cur_arg + 1], "none"))
newsrv->observe = HANA_OBS_NONE;
else if (!strcmp(args[cur_arg + 1], "layer4"))
newsrv->observe = HANA_OBS_LAYER4;
else if (!strcmp(args[cur_arg + 1], "layer7")) {
if (curproxy->mode != PR_MODE_HTTP) {
Alert("parsing [%s:%d]: '%s' can only be used in http proxies.\n",
file, linenum, args[cur_arg + 1]);
err_code |= ERR_ALERT;
}
newsrv->observe = HANA_OBS_LAYER7;
}
else {
Alert("parsing [%s:%d]: '%s' expects one of 'none', "
"'l4events', 'http-responses' but get '%s'\n",
file, linenum, args[cur_arg], args[cur_arg + 1]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
cur_arg += 2;
}
else if (!strcmp(args[cur_arg], "on-error")) {
if (!strcmp(args[cur_arg + 1], "fastinter"))
newsrv->onerror = HANA_ONERR_FASTINTER;
else if (!strcmp(args[cur_arg + 1], "fail-check"))
newsrv->onerror = HANA_ONERR_FAILCHK;
else if (!strcmp(args[cur_arg + 1], "sudden-death"))
newsrv->onerror = HANA_ONERR_SUDDTH;
else if (!strcmp(args[cur_arg + 1], "mark-down"))
newsrv->onerror = HANA_ONERR_MARKDWN;
else {
Alert("parsing [%s:%d]: '%s' expects one of 'fastinter', "
"'fail-check', 'sudden-death' or 'mark-down' but get '%s'\n",
file, linenum, args[cur_arg], args[cur_arg + 1]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
cur_arg += 2;
}
else if (!strcmp(args[cur_arg], "error-limit")) {
if (!*args[cur_arg + 1]) {
Alert("parsing [%s:%d]: '%s' expects an integer argument.\n",
file, linenum, args[cur_arg]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
newsrv->consecutive_errors_limit = atoi(args[cur_arg + 1]);
if (newsrv->consecutive_errors_limit <= 0) {
Alert("parsing [%s:%d]: %s has to be > 0.\n",
file, linenum, args[cur_arg]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
}
else if (!strcmp(args[cur_arg], "source")) { /* address to which we bind when connecting */
int port_low, port_high;
if (!*args[cur_arg + 1]) {

View File

@ -52,6 +52,8 @@ const struct check_status check_statuses[HCHK_STATUS_SIZE] = {
[HCHK_STATUS_INI] = { SRV_CHK_UNKNOWN, "INI", "Initializing" },
[HCHK_STATUS_START] = { /* SPECIAL STATUS*/ },
[HCHK_STATUS_HANA] = { SRV_CHK_ERROR, "HANA", "Health analyze" },
[HCHK_STATUS_SOCKERR] = { SRV_CHK_ERROR, "SOCKERR", "Socket error" },
[HCHK_STATUS_L4OK] = { SRV_CHK_RUNNING, "L4OK", "Layer4 check passed" },
@ -72,6 +74,22 @@ const struct check_status check_statuses[HCHK_STATUS_SIZE] = {
[HCHK_STATUS_L7STS] = { SRV_CHK_ERROR, "L7STS", "Layer7 wrong status" },
};
const struct analyze_status analyze_statuses[HANA_STATUS_SIZE] = { /* 0: ignore, 1: error, 2: OK */
[HANA_STATUS_UNKNOWN] = { "Unknown", { 0, 0 }},
[HANA_STATUS_L4_OK] = { "L4 successful connection", { 2, 0 }},
[HANA_STATUS_L4_ERR] = { "L4 unsuccessful connection", { 1, 1 }},
[HANA_STATUS_HTTP_OK] = { "Correct http response", { 0, 2 }},
[HANA_STATUS_HTTP_STS] = { "Wrong http response", { 0, 1 }},
[HANA_STATUS_HTTP_HDRRSP] = { "Invalid http response (headers)", { 0, 1 }},
[HANA_STATUS_HTTP_RSP] = { "Invalid http response", { 0, 1 }},
[HANA_STATUS_HTTP_READ_ERROR] = { "Read error (http)", { 0, 1 }},
[HANA_STATUS_HTTP_READ_TIMEOUT] = { "Read timeout (http)", { 0, 1 }},
[HANA_STATUS_HTTP_BROKEN_PIPE] = { "Close from server (http)", { 0, 1 }},
};
/*
* Convert check_status code to description
*/
@ -108,6 +126,21 @@ const char *get_check_status_info(short check_status) {
return check_statuses[HCHK_STATUS_UNKNOWN].info;
}
const char *get_analyze_status(short analyze_status) {
const char *desc;
if (analyze_status < HANA_STATUS_SIZE)
desc = analyze_statuses[analyze_status].desc;
else
desc = NULL;
if (desc && *desc)
return desc;
else
return analyze_statuses[HANA_STATUS_UNKNOWN].desc;
}
#define SSP_O_VIA 0x0001
#define SSP_O_HCHK 0x0002
#define SSP_O_STATUS 0x0004
@ -136,7 +169,8 @@ static void server_status_printf(struct chunk *msg, struct server *s, unsigned o
chunk_printf(msg, "\"");
}
chunk_printf(msg, ", check duration: %lums", s->check_duration);
if (s->check_duration >= 0)
chunk_printf(msg, ", check duration: %ldms", s->check_duration);
}
if (options & SSP_O_STATUS) {
@ -184,9 +218,11 @@ static void set_server_check_status(struct server *s, short status, char *desc)
s->check_status = status;
if (check_statuses[status].result)
s->result |= check_statuses[status].result;
s->result = check_statuses[status].result;
if (!tv_iszero(&s->check_start)) {
if (status == HCHK_STATUS_HANA)
s->check_duration = -1;
else if (!tv_iszero(&s->check_start)) {
/* set_server_check_status() may be called more than once */
s->check_duration = tv_ms_elapsed(&s->check_start, &now);
tv_zero(&s->check_start);
@ -229,6 +265,10 @@ static void set_server_check_status(struct server *s, short status, char *desc)
if (health >= rise)
health = rise + fall - 1; /* OK now */
}
/* clear consecutive_errors if observing is enabled */
if (s->onerror)
s->consecutive_errors = 0;
}
/* FIXME end: calculate local version of the health/rise/fall/state */
@ -505,6 +545,96 @@ static void set_server_enabled(struct server *s) {
set_server_enabled(srv);
}
void health_adjust(struct server *s, short status) {
int failed;
int expire;
/* return now if observing nor health check is not enabled */
if (!s->observe || !s->check)
return;
if (s->observe >= HANA_OBS_SIZE)
return;
if (status >= HCHK_STATUS_SIZE || !analyze_statuses[status].desc)
return;
switch (analyze_statuses[status].lr[s->observe - 1]) {
case 1:
failed = 1;
break;
case 2:
failed = 0;
break;
default:
return;
}
if (!failed) {
/* good: clear consecutive_errors */
s->consecutive_errors = 0;
return;
}
s->consecutive_errors++;
if (s->consecutive_errors < s->consecutive_errors_limit)
return;
sprintf(trash, "Detected %d consecutive errors, last one was: %s",
s->consecutive_errors, get_analyze_status(status));
switch (s->onerror) {
case HANA_ONERR_FASTINTER:
/* force fastinter - nothing to do here as all modes force it */
break;
case HANA_ONERR_SUDDTH:
/* simulate a pre-fatal failed health check */
if (s->health > s->rise)
s->health = s->rise + 1;
/* no break - fall through */
case HANA_ONERR_FAILCHK:
/* simulate a failed health check */
set_server_check_status(s, HCHK_STATUS_HANA, trash);
if (s->health > s->rise) {
s->health--; /* still good */
s->counters.failed_checks++;
}
else
set_server_down(s);
break;
case HANA_ONERR_MARKDWN:
/* mark server down */
s->health = s->rise;
set_server_check_status(s, HCHK_STATUS_HANA, trash);
set_server_down(s);
break;
default:
/* write a warning? */
break;
}
s->consecutive_errors = 0;
s->counters.failed_hana++;
if (s->fastinter) {
expire = tick_add(now_ms, MS_TO_TICKS(s->fastinter));
if (s->check->expire > expire)
s->check->expire = expire;
}
}
/*
* This function is used only for server health-checks. It handles
* the connection acknowledgement. If the proxy requires L7 health-checks,

View File

@ -244,7 +244,7 @@ int print_csv_header(struct chunk *msg)
"pid,iid,sid,throttle,lbtot,tracked,type,"
"rate,rate_lim,rate_max,"
"check_status,check_code,check_duration,"
"hrsp_1xx,hrsp_2xx,hrsp_3xx,hrsp_4xx,hrsp_5xx,hrsp_other,"
"hrsp_1xx,hrsp_2xx,hrsp_3xx,hrsp_4xx,hrsp_5xx,hrsp_other,hanafail,"
"\n");
}
@ -1370,6 +1370,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
chunk_printf(&msg, ",,,,,,");
}
/* failed health analyses */
chunk_printf(&msg, ",");
/* finish with EOL */
chunk_printf(&msg, "\n");
}
@ -1457,6 +1460,8 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
",,,"
/* http response: 1xx, 2xx, 3xx, 4xx, 5xx, other */
",,,,,,"
/* failed health analyses */
","
"\n",
px->id, l->name,
l->nbconn, l->counters->conn_max,
@ -1610,7 +1615,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
if (sv->check_status >= HCHK_STATUS_L57DATA)
chunk_printf(&msg, "/%d", sv->check_code);
if (sv->check_status >= HCHK_STATUS_CHECKED)
if (sv->check_status >= HCHK_STATUS_CHECKED && sv->check_duration >= 0)
chunk_printf(&msg, " in %lums", sv->check_duration);
} else {
chunk_printf(&msg, "</td><td>");
@ -1629,11 +1634,11 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
/* check failures: unique, fatal, down time */
if (sv->state & SRV_CHECKED)
chunk_printf(&msg,
"<td>%lld</td><td>%lld</td>"
"<td>%s</td>"
"<td title=\"Failed Health Checks/Health Analyses\">%lld/%lld</td>"
"<td>%lld</td><td>%s</td>"
"",
svs->counters.failed_checks, svs->counters.down_trans,
human_time(srv_downtime(sv), 1));
svs->counters.failed_checks, svs->counters.failed_hana,
svs->counters.down_trans, human_time(srv_downtime(sv), 1));
else if (sv != svs)
chunk_printf(&msg,
"<td class=ac colspan=3>via %s/%s</td>", svs->proxy->id, svs->id);
@ -1772,6 +1777,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
chunk_printf(&msg, ",,,,,,");
}
/* failed health analyses */
chunk_printf(&msg, "%lld,", sv->counters.failed_hana);
/* finish with EOL */
chunk_printf(&msg, "\n");
}
@ -1919,6 +1927,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri)
chunk_printf(&msg, ",,,,,,");
}
/* failed health analyses */
chunk_printf(&msg, ",");
/* finish with EOL */
chunk_printf(&msg, "\n");

View File

@ -41,6 +41,7 @@
#include <proto/acl.h>
#include <proto/backend.h>
#include <proto/buffers.h>
#include <proto/checks.h>
#include <proto/client.h>
#include <proto/dumpstats.h>
#include <proto/fd.h>
@ -2948,8 +2949,10 @@ int http_wait_for_response(struct session *s, struct buffer *rep, int an_bit)
http_capture_bad_message(&s->be->invalid_rep, s, rep, msg, s->fe);
s->be->counters.failed_resp++;
if (s->srv)
if (s->srv) {
s->srv->counters.failed_resp++;
health_adjust(s->srv, HANA_STATUS_HTTP_HDRRSP);
}
rep->analysers = 0;
txn->status = 502;
@ -2974,8 +2977,10 @@ int http_wait_for_response(struct session *s, struct buffer *rep, int an_bit)
http_capture_bad_message(&s->be->invalid_rep, s, rep, msg, s->fe);
s->be->counters.failed_resp++;
if (s->srv)
if (s->srv) {
s->srv->counters.failed_resp++;
health_adjust(s->srv, HANA_STATUS_HTTP_READ_ERROR);
}
rep->analysers = 0;
txn->status = 502;
@ -2994,8 +2999,10 @@ int http_wait_for_response(struct session *s, struct buffer *rep, int an_bit)
http_capture_bad_message(&s->be->invalid_rep, s, rep, msg, s->fe);
s->be->counters.failed_resp++;
if (s->srv)
if (s->srv) {
s->srv->counters.failed_resp++;
health_adjust(s->srv, HANA_STATUS_HTTP_READ_TIMEOUT);
}
rep->analysers = 0;
txn->status = 504;
@ -3014,8 +3021,10 @@ int http_wait_for_response(struct session *s, struct buffer *rep, int an_bit)
http_capture_bad_message(&s->be->invalid_rep, s, rep, msg, s->fe);
s->be->counters.failed_resp++;
if (s->srv)
if (s->srv) {
s->srv->counters.failed_resp++;
health_adjust(s->srv, HANA_STATUS_HTTP_BROKEN_PIPE);
}
rep->analysers = 0;
txn->status = 502;
@ -3070,6 +3079,11 @@ int http_wait_for_response(struct session *s, struct buffer *rep, int an_bit)
txn->status = strl2ui(rep->data + msg->sl.st.c, msg->sl.st.c_l);
if (txn->status >= 100 && txn->status < 500)
health_adjust(s->srv, HANA_STATUS_HTTP_OK);
else
health_adjust(s->srv, HANA_STATUS_HTTP_STS);
/*
* 2: check for cacheability.
*/
@ -3257,8 +3271,10 @@ int http_process_res_common(struct session *t, struct buffer *rep, int an_bit, s
if (rule_set->rsp_exp != NULL) {
if (apply_filters_to_response(t, rep, rule_set->rsp_exp) < 0) {
return_bad_resp:
if (t->srv)
if (t->srv) {
t->srv->counters.failed_resp++;
health_adjust(t->srv, HANA_STATUS_HTTP_RSP);
}
cur_proxy->counters.failed_resp++;
return_srv_prx_502:
rep->analysers = 0;

View File

@ -43,6 +43,7 @@
#include <proto/acl.h>
#include <proto/backend.h>
#include <proto/buffers.h>
#include <proto/checks.h>
#include <proto/fd.h>
#include <proto/log.h>
#include <proto/port_range.h>

View File

@ -22,6 +22,7 @@
#include <proto/acl.h>
#include <proto/backend.h>
#include <proto/buffers.h>
#include <proto/checks.h>
#include <proto/dumpstats.h>
#include <proto/hdr_idx.h>
#include <proto/log.h>
@ -249,6 +250,8 @@ int sess_update_st_cer(struct session *s, struct stream_interface *si)
{
/* we probably have to release last session from the server */
if (s->srv) {
health_adjust(s->srv, HANA_STATUS_L4_ERR);
if (s->flags & SN_CURR_SESS) {
s->flags &= ~SN_CURR_SESS;
s->srv->cur_sess--;
@ -327,6 +330,9 @@ void sess_establish(struct session *s, struct stream_interface *si)
struct buffer *req = si->ob;
struct buffer *rep = si->ib;
if (s->srv)
health_adjust(s->srv, HANA_STATUS_L4_OK);
if (s->be->mode == PR_MODE_TCP) { /* let's allow immediate data connection in this case */
buffer_set_rlim(rep, rep->size); /* no rewrite needed */