BUG/MINOR: checks: refine which errno values are really errors.

Two regtest regularly fail in a random fashion depending on the machine's
load (one could really wonder if it's really worth keeping such
unreproducible tests) :
  - tcp-check_multiple_ports.vtc
  - 4be_1srv_smtpchk_httpchk_layer47errors.vtc

It happens that one of the reason is the time it takes to connect to
the local socket (hence the load-dependent aspect): if connect() on the
loopback returns EINPROGRESS then this status is reported instead of a
real error. Normally such a test is expected to see the error cleaned
by tcp_connect_probe() but it really depends on the timing and instead
we may very well send() first and see this error. The problem is that
everything is collected based on errno, hoping it won't get molested
in the way from the last unsuccesful syscall to wake_srv_chk(), which
obviously is hard to guarantee.

This patch at least makes sure that a few non-errors are reported as
zero just like EAGAIN. It doesn't fix the root cause but makes it less
likely to report incorrect failures.

This fix could be backported as far as 1.9.
This commit is contained in:
Willy Tarreau 2019-12-27 12:03:27 +01:00
parent 7aed6ef8e3
commit c8dc20a825

View File

@ -137,6 +137,17 @@ static const struct analyze_status analyze_statuses[HANA_STATUS_SIZE] = { /* 0:
[HANA_STATUS_HTTP_BROKEN_PIPE] = { "Close from server (http)", { 0, 1 }},
};
/* checks if <err> is a real error for errno or one that can be ignored, and
* return 0 for these ones or <err> for real ones.
*/
static inline int unclean_errno(int err)
{
if (err == EAGAIN || err == EINPROGRESS ||
err == EISCONN || err == EALREADY)
return 0;
return err;
}
/*
* Convert check_status code to description
*/
@ -548,7 +559,7 @@ static int retrieve_errno_from_socket(struct connection *conn)
int skerr;
socklen_t lskerr = sizeof(skerr);
if (conn->flags & CO_FL_ERROR && ((errno && errno != EAGAIN) || !conn->ctrl))
if (conn->flags & CO_FL_ERROR && (unclean_errno(errno) || !conn->ctrl))
return 1;
if (!conn_ctrl_ready(conn))
@ -557,8 +568,7 @@ static int retrieve_errno_from_socket(struct connection *conn)
if (getsockopt(conn->handle.fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) == 0)
errno = skerr;
if (errno == EAGAIN)
errno = 0;
errno = unclean_errno(errno);
if (!errno) {
/* we could not retrieve an error, that does not mean there is
@ -599,8 +609,8 @@ static void chk_report_conn_err(struct check *check, int errno_bck, int expired)
if (check->result != CHK_RES_UNKNOWN)
return;
errno = errno_bck;
if (conn && (!errno || errno == EAGAIN))
errno = unclean_errno(errno_bck);
if (conn && errno)
retrieve_errno_from_socket(conn);
if (conn && !(conn->flags & CO_FL_ERROR) &&
@ -644,7 +654,7 @@ static void chk_report_conn_err(struct check *check, int errno_bck, int expired)
}
if (conn && conn->err_code) {
if (errno && errno != EAGAIN)
if (unclean_errno(errno))
chunk_printf(&trash, "%s (%s)%s", conn_err_code_str(conn), strerror(errno),
chk->area);
else
@ -653,7 +663,7 @@ static void chk_report_conn_err(struct check *check, int errno_bck, int expired)
err_msg = trash.area;
}
else {
if (errno && errno != EAGAIN) {
if (unclean_errno(errno)) {
chunk_printf(&trash, "%s%s", strerror(errno),
chk->area);
err_msg = trash.area;