MINOR: contrib/prometheus-exporter: Add the last heathcheck duration metric

ST_F_CHECK_DURATION is now part of exported server metrics, named
haproxy_server_check_duration_seconds and expressed in seconds. For a given
server, this value is exported only if the healthcheck is finished (the status
is greater or equal to HCHK_STATUS_CHECKED).

This patch fixes the issue #519. It may be backported as fat as 2.0.
This commit is contained in:
Christopher Faulet 2020-02-27 16:12:07 +01:00
parent 81725b867c
commit 2711e51016
2 changed files with 11 additions and 4 deletions

View File

@ -272,6 +272,7 @@ Exported metrics
| haproxy_server_weight | Service weight. | | haproxy_server_weight | Service weight. |
| haproxy_server_check_status | Status of last health check, if enabled. (see below for the mapping) | | haproxy_server_check_status | Status of last health check, if enabled. (see below for the mapping) |
| haproxy_server_check_code | layer5-7 code, if available of the last health check. | | haproxy_server_check_code | layer5-7 code, if available of the last health check. |
| haproxy_server_check_duration_seconds | Total duration of the latest server health check, in seconds. |
| haproxy_server_check_failures_total | Total number of failed check (Only when the server is up). | | haproxy_server_check_failures_total | Total number of failed check (Only when the server is up). |
| haproxy_server_check_up_down_total | Total number of UP->DOWN transitions. | | haproxy_server_check_up_down_total | Total number of UP->DOWN transitions. |
| haproxy_server_downtime_seconds_total | Total downtime (in seconds) for the service. | | haproxy_server_downtime_seconds_total | Total downtime (in seconds) for the service. |

View File

@ -388,8 +388,8 @@ const int promex_srv_metrics[ST_F_TOTAL_FIELDS] = {
[ST_F_RATE_LIM] = 0, [ST_F_RATE_LIM] = 0,
[ST_F_RATE_MAX] = ST_F_LASTSESS, [ST_F_RATE_MAX] = ST_F_LASTSESS,
[ST_F_CHECK_STATUS] = ST_F_CHECK_CODE, [ST_F_CHECK_STATUS] = ST_F_CHECK_CODE,
[ST_F_CHECK_CODE] = ST_F_CHKFAIL, [ST_F_CHECK_CODE] = ST_F_CHECK_DURATION,
[ST_F_CHECK_DURATION] = 0, [ST_F_CHECK_DURATION] = ST_F_CHKFAIL,
[ST_F_HRSP_1XX] = ST_F_HRSP_2XX, [ST_F_HRSP_1XX] = ST_F_HRSP_2XX,
[ST_F_HRSP_2XX] = ST_F_HRSP_3XX, [ST_F_HRSP_2XX] = ST_F_HRSP_3XX,
[ST_F_HRSP_3XX] = ST_F_HRSP_4XX, [ST_F_HRSP_3XX] = ST_F_HRSP_4XX,
@ -552,7 +552,7 @@ const struct ist promex_st_metric_names[ST_F_TOTAL_FIELDS] = {
[ST_F_RATE_MAX] = IST("max_session_rate"), [ST_F_RATE_MAX] = IST("max_session_rate"),
[ST_F_CHECK_STATUS] = IST("check_status"), [ST_F_CHECK_STATUS] = IST("check_status"),
[ST_F_CHECK_CODE] = IST("check_code"), [ST_F_CHECK_CODE] = IST("check_code"),
[ST_F_CHECK_DURATION] = IST("check_duration_milliseconds"), [ST_F_CHECK_DURATION] = IST("check_duration_seconds"),
[ST_F_HRSP_1XX] = IST("http_responses_total"), [ST_F_HRSP_1XX] = IST("http_responses_total"),
[ST_F_HRSP_2XX] = IST("http_responses_total"), [ST_F_HRSP_2XX] = IST("http_responses_total"),
[ST_F_HRSP_3XX] = IST("http_responses_total"), [ST_F_HRSP_3XX] = IST("http_responses_total"),
@ -715,7 +715,7 @@ const struct ist promex_st_metric_desc[ST_F_TOTAL_FIELDS] = {
[ST_F_RATE_MAX] = IST("Maximum observed number of sessions per second."), [ST_F_RATE_MAX] = IST("Maximum observed number of sessions per second."),
[ST_F_CHECK_STATUS] = IST("Status of last health check (HCHK_STATUS_* values)."), [ST_F_CHECK_STATUS] = IST("Status of last health check (HCHK_STATUS_* values)."),
[ST_F_CHECK_CODE] = IST("layer5-7 code, if available of the last health check."), [ST_F_CHECK_CODE] = IST("layer5-7 code, if available of the last health check."),
[ST_F_CHECK_DURATION] = IST("Time in ms took to finish last health check."), [ST_F_CHECK_DURATION] = IST("Total duration of the latest server health check, in seconds."),
[ST_F_HRSP_1XX] = IST("Total number of HTTP responses."), [ST_F_HRSP_1XX] = IST("Total number of HTTP responses."),
[ST_F_HRSP_2XX] = IST("Total number of HTTP responses."), [ST_F_HRSP_2XX] = IST("Total number of HTTP responses."),
[ST_F_HRSP_3XX] = IST("Total number of HTTP responses."), [ST_F_HRSP_3XX] = IST("Total number of HTTP responses."),
@ -2037,6 +2037,12 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
goto next_sv; goto next_sv;
metric = mkf_u32(FN_OUTPUT, (sv->check.status < HCHK_STATUS_L57DATA) ? 0 : sv->check.code); metric = mkf_u32(FN_OUTPUT, (sv->check.status < HCHK_STATUS_L57DATA) ? 0 : sv->check.code);
break; break;
case ST_F_CHECK_DURATION:
if (sv->check.status < HCHK_STATUS_CHECKED)
goto next_sv;
secs = (double)sv->check.duration / 1000.0;
metric = mkf_flt(FN_DURATION, secs);
break;
case ST_F_CHKFAIL: case ST_F_CHKFAIL:
metric = mkf_u64(FN_COUNTER, sv->counters.failed_checks); metric = mkf_u64(FN_COUNTER, sv->counters.failed_checks);
break; break;