BUG/MINOR: server: don't miss server stats update on server state transitions

s->last_change and s->down_time updates were manually updated for each
effective server state change within srv_update_status().

This is rather error-prone, and as a result there were still some state
transitions that were not handled properly since at least 1.8.

ie:
- when transitionning from DRAIN to READY: downtime was updated
  (which is wrong since a server in DRAIN state should not be
   considered as DOWN)
- when transitionning from MAINT to READY: downtime was not updated
  (this can be easily seen in the html stats page)

To fix these all at once, and prevent similar bugs from being introduced,
we centralize the server last_change and down_time stats logic at the end
of srv_update_status():

If the server state changed during the call, then it means that
last_change must be updated, with a special case when changing from
STOPPED state which means the server was previously DOWN and thus
downtime should be updated.

This patch depends on:

- "MINOR: server: explicitly commit state change in srv_update_status()"

This could be backported to every stable versions.
This commit is contained in:
Aurelien DARRAGON 2023-04-18 13:52:27 +02:00 committed by Christopher Faulet
parent e80ddb18a8
commit 9f5853fa38

View File

@ -5287,6 +5287,7 @@ static void srv_update_status(struct server *s)
struct proxy *px = s->proxy;
int prev_srv_count = s->proxy->srv_bck + s->proxy->srv_act;
int srv_was_stopping = (s->cur_state == SRV_ST_STOPPING) || (s->cur_admin & SRV_ADMF_DRAIN);
enum srv_state srv_prev_state = s->cur_state;
int log_level;
struct buffer *tmptrash = NULL;
@ -5301,7 +5302,6 @@ static void srv_update_status(struct server *s)
s->next_admin = s->cur_admin;
if ((s->cur_state != SRV_ST_STOPPED) && (s->next_state == SRV_ST_STOPPED)) {
s->last_change = now.tv_sec;
if (s->proxy->lbprm.set_server_status_down)
s->proxy->lbprm.set_server_status_down(s);
@ -5338,7 +5338,6 @@ static void srv_update_status(struct server *s)
s->counters.down_trans++;
}
else if ((s->cur_state != SRV_ST_STOPPING) && (s->next_state == SRV_ST_STOPPING)) {
s->last_change = now.tv_sec;
if (s->proxy->lbprm.set_server_status_down)
s->proxy->lbprm.set_server_status_down(s);
@ -5366,10 +5365,6 @@ static void srv_update_status(struct server *s)
else if (((s->cur_state != SRV_ST_RUNNING) && (s->next_state == SRV_ST_RUNNING))
|| ((s->cur_state != SRV_ST_STARTING) && (s->next_state == SRV_ST_STARTING))) {
if (s->cur_state == SRV_ST_STOPPED && s->last_change < now.tv_sec) // ignore negative times
s->down_time += now.tv_sec - s->last_change;
s->last_change = now.tv_sec;
if (s->next_state == SRV_ST_STARTING && s->warmup)
task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20))));
@ -5471,7 +5466,6 @@ static void srv_update_status(struct server *s)
}
else { /* server was still running */
check->health = 0; /* failure */
s->last_change = now.tv_sec;
s->next_state = SRV_ST_STOPPED;
if (s->proxy->lbprm.set_server_status_down)
@ -5540,7 +5534,6 @@ static void srv_update_status(struct server *s)
s->next_state = SRV_ST_STOPPING;
}
else {
s->last_change = now.tv_sec;
s->next_state = SRV_ST_STARTING;
if (s->slowstart > 0) {
if (s->warmup)
@ -5679,7 +5672,6 @@ static void srv_update_status(struct server *s)
if (!(s->cur_admin & SRV_ADMF_DRAIN) && (s->next_admin & SRV_ADMF_DRAIN)) {
/* drain state is applied only if not yet in maint */
s->last_change = now.tv_sec;
if (px->lbprm.set_server_status_down)
px->lbprm.set_server_status_down(s);
@ -5710,10 +5702,6 @@ static void srv_update_status(struct server *s)
}
else if ((s->cur_admin & SRV_ADMF_DRAIN) && !(s->next_admin & SRV_ADMF_DRAIN)) {
/* OK completely leaving drain mode */
if (s->last_change < now.tv_sec) // ignore negative times
s->down_time += now.tv_sec - s->last_change;
s->last_change = now.tv_sec;
server_recalc_eweight(s, 0);
tmptrash = alloc_trash_chunk();
@ -5793,6 +5781,16 @@ static void srv_update_status(struct server *s)
*/
srv_lb_commit_status(s);
/* check if server stats must be updated due the the server state change */
if (srv_prev_state != s->cur_state) {
if (srv_prev_state == SRV_ST_STOPPED) {
/* server was down and no longer is */
if (s->last_change < now.tv_sec) // ignore negative times
s->down_time += now.tv_sec - s->last_change;
}
s->last_change = now.tv_sec;
}
/* check if backend stats must be updated due to the server state change */
if (prev_srv_count && s->proxy->srv_bck == 0 && s->proxy->srv_act == 0)
set_backend_down(s->proxy); /* backend going down */