From c10186eeea1edbe466f8738fc8fcc8871a334f82 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 3 May 2024 14:23:46 +0100 Subject: [PATCH] BUGFIX: Mark the rule's restoration process as completed always (#14048) * BUGFIX: Mark the rule's restoration process as completed always In https://github.com/prometheus/prometheus/pull/13980 I introduced a change to reduce the number of queries executed when we restore alert statuses. With this, the querying semantics changed as we now need to go through all series before we enter the alert restoration loop and I missed the fact that exiting early when there are no rules to restore would lead to an incomplete restoration. An alert being restored is used as a proxy for "we're now ready to write `ALERTS/ALERTS_FOR_SERIES` metrics" so as a result we weren't writing the series if we didn't restore anything the first time around. --------- Signed-off-by: gotjosh --- rules/group.go | 6 +++++- rules/manager_test.go | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/rules/group.go b/rules/group.go index 3a97c9564..1f4757de3 100644 --- a/rules/group.go +++ b/rules/group.go @@ -672,6 +672,9 @@ func (g *Group) RestoreForState(ts time.Time) { "stage", "Select", "err", err, ) + // Even if we failed to query the `ALERT_FOR_STATE` series, we currently have no way to retry the restore process. + // So the best we can do is mark the rule as restored and let it eventually fire. + alertRule.SetRestored(true) continue } @@ -683,7 +686,8 @@ func (g *Group) RestoreForState(ts time.Time) { // No results for this alert rule. if len(seriesByLabels) == 0 { - level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) + level.Debug(g.logger).Log("msg", "No series found to restore the 'for' state of the alert rule", labels.AlertName, alertRule.Name()) + alertRule.SetRestored(true) continue } diff --git a/rules/manager_test.go b/rules/manager_test.go index 07159145f..c45569bef 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -482,6 +482,9 @@ func TestForStateRestore(t *testing.T) { return labels.Compare(got[i].Labels, got[j].Labels) < 0 }) + // In all cases, we expect the restoration process to have completed. + require.Truef(t, newRule.Restored(), "expected the rule restoration process to have completed") + // Checking if we have restored it correctly. switch { case tt.noRestore: