diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cc5b8f40..6a4d478ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## unreleased * [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633 +* [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 diff --git a/rules/alerting.go b/rules/alerting.go index edcdfe5e0..2d2b19efe 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -246,13 +246,16 @@ func (r *AlertingRule) sample(alert *Alert, ts time.Time) promql.Sample { return s } -// forStateSample returns the sample for ALERTS_FOR_STATE. +// forStateSample returns a promql.Sample with the rule labels, `ALERTS_FOR_STATE` as the metric name and the rule name as the `alertname` label. +// Optionally, if an alert is provided it'll copy the labels of the alert into the sample labels. func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) promql.Sample { lb := labels.NewBuilder(r.labels) - alert.Labels.Range(func(l labels.Label) { - lb.Set(l.Name, l.Value) - }) + if alert != nil { + alert.Labels.Range(func(l labels.Label) { + lb.Set(l.Name, l.Value) + }) + } lb.Set(labels.MetricName, alertForStateMetricName) lb.Set(labels.AlertName, r.name) @@ -265,9 +268,11 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro return s } -// QueryforStateSeries returns the series for ALERTS_FOR_STATE. -func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q storage.Querier) (storage.Series, error) { - smpl := r.forStateSample(alert, time.Now(), 0) +// QueryForStateSeries returns the series for ALERTS_FOR_STATE of the alert rule. +func (r *AlertingRule) QueryForStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) { + // We use a sample to ease the building of matchers. + // Don't provide an alert as we want matchers that match all series for the alert rule. + smpl := r.forStateSample(nil, time.Now(), 0) var matchers []*labels.Matcher smpl.Metric.Range(func(l labels.Label) { mt, err := labels.NewMatcher(labels.MatchEqual, l.Name, l.Value) @@ -276,20 +281,9 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q } matchers = append(matchers, mt) }) + sset := q.Select(ctx, false, nil, matchers...) - - var s storage.Series - for sset.Next() { - // Query assures that smpl.Metric is included in sset.At().Labels(), - // hence just checking the length would act like equality. - // (This is faster than calling labels.Compare again as we already have some info). - if sset.At().Labels().Len() == len(matchers) { - s = sset.At() - break - } - } - - return s, sset.Err() + return sset, sset.Err() } // SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. @@ -557,6 +551,13 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) { } } +func (r *AlertingRule) ActiveAlertsCount() int { + r.activeMtx.Lock() + defer r.activeMtx.Unlock() + + return len(r.active) +} + func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay, interval time.Duration, notifyFunc NotifyFunc) { alerts := []*Alert{} r.ForEachActiveAlert(func(alert *Alert) { diff --git a/rules/alerting_test.go b/rules/alerting_test.go index ddfe345ef..5fae3edd1 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -710,19 +710,17 @@ func TestQueryForStateSeries(t *testing.T) { labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) - alert := &Alert{ - State: 0, - Labels: labels.EmptyLabels(), - Annotations: labels.EmptyLabels(), - Value: 0, - ActiveAt: time.Time{}, - FiredAt: time.Time{}, - ResolvedAt: time.Time{}, - LastSentAt: time.Time{}, - ValidUntil: time.Time{}, - } + sample := rule.forStateSample(nil, time.Time{}, 0) - series, err := rule.QueryforStateSeries(context.Background(), alert, querier) + seriesSet, err := rule.QueryForStateSeries(context.Background(), querier) + + var series storage.Series + for seriesSet.Next() { + if seriesSet.At().Labels().Len() == sample.Metric.Len() { + series = seriesSet.At() + break + } + } require.Equal(t, tst.expectedSeries, series) require.Equal(t, tst.expectedError, err) @@ -1025,3 +1023,24 @@ func TestAlertingRule_SetNoDependencyRules(t *testing.T) { rule.SetNoDependencyRules(true) require.True(t, rule.NoDependencyRules()) } + +func TestAlertingRule_ActiveAlertsCount(t *testing.T) { + rule := NewAlertingRule( + "TestRule", + nil, + time.Minute, + 0, + labels.FromStrings("severity", "critical"), + labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, + ) + + require.Equal(t, 0, rule.ActiveAlertsCount()) + + // Set an active alert. + lbls := labels.FromStrings("a1", "1") + h := lbls.Hash() + al := &Alert{State: StateFiring, Labels: lbls, ActiveAt: time.Now()} + rule.active[h] = al + + require.Equal(t, 1, rule.ActiveAlertsCount()) +} diff --git a/rules/group.go b/rules/group.go index 987136a00..3a97c9564 100644 --- a/rules/group.go +++ b/rules/group.go @@ -664,25 +664,36 @@ func (g *Group) RestoreForState(ts time.Time) { continue } + sset, err := alertRule.QueryForStateSeries(g.opts.Context, q) + if err != nil { + level.Error(g.logger).Log( + "msg", "Failed to restore 'for' state", + labels.AlertName, alertRule.Name(), + "stage", "Select", + "err", err, + ) + continue + } + + // While not technically the same number of series we expect, it's as good of an approximation as any. + seriesByLabels := make(map[string]storage.Series, alertRule.ActiveAlertsCount()) + for sset.Next() { + seriesByLabels[sset.At().Labels().DropMetricName().String()] = sset.At() + } + + // No results for this alert rule. + if len(seriesByLabels) == 0 { + level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) + continue + } + alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q) - if err != nil { - // Querier Warnings are ignored. We do not care unless we have an error. - level.Error(g.logger).Log( - "msg", "Failed to restore 'for' state", - labels.AlertName, alertRule.Name(), - "stage", "Select", - "err", err, - ) + s, ok := seriesByLabels[a.Labels.String()] + if !ok { return } - - if s == nil { - return - } - // Series found for the 'for' state. var t int64 var v float64 diff --git a/rules/manager_test.go b/rules/manager_test.go index 50ab6b861..07159145f 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -397,122 +397,120 @@ func TestForStateRestore(t *testing.T) { group.Eval(context.TODO(), evalTime) } - exp := rule.ActiveAlerts() - for _, aa := range exp { - require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) - } - sort.Slice(exp, func(i, j int) bool { - return labels.Compare(exp[i].Labels, exp[j].Labels) < 0 - }) - // Prometheus goes down here. We create new rules and groups. type testInput struct { + name string restoreDuration time.Duration - alerts []*Alert + expectedAlerts []*Alert num int noRestore bool gracePeriod bool downDuration time.Duration + before func() } tests := []testInput{ { - // Normal restore (alerts were not firing). + name: "normal restore (alerts were not firing)", restoreDuration: 15 * time.Minute, - alerts: rule.ActiveAlerts(), + expectedAlerts: rule.ActiveAlerts(), downDuration: 10 * time.Minute, }, { - // Testing Outage Tolerance. + name: "outage tolerance", restoreDuration: 40 * time.Minute, noRestore: true, num: 2, }, { - // No active alerts. + name: "no active alerts", restoreDuration: 50 * time.Minute, - alerts: []*Alert{}, + expectedAlerts: []*Alert{}, + }, + { + name: "test the grace period", + restoreDuration: 25 * time.Minute, + expectedAlerts: []*Alert{}, + gracePeriod: true, + before: func() { + for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { + evalTime := baseTime.Add(duration) + group.Eval(context.TODO(), evalTime) + } + }, + num: 2, }, } - testFunc := func(tst testInput) { - newRule := NewAlertingRule( - "HTTPRequestRateLow", - expr, - alertForDuration, - 0, - labels.FromStrings("severity", "critical"), - labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, - ) - newGroup := NewGroup(GroupOptions{ - Name: "default", - Interval: time.Second, - Rules: []Rule{newRule}, - ShouldRestore: true, - Opts: opts, + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.before != nil { + tt.before() + } + + newRule := NewAlertingRule( + "HTTPRequestRateLow", + expr, + alertForDuration, + 0, + labels.FromStrings("severity", "critical"), + labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, + ) + newGroup := NewGroup(GroupOptions{ + Name: "default", + Interval: time.Second, + Rules: []Rule{newRule}, + ShouldRestore: true, + Opts: opts, + }) + + newGroups := make(map[string]*Group) + newGroups["default;"] = newGroup + + restoreTime := baseTime.Add(tt.restoreDuration) + // First eval before restoration. + newGroup.Eval(context.TODO(), restoreTime) + // Restore happens here. + newGroup.RestoreForState(restoreTime) + + got := newRule.ActiveAlerts() + for _, aa := range got { + require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) + } + sort.Slice(got, func(i, j int) bool { + return labels.Compare(got[i].Labels, got[j].Labels) < 0 + }) + + // Checking if we have restored it correctly. + switch { + case tt.noRestore: + require.Len(t, got, tt.num) + for _, e := range got { + require.Equal(t, e.ActiveAt, restoreTime) + } + case tt.gracePeriod: + + require.Len(t, got, tt.num) + for _, e := range got { + require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) + } + default: + exp := tt.expectedAlerts + require.Equal(t, len(exp), len(got)) + sortAlerts(exp) + sortAlerts(got) + for i, e := range exp { + require.Equal(t, e.Labels, got[i].Labels) + + // Difference in time should be within 1e6 ns, i.e. 1ms + // (due to conversion between ns & ms, float64 & int64). + activeAtDiff := float64(e.ActiveAt.Unix() + int64(tt.downDuration/time.Second) - got[i].ActiveAt.Unix()) + require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") + } + } }) - - newGroups := make(map[string]*Group) - newGroups["default;"] = newGroup - - restoreTime := baseTime.Add(tst.restoreDuration) - // First eval before restoration. - newGroup.Eval(context.TODO(), restoreTime) - // Restore happens here. - newGroup.RestoreForState(restoreTime) - - got := newRule.ActiveAlerts() - for _, aa := range got { - require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) - } - sort.Slice(got, func(i, j int) bool { - return labels.Compare(got[i].Labels, got[j].Labels) < 0 - }) - - // Checking if we have restored it correctly. - switch { - case tst.noRestore: - require.Len(t, got, tst.num) - for _, e := range got { - require.Equal(t, e.ActiveAt, restoreTime) - } - case tst.gracePeriod: - require.Len(t, got, tst.num) - for _, e := range got { - require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) - } - default: - exp := tst.alerts - require.Equal(t, len(exp), len(got)) - sortAlerts(exp) - sortAlerts(got) - for i, e := range exp { - require.Equal(t, e.Labels, got[i].Labels) - - // Difference in time should be within 1e6 ns, i.e. 1ms - // (due to conversion between ns & ms, float64 & int64). - activeAtDiff := float64(e.ActiveAt.Unix() + int64(tst.downDuration/time.Second) - got[i].ActiveAt.Unix()) - require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") - } - } } - - for _, tst := range tests { - testFunc(tst) - } - - // Testing the grace period. - for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { - evalTime := baseTime.Add(duration) - group.Eval(context.TODO(), evalTime) - } - testFunc(testInput{ - restoreDuration: 25 * time.Minute, - alerts: []*Alert{}, - gracePeriod: true, - num: 2, - }) } func TestStaleness(t *testing.T) {