From 87f1dad16da760aa48f8a41089c0d1a20b56689c Mon Sep 17 00:00:00 2001 From: Chris Marchbanks Date: Mon, 27 Aug 2018 10:41:42 -0600 Subject: [PATCH] throttle resends of alerts to 1 minute by default (#4538) Signed-off-by: Chris Marchbanks --- cmd/prometheus/main.go | 10 ++++----- rules/alerting.go | 26 +++++++++++++++++++++ rules/manager.go | 3 ++- rules/manager_test.go | 51 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 6 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index d090464a3..cf25622c9 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -89,6 +89,7 @@ func main() { notifierTimeout model.Duration forGracePeriod model.Duration outageTolerance model.Duration + resendDelay model.Duration web web.Options tsdb tsdb.Options lookbackDelta model.Duration @@ -173,6 +174,9 @@ func main() { a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored 'for' state. This is maintained only for alerts with configured 'for' time greater than grace period."). Default("10m").SetValue(&cfg.forGracePeriod) + a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager. Must be lower than resolve_timeout in Alertmanager"). + Default("1m").SetValue(&cfg.resendDelay) + a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). Default("10000").IntVar(&cfg.notifier.QueueCapacity) @@ -272,6 +276,7 @@ func main() { Logger: log.With(logger, "component", "rule manager"), OutageTolerance: time.Duration(cfg.outageTolerance), ForGracePeriod: time.Duration(cfg.forGracePeriod), + ResendDelay: time.Duration(cfg.resendDelay), }) ) @@ -682,16 +687,11 @@ func computeExternalURL(u, listenAddr string) (*url.URL, error) { } // sendAlerts implements the rules.NotifyFunc for a Notifier. -// It filters any non-firing alerts from the input. func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc { return func(ctx context.Context, expr string, alerts ...*rules.Alert) { var res []*notifier.Alert for _, alert := range alerts { - // Only send actually firing alerts. - if alert.State == rules.StatePending { - continue - } a := ¬ifier.Alert{ StartsAt: alert.FiredAt, Labels: alert.Labels, diff --git a/rules/alerting.go b/rules/alerting.go index d5c533d17..a542f89e7 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -88,6 +88,20 @@ type Alert struct { ActiveAt time.Time FiredAt time.Time ResolvedAt time.Time + LastSentAt time.Time +} + +func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool { + if a.State == StatePending { + return false + } + + // if an alert has been resolved since the last send, resend it + if a.ResolvedAt.After(a.LastSentAt) { + return true + } + + return a.LastSentAt.Add(resendDelay).Before(ts) } // An AlertingRule generates alerts from its vector expression. @@ -426,6 +440,18 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) { } } +func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, notifyFunc NotifyFunc) { + alerts := make([]*Alert, 0) + r.ForEachActiveAlert(func(alert *Alert) { + if alert.needsSending(ts, resendDelay) { + alert.LastSentAt = ts + anew := *alert + alerts = append(alerts, &anew) + } + }) + notifyFunc(ctx, r.vector.String(), alerts...) +} + func (r *AlertingRule) String() string { ar := rulefmt.Rule{ Alert: r.name, diff --git a/rules/manager.go b/rules/manager.go index eb49a9c4b..2a9ac3397 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -393,7 +393,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { } if ar, ok := rule.(*AlertingRule); ok { - g.opts.NotifyFunc(ctx, ar.vector.String(), ar.currentAlerts()...) + ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.opts.NotifyFunc) } var ( numOutOfOrder = 0 @@ -607,6 +607,7 @@ type ManagerOptions struct { Registerer prometheus.Registerer OutageTolerance time.Duration ForGracePeriod time.Duration + ResendDelay time.Duration } // NewManager returns an implementation of Manager, ready to be started diff --git a/rules/manager_test.go b/rules/manager_test.go index 109b0bd3a..66c33fe64 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -651,3 +651,54 @@ func TestUpdate(t *testing.T) { } } } + +func TestNotify(t *testing.T) { + storage := testutil.NewStorage(t) + defer storage.Close() + engine := promql.NewEngine(nil, nil, 10, 10*time.Second) + var lastNotified []*Alert + notifyFunc := func(ctx context.Context, expr string, alerts ...*Alert) { + lastNotified = alerts + } + opts := &ManagerOptions{ + QueryFunc: EngineQueryFunc(engine, storage), + Appendable: storage, + TSDB: storage, + Context: context.Background(), + Logger: log.NewNopLogger(), + NotifyFunc: notifyFunc, + ResendDelay: 2 * time.Second, + } + + expr, err := promql.ParseExpr("a > 1") + testutil.Ok(t, err) + rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, true, log.NewNopLogger()) + group := NewGroup("alert", "", time.Second, []Rule{rule}, true, opts) + + app, _ := storage.Appender() + app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2) + app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 2000, 3) + app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 5000, 3) + app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 6000, 0) + + err = app.Commit() + testutil.Ok(t, err) + + ctx := context.Background() + + // Alert sent right away + group.Eval(ctx, time.Unix(1, 0)) + testutil.Equals(t, 1, len(lastNotified)) + + // Alert is not sent 1s later + group.Eval(ctx, time.Unix(2, 0)) + testutil.Equals(t, 0, len(lastNotified)) + + // Alert is resent at t=5s + group.Eval(ctx, time.Unix(5, 0)) + testutil.Equals(t, 1, len(lastNotified)) + + // Resolution alert sent right away + group.Eval(ctx, time.Unix(6, 0)) + testutil.Equals(t, 1, len(lastNotified)) +}