Simplify and fix notification grouping.

This commit changes the notification grouping behavior
to simply send all alerts of a group as soon as a single
one of them needs updating.

This fixes a critical bug which caused erroneous resolved
notifications to be sent.
This commit is contained in:
Fabian Reinartz 2016-01-08 15:15:14 +01:00
parent 3de6b062a2
commit 11fae2a719
3 changed files with 188 additions and 139 deletions

View File

@ -213,6 +213,29 @@ func Dedup(notifies provider.Notifies, n Notifier) *DedupingNotifier {
return &DedupingNotifier{notifies: notifies, notifier: n} return &DedupingNotifier{notifies: notifies, notifier: n}
} }
// hasUpdates checks an alert against the last notification that was made
// about it.
func (n *DedupingNotifier) hasUpdate(alert *types.Alert, last *types.NotifyInfo, now time.Time, interval time.Duration) bool {
if last != nil {
if alert.Resolved() {
if last.Resolved {
return false
}
} else if !last.Resolved {
// Do not send again if last was delivered unless
// the repeat interval has already passed.
if !now.After(last.Timestamp.Add(interval)) {
return false
}
}
} else if alert.Resolved() {
// If the alert is resolved but we never notified about it firing,
// there is nothing to do.
return false
}
return true
}
// Notify implements the Notifier interface. // Notify implements the Notifier interface.
func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error { func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error {
name, ok := Receiver(ctx) name, ok := Receiver(ctx)
@ -235,57 +258,27 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
fps = append(fps, a.Fingerprint()) fps = append(fps, a.Fingerprint())
} }
notifies, err := n.notifies.Get(name, fps...) notifyInfo, err := n.notifies.Get(name, fps...)
if err != nil { if err != nil {
return err return err
} }
var ( // If we have to notify about any of the alerts, we send a notification
doResend bool // for the entire batch.
resendQueue []*types.Alert var send bool
filtered []*types.Alert for i, alert := range alerts {
) if n.hasUpdate(alert, notifyInfo[i], now, repeatInterval) {
for i, a := range alerts { send = true
last := notifies[i] break
if last != nil {
if a.Resolved() {
if last.Resolved {
continue
}
} else if !last.Resolved {
// Do not send again if last was delivered unless
// the repeat interval has already passed.
if !now.After(last.Timestamp.Add(repeatInterval)) {
// To not repeat initial batch fragmentation after the repeat interval
// has passed, store them and send them anyway if on of the other
// alerts has already passed the repeat interval.
// This way we unify batches again.
resendQueue = append(resendQueue, a)
continue
} else {
doResend = true
}
}
} else if a.Resolved() {
// If the alert is resolved but we never notified about it firing,
// there is nothing to do.
continue
} }
filtered = append(filtered, a)
} }
if !send {
// As we are resending an alert anyway, resend all of them even if their return nil
// repeat interval has not yet passed.
if doResend {
filtered = append(filtered, resendQueue...)
} }
var newNotifies []*types.NotifyInfo var newNotifies []*types.NotifyInfo
for _, a := range filtered { for _, a := range alerts {
newNotifies = append(newNotifies, &types.NotifyInfo{ newNotifies = append(newNotifies, &types.NotifyInfo{
Alert: a.Fingerprint(), Alert: a.Fingerprint(),
Receiver: name, Receiver: name,
@ -294,7 +287,7 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
}) })
} }
if err := n.notifier.Notify(ctx, filtered...); err != nil { if err := n.notifier.Notify(ctx, alerts...); err != nil {
return err return err
} }

View File

@ -43,6 +43,129 @@ func (n *failNotifier) Notify(ctx context.Context, as ...*types.Alert) error {
return fmt.Errorf("some error") return fmt.Errorf("some error")
} }
func TestDedupingNotifierHasUpdate(t *testing.T) {
var (
n = &DedupingNotifier{}
now = time.Now()
interval = 100 * time.Minute
)
cases := []struct {
inAlert *types.Alert
inNotifyInfo *types.NotifyInfo
result bool
}{
// A new alert about which there's no previous notification information.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: nil,
result: true,
},
// A new alert about which there's no previous notification information.
// It is already resolved, so there's no use in sending a notification.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: nil,
result: false,
},
// An alert that has been firing is now resolved for the first time.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-time.Minute),
},
result: true,
},
// A resolved alert for which we have already sent a resolved notification.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: true,
Timestamp: now.Add(-time.Minute),
},
result: false,
},
// An alert that was resolved last time but is now firing again.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-3 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: true,
Timestamp: now.Add(-4 * time.Minute),
},
result: true,
},
// A firing alert about which we already notified. The last notification
// is less than the repeat interval ago.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-15 * time.Minute),
},
result: false,
},
// A firing alert about which we already notified. The last notification
// is more than the repeat interval ago.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-115 * time.Minute),
},
result: true,
},
}
for i, c := range cases {
if n.hasUpdate(c.inAlert, c.inNotifyInfo, now, interval) != c.result {
t.Errorf("unexpected hasUpdates result for case %d", i)
}
}
}
func TestDedupingNotifier(t *testing.T) { func TestDedupingNotifier(t *testing.T) {
var ( var (
record = &recordNotifier{} record = &recordNotifier{}
@ -66,69 +189,19 @@ func TestDedupingNotifier(t *testing.T) {
Labels: model.LabelSet{"alertname": "1"}, Labels: model.LabelSet{"alertname": "1"},
EndsAt: now.Add(-5 * time.Minute), EndsAt: now.Add(-5 * time.Minute),
}, },
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "2"},
EndsAt: now.Add(-9 * time.Minute),
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "3"},
EndsAt: now.Add(-10 * time.Minute),
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "4"},
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "5"},
},
}, },
} }
var fps []model.Fingerprint // Set an initial NotifyInfo to ensure that on notification failure
for _, a := range alerts { // nothing changes.
fps = append(fps, a.Fingerprint())
}
nsBefore := []*types.NotifyInfo{ nsBefore := []*types.NotifyInfo{
// The first a new alert starting now.
nil, nil,
// The second alert was not previously notified about and
// is already resolved.
nil,
// The third alert is an attempt to resolve a previously
// firing alert.
{ {
Alert: fps[2], Alert: alerts[1].Fingerprint(),
Receiver: "name", Receiver: "name",
Resolved: false, Resolved: false,
Timestamp: now.Add(-10 * time.Minute), Timestamp: now.Add(-10 * time.Minute),
}, },
// The fourth alert is an attempt to resolve an alert again
// even though the previous notification succeeded.
{
Alert: fps[3],
Receiver: "name",
Resolved: true,
Timestamp: now.Add(-10 * time.Minute),
},
// The fifth alert resends a previously successful notification
// that was longer than ago than the repeat interval.
{
Alert: fps[4],
Receiver: "name",
Resolved: false,
Timestamp: now.Add(-110 * time.Minute),
},
// The sixth alert is a firing again after being resolved before.
{
Alert: fps[5],
Receiver: "name",
Resolved: true,
Timestamp: now.Add(3 * time.Minute),
},
} }
if err := notifies.Set(nsBefore...); err != nil { if err := notifies.Set(nsBefore...); err != nil {
@ -140,7 +213,7 @@ func TestDedupingNotifier(t *testing.T) {
t.Fatalf("Fail notifier did not fail") t.Fatalf("Fail notifier did not fail")
} }
// After a failing notify the notifies data must be unchanged. // After a failing notify the notifies data must be unchanged.
nsCur, err := notifies.Get("name", fps...) nsCur, err := notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
if err != nil { if err != nil {
t.Fatalf("Error getting notify info: %s", err) t.Fatalf("Error getting notify info: %s", err)
} }
@ -153,46 +226,29 @@ func TestDedupingNotifier(t *testing.T) {
t.Fatalf("Notify failed: %s", err) t.Fatalf("Notify failed: %s", err)
} }
alertsExp := []*types.Alert{ if !reflect.DeepEqual(record.alerts, alerts) {
alerts[0], t.Fatalf("Expected alerts %v, got %v", alerts, record.alerts)
alerts[2], }
alerts[4], nsCur, err = notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
alerts[5], if err != nil {
t.Fatalf("Error getting notifies: %s", err)
} }
nsAfter := []*types.NotifyInfo{ nsAfter := []*types.NotifyInfo{
{ {
Alert: fps[0], Alert: alerts[0].Fingerprint(),
Receiver: "name", Receiver: "name",
Resolved: false, Resolved: false,
}, Timestamp: now,
nil,
{
Alert: fps[2],
Receiver: "name",
Resolved: true,
},
nsBefore[3],
{
Alert: fps[4],
Receiver: "name",
Resolved: false,
}, },
{ {
Alert: fps[5], Alert: alerts[1].Fingerprint(),
Receiver: "name", Receiver: "name",
Resolved: false, Resolved: true,
Timestamp: now,
}, },
} }
if !reflect.DeepEqual(record.alerts, alertsExp) {
t.Fatalf("Expected alerts %v, got %v", alertsExp, record.alerts)
}
nsCur, err = notifies.Get("name", fps...)
if err != nil {
t.Fatalf("Error getting notifies: %s", err)
}
for i, after := range nsAfter { for i, after := range nsAfter {
cur := nsCur[i] cur := nsCur[i]

View File

@ -229,19 +229,22 @@ receivers:
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address())) am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
am.Push(At(1.1), Alert("alertname", "test1").Active(1)) am.Push(At(1.1), Alert("alertname", "test1").Active(1))
am.Push(At(1.9), Alert("alertname", "test5").Active(1)) am.Push(At(1.7), Alert("alertname", "test5").Active(1))
am.Push(At(2.3),
Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6),
)
co.Want(Between(2.0, 2.5), co.Want(Between(2.0, 2.5),
Alert("alertname", "test1").Active(1), Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1), Alert("alertname", "test5").Active(1),
) )
// Only expect the new ones with the next group interval.
co.Want(Between(3, 3.5), am.Push(At(3.3),
Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6),
)
co.Want(Between(4.1, 4.5),
Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1),
Alert("alertname", "test2").Active(1.5), Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5), Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6), Alert("alertname", "test4").Active(1.6),
@ -250,10 +253,7 @@ receivers:
// While no changes happen expect no additional notifications // While no changes happen expect no additional notifications
// until the 5s repeat interval has ended. // until the 5s repeat interval has ended.
// The last three notifications should sent with the first two even co.Want(Between(9.1, 9.5),
// though their repeat interval has not yet passed. This way fragmented
// batches are unified and notification noise reduced.
co.Want(Between(7, 7.5),
Alert("alertname", "test1").Active(1), Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1), Alert("alertname", "test5").Active(1),
Alert("alertname", "test2").Active(1.5), Alert("alertname", "test2").Active(1.5),