diff --git a/dispatch.go b/dispatch.go index cb18f6e4..a425951f 100644 --- a/dispatch.go +++ b/dispatch.go @@ -167,8 +167,6 @@ func (d *Dispatcher) processAlert(alert *types.Alert, opts *RouteOpts) { ag = newAggrGroup(d.ctx, group, opts) groups[fp] = ag - ag.log = log.With("aggrGroup", ag) - go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool { err := d.notifier.Notify(ctx, alerts...) if err != nil { @@ -209,6 +207,8 @@ func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) * } ag.ctx, ag.cancel = context.WithCancel(ctx) + ag.log = log.With("aggrGroup", ag) + // Set an initial one-time wait before flushing // the first batch of notifications. ag.next = time.NewTimer(ag.opts.GroupWait) @@ -231,6 +231,7 @@ func (ag *aggrGroup) run(nf notifyFunc) { if timeout < notify.MinTimeout { timeout = notify.MinTimeout } + fmt.Println("starting at", time.Now()) for { select { @@ -255,6 +256,7 @@ func (ag *aggrGroup) run(nf notifyFunc) { // Wait the configured interval before calling flush again. ag.next.Reset(ag.opts.GroupInterval) + fmt.Println("flushing at", now) ag.flush(func(alerts ...*types.Alert) bool { return nf(ctx, alerts...) }) @@ -288,7 +290,8 @@ func (ag *aggrGroup) insert(alert *types.Alert) { // Immediately trigger a flush if the wait duration for this // alert is already over. - if !ag.hasSent && alert.UpdatedAt.Add(ag.opts.GroupWait).Before(time.Now()) { + if !ag.hasSent && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) { + fmt.Println("early") ag.next.Reset(0) } } diff --git a/dispatch_test.go b/dispatch_test.go new file mode 100644 index 00000000..e8d40969 --- /dev/null +++ b/dispatch_test.go @@ -0,0 +1,220 @@ +package main + +import ( + "reflect" + "sort" + "testing" + "time" + + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/types" +) + +func TestAggrGroup(t *testing.T) { + lset := model.LabelSet{ + "a": "v1", + "b": "v2", + } + opts := &RouteOpts{ + SendTo: "n1", + SendResolved: true, + GroupBy: map[model.LabelName]struct{}{}, + GroupWait: 1 * time.Second, + GroupInterval: 300 * time.Millisecond, + RepeatInterval: 1 * time.Hour, + } + + var ( + a1 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + }, + StartsAt: time.Now().Add(time.Minute), + EndsAt: time.Now().Add(time.Hour), + }, + UpdatedAt: time.Now(), + } + a2 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v4", + }, + StartsAt: time.Now().Add(-time.Hour), + EndsAt: time.Now().Add(2 * time.Hour), + }, + UpdatedAt: time.Now(), + } + a3 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v5", + }, + StartsAt: time.Now().Add(time.Minute), + EndsAt: time.Now().Add(5 * time.Minute), + }, + UpdatedAt: time.Now(), + } + ) + + var ( + last = time.Now() + current = time.Now() + alertsCh = make(chan types.AlertSlice) + ) + + ntfy := func(ctx context.Context, alerts ...*types.Alert) bool { + // Validate that the context is properly populated. + if _, ok := notify.Now(ctx); !ok { + t.Errorf("now missing") + } + if _, ok := notify.GroupKey(ctx); !ok { + t.Errorf("group key missing") + } + if lbls, ok := notify.GroupLabels(ctx); !ok || !reflect.DeepEqual(lbls, lset) { + t.Errorf("wrong group labels: %q", lbls) + } + if dest, ok := notify.Destination(ctx); !ok || dest != opts.SendTo { + t.Errorf("wrong destination: %q", dest) + } + if ri, ok := notify.RepeatInterval(ctx); !ok || ri != opts.RepeatInterval { + t.Errorf("wrong repeat interval: %q", ri) + } + if sr, ok := notify.SendResolved(ctx); !ok || sr != opts.SendResolved { + t.Errorf("wrong send_resolved: %q", sr) + } + + last = current + current = time.Now() + + alertsCh <- types.AlertSlice(alerts) + + return true + } + + // Test regular situation where we wait for group_wait to send out alerts. + ag := newAggrGroup(context.Background(), lset, opts) + go ag.run(ntfy) + + ag.insert(a1) + + select { + case <-time.After(2 * opts.GroupWait): + t.Fatalf("expected initial batch after group_wait") + + case batch := <-alertsCh: + if s := time.Since(last); s < opts.GroupWait { + t.Fatalf("received batch to early after %v", s) + } + exp := types.AlertSlice{a1} + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + + for i := 0; i < 3; i++ { + // New alert should come in after group interval. + ag.insert(a3) + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + if s := time.Since(last); s < opts.GroupInterval { + t.Fatalf("received batch to early after %v", s) + } + exp := types.AlertSlice{a1, a3} + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + } + + ag.stop() + + // Add an alert that started more than group_interval in the past. We expect + // immediate flushing. + // Finally, set all alerts to be resolved. After successful notify the aggregation group + // should empty itself. + ag = newAggrGroup(context.Background(), lset, opts) + go ag.run(ntfy) + + ag.insert(a1) + ag.insert(a2) + + // a2 lies way in the past so the initial group_wait should be skipped. + select { + case <-time.After(opts.GroupWait / 2): + t.Fatalf("expected immediate alert but received none") + + case batch := <-alertsCh: + exp := types.AlertSlice{a1, a2} + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + + for i := 0; i < 3; i++ { + // New alert should come in after group interval. + ag.insert(a3) + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + if s := time.Since(last); s < opts.GroupInterval { + t.Fatalf("received batch to early after %v", s) + } + exp := types.AlertSlice{a1, a2, a3} + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + } + + // Resolve all alerts, they should be removed after the next batch was sent. + a1.EndsAt = time.Now() + a2.EndsAt = time.Now() + a3.EndsAt = time.Now() + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + if s := time.Since(last); s < opts.GroupInterval { + t.Fatalf("received batch to early after %v", s) + } + exp := types.AlertSlice{a1, a2, a3} + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + + if !ag.empty() { + t.Fatalf("Expected aggregation group to be empty after resolving alerts") + } + } + + ag.stop() +} diff --git a/types/types.go b/types/types.go index 6e8cc940..531a51b0 100644 --- a/types/types.go +++ b/types/types.go @@ -44,6 +44,12 @@ type Alert struct { Timeout bool `json:"-"` } +type AlertSlice []*Alert + +func (as AlertSlice) Less(i, j int) bool { return as[i].UpdatedAt.Before(as[j].UpdatedAt) } +func (as AlertSlice) Swap(i, j int) { as[i], as[j] = as[j], as[i] } +func (as AlertSlice) Len() int { return len(as) } + // Alerts turns a sequence of internal alerts into a list of // exposable model.Alert structures. func Alerts(alerts ...*Alert) model.Alerts {