*: consider mesh wait in notification timeouts

This adds the peer wait duration to the standard timeout to avoid
terminating a notification prematurely while being in failover
wait status.
This commit is contained in:
Fabian Reinartz 2016-09-05 13:19:14 +02:00
parent c01c1989a7
commit e9fbe62e0f
3 changed files with 42 additions and 27 deletions

View File

@ -183,6 +183,14 @@ func main() {
log.Fatal(err)
}
waitFunc := meshWait(mrouter, 5*time.Second)
timeoutFunc := func(d time.Duration) time.Duration {
if d < notify.MinTimeout {
d = notify.MinTimeout
}
return d + waitFunc()
}
reload := func() (err error) {
log.With("file", *configFile).Infof("Loading configuration file")
defer func() {
@ -215,13 +223,13 @@ func main() {
pipeline = notify.BuildPipeline(
conf.Receivers,
tmpl,
meshWait(mrouter, 5*time.Second),
waitFunc,
inhibitor,
silences,
notificationLog,
marker,
)
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker)
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc)
go disp.Run()
go inhibitor.Run()

View File

@ -22,7 +22,8 @@ type Dispatcher struct {
alerts provider.Alerts
stage notify.Stage
marker types.Marker
marker types.Marker
timeout func(time.Duration) time.Duration
aggrGroups map[*Route]map[model.Fingerprint]*aggrGroup
mtx sync.RWMutex
@ -35,13 +36,20 @@ type Dispatcher struct {
}
// NewDispatcher returns a new Dispatcher.
func NewDispatcher(ap provider.Alerts, r *Route, s notify.Stage, mk types.Marker) *Dispatcher {
func NewDispatcher(
ap provider.Alerts,
r *Route,
s notify.Stage,
mk types.Marker,
to func(time.Duration) time.Duration,
) *Dispatcher {
disp := &Dispatcher{
alerts: ap,
stage: s,
route: r,
marker: mk,
log: log.With("component", "dispatcher"),
alerts: ap,
stage: s,
route: r,
marker: mk,
timeout: to,
log: log.With("component", "dispatcher"),
}
return disp
}
@ -229,7 +237,7 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
// If the group does not exist, create it.
ag, ok := groups[fp]
if !ok {
ag = newAggrGroup(d.ctx, group, &route.RouteOpts)
ag = newAggrGroup(d.ctx, group, &route.RouteOpts, d.timeout)
groups[fp] = ag
go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool {
@ -253,10 +261,11 @@ type aggrGroup struct {
routeFP model.Fingerprint
log log.Logger
ctx context.Context
cancel func()
done chan struct{}
next *time.Timer
ctx context.Context
cancel func()
done chan struct{}
next *time.Timer
timeout func(time.Duration) time.Duration
mtx sync.RWMutex
alerts map[model.Fingerprint]*types.Alert
@ -264,11 +273,15 @@ type aggrGroup struct {
}
// newAggrGroup returns a new aggregation group.
func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) *aggrGroup {
func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts, to func(time.Duration) time.Duration) *aggrGroup {
if to == nil {
to = func(d time.Duration) time.Duration { return d }
}
ag := &aggrGroup{
labels: labels,
opts: opts,
alerts: map[model.Fingerprint]*types.Alert{},
labels: labels,
opts: opts,
timeout: to,
alerts: map[model.Fingerprint]*types.Alert{},
}
ag.ctx, ag.cancel = context.WithCancel(ctx)
@ -302,18 +315,12 @@ func (ag *aggrGroup) run(nf notifyFunc) {
defer close(ag.done)
defer ag.next.Stop()
timeout := ag.opts.GroupInterval
if timeout < notify.MinTimeout {
timeout = notify.MinTimeout
}
for {
select {
case now := <-ag.next.C:
// Give the notifcations time until the next flush to
// finish before terminating them.
ctx, cancel := context.WithTimeout(ag.ctx, timeout)
ctx, cancel := context.WithTimeout(ag.ctx, ag.timeout(ag.opts.GroupInterval))
// The now time we retrieve from the ticker is the only reliable
// point of time reference for the subsequent notification pipeline.

View File

@ -98,7 +98,7 @@ func TestAggrGroup(t *testing.T) {
}
// Test regular situation where we wait for group_wait to send out alerts.
ag := newAggrGroup(context.Background(), lset, opts)
ag := newAggrGroup(context.Background(), lset, opts, nil)
go ag.run(ntfy)
ag.insert(a1)
@ -146,7 +146,7 @@ func TestAggrGroup(t *testing.T) {
// immediate flushing.
// Finally, set all alerts to be resolved. After successful notify the aggregation group
// should empty itself.
ag = newAggrGroup(context.Background(), lset, opts)
ag = newAggrGroup(context.Background(), lset, opts, nil)
go ag.run(ntfy)
ag.insert(a1)