alertmanager/dispatch/dispatch.go

350 lines
8.2 KiB
Go
Raw Normal View History

// Copyright 2018 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dispatch
import (
2015-09-26 09:12:47 +00:00
"fmt"
"sort"
"sync"
"time"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/common/model"
"golang.org/x/net/context"
2015-09-25 11:12:51 +00:00
2015-09-29 13:12:31 +00:00
"github.com/prometheus/alertmanager/notify"
2015-09-25 11:12:51 +00:00
"github.com/prometheus/alertmanager/provider"
2015-09-25 12:38:57 +00:00
"github.com/prometheus/alertmanager/types"
)
// Dispatcher sorts incoming alerts into aggregation groups and
// assigns the correct notifiers to each.
type Dispatcher struct {
2016-08-12 17:18:26 +00:00
route *Route
alerts provider.Alerts
stage notify.Stage
marker types.Marker
timeout func(time.Duration) time.Duration
aggrGroups map[*Route]map[model.Fingerprint]*aggrGroup
mtx sync.RWMutex
done chan struct{}
ctx context.Context
cancel func()
2015-09-29 09:58:30 +00:00
logger log.Logger
}
// NewDispatcher returns a new Dispatcher.
func NewDispatcher(
ap provider.Alerts,
r *Route,
s notify.Stage,
mk types.Marker,
to func(time.Duration) time.Duration,
l log.Logger,
) *Dispatcher {
disp := &Dispatcher{
alerts: ap,
stage: s,
route: r,
marker: mk,
timeout: to,
logger: log.With(l, "component", "dispatcher"),
}
return disp
}
// Run starts dispatching alerts incoming via the updates channel.
func (d *Dispatcher) Run() {
d.done = make(chan struct{})
d.mtx.Lock()
d.aggrGroups = map[*Route]map[model.Fingerprint]*aggrGroup{}
d.mtx.Unlock()
d.ctx, d.cancel = context.WithCancel(context.Background())
d.run(d.alerts.Subscribe())
close(d.done)
}
func (d *Dispatcher) run(it provider.AlertIterator) {
cleanup := time.NewTicker(30 * time.Second)
defer cleanup.Stop()
defer it.Close()
for {
select {
case alert, ok := <-it.Next():
if !ok {
// Iterator exhausted for some reason.
if err := it.Err(); err != nil {
level.Error(d.logger).Log("msg", "Error on alert update", "err", err)
}
return
}
level.Debug(d.logger).Log("msg", "Received alert", "alert", alert)
2015-09-29 09:58:30 +00:00
// Log errors but keep trying.
if err := it.Err(); err != nil {
level.Error(d.logger).Log("msg", "Error on alert update", "err", err)
continue
}
2015-10-19 14:17:15 +00:00
for _, r := range d.route.Match(alert.Labels) {
d.processAlert(alert, r)
}
case <-cleanup.C:
2015-12-04 13:55:42 +00:00
d.mtx.Lock()
for _, groups := range d.aggrGroups {
for _, ag := range groups {
if ag.empty() {
ag.stop()
delete(groups, ag.fingerprint())
}
}
}
2015-12-04 13:55:42 +00:00
d.mtx.Unlock()
case <-d.ctx.Done():
return
}
}
}
// Stop the dispatcher.
func (d *Dispatcher) Stop() {
if d == nil || d.cancel == nil {
return
}
d.cancel()
2015-09-25 16:14:46 +00:00
d.cancel = nil
2015-09-26 09:12:47 +00:00
<-d.done
}
// notifyFunc is a function that performs notifcation for the alert
// with the given fingerprint. It aborts on context cancelation.
2015-09-26 16:12:56 +00:00
// Returns false iff notifying failed.
type notifyFunc func(context.Context, ...*types.Alert) bool
2016-06-06 11:18:55 +00:00
// processAlert determines in which aggregation group the alert falls
// and inserts it.
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
groupLabels := model.LabelSet{}
for ln, lv := range alert.Labels {
if _, ok := route.RouteOpts.GroupBy[ln]; ok {
groupLabels[ln] = lv
}
}
fp := groupLabels.Fingerprint()
d.mtx.Lock()
defer d.mtx.Unlock()
group, ok := d.aggrGroups[route]
if !ok {
group = map[model.Fingerprint]*aggrGroup{}
d.aggrGroups[route] = group
}
// If the group does not exist, create it.
ag, ok := group[fp]
if !ok {
ag = newAggrGroup(d.ctx, groupLabels, route, d.timeout, d.logger)
group[fp] = ag
go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool {
_, _, err := d.stage.Exec(ctx, d.logger, alerts...)
if err != nil {
level.Error(d.logger).Log("msg", "Notify for alerts failed", "num_alerts", len(alerts), "err", err)
}
return err == nil
})
}
ag.insert(alert)
}
// aggrGroup aggregates alert fingerprints into groups to which a
// common set of routing options applies.
// It emits notifications in the specified intervals.
type aggrGroup struct {
labels model.LabelSet
opts *RouteOpts
logger log.Logger
routeKey string
ctx context.Context
cancel func()
done chan struct{}
next *time.Timer
timeout func(time.Duration) time.Duration
mtx sync.RWMutex
alerts map[model.Fingerprint]*types.Alert
hasFlushed bool
}
// newAggrGroup returns a new aggregation group.
func newAggrGroup(ctx context.Context, labels model.LabelSet, r *Route, to func(time.Duration) time.Duration, logger log.Logger) *aggrGroup {
if to == nil {
to = func(d time.Duration) time.Duration { return d }
}
ag := &aggrGroup{
labels: labels,
routeKey: r.Key(),
opts: &r.RouteOpts,
timeout: to,
alerts: map[model.Fingerprint]*types.Alert{},
}
ag.ctx, ag.cancel = context.WithCancel(ctx)
ag.logger = log.With(logger, "aggrGroup", ag)
2015-10-27 17:24:09 +00:00
2015-10-07 14:18:55 +00:00
// Set an initial one-time wait before flushing
// the first batch of notifications.
ag.next = time.NewTimer(ag.opts.GroupWait)
return ag
}
func (ag *aggrGroup) fingerprint() model.Fingerprint {
return ag.labels.Fingerprint()
}
func (ag *aggrGroup) GroupKey() string {
return fmt.Sprintf("%s:%s", ag.routeKey, ag.labels)
}
func (ag *aggrGroup) String() string {
return ag.GroupKey()
}
2015-09-29 13:12:31 +00:00
func (ag *aggrGroup) run(nf notifyFunc) {
ag.done = make(chan struct{})
defer close(ag.done)
defer ag.next.Stop()
for {
select {
case now := <-ag.next.C:
// Give the notifications time until the next flush to
// finish before terminating them.
ctx, cancel := context.WithTimeout(ag.ctx, ag.timeout(ag.opts.GroupInterval))
// The now time we retrieve from the ticker is the only reliable
// point of time reference for the subsequent notification pipeline.
// Calculating the current time directly is prone to flaky behavior,
// which usually only becomes apparent in tests.
ctx = notify.WithNow(ctx, now)
// Populate context with information needed along the pipeline.
ctx = notify.WithGroupKey(ctx, ag.GroupKey())
ctx = notify.WithGroupLabels(ctx, ag.labels)
ctx = notify.WithReceiverName(ctx, ag.opts.Receiver)
ctx = notify.WithRepeatInterval(ctx, ag.opts.RepeatInterval)
// Wait the configured interval before calling flush again.
ag.mtx.Lock()
ag.next.Reset(ag.opts.GroupInterval)
ag.hasFlushed = true
ag.mtx.Unlock()
ag.flush(func(alerts ...*types.Alert) bool {
2015-09-29 13:12:31 +00:00
return nf(ctx, alerts...)
})
cancel()
case <-ag.ctx.Done():
return
}
}
}
func (ag *aggrGroup) stop() {
// Calling cancel will terminate all in-process notifications
// and the run() loop.
ag.cancel()
<-ag.done
}
// insert inserts the alert into the aggregation group.
2015-09-25 16:14:46 +00:00
func (ag *aggrGroup) insert(alert *types.Alert) {
ag.mtx.Lock()
defer ag.mtx.Unlock()
2015-09-25 16:14:46 +00:00
ag.alerts[alert.Fingerprint()] = alert
// Immediately trigger a flush if the wait duration for this
// alert is already over.
if !ag.hasFlushed && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) {
ag.next.Reset(0)
}
}
func (ag *aggrGroup) empty() bool {
ag.mtx.RLock()
defer ag.mtx.RUnlock()
return len(ag.alerts) == 0
}
// flush sends notifications for all new alerts.
func (ag *aggrGroup) flush(notify func(...*types.Alert) bool) {
2015-09-27 17:50:41 +00:00
if ag.empty() {
return
}
ag.mtx.Lock()
var (
alerts = make(map[model.Fingerprint]*types.Alert, len(ag.alerts))
alertsSlice = make(types.AlertSlice, 0, len(ag.alerts))
)
2015-09-25 16:14:46 +00:00
for fp, alert := range ag.alerts {
alerts[fp] = alert
alertsSlice = append(alertsSlice, alert)
}
sort.Stable(alertsSlice)
ag.mtx.Unlock()
level.Debug(ag.logger).Log("msg", "Flushing", "alerts", fmt.Sprintf("%v", alertsSlice))
if notify(alertsSlice...) {
ag.mtx.Lock()
for fp, a := range alerts {
// Only delete if the fingerprint has not been inserted
// again since we notified about it.
if a.Resolved() && ag.alerts[fp] == a {
delete(ag.alerts, fp)
}
}
ag.mtx.Unlock()
2015-07-10 17:25:56 +00:00
}
}