2015-07-02 16:38:05 +00:00
|
|
|
package manager
|
|
|
|
|
|
|
|
import (
|
2015-07-04 12:05:04 +00:00
|
|
|
"fmt"
|
2015-07-02 16:38:05 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/prometheus/common/model"
|
|
|
|
"github.com/prometheus/log"
|
2015-09-24 22:15:27 +00:00
|
|
|
"golang.org/x/net/context"
|
2015-09-25 11:12:51 +00:00
|
|
|
|
|
|
|
"github.com/prometheus/alertmanager/provider"
|
2015-09-25 12:38:57 +00:00
|
|
|
"github.com/prometheus/alertmanager/types"
|
2015-07-02 16:38:05 +00:00
|
|
|
)
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
const ResolveTimeout = 30 * time.Second
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Dispatcher sorts incoming alerts into aggregation groups and
|
|
|
|
// assigns the correct notifiers to each.
|
2015-07-02 16:38:05 +00:00
|
|
|
type Dispatcher struct {
|
2015-09-25 11:44:00 +00:00
|
|
|
routes Routes
|
|
|
|
alerts provider.Alerts
|
2015-07-02 16:38:05 +00:00
|
|
|
|
|
|
|
aggrGroups map[model.Fingerprint]*aggrGroup
|
2015-07-04 12:41:10 +00:00
|
|
|
notifiers map[string]Notifier
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
mtx sync.RWMutex
|
|
|
|
done chan struct{}
|
|
|
|
ctx context.Context
|
|
|
|
cancel func()
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// NewDispatcher returns a new Dispatcher.
|
2015-09-25 11:44:00 +00:00
|
|
|
func NewDispatcher(ap provider.Alerts) *Dispatcher {
|
|
|
|
return &Dispatcher{alerts: ap}
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// ApplyConfig updates the dispatcher to match the new configuration.
|
|
|
|
func (d *Dispatcher) ApplyConfig(conf *Config) {
|
|
|
|
d.mtx.Lock()
|
|
|
|
defer d.mtx.Unlock()
|
2015-07-09 13:01:38 +00:00
|
|
|
|
2015-09-25 11:44:00 +00:00
|
|
|
d.Stop()
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
d.routes = conf.Routes
|
|
|
|
d.notifiers = map[string]Notifier{}
|
2015-07-09 13:01:38 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// TODO(fabxc): build correct notifiers from new conf.NotificationConfigs.
|
|
|
|
for _, ncfg := range conf.NotificationConfigs {
|
|
|
|
d.notifiers[ncfg.Name] = &LogNotifier{ncfg.Name}
|
2015-07-09 13:01:38 +00:00
|
|
|
}
|
2015-09-25 11:44:00 +00:00
|
|
|
|
|
|
|
go d.Run()
|
2015-07-09 13:01:38 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Run starts dispatching alerts incoming via the updates channel.
|
2015-09-25 11:44:00 +00:00
|
|
|
func (d *Dispatcher) Run() {
|
2015-09-24 22:15:27 +00:00
|
|
|
d.done = make(chan struct{})
|
2015-09-25 11:44:00 +00:00
|
|
|
d.aggrGroups = map[model.Fingerprint]*aggrGroup{}
|
|
|
|
|
|
|
|
d.ctx, d.cancel = context.WithCancel(context.Background())
|
|
|
|
|
|
|
|
updates := d.alertProvider.IterActive()
|
2015-07-09 13:01:38 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
defer close(d.done)
|
|
|
|
defer close(updates)
|
2015-07-04 12:41:10 +00:00
|
|
|
|
2015-09-25 11:44:00 +00:00
|
|
|
d.run(updates)
|
|
|
|
}
|
|
|
|
|
2015-09-25 12:38:57 +00:00
|
|
|
func (d *Dispatcher) run(updates <-chan *types.Alert) {
|
2015-09-24 22:15:27 +00:00
|
|
|
cleanup := time.Tick(30 * time.Second)
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
for {
|
|
|
|
select {
|
2015-09-24 22:15:27 +00:00
|
|
|
case alert := <-updates:
|
|
|
|
d.mtx.RLock()
|
|
|
|
routes := d.routes.Match(alert.Labels)
|
|
|
|
d.mtx.RUnlock()
|
|
|
|
|
|
|
|
for _, r := range routes {
|
|
|
|
d.processAlert(alert, r)
|
|
|
|
}
|
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
case <-cleanup:
|
|
|
|
for _, ag := range d.aggrGroups {
|
|
|
|
if ag.empty() {
|
|
|
|
ag.stop()
|
|
|
|
delete(d.aggrGroups, ag.fingerprint())
|
|
|
|
}
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
case <-d.ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Stop the dispatcher.
|
|
|
|
func (d *Dispatcher) Stop() {
|
|
|
|
d.cancel()
|
|
|
|
<-d.done
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// notifyFunc is a function that performs notifcation for the alert
|
|
|
|
// with the given fingerprint. It aborts on context cancelation.
|
|
|
|
// It returns whether the alert has successfully been communiated as
|
|
|
|
// resolved.
|
|
|
|
type notifyFunc func(context.Context, model.Fingerprint) bool
|
|
|
|
|
|
|
|
// notifyFunc returns a function which performs a notification
|
|
|
|
// as required by the routing options.
|
|
|
|
func (d *Dispatcher) notifyFunc(dest string) notifyFunc {
|
|
|
|
d.mtx.Lock()
|
|
|
|
defer d.mtx.Unlock()
|
|
|
|
|
|
|
|
notifier := d.notifiers[dest]
|
|
|
|
|
|
|
|
return func(ctx context.Context, fp model.Fingerprint) bool {
|
2015-09-25 11:44:00 +00:00
|
|
|
alert := d.alerts.Get(fp)
|
2015-09-24 22:15:27 +00:00
|
|
|
|
|
|
|
if err := notifier.Notify(ctx, alert); err != nil {
|
|
|
|
log.Errorf("Notify for %v failed: %s", alert, err)
|
|
|
|
return false
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
return alert.Resolved()
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// processAlert determins in which aggregation group the alert falls
|
|
|
|
// and insert it.
|
2015-09-25 12:38:57 +00:00
|
|
|
func (d *Dispatcher) processAlert(alert *types.Alert, opts *RouteOpts) {
|
2015-07-02 16:38:05 +00:00
|
|
|
group := model.LabelSet{}
|
|
|
|
|
|
|
|
for ln, lv := range alert.Labels {
|
|
|
|
if _, ok := opts.GroupBy[ln]; ok {
|
|
|
|
group[ln] = lv
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fp := group.Fingerprint()
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// If the group does not exist, create it.
|
2015-07-02 16:38:05 +00:00
|
|
|
ag, ok := d.aggrGroups[fp]
|
|
|
|
if !ok {
|
2015-07-04 10:52:53 +00:00
|
|
|
ag = newAggrGroup(d, group, opts)
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.run(ag.notifyFunc(opts.SendTo))
|
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
d.aggrGroups[fp] = ag
|
|
|
|
}
|
|
|
|
|
|
|
|
ag.insert(alert)
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 18:48:21 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// aggrGroup aggregates alert fingerprints into groups to which a
|
|
|
|
// common set of routing options applies.
|
|
|
|
// It emits notifications in the specified intervals.
|
|
|
|
type aggrGroup struct {
|
2015-07-04 10:52:53 +00:00
|
|
|
labels model.LabelSet
|
|
|
|
opts *RouteOpts
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ctx context.Context
|
|
|
|
cancel func()
|
|
|
|
done chan struct{}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
mtx sync.RWMutex
|
|
|
|
alerts map[model.Fingerprint]struct{}
|
|
|
|
hasSent bool
|
2015-09-24 22:15:27 +00:00
|
|
|
curRev int
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// newAggrGroup returns a new aggregation group.
|
|
|
|
func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) *aggrGroup {
|
2015-07-02 18:48:21 +00:00
|
|
|
ag := &aggrGroup{
|
2015-07-04 10:52:53 +00:00
|
|
|
labels: labels,
|
|
|
|
opts: opts,
|
|
|
|
alerts: map[model.Fingerprint]struct{}{},
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.ctx, ag.cancel = context.WithCancel(ctx)
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
return ag
|
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
func (ag *aggrGroup) run(notify notifyFunc) {
|
|
|
|
ag.done = make(chan struct{})
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Set an initial one-time wait before flushing
|
|
|
|
// the first batch of notifications.
|
|
|
|
next := time.NewTimer(opts.GroupWait)
|
|
|
|
|
|
|
|
defer close(ag.done)
|
2015-07-04 10:52:53 +00:00
|
|
|
defer ag.next.Stop()
|
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
for {
|
|
|
|
select {
|
2015-07-04 10:52:53 +00:00
|
|
|
case <-ag.next.C:
|
2015-09-24 22:15:27 +00:00
|
|
|
// Give the notifcations 2/3 the time of the repeat interval
|
|
|
|
// to finish before terminating them.
|
|
|
|
ctx, _ := context.WithTimeout(ag.ctx, ag.opts.RepeatInterval*2/3)
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
// Wait the configured interval before calling flush again.
|
2015-09-24 22:15:27 +00:00
|
|
|
next.Reset(ag.opts.RepeatInterval)
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.flush(func(fp model.Fingerprint) bool {
|
|
|
|
notify(ctx, fp)
|
|
|
|
})
|
|
|
|
|
|
|
|
case <-ag.ctx.Done():
|
2015-07-02 16:38:05 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) stop() {
|
2015-09-24 22:15:27 +00:00
|
|
|
// Calling cancel will terminate all in-process notifications
|
|
|
|
// and the run() loop.
|
|
|
|
ag.cancel()
|
|
|
|
<-ag.done
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) fingerprint() model.Fingerprint {
|
|
|
|
return ag.labels.Fingerprint()
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// insert the alert into the aggregation group. If the aggregation group
|
|
|
|
// is empty afterwards, true is returned.
|
2015-09-24 22:15:27 +00:00
|
|
|
func (ag *aggrGroup) insert(fp model.Fingerprint) {
|
2015-07-04 10:52:53 +00:00
|
|
|
ag.mtx.Lock()
|
2015-09-24 22:15:27 +00:00
|
|
|
defer ag.mtx.Unlock()
|
|
|
|
|
|
|
|
ag.curRev++
|
|
|
|
ag.alerts[fp] = ag.curRev
|
2015-07-02 18:48:21 +00:00
|
|
|
|
|
|
|
// Immediately trigger a flush if the wait duration for this
|
|
|
|
// alert is already over.
|
2015-07-04 10:52:53 +00:00
|
|
|
if !ag.hasSent && alert.Timestamp.Add(ag.opts.GroupWait).Before(time.Now()) {
|
|
|
|
ag.next.Reset(0)
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
func (ag *aggrGroup) empty() bool {
|
|
|
|
ag.mtx.RLock()
|
|
|
|
defer ag.mtx.RUnlock()
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
return len(ag.alerts) == 0
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// flush sends notifications for all new alerts.
|
2015-09-24 22:15:27 +00:00
|
|
|
func (ag *aggrGroup) flush(notify func(model.Fingerprint) bool) {
|
2015-07-02 16:38:05 +00:00
|
|
|
ag.mtx.Lock()
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
alerts := make(map[model.Fingerprint]int, len(ag.alerts))
|
|
|
|
for fp, rev := range ag.alerts {
|
|
|
|
alerts[fp] = rev
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.mtx.Unlock()
|
|
|
|
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
wg.Add(len(alerts))
|
|
|
|
|
|
|
|
for fp, rev := range alerts {
|
|
|
|
go func(fp model.Fingerprint) {
|
|
|
|
// notify returns whether the alert can be deleted
|
|
|
|
// afterwards.
|
|
|
|
if notify(fp) {
|
|
|
|
ag.mtx.Lock()
|
|
|
|
// Only delete if the fingerprint has not been inserted
|
|
|
|
// again since we notified about it.
|
|
|
|
if ag.alerts[fp] == rev {
|
|
|
|
delete(alerts, fp)
|
|
|
|
}
|
|
|
|
ag.mtx.Unlock()
|
|
|
|
}
|
|
|
|
wg.Done()
|
|
|
|
}(fp)
|
2015-07-10 17:25:56 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
wg.Wait()
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.hasSent = true
|
|
|
|
}
|