2015-09-25 16:14:46 +00:00
|
|
|
package main
|
2015-07-02 16:38:05 +00:00
|
|
|
|
|
|
|
import (
|
2015-09-26 09:12:47 +00:00
|
|
|
"fmt"
|
2015-07-02 16:38:05 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2015-09-28 10:12:27 +00:00
|
|
|
"github.com/prometheus/common/log"
|
2015-07-02 16:38:05 +00:00
|
|
|
"github.com/prometheus/common/model"
|
2015-09-24 22:15:27 +00:00
|
|
|
"golang.org/x/net/context"
|
2015-09-25 11:12:51 +00:00
|
|
|
|
2015-09-29 13:12:31 +00:00
|
|
|
"github.com/prometheus/alertmanager/notify"
|
2015-09-25 11:12:51 +00:00
|
|
|
"github.com/prometheus/alertmanager/provider"
|
2015-09-25 12:38:57 +00:00
|
|
|
"github.com/prometheus/alertmanager/types"
|
2015-07-02 16:38:05 +00:00
|
|
|
)
|
|
|
|
|
2015-11-05 09:49:32 +00:00
|
|
|
// ResolveTimeout is the time after which an alert is declared resolved
|
|
|
|
// if it has not been updated.
|
2015-10-07 14:18:55 +00:00
|
|
|
const ResolveTimeout = 5 * time.Minute
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Dispatcher sorts incoming alerts into aggregation groups and
|
|
|
|
// assigns the correct notifiers to each.
|
2015-07-02 16:38:05 +00:00
|
|
|
type Dispatcher struct {
|
2015-10-19 14:17:15 +00:00
|
|
|
route *Route
|
2015-09-27 11:09:02 +00:00
|
|
|
alerts provider.Alerts
|
2015-09-29 13:12:31 +00:00
|
|
|
notifier notify.Notifier
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-10-16 14:55:56 +00:00
|
|
|
aggrGroups map[*RouteOpts]map[model.Fingerprint]*aggrGroup
|
2015-07-04 12:41:10 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
done chan struct{}
|
|
|
|
ctx context.Context
|
|
|
|
cancel func()
|
2015-09-29 09:58:30 +00:00
|
|
|
|
|
|
|
log log.Logger
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// NewDispatcher returns a new Dispatcher.
|
2015-10-19 14:17:15 +00:00
|
|
|
func NewDispatcher(ap provider.Alerts, r *Route, n notify.Notifier) *Dispatcher {
|
2015-10-11 14:54:39 +00:00
|
|
|
disp := &Dispatcher{
|
2015-09-27 11:09:02 +00:00
|
|
|
alerts: ap,
|
|
|
|
notifier: n,
|
2015-10-19 14:17:15 +00:00
|
|
|
route: r,
|
2015-09-29 09:58:30 +00:00
|
|
|
log: log.With("component", "dispatcher"),
|
2015-09-27 11:09:02 +00:00
|
|
|
}
|
2015-10-11 14:54:39 +00:00
|
|
|
return disp
|
2015-07-09 13:01:38 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Run starts dispatching alerts incoming via the updates channel.
|
2015-09-25 11:44:00 +00:00
|
|
|
func (d *Dispatcher) Run() {
|
2015-09-24 22:15:27 +00:00
|
|
|
d.done = make(chan struct{})
|
2015-10-16 14:55:56 +00:00
|
|
|
d.aggrGroups = map[*RouteOpts]map[model.Fingerprint]*aggrGroup{}
|
2015-09-25 11:44:00 +00:00
|
|
|
|
|
|
|
d.ctx, d.cancel = context.WithCancel(context.Background())
|
|
|
|
|
2015-09-29 08:00:02 +00:00
|
|
|
d.run(d.alerts.Subscribe())
|
2015-10-11 14:54:39 +00:00
|
|
|
close(d.done)
|
2015-09-25 11:44:00 +00:00
|
|
|
}
|
|
|
|
|
2015-11-05 09:49:32 +00:00
|
|
|
// UIRoute is the data representation of the routing tree as provided
|
|
|
|
// by the API.
|
2015-10-21 14:34:56 +00:00
|
|
|
type UIRoute struct {
|
2015-10-22 07:49:16 +00:00
|
|
|
RouteOpts *RouteOpts `json:"routeOpts"`
|
|
|
|
Matchers types.Matchers `json:"matchers"`
|
|
|
|
Groups []*UIGroup `json:"groups"`
|
|
|
|
Routes []*UIRoute `json:"routes"`
|
2015-10-21 14:34:56 +00:00
|
|
|
}
|
|
|
|
|
2015-11-05 09:49:32 +00:00
|
|
|
// UIGroup is the representation of a group of alerts as provided by
|
|
|
|
// the API.
|
2015-10-21 14:34:56 +00:00
|
|
|
type UIGroup struct {
|
2015-10-22 07:49:16 +00:00
|
|
|
Labels model.LabelSet `json:"labels"`
|
|
|
|
Alerts model.Alerts `json:"alerts"`
|
2015-10-21 14:34:56 +00:00
|
|
|
}
|
|
|
|
|
2015-11-05 09:49:32 +00:00
|
|
|
// Populate writes the dispatcher's internal state into the given UIRoute.
|
2015-10-21 14:34:56 +00:00
|
|
|
func (d *Dispatcher) Populate(r *UIRoute) {
|
|
|
|
for _, sr := range r.Routes {
|
|
|
|
d.Populate(sr)
|
|
|
|
}
|
|
|
|
|
|
|
|
groups, ok := d.aggrGroups[r.RouteOpts]
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, ag := range groups {
|
|
|
|
var as []*types.Alert
|
|
|
|
for _, a := range ag.alerts {
|
|
|
|
as = append(as, a)
|
|
|
|
}
|
|
|
|
g := &UIGroup{
|
|
|
|
Labels: ag.labels,
|
|
|
|
Alerts: types.Alerts(as...),
|
|
|
|
}
|
|
|
|
r.Groups = append(r.Groups, g)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-29 08:00:02 +00:00
|
|
|
func (d *Dispatcher) run(it provider.AlertIterator) {
|
2015-10-16 15:45:15 +00:00
|
|
|
cleanup := time.NewTicker(30 * time.Second)
|
2015-09-26 12:12:55 +00:00
|
|
|
defer cleanup.Stop()
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-29 08:00:02 +00:00
|
|
|
defer it.Close()
|
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
for {
|
|
|
|
select {
|
2015-09-29 08:00:02 +00:00
|
|
|
case alert := <-it.Next():
|
2015-09-29 09:58:30 +00:00
|
|
|
d.log.With("alert", alert).Debug("Received alert")
|
|
|
|
|
2015-10-11 14:54:39 +00:00
|
|
|
// Log errors but keep trying.
|
2015-09-29 08:00:02 +00:00
|
|
|
if err := it.Err(); err != nil {
|
|
|
|
log.Errorf("Error on alert update: %s", err)
|
|
|
|
continue
|
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
|
2015-10-19 14:17:15 +00:00
|
|
|
for _, r := range d.route.Match(alert.Labels) {
|
2015-09-24 22:15:27 +00:00
|
|
|
d.processAlert(alert, r)
|
|
|
|
}
|
|
|
|
|
2015-09-26 12:12:55 +00:00
|
|
|
case <-cleanup.C:
|
2015-10-16 14:55:56 +00:00
|
|
|
for _, groups := range d.aggrGroups {
|
|
|
|
for _, ag := range groups {
|
|
|
|
if ag.empty() {
|
|
|
|
ag.stop()
|
|
|
|
delete(groups, ag.fingerprint())
|
|
|
|
}
|
2015-07-04 12:59:52 +00:00
|
|
|
}
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
case <-d.ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// Stop the dispatcher.
|
|
|
|
func (d *Dispatcher) Stop() {
|
2015-10-11 14:54:39 +00:00
|
|
|
if d == nil || d.cancel == nil {
|
|
|
|
return
|
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
d.cancel()
|
2015-09-25 16:14:46 +00:00
|
|
|
d.cancel = nil
|
2015-09-26 09:12:47 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
<-d.done
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// notifyFunc is a function that performs notifcation for the alert
|
|
|
|
// with the given fingerprint. It aborts on context cancelation.
|
2015-09-26 16:12:56 +00:00
|
|
|
// Returns false iff notifying failed.
|
2015-09-26 12:12:55 +00:00
|
|
|
type notifyFunc func(context.Context, ...*types.Alert) bool
|
2015-09-24 22:15:27 +00:00
|
|
|
|
|
|
|
// processAlert determins in which aggregation group the alert falls
|
|
|
|
// and insert it.
|
2015-09-25 12:38:57 +00:00
|
|
|
func (d *Dispatcher) processAlert(alert *types.Alert, opts *RouteOpts) {
|
2015-07-02 16:38:05 +00:00
|
|
|
group := model.LabelSet{}
|
|
|
|
|
|
|
|
for ln, lv := range alert.Labels {
|
|
|
|
if _, ok := opts.GroupBy[ln]; ok {
|
|
|
|
group[ln] = lv
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fp := group.Fingerprint()
|
|
|
|
|
2015-10-16 14:55:56 +00:00
|
|
|
groups, ok := d.aggrGroups[opts]
|
|
|
|
if !ok {
|
|
|
|
groups = map[model.Fingerprint]*aggrGroup{}
|
|
|
|
d.aggrGroups[opts] = groups
|
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// If the group does not exist, create it.
|
2015-10-16 14:55:56 +00:00
|
|
|
ag, ok := groups[fp]
|
2015-07-02 16:38:05 +00:00
|
|
|
if !ok {
|
2015-09-25 16:14:46 +00:00
|
|
|
ag = newAggrGroup(d.ctx, group, opts)
|
2015-10-16 14:55:56 +00:00
|
|
|
groups[fp] = ag
|
2015-09-27 11:09:02 +00:00
|
|
|
|
|
|
|
go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool {
|
2015-10-11 14:54:39 +00:00
|
|
|
err := d.notifier.Notify(ctx, alerts...)
|
|
|
|
if err != nil {
|
2015-10-20 05:12:28 +00:00
|
|
|
log.Errorf("Notify for %d alerts failed: %s", len(alerts), err)
|
2015-09-27 11:09:02 +00:00
|
|
|
}
|
2015-10-16 15:45:15 +00:00
|
|
|
return err == nil
|
2015-09-27 11:09:02 +00:00
|
|
|
})
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ag.insert(alert)
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 18:48:21 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// aggrGroup aggregates alert fingerprints into groups to which a
|
|
|
|
// common set of routing options applies.
|
|
|
|
// It emits notifications in the specified intervals.
|
|
|
|
type aggrGroup struct {
|
2015-10-21 11:08:53 +00:00
|
|
|
labels model.LabelSet
|
|
|
|
opts *RouteOpts
|
|
|
|
routeFP model.Fingerprint
|
|
|
|
log log.Logger
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ctx context.Context
|
|
|
|
cancel func()
|
|
|
|
done chan struct{}
|
2015-09-25 16:14:46 +00:00
|
|
|
next *time.Timer
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
mtx sync.RWMutex
|
2015-09-25 16:14:46 +00:00
|
|
|
alerts map[model.Fingerprint]*types.Alert
|
2015-07-04 10:52:53 +00:00
|
|
|
hasSent bool
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
// newAggrGroup returns a new aggregation group.
|
|
|
|
func newAggrGroup(ctx context.Context, labels model.LabelSet, opts *RouteOpts) *aggrGroup {
|
2015-07-02 18:48:21 +00:00
|
|
|
ag := &aggrGroup{
|
2015-07-04 10:52:53 +00:00
|
|
|
labels: labels,
|
|
|
|
opts: opts,
|
2015-09-25 16:14:46 +00:00
|
|
|
alerts: map[model.Fingerprint]*types.Alert{},
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.ctx, ag.cancel = context.WithCancel(ctx)
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-10-27 17:24:09 +00:00
|
|
|
ag.log = log.With("aggrGroup", ag)
|
|
|
|
|
2015-10-07 14:18:55 +00:00
|
|
|
// Set an initial one-time wait before flushing
|
|
|
|
// the first batch of notifications.
|
|
|
|
ag.next = time.NewTimer(ag.opts.GroupWait)
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
return ag
|
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-26 15:54:49 +00:00
|
|
|
func (ag *aggrGroup) String() string {
|
2015-09-30 12:53:52 +00:00
|
|
|
return fmt.Sprintf("%v", ag.fingerprint())
|
2015-09-26 15:54:49 +00:00
|
|
|
}
|
|
|
|
|
2015-09-29 13:12:31 +00:00
|
|
|
func (ag *aggrGroup) run(nf notifyFunc) {
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.done = make(chan struct{})
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
defer close(ag.done)
|
2015-07-04 10:52:53 +00:00
|
|
|
defer ag.next.Stop()
|
|
|
|
|
2015-10-09 06:58:44 +00:00
|
|
|
timeout := ag.opts.GroupInterval
|
|
|
|
|
|
|
|
if timeout < notify.MinTimeout {
|
|
|
|
timeout = notify.MinTimeout
|
|
|
|
}
|
2015-10-27 17:24:09 +00:00
|
|
|
fmt.Println("starting at", time.Now())
|
2015-10-09 06:58:44 +00:00
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
for {
|
|
|
|
select {
|
2015-10-09 06:26:41 +00:00
|
|
|
case now := <-ag.next.C:
|
2015-09-26 16:03:54 +00:00
|
|
|
// Give the notifcations time until the next flush to
|
|
|
|
// finish before terminating them.
|
2015-10-09 06:58:44 +00:00
|
|
|
ctx, cancel := context.WithTimeout(ag.ctx, timeout)
|
2015-09-24 22:15:27 +00:00
|
|
|
|
2015-10-09 06:26:41 +00:00
|
|
|
// The now time we retrieve from the ticker is the only reliable
|
|
|
|
// point of time reference for the subsequent notification pipeline.
|
2015-10-09 06:58:44 +00:00
|
|
|
// Calculating the current time directly is prone to flaky behavior,
|
2015-10-09 06:26:41 +00:00
|
|
|
// which usually only becomes apparent in tests.
|
2015-10-09 06:43:39 +00:00
|
|
|
ctx = notify.WithNow(ctx, now)
|
2015-10-09 06:26:41 +00:00
|
|
|
|
2015-10-09 06:43:39 +00:00
|
|
|
// Populate context with information needed along the pipeline.
|
2015-10-21 11:08:53 +00:00
|
|
|
ctx = notify.WithGroupKey(ctx, ag.labels.Fingerprint()^ag.routeFP)
|
2015-10-16 14:55:56 +00:00
|
|
|
ctx = notify.WithGroupLabels(ctx, ag.labels)
|
2015-10-09 06:43:39 +00:00
|
|
|
ctx = notify.WithDestination(ctx, ag.opts.SendTo)
|
|
|
|
ctx = notify.WithRepeatInterval(ctx, ag.opts.RepeatInterval)
|
|
|
|
ctx = notify.WithSendResolved(ctx, ag.opts.SendResolved)
|
2015-10-08 08:50:37 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
// Wait the configured interval before calling flush again.
|
2015-09-26 16:03:54 +00:00
|
|
|
ag.next.Reset(ag.opts.GroupInterval)
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-10-27 17:24:09 +00:00
|
|
|
fmt.Println("flushing at", now)
|
2015-09-26 12:12:55 +00:00
|
|
|
ag.flush(func(alerts ...*types.Alert) bool {
|
2015-09-29 13:12:31 +00:00
|
|
|
return nf(ctx, alerts...)
|
2015-09-24 22:15:27 +00:00
|
|
|
})
|
|
|
|
|
2015-10-09 06:58:44 +00:00
|
|
|
cancel()
|
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
case <-ag.ctx.Done():
|
2015-07-02 16:38:05 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) stop() {
|
2015-09-24 22:15:27 +00:00
|
|
|
// Calling cancel will terminate all in-process notifications
|
|
|
|
// and the run() loop.
|
|
|
|
ag.cancel()
|
|
|
|
<-ag.done
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) fingerprint() model.Fingerprint {
|
|
|
|
return ag.labels.Fingerprint()
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// insert the alert into the aggregation group. If the aggregation group
|
|
|
|
// is empty afterwards, true is returned.
|
2015-09-25 16:14:46 +00:00
|
|
|
func (ag *aggrGroup) insert(alert *types.Alert) {
|
2015-07-04 10:52:53 +00:00
|
|
|
ag.mtx.Lock()
|
2015-09-24 22:15:27 +00:00
|
|
|
defer ag.mtx.Unlock()
|
|
|
|
|
2015-09-25 16:14:46 +00:00
|
|
|
ag.alerts[alert.Fingerprint()] = alert
|
2015-07-02 18:48:21 +00:00
|
|
|
|
|
|
|
// Immediately trigger a flush if the wait duration for this
|
|
|
|
// alert is already over.
|
2015-10-27 17:24:09 +00:00
|
|
|
if !ag.hasSent && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) {
|
2015-07-04 10:52:53 +00:00
|
|
|
ag.next.Reset(0)
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
func (ag *aggrGroup) empty() bool {
|
|
|
|
ag.mtx.RLock()
|
|
|
|
defer ag.mtx.RUnlock()
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
return len(ag.alerts) == 0
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// flush sends notifications for all new alerts.
|
2015-09-26 12:12:55 +00:00
|
|
|
func (ag *aggrGroup) flush(notify func(...*types.Alert) bool) {
|
2015-09-27 17:50:41 +00:00
|
|
|
if ag.empty() {
|
|
|
|
return
|
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
ag.mtx.Lock()
|
|
|
|
|
2015-09-26 12:12:55 +00:00
|
|
|
var (
|
|
|
|
alerts = make(map[model.Fingerprint]*types.Alert, len(ag.alerts))
|
|
|
|
alertsSlice = make([]*types.Alert, 0, len(ag.alerts))
|
|
|
|
)
|
2015-09-25 16:14:46 +00:00
|
|
|
for fp, alert := range ag.alerts {
|
|
|
|
alerts[fp] = alert
|
2015-09-26 12:12:55 +00:00
|
|
|
alertsSlice = append(alertsSlice, alert)
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-09-24 22:15:27 +00:00
|
|
|
ag.mtx.Unlock()
|
|
|
|
|
2015-09-30 12:53:52 +00:00
|
|
|
ag.log.Debugln("flushing", alertsSlice)
|
|
|
|
|
2015-09-26 12:12:55 +00:00
|
|
|
if notify(alertsSlice...) {
|
|
|
|
ag.mtx.Lock()
|
|
|
|
for fp, a := range alerts {
|
|
|
|
// Only delete if the fingerprint has not been inserted
|
|
|
|
// again since we notified about it.
|
|
|
|
if a.Resolved() && ag.alerts[fp] == a {
|
2015-09-26 15:54:49 +00:00
|
|
|
delete(ag.alerts, fp)
|
2015-09-24 22:15:27 +00:00
|
|
|
}
|
2015-09-26 12:12:55 +00:00
|
|
|
}
|
2015-11-04 16:17:37 +00:00
|
|
|
|
|
|
|
ag.hasSent = true
|
2015-09-26 12:12:55 +00:00
|
|
|
ag.mtx.Unlock()
|
2015-07-10 17:25:56 +00:00
|
|
|
}
|
2015-09-24 22:15:27 +00:00
|
|
|
}
|