2015-07-02 16:38:05 +00:00
|
|
|
package manager
|
|
|
|
|
|
|
|
import (
|
2015-07-04 12:05:04 +00:00
|
|
|
"fmt"
|
2015-07-02 16:38:05 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/prometheus/common/model"
|
|
|
|
"github.com/prometheus/log"
|
|
|
|
)
|
|
|
|
|
2015-07-09 13:01:38 +00:00
|
|
|
const ResolveTimeout = 35 * time.Second
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
// Dispatcher dispatches alerts. It is absed on the alert's data
|
|
|
|
// rather than the time they arrive. Thus it can recover it's state
|
|
|
|
// without persistence.
|
|
|
|
type Dispatcher struct {
|
|
|
|
state State
|
|
|
|
|
|
|
|
aggrGroups map[model.Fingerprint]*aggrGroup
|
2015-07-04 12:41:10 +00:00
|
|
|
notifiers map[string]Notifier
|
|
|
|
|
|
|
|
mtx sync.RWMutex
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-07-04 12:41:10 +00:00
|
|
|
func NewDispatcher(state State, notifiers []Notifier) *Dispatcher {
|
|
|
|
disp := &Dispatcher{
|
2015-07-02 16:38:05 +00:00
|
|
|
state: state,
|
|
|
|
aggrGroups: map[model.Fingerprint]*aggrGroup{},
|
2015-07-04 12:41:10 +00:00
|
|
|
notifiers: map[string]Notifier{},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, n := range notifiers {
|
|
|
|
disp.notifiers[n.Name()] = n
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
2015-07-04 12:41:10 +00:00
|
|
|
|
|
|
|
return disp
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-07-09 13:01:38 +00:00
|
|
|
func (d *Dispatcher) filter(alerts ...*Alert) ([]*Alert, error) {
|
|
|
|
|
|
|
|
silences, err := d.state.Silence().List()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var sentAlerts []*Alert
|
|
|
|
|
|
|
|
for _, alert := range alerts {
|
|
|
|
add := true
|
|
|
|
// None of the existing silences must match the alert.
|
|
|
|
for _, sil := range silences {
|
|
|
|
if sil.Match(alert.Labels) {
|
|
|
|
add = false
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !add {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Filter out alerts that have already been sent.
|
|
|
|
ni, err := d.state.Notify().Get(alert.Fingerprint())
|
|
|
|
// Always try to send on error as the safest option.
|
|
|
|
if err == nil && ni.LastSent.Before(alert.ResolvedAt) && ni.LastResolved == alert.Resolved() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
sentAlerts = append(sentAlerts, alert)
|
|
|
|
}
|
|
|
|
|
|
|
|
return sentAlerts, nil
|
|
|
|
}
|
|
|
|
|
2015-07-04 12:05:04 +00:00
|
|
|
func (d *Dispatcher) notify(name string, alerts ...*Alert) error {
|
2015-07-09 13:01:38 +00:00
|
|
|
alerts, err := d.filter(alerts...)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2015-07-04 12:05:04 +00:00
|
|
|
if len(alerts) == 0 {
|
2015-07-04 12:41:10 +00:00
|
|
|
return nil
|
2015-07-04 12:05:04 +00:00
|
|
|
}
|
2015-07-04 12:41:10 +00:00
|
|
|
|
|
|
|
d.mtx.RLock()
|
|
|
|
notifier, ok := d.notifiers[name]
|
|
|
|
d.mtx.RUnlock()
|
|
|
|
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("notifier %q does not exist", name)
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
2015-07-04 12:41:10 +00:00
|
|
|
|
2015-07-09 13:01:38 +00:00
|
|
|
if err = notifier.Send(alerts...); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, alert := range alerts {
|
|
|
|
_ = d.state.Notify().Set(alert.Fingerprint(), &NotifyInfo{
|
|
|
|
LastSent: time.Now(),
|
|
|
|
LastResolved: alert.Resolved(),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Dispatcher) Run() {
|
2015-07-09 13:01:38 +00:00
|
|
|
var (
|
|
|
|
updates = d.state.Alert().Iter()
|
|
|
|
cleanup = time.Tick(30 * time.Second)
|
|
|
|
)
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-cleanup:
|
|
|
|
// Cleanup routine.
|
|
|
|
for _, ag := range d.aggrGroups {
|
|
|
|
if ag.empty() {
|
|
|
|
ag.stop()
|
|
|
|
delete(d.aggrGroups, ag.fingerprint())
|
|
|
|
}
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
case alert := <-updates:
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
conf, err := d.state.Config().Get()
|
|
|
|
if err != nil {
|
|
|
|
log.Error(err)
|
|
|
|
continue
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-04 12:59:52 +00:00
|
|
|
for _, m := range conf.Routes.Match(alert.Labels) {
|
|
|
|
d.processAlert(alert, m)
|
|
|
|
}
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-07 08:00:50 +00:00
|
|
|
if alert.ResolvedAt.IsZero() {
|
|
|
|
alert.ResolvedAt = alert.CreatedAt.Add(ResolveTimeout)
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-09 13:01:38 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Dispatcher) processAlert(alert *Alert, opts *RouteOpts) {
|
|
|
|
group := model.LabelSet{}
|
|
|
|
|
|
|
|
for ln, lv := range alert.Labels {
|
|
|
|
if _, ok := opts.GroupBy[ln]; ok {
|
|
|
|
group[ln] = lv
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fp := group.Fingerprint()
|
|
|
|
|
|
|
|
ag, ok := d.aggrGroups[fp]
|
|
|
|
if !ok {
|
2015-07-04 10:52:53 +00:00
|
|
|
ag = newAggrGroup(d, group, opts)
|
2015-07-02 16:38:05 +00:00
|
|
|
d.aggrGroups[fp] = ag
|
|
|
|
}
|
|
|
|
|
|
|
|
ag.insert(alert)
|
2015-07-04 10:52:53 +00:00
|
|
|
}
|
2015-07-02 18:48:21 +00:00
|
|
|
|
2015-07-09 13:01:38 +00:00
|
|
|
type Silence struct {
|
|
|
|
Matchers Matchers
|
|
|
|
|
|
|
|
// The numeric ID of the silence.
|
|
|
|
ID string
|
|
|
|
|
|
|
|
// Name/email of the silence creator.
|
|
|
|
CreatedBy string
|
|
|
|
// When the silence was first created (Unix timestamp).
|
|
|
|
CreatedAt, EndsAt time.Time
|
|
|
|
|
|
|
|
// Additional comment about the silence.
|
|
|
|
Comment string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sil *Silence) Match(lset model.LabelSet) bool {
|
|
|
|
now := time.Now()
|
|
|
|
|
|
|
|
if now.Before(sil.CreatedAt) || now.After(sil.EndsAt) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return sil.Matchers.Match(lset)
|
|
|
|
}
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
// Alert models an action triggered by Prometheus.
|
|
|
|
type Alert struct {
|
|
|
|
// Label value pairs for purpose of aggregation, matching, and disposition
|
|
|
|
// dispatching. This must minimally include an "alertname" label.
|
|
|
|
Labels model.LabelSet `json:"labels"`
|
|
|
|
|
|
|
|
// Extra key/value information which is not used for aggregation.
|
2015-07-04 12:05:04 +00:00
|
|
|
Payload map[string]string `json:"payload,omitempty"`
|
|
|
|
Summary string `json:"summary,omitempty"`
|
|
|
|
Description string `json:"description,omitempty"`
|
|
|
|
Runbook string `json:"runbook,omitempty"`
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-04 12:05:04 +00:00
|
|
|
CreatedAt time.Time `json:"created_at,omitempty"`
|
|
|
|
ResolvedAt time.Time `json:"resolved_at,omitempty"`
|
2015-07-04 10:52:53 +00:00
|
|
|
|
2015-07-04 12:05:04 +00:00
|
|
|
// The authoritative timestamp.
|
2015-07-04 10:52:53 +00:00
|
|
|
Timestamp time.Time `json:"timestamp"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// Name returns the name of the alert. It is equivalent to the "alertname" label.
|
|
|
|
func (a *Alert) Name() string {
|
|
|
|
return string(a.Labels[model.AlertNameLabel])
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fingerprint returns a unique hash for the alert. It is equivalent to
|
|
|
|
// the fingerprint of the alert's label set.
|
|
|
|
func (a *Alert) Fingerprint() model.Fingerprint {
|
|
|
|
return a.Labels.Fingerprint()
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-07-04 12:05:04 +00:00
|
|
|
func (a *Alert) String() string {
|
2015-07-07 08:00:50 +00:00
|
|
|
s := fmt.Sprintf("%s[%s]", a.Name(), a.Fingerprint())
|
2015-07-04 12:05:04 +00:00
|
|
|
if a.Resolved() {
|
|
|
|
return s + "[resolved]"
|
|
|
|
}
|
|
|
|
return s + "[active]"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *Alert) Resolved() bool {
|
2015-07-07 08:00:50 +00:00
|
|
|
if a.ResolvedAt.IsZero() {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return !a.ResolvedAt.After(time.Now())
|
2015-07-04 12:05:04 +00:00
|
|
|
}
|
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
// aggrGroup aggregates alerts into groups based on
|
|
|
|
// common values for a set of labels.
|
|
|
|
type aggrGroup struct {
|
|
|
|
dispatcher *Dispatcher
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
labels model.LabelSet
|
|
|
|
opts *RouteOpts
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
next *time.Timer
|
|
|
|
done chan struct{}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
mtx sync.RWMutex
|
|
|
|
alerts map[model.Fingerprint]struct{}
|
|
|
|
hasSent bool
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// newAggrGroup returns a new aggregation group and starts background processing
|
|
|
|
// that sends notifications about the contained alerts.
|
2015-07-04 10:52:53 +00:00
|
|
|
func newAggrGroup(d *Dispatcher, labels model.LabelSet, opts *RouteOpts) *aggrGroup {
|
2015-07-02 18:48:21 +00:00
|
|
|
ag := &aggrGroup{
|
|
|
|
dispatcher: d,
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
labels: labels,
|
|
|
|
opts: opts,
|
|
|
|
|
|
|
|
alerts: map[model.Fingerprint]struct{}{},
|
|
|
|
done: make(chan struct{}),
|
2015-07-04 12:41:10 +00:00
|
|
|
|
|
|
|
// Set an initial one-time wait before flushing
|
|
|
|
// the first batch of notifications.
|
|
|
|
next: time.NewTimer(opts.GroupWait),
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
go ag.run()
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
return ag
|
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
func (ag *aggrGroup) run() {
|
2015-07-04 10:52:53 +00:00
|
|
|
|
|
|
|
defer ag.next.Stop()
|
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
for {
|
|
|
|
select {
|
2015-07-04 10:52:53 +00:00
|
|
|
case <-ag.next.C:
|
2015-07-02 16:38:05 +00:00
|
|
|
ag.flush()
|
2015-07-04 10:52:53 +00:00
|
|
|
// Wait the configured interval before calling flush again.
|
|
|
|
ag.next.Reset(ag.opts.GroupInterval)
|
|
|
|
|
2015-07-02 16:38:05 +00:00
|
|
|
case <-ag.done:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) stop() {
|
|
|
|
close(ag.done)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ag *aggrGroup) fingerprint() model.Fingerprint {
|
|
|
|
return ag.labels.Fingerprint()
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// insert the alert into the aggregation group. If the aggregation group
|
|
|
|
// is empty afterwards, true is returned.
|
2015-07-02 16:38:05 +00:00
|
|
|
func (ag *aggrGroup) insert(alert *Alert) {
|
2015-07-04 10:52:53 +00:00
|
|
|
fp := alert.Fingerprint()
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
ag.mtx.Lock()
|
|
|
|
ag.alerts[fp] = struct{}{}
|
2015-07-02 18:48:21 +00:00
|
|
|
ag.mtx.Unlock()
|
|
|
|
|
|
|
|
// Immediately trigger a flush if the wait duration for this
|
|
|
|
// alert is already over.
|
2015-07-04 10:52:53 +00:00
|
|
|
if !ag.hasSent && alert.Timestamp.Add(ag.opts.GroupWait).Before(time.Now()) {
|
|
|
|
ag.next.Reset(0)
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
func (ag *aggrGroup) empty() bool {
|
|
|
|
ag.mtx.RLock()
|
|
|
|
defer ag.mtx.RUnlock()
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
return len(ag.alerts) == 0
|
2015-07-02 18:48:21 +00:00
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// flush sends notifications for all new alerts.
|
|
|
|
func (ag *aggrGroup) flush() {
|
2015-07-02 16:38:05 +00:00
|
|
|
ag.mtx.Lock()
|
|
|
|
defer ag.mtx.Unlock()
|
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
var alerts []*Alert
|
|
|
|
for fp := range ag.alerts {
|
|
|
|
a, err := ag.dispatcher.state.Alert().Get(fp)
|
|
|
|
if err != nil {
|
|
|
|
log.Error(err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// TODO(fabxc): only delete if notify successful.
|
2015-07-04 12:05:04 +00:00
|
|
|
if a.Resolved() {
|
2015-07-04 10:52:53 +00:00
|
|
|
delete(ag.alerts, fp)
|
|
|
|
}
|
|
|
|
alerts = append(alerts, a)
|
|
|
|
}
|
2015-07-02 16:38:05 +00:00
|
|
|
|
2015-07-04 10:52:53 +00:00
|
|
|
ag.dispatcher.notify(ag.opts.SendTo, alerts...)
|
|
|
|
ag.hasSent = true
|
2015-07-02 16:38:05 +00:00
|
|
|
}
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
// alertTimeline is a list of alerts sorted by their timestamp.
|
2015-07-02 16:38:05 +00:00
|
|
|
type alertTimeline []*Alert
|
|
|
|
|
2015-07-02 18:48:21 +00:00
|
|
|
func (at alertTimeline) Len() int { return len(at) }
|
|
|
|
func (at alertTimeline) Less(i, j int) bool { return at[i].Timestamp.Before(at[j].Timestamp) }
|
|
|
|
func (at alertTimeline) Swap(i, j int) { at[i], at[j] = at[j], at[i] }
|