2015-10-11 15:24:49 +00:00
|
|
|
// Copyright 2015 Prometheus Team
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2015-09-29 12:45:38 +00:00
|
|
|
package notify
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2017-03-13 12:44:36 +00:00
|
|
|
"sort"
|
2015-09-29 13:12:31 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
2015-09-29 12:45:38 +00:00
|
|
|
|
2015-10-09 07:37:32 +00:00
|
|
|
"github.com/cenkalti/backoff"
|
2017-03-13 12:44:36 +00:00
|
|
|
"github.com/cespare/xxhash"
|
2017-10-22 05:59:33 +00:00
|
|
|
"github.com/go-kit/kit/log"
|
|
|
|
"github.com/go-kit/kit/log/level"
|
2016-08-12 17:18:26 +00:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
2015-09-29 13:12:31 +00:00
|
|
|
"github.com/prometheus/common/model"
|
2015-09-29 12:45:38 +00:00
|
|
|
"golang.org/x/net/context"
|
|
|
|
|
2016-08-11 13:04:03 +00:00
|
|
|
"github.com/prometheus/alertmanager/config"
|
|
|
|
"github.com/prometheus/alertmanager/inhibit"
|
2016-08-16 12:09:06 +00:00
|
|
|
"github.com/prometheus/alertmanager/nflog"
|
2016-08-16 12:32:24 +00:00
|
|
|
"github.com/prometheus/alertmanager/nflog/nflogpb"
|
2016-08-30 09:58:27 +00:00
|
|
|
"github.com/prometheus/alertmanager/silence"
|
2016-08-11 13:04:03 +00:00
|
|
|
"github.com/prometheus/alertmanager/template"
|
2015-09-29 12:45:38 +00:00
|
|
|
"github.com/prometheus/alertmanager/types"
|
|
|
|
)
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
var (
|
|
|
|
numNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Namespace: "alertmanager",
|
|
|
|
Name: "notifications_total",
|
|
|
|
Help: "The total number of attempted notifications.",
|
|
|
|
}, []string{"integration"})
|
|
|
|
|
|
|
|
numFailedNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Namespace: "alertmanager",
|
|
|
|
Name: "notifications_failed_total",
|
|
|
|
Help: "The total number of failed notifications.",
|
|
|
|
}, []string{"integration"})
|
|
|
|
)
|
|
|
|
|
|
|
|
func init() {
|
2017-10-07 09:57:53 +00:00
|
|
|
numNotifications.WithLabelValues("email")
|
|
|
|
numNotifications.WithLabelValues("hipchat")
|
|
|
|
numNotifications.WithLabelValues("pagerduty")
|
|
|
|
numNotifications.WithLabelValues("pushover")
|
|
|
|
numNotifications.WithLabelValues("slack")
|
|
|
|
numNotifications.WithLabelValues("opsgenie")
|
|
|
|
numNotifications.WithLabelValues("webhook")
|
|
|
|
numNotifications.WithLabelValues("victorops")
|
|
|
|
numFailedNotifications.WithLabelValues("email")
|
|
|
|
numFailedNotifications.WithLabelValues("hipchat")
|
|
|
|
numFailedNotifications.WithLabelValues("pagerduty")
|
|
|
|
numFailedNotifications.WithLabelValues("pushover")
|
|
|
|
numFailedNotifications.WithLabelValues("slack")
|
|
|
|
numFailedNotifications.WithLabelValues("opsgenie")
|
|
|
|
numFailedNotifications.WithLabelValues("webhook")
|
|
|
|
numFailedNotifications.WithLabelValues("victorops")
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
prometheus.Register(numNotifications)
|
|
|
|
prometheus.Register(numFailedNotifications)
|
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// MinTimeout is the minimum timeout that is set for the context of a call
|
2015-10-09 06:58:44 +00:00
|
|
|
// to a notification pipeline.
|
|
|
|
const MinTimeout = 10 * time.Second
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// notifyKey defines a custom type with which a context is populated to
|
|
|
|
// avoid accidental collisions.
|
2015-09-29 13:12:31 +00:00
|
|
|
type notifyKey int
|
|
|
|
|
|
|
|
const (
|
2016-08-16 12:22:47 +00:00
|
|
|
keyReceiverName notifyKey = iota
|
2015-10-09 06:43:39 +00:00
|
|
|
keyRepeatInterval
|
2015-10-16 14:55:56 +00:00
|
|
|
keyGroupLabels
|
2015-10-21 11:08:53 +00:00
|
|
|
keyGroupKey
|
2017-03-13 12:44:36 +00:00
|
|
|
keyFiringAlerts
|
|
|
|
keyResolvedAlerts
|
2015-10-09 06:43:39 +00:00
|
|
|
keyNow
|
2015-09-29 13:12:31 +00:00
|
|
|
)
|
|
|
|
|
2016-08-16 12:22:47 +00:00
|
|
|
// WithReceiverName populates a context with a receiver name.
|
|
|
|
func WithReceiverName(ctx context.Context, rcv string) context.Context {
|
|
|
|
return context.WithValue(ctx, keyReceiverName, rcv)
|
2015-10-09 06:43:39 +00:00
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// WithGroupKey populates a context with a group key.
|
2017-04-21 09:43:12 +00:00
|
|
|
func WithGroupKey(ctx context.Context, s string) context.Context {
|
|
|
|
return context.WithValue(ctx, keyGroupKey, s)
|
2015-10-21 11:08:53 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
// WithFiringAlerts populates a context with a slice of firing alerts.
|
|
|
|
func WithFiringAlerts(ctx context.Context, alerts []uint64) context.Context {
|
|
|
|
return context.WithValue(ctx, keyFiringAlerts, alerts)
|
|
|
|
}
|
|
|
|
|
|
|
|
// WithResolvedAlerts populates a context with a slice of resolved alerts.
|
|
|
|
func WithResolvedAlerts(ctx context.Context, alerts []uint64) context.Context {
|
|
|
|
return context.WithValue(ctx, keyResolvedAlerts, alerts)
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// WithGroupLabels populates a context with grouping labels.
|
2015-10-16 14:55:56 +00:00
|
|
|
func WithGroupLabels(ctx context.Context, lset model.LabelSet) context.Context {
|
|
|
|
return context.WithValue(ctx, keyGroupLabels, lset)
|
2015-10-09 06:43:39 +00:00
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// WithNow populates a context with a now timestamp.
|
2015-10-09 06:43:39 +00:00
|
|
|
func WithNow(ctx context.Context, t time.Time) context.Context {
|
|
|
|
return context.WithValue(ctx, keyNow, t)
|
|
|
|
}
|
|
|
|
|
2016-08-16 12:09:06 +00:00
|
|
|
// WithRepeatInterval populates a context with a repeat interval.
|
|
|
|
func WithRepeatInterval(ctx context.Context, t time.Duration) context.Context {
|
|
|
|
return context.WithValue(ctx, keyRepeatInterval, t)
|
|
|
|
}
|
|
|
|
|
|
|
|
// RepeatInterval extracts a repeat interval from the context. Iff none exists, the
|
|
|
|
// second argument is false.
|
|
|
|
func RepeatInterval(ctx context.Context) (time.Duration, bool) {
|
|
|
|
v, ok := ctx.Value(keyRepeatInterval).(time.Duration)
|
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2016-08-16 12:32:24 +00:00
|
|
|
// ReceiverName extracts a receiver name from the context. Iff none exists, the
|
|
|
|
// second argument is false.
|
|
|
|
func ReceiverName(ctx context.Context) (string, bool) {
|
|
|
|
v, ok := ctx.Value(keyReceiverName).(string)
|
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2017-10-22 05:59:33 +00:00
|
|
|
func receiverName(ctx context.Context, l log.Logger) string {
|
2016-08-16 12:22:47 +00:00
|
|
|
recv, ok := ReceiverName(ctx)
|
2015-11-26 17:19:46 +00:00
|
|
|
if !ok {
|
2017-10-22 05:59:33 +00:00
|
|
|
level.Error(l).Log("msg", "Missing receiver")
|
2015-11-26 17:19:46 +00:00
|
|
|
}
|
|
|
|
return recv
|
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// GroupKey extracts a group key from the context. Iff none exists, the
|
|
|
|
// second argument is false.
|
2017-04-21 09:43:12 +00:00
|
|
|
func GroupKey(ctx context.Context) (string, bool) {
|
|
|
|
v, ok := ctx.Value(keyGroupKey).(string)
|
2015-10-21 11:08:53 +00:00
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2017-10-22 05:59:33 +00:00
|
|
|
func groupLabels(ctx context.Context, l log.Logger) model.LabelSet {
|
2015-11-25 14:49:26 +00:00
|
|
|
groupLabels, ok := GroupLabels(ctx)
|
|
|
|
if !ok {
|
2017-10-22 05:59:33 +00:00
|
|
|
level.Error(l).Log("msg", "Missing group labels")
|
2015-11-25 14:49:26 +00:00
|
|
|
}
|
|
|
|
return groupLabels
|
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// GroupLabels extracts grouping label set from the context. Iff none exists, the
|
|
|
|
// second argument is false.
|
2015-10-16 14:55:56 +00:00
|
|
|
func GroupLabels(ctx context.Context) (model.LabelSet, bool) {
|
|
|
|
v, ok := ctx.Value(keyGroupLabels).(model.LabelSet)
|
2015-10-09 06:43:39 +00:00
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2015-11-12 12:18:36 +00:00
|
|
|
// Now extracts a now timestamp from the context. Iff none exists, the
|
|
|
|
// second argument is false.
|
2015-10-09 06:43:39 +00:00
|
|
|
func Now(ctx context.Context) (time.Time, bool) {
|
|
|
|
v, ok := ctx.Value(keyNow).(time.Time)
|
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
// FiringAlerts extracts a slice of firing alerts from the context.
|
|
|
|
// Iff none exists, the second argument is false.
|
|
|
|
func FiringAlerts(ctx context.Context) ([]uint64, bool) {
|
|
|
|
v, ok := ctx.Value(keyFiringAlerts).([]uint64)
|
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
|
|
|
// ResolvedAlerts extracts a slice of firing alerts from the context.
|
|
|
|
// Iff none exists, the second argument is false.
|
|
|
|
func ResolvedAlerts(ctx context.Context) ([]uint64, bool) {
|
|
|
|
v, ok := ctx.Value(keyResolvedAlerts).([]uint64)
|
2016-08-16 12:09:06 +00:00
|
|
|
return v, ok
|
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// A Stage processes alerts under the constraints of the given context.
|
2016-08-12 13:22:17 +00:00
|
|
|
type Stage interface {
|
2017-10-22 05:59:33 +00:00
|
|
|
Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error)
|
2016-08-11 13:04:03 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// StageFunc wraps a function to represent a Stage.
|
2017-10-22 05:59:33 +00:00
|
|
|
type StageFunc func(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error)
|
2016-08-12 13:22:17 +00:00
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// Exec implements Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (f StageFunc) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
|
|
|
return f(ctx, l, alerts...)
|
2016-08-12 13:22:17 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// BuildPipeline builds a map of receivers to Stages.
|
|
|
|
func BuildPipeline(
|
|
|
|
confs []*config.Receiver,
|
|
|
|
tmpl *template.Template,
|
2016-08-16 12:32:24 +00:00
|
|
|
wait func() time.Duration,
|
2016-08-12 17:18:26 +00:00
|
|
|
inhibitor *inhibit.Inhibitor,
|
2016-08-30 09:58:27 +00:00
|
|
|
silences *silence.Silences,
|
2016-08-16 12:09:06 +00:00
|
|
|
notificationLog nflog.Log,
|
2016-08-12 17:18:26 +00:00
|
|
|
marker types.Marker,
|
2017-10-22 05:59:33 +00:00
|
|
|
logger log.Logger,
|
2016-08-12 17:18:26 +00:00
|
|
|
) RoutingStage {
|
|
|
|
rs := RoutingStage{}
|
|
|
|
|
2016-08-16 12:32:24 +00:00
|
|
|
is := NewInhibitStage(inhibitor, marker)
|
|
|
|
ss := NewSilenceStage(silences, marker)
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
for _, rc := range confs {
|
2017-10-22 05:59:33 +00:00
|
|
|
rs[rc.Name] = MultiStage{is, ss, createStage(rc, tmpl, wait, notificationLog, logger)}
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
return rs
|
|
|
|
}
|
|
|
|
|
|
|
|
// createStage creates a pipeline of stages for a receiver.
|
2017-10-22 05:59:33 +00:00
|
|
|
func createStage(rc *config.Receiver, tmpl *template.Template, wait func() time.Duration, notificationLog nflog.Log, logger log.Logger) Stage {
|
2016-08-16 12:32:24 +00:00
|
|
|
var fs FanoutStage
|
2017-10-22 05:59:33 +00:00
|
|
|
for _, i := range BuildReceiverIntegrations(rc, tmpl, logger) {
|
2016-08-16 12:32:24 +00:00
|
|
|
recv := &nflogpb.Receiver{
|
|
|
|
GroupName: rc.Name,
|
|
|
|
Integration: i.name,
|
|
|
|
Idx: uint32(i.idx),
|
|
|
|
}
|
2016-08-12 13:22:17 +00:00
|
|
|
var s MultiStage
|
2016-08-16 12:32:24 +00:00
|
|
|
s = append(s, NewWaitStage(wait))
|
2017-03-13 12:44:36 +00:00
|
|
|
s = append(s, NewDedupStage(notificationLog, recv, i.conf.SendResolved()))
|
2016-08-12 17:18:26 +00:00
|
|
|
s = append(s, NewRetryStage(i))
|
2016-08-16 12:09:06 +00:00
|
|
|
s = append(s, NewSetNotifiesStage(notificationLog, recv))
|
2016-08-12 13:22:17 +00:00
|
|
|
|
2016-08-16 12:32:24 +00:00
|
|
|
fs = append(fs, s)
|
|
|
|
}
|
|
|
|
return fs
|
2016-08-12 13:22:17 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// RoutingStage executes the inner stages based on the receiver specified in
|
|
|
|
// the context.
|
|
|
|
type RoutingStage map[string]Stage
|
2016-08-12 13:22:17 +00:00
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (rs RoutingStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-16 12:22:47 +00:00
|
|
|
receiver, ok := ReceiverName(ctx)
|
2016-08-12 17:18:26 +00:00
|
|
|
if !ok {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, fmt.Errorf("receiver missing")
|
2016-08-12 13:22:17 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
s, ok := rs[receiver]
|
2016-08-12 13:22:17 +00:00
|
|
|
if !ok {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, fmt.Errorf("stage for receiver missing")
|
2016-08-11 13:04:03 +00:00
|
|
|
}
|
|
|
|
|
2017-10-22 05:59:33 +00:00
|
|
|
return s.Exec(ctx, l, alerts...)
|
2016-08-11 13:04:03 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// A MultiStage executes a series of stages sequencially.
|
|
|
|
type MultiStage []Stage
|
2016-08-11 13:04:03 +00:00
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (ms MultiStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-12 17:18:26 +00:00
|
|
|
var err error
|
|
|
|
for _, s := range ms {
|
|
|
|
if len(alerts) == 0 {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
2016-08-11 13:04:03 +00:00
|
|
|
|
2017-10-22 05:59:33 +00:00
|
|
|
ctx, alerts, err = s.Exec(ctx, l, alerts...)
|
2016-08-11 13:04:03 +00:00
|
|
|
if err != nil {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, err
|
2016-08-11 13:04:03 +00:00
|
|
|
}
|
|
|
|
}
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, alerts, nil
|
2016-08-11 13:04:03 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// FanoutStage executes its stages concurrently
|
2016-08-16 12:32:24 +00:00
|
|
|
type FanoutStage []Stage
|
2016-08-12 17:18:26 +00:00
|
|
|
|
2016-08-16 12:32:24 +00:00
|
|
|
// Exec attempts to execute all stages concurrently and discards the results.
|
|
|
|
// It returns its input alerts and a types.MultiError if one or more stages fail.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (fs FanoutStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2015-10-11 10:40:43 +00:00
|
|
|
var (
|
|
|
|
wg sync.WaitGroup
|
|
|
|
me types.MultiError
|
|
|
|
)
|
2016-08-12 13:22:17 +00:00
|
|
|
wg.Add(len(fs))
|
2015-09-29 13:12:31 +00:00
|
|
|
|
2016-08-16 12:32:24 +00:00
|
|
|
for _, s := range fs {
|
2016-08-12 13:22:17 +00:00
|
|
|
go func(s Stage) {
|
2017-10-22 05:59:33 +00:00
|
|
|
if _, _, err := s.Exec(ctx, l, alerts...); err != nil {
|
2015-11-20 14:10:38 +00:00
|
|
|
me.Add(err)
|
2017-10-22 05:59:33 +00:00
|
|
|
level.Error(l).Log("msg", "Error on notify", "err", err)
|
2015-09-29 13:12:31 +00:00
|
|
|
}
|
|
|
|
wg.Done()
|
2016-08-12 13:22:17 +00:00
|
|
|
}(s)
|
2015-09-29 13:12:31 +00:00
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
|
2015-11-20 14:10:38 +00:00
|
|
|
if me.Len() > 0 {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, alerts, &me
|
2015-10-11 14:54:31 +00:00
|
|
|
}
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, alerts, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// InhibitStage filters alerts through an inhibition muter.
|
|
|
|
type InhibitStage struct {
|
|
|
|
muter types.Muter
|
|
|
|
marker types.Marker
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewInhibitStage return a new InhibitStage.
|
|
|
|
func NewInhibitStage(m types.Muter, mk types.Marker) *InhibitStage {
|
|
|
|
return &InhibitStage{
|
|
|
|
muter: m,
|
|
|
|
marker: mk,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (n *InhibitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-12 17:18:26 +00:00
|
|
|
var filtered []*types.Alert
|
|
|
|
for _, a := range alerts {
|
|
|
|
// TODO(fabxc): increment total alerts counter.
|
|
|
|
// Do not send the alert if the silencer mutes it.
|
|
|
|
if !n.muter.Mutes(a.Labels) {
|
|
|
|
// TODO(fabxc): increment muted alerts counter.
|
|
|
|
filtered = append(filtered, a)
|
2016-08-05 08:18:10 +00:00
|
|
|
}
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
2015-10-09 07:37:32 +00:00
|
|
|
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, filtered, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// SilenceStage filters alerts through a silence muter.
|
|
|
|
type SilenceStage struct {
|
2016-08-30 09:58:27 +00:00
|
|
|
silences *silence.Silences
|
|
|
|
marker types.Marker
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewSilenceStage returns a new SilenceStage.
|
2016-08-30 09:58:27 +00:00
|
|
|
func NewSilenceStage(s *silence.Silences, mk types.Marker) *SilenceStage {
|
2016-08-12 17:18:26 +00:00
|
|
|
return &SilenceStage{
|
2016-08-30 09:58:27 +00:00
|
|
|
silences: s,
|
|
|
|
marker: mk,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (n *SilenceStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-12 17:18:26 +00:00
|
|
|
var filtered []*types.Alert
|
|
|
|
for _, a := range alerts {
|
|
|
|
// TODO(fabxc): increment total alerts counter.
|
|
|
|
// Do not send the alert if the silencer mutes it.
|
2016-08-30 09:58:27 +00:00
|
|
|
sils, err := n.silences.Query(
|
|
|
|
silence.QState(silence.StateActive),
|
2016-09-08 14:02:27 +00:00
|
|
|
silence.QMatches(a.Labels),
|
2016-08-30 09:58:27 +00:00
|
|
|
)
|
|
|
|
if err != nil {
|
2017-10-22 05:59:33 +00:00
|
|
|
level.Error(l).Log("msg", "Querying silences failed", "err", err)
|
2016-08-30 09:58:27 +00:00
|
|
|
}
|
2017-04-27 12:18:52 +00:00
|
|
|
|
2016-08-30 09:58:27 +00:00
|
|
|
if len(sils) == 0 {
|
2016-08-12 17:18:26 +00:00
|
|
|
// TODO(fabxc): increment muted alerts counter.
|
|
|
|
filtered = append(filtered, a)
|
2016-08-30 09:58:27 +00:00
|
|
|
n.marker.SetSilenced(a.Labels.Fingerprint())
|
|
|
|
} else {
|
2017-04-27 12:18:52 +00:00
|
|
|
ids := make([]string, len(sils))
|
|
|
|
for i, s := range sils {
|
|
|
|
ids[i] = s.Id
|
|
|
|
}
|
|
|
|
n.marker.SetSilenced(a.Labels.Fingerprint(), ids...)
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, filtered, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// WaitStage waits for a certain amount of time before continuing or until the
|
|
|
|
// context is done.
|
|
|
|
type WaitStage struct {
|
|
|
|
wait func() time.Duration
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewWaitStage returns a new WaitStage.
|
|
|
|
func NewWaitStage(wait func() time.Duration) *WaitStage {
|
|
|
|
return &WaitStage{
|
|
|
|
wait: wait,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (ws *WaitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-12 17:18:26 +00:00
|
|
|
select {
|
|
|
|
case <-time.After(ws.wait()):
|
|
|
|
case <-ctx.Done():
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, ctx.Err()
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, alerts, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// DedupStage filters alerts.
|
2016-08-16 12:09:06 +00:00
|
|
|
// Filtering happens based on a notification log.
|
2016-08-12 17:18:26 +00:00
|
|
|
type DedupStage struct {
|
2017-03-13 12:44:36 +00:00
|
|
|
nflog nflog.Log
|
|
|
|
recv *nflogpb.Receiver
|
|
|
|
sendResolved bool
|
2016-08-16 12:09:06 +00:00
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
now func() time.Time
|
|
|
|
hash func(*types.Alert) uint64
|
2015-09-29 12:45:38 +00:00
|
|
|
}
|
|
|
|
|
2016-08-16 12:09:06 +00:00
|
|
|
// NewDedupStage wraps a DedupStage that runs against the given notification log.
|
2017-03-13 12:44:36 +00:00
|
|
|
func NewDedupStage(l nflog.Log, recv *nflogpb.Receiver, sendResolved bool) *DedupStage {
|
2016-08-16 12:32:24 +00:00
|
|
|
return &DedupStage{
|
2017-03-13 12:44:36 +00:00
|
|
|
nflog: l,
|
|
|
|
recv: recv,
|
|
|
|
now: utcNow,
|
|
|
|
sendResolved: sendResolved,
|
|
|
|
hash: hashAlert,
|
2016-08-16 12:32:24 +00:00
|
|
|
}
|
2015-09-29 12:45:38 +00:00
|
|
|
}
|
|
|
|
|
2016-08-16 12:09:06 +00:00
|
|
|
func utcNow() time.Time {
|
|
|
|
return time.Now().UTC()
|
|
|
|
}
|
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
var hashBuffers = sync.Pool{}
|
2016-08-16 12:09:06 +00:00
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
func getHashBuffer() []byte {
|
|
|
|
b := hashBuffers.Get()
|
|
|
|
if b == nil {
|
|
|
|
return make([]byte, 0, 1024)
|
|
|
|
}
|
|
|
|
return b.([]byte)
|
|
|
|
}
|
|
|
|
|
|
|
|
func putHashBuffer(b []byte) {
|
|
|
|
b = b[:0]
|
|
|
|
hashBuffers.Put(b)
|
|
|
|
}
|
|
|
|
|
|
|
|
func hashAlert(a *types.Alert) uint64 {
|
|
|
|
const sep = '\xff'
|
2017-04-21 09:43:12 +00:00
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
b := getHashBuffer()
|
2017-04-21 09:43:12 +00:00
|
|
|
defer putHashBuffer(b)
|
|
|
|
|
|
|
|
names := make(model.LabelNames, 0, len(a.Labels))
|
2017-03-13 12:44:36 +00:00
|
|
|
|
2017-10-07 09:57:53 +00:00
|
|
|
for ln := range a.Labels {
|
2017-04-21 09:43:12 +00:00
|
|
|
names = append(names, ln)
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
2017-04-21 09:43:12 +00:00
|
|
|
sort.Sort(names)
|
2017-03-13 12:44:36 +00:00
|
|
|
|
2017-04-21 09:43:12 +00:00
|
|
|
for _, ln := range names {
|
|
|
|
b = append(b, string(ln)...)
|
2017-03-13 12:44:36 +00:00
|
|
|
b = append(b, sep)
|
2017-04-21 09:43:12 +00:00
|
|
|
b = append(b, string(a.Labels[ln])...)
|
2017-03-13 12:44:36 +00:00
|
|
|
b = append(b, sep)
|
|
|
|
}
|
|
|
|
|
|
|
|
hash := xxhash.Sum64(b)
|
|
|
|
|
|
|
|
return hash
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
func (n *DedupStage) needsUpdate(entry *nflogpb.Entry, firing, resolved map[uint64]struct{}, repeat time.Duration) (bool, error) {
|
2016-08-16 12:09:06 +00:00
|
|
|
// If we haven't notified about the alert group before, notify right away
|
|
|
|
// unless we only have resolved alerts.
|
|
|
|
if entry == nil {
|
2017-05-29 12:07:05 +00:00
|
|
|
return len(firing) > 0, nil
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
2017-03-13 12:44:36 +00:00
|
|
|
|
|
|
|
if !entry.IsFiringSubset(firing) {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if n.sendResolved && !entry.IsResolvedSubset(resolved) {
|
2016-08-16 12:09:06 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Nothing changed, only notify if the repeat interval has passed.
|
2017-04-18 08:03:57 +00:00
|
|
|
return entry.Timestamp.Before(n.now().Add(-repeat)), nil
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-16 12:09:06 +00:00
|
|
|
gkey, ok := GroupKey(ctx)
|
2015-09-29 13:12:31 +00:00
|
|
|
if !ok {
|
2016-08-16 12:09:06 +00:00
|
|
|
return ctx, nil, fmt.Errorf("group key missing")
|
2015-09-29 12:45:38 +00:00
|
|
|
}
|
2015-09-29 13:12:31 +00:00
|
|
|
|
2016-08-16 12:09:06 +00:00
|
|
|
repeatInterval, ok := RepeatInterval(ctx)
|
2015-10-09 06:26:41 +00:00
|
|
|
if !ok {
|
2016-08-16 12:09:06 +00:00
|
|
|
return ctx, nil, fmt.Errorf("repeat interval missing")
|
2015-10-09 06:26:41 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
firingSet := map[uint64]struct{}{}
|
|
|
|
resolvedSet := map[uint64]struct{}{}
|
|
|
|
firing := []uint64{}
|
|
|
|
resolved := []uint64{}
|
|
|
|
|
|
|
|
var hash uint64
|
|
|
|
for _, a := range alerts {
|
|
|
|
hash = n.hash(a)
|
|
|
|
if a.Resolved() {
|
|
|
|
resolved = append(resolved, hash)
|
|
|
|
resolvedSet[hash] = struct{}{}
|
|
|
|
} else {
|
|
|
|
firing = append(firing, hash)
|
|
|
|
firingSet[hash] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
2015-09-29 12:45:38 +00:00
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
ctx = WithFiringAlerts(ctx, firing)
|
|
|
|
ctx = WithResolvedAlerts(ctx, resolved)
|
2016-08-16 12:09:06 +00:00
|
|
|
|
2017-04-21 09:43:12 +00:00
|
|
|
entries, err := n.nflog.Query(nflog.QGroupKey(gkey), nflog.QReceiver(n.recv))
|
2016-08-16 12:09:06 +00:00
|
|
|
|
|
|
|
if err != nil && err != nflog.ErrNotFound {
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, err
|
2015-09-29 12:45:38 +00:00
|
|
|
}
|
2016-08-16 12:09:06 +00:00
|
|
|
var entry *nflogpb.Entry
|
|
|
|
switch len(entries) {
|
|
|
|
case 0:
|
|
|
|
case 1:
|
|
|
|
entry = entries[0]
|
|
|
|
case 2:
|
|
|
|
return ctx, nil, fmt.Errorf("Unexpected entry result size %d", len(entries))
|
|
|
|
}
|
2017-03-13 12:44:36 +00:00
|
|
|
if ok, err := n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval); err != nil {
|
2016-08-16 12:09:06 +00:00
|
|
|
return ctx, nil, err
|
|
|
|
} else if ok {
|
|
|
|
return ctx, alerts, nil
|
2015-09-30 17:03:04 +00:00
|
|
|
}
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, nil
|
2016-06-02 17:29:52 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// RetryStage notifies via passed integration with exponential backoff until it
|
|
|
|
// succeeds. It aborts if the context is canceled or timed out.
|
|
|
|
type RetryStage struct {
|
|
|
|
integration Integration
|
2016-06-02 17:29:52 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// NewRetryStage returns a new instance of a RetryStage.
|
|
|
|
func NewRetryStage(i Integration) *RetryStage {
|
|
|
|
return &RetryStage{
|
|
|
|
integration: i,
|
2015-09-29 13:12:31 +00:00
|
|
|
}
|
2016-08-12 13:22:17 +00:00
|
|
|
}
|
2015-09-29 13:12:31 +00:00
|
|
|
|
2016-08-12 13:22:17 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-12 17:18:26 +00:00
|
|
|
var (
|
|
|
|
i = 0
|
|
|
|
b = backoff.NewExponentialBackOff()
|
|
|
|
tick = backoff.NewTicker(b)
|
2016-09-05 15:51:03 +00:00
|
|
|
iErr error
|
2016-08-12 17:18:26 +00:00
|
|
|
)
|
|
|
|
defer tick.Stop()
|
2015-10-11 14:54:31 +00:00
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
for {
|
|
|
|
i++
|
|
|
|
// Always check the context first to not notify again.
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
2016-09-05 15:51:03 +00:00
|
|
|
if iErr != nil {
|
|
|
|
return ctx, nil, iErr
|
|
|
|
}
|
|
|
|
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, ctx.Err()
|
2016-08-12 17:18:26 +00:00
|
|
|
default:
|
|
|
|
}
|
2015-09-29 13:12:31 +00:00
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
select {
|
|
|
|
case <-tick.C:
|
2016-09-05 15:51:03 +00:00
|
|
|
if retry, err := r.integration.Notify(ctx, alerts...); err != nil {
|
2016-08-12 17:18:26 +00:00
|
|
|
numFailedNotifications.WithLabelValues(r.integration.name).Inc()
|
2017-10-22 05:59:33 +00:00
|
|
|
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.name, "err", err)
|
2016-09-05 15:51:03 +00:00
|
|
|
if !retry {
|
2017-04-10 22:50:14 +00:00
|
|
|
return ctx, alerts, fmt.Errorf("Cancelling notify retry for %q due to unrecoverable error: %s", r.integration.name, err)
|
2016-09-05 15:51:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Save this error to be able to return the last seen error by an
|
|
|
|
// integration upon context timeout.
|
|
|
|
iErr = err
|
2016-08-12 17:18:26 +00:00
|
|
|
} else {
|
|
|
|
numNotifications.WithLabelValues(r.integration.name).Inc()
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, alerts, nil
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
|
|
|
case <-ctx.Done():
|
2016-09-05 15:51:03 +00:00
|
|
|
if iErr != nil {
|
|
|
|
return ctx, nil, iErr
|
|
|
|
}
|
|
|
|
|
2016-08-17 08:54:17 +00:00
|
|
|
return ctx, nil, ctx.Err()
|
2015-12-03 16:27:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// SetNotifiesStage sets the notification information about passed alerts. The
|
|
|
|
// passed alerts should have already been sent to the receivers.
|
|
|
|
type SetNotifiesStage struct {
|
2016-08-16 12:09:06 +00:00
|
|
|
nflog nflog.Log
|
|
|
|
recv *nflogpb.Receiver
|
2015-12-03 16:27:36 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 17:18:26 +00:00
|
|
|
// NewSetNotifiesStage returns a new instance of a SetNotifiesStage.
|
2016-08-16 12:09:06 +00:00
|
|
|
func NewSetNotifiesStage(l nflog.Log, recv *nflogpb.Receiver) *SetNotifiesStage {
|
2016-08-12 17:18:26 +00:00
|
|
|
return &SetNotifiesStage{
|
2017-03-13 12:44:36 +00:00
|
|
|
nflog: l,
|
|
|
|
recv: recv,
|
2015-12-03 16:27:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-12 13:22:17 +00:00
|
|
|
// Exec implements the Stage interface.
|
2017-10-22 05:59:33 +00:00
|
|
|
func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
2016-08-16 12:09:06 +00:00
|
|
|
gkey, ok := GroupKey(ctx)
|
|
|
|
if !ok {
|
|
|
|
return ctx, nil, fmt.Errorf("group key missing")
|
2016-08-12 17:18:26 +00:00
|
|
|
}
|
2016-08-12 13:22:17 +00:00
|
|
|
|
2017-03-13 12:44:36 +00:00
|
|
|
firing, ok := FiringAlerts(ctx)
|
|
|
|
if !ok {
|
|
|
|
return ctx, nil, fmt.Errorf("firing alerts missing")
|
2016-08-16 12:09:06 +00:00
|
|
|
}
|
2017-03-13 12:44:36 +00:00
|
|
|
|
|
|
|
resolved, ok := ResolvedAlerts(ctx)
|
|
|
|
if !ok {
|
|
|
|
return ctx, nil, fmt.Errorf("resolved alerts missing")
|
|
|
|
}
|
|
|
|
|
2017-04-21 09:43:12 +00:00
|
|
|
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)
|
2016-08-12 13:22:17 +00:00
|
|
|
}
|