notify: don't use the global metrics registry (#1977)
* notify: don't use the global metrics registry Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Address Max's comment Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
40b3facdf6
commit
9f7f4ead46
|
@ -371,6 +371,7 @@ func run() int {
|
||||||
tmpl *template.Template
|
tmpl *template.Template
|
||||||
)
|
)
|
||||||
|
|
||||||
|
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer)
|
||||||
configCoordinator := config.NewCoordinator(
|
configCoordinator := config.NewCoordinator(
|
||||||
*configFile,
|
*configFile,
|
||||||
prometheus.DefaultRegisterer,
|
prometheus.DefaultRegisterer,
|
||||||
|
@ -399,7 +400,7 @@ func run() int {
|
||||||
|
|
||||||
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
|
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
|
||||||
silencer := silence.NewSilencer(silences, marker, logger)
|
silencer := silence.NewSilencer(silences, marker, logger)
|
||||||
pipeline := notify.BuildPipeline(
|
pipeline := pipelineBuilder.New(
|
||||||
receivers,
|
receivers,
|
||||||
waitFunc,
|
waitFunc,
|
||||||
inhibitor,
|
inhibitor,
|
||||||
|
|
135
notify/notify.go
135
notify/notify.go
|
@ -35,61 +35,6 @@ import (
|
||||||
"github.com/prometheus/alertmanager/types"
|
"github.com/prometheus/alertmanager/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
numNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Namespace: "alertmanager",
|
|
||||||
Name: "notifications_total",
|
|
||||||
Help: "The total number of attempted notifications.",
|
|
||||||
}, []string{"integration"})
|
|
||||||
|
|
||||||
numFailedNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Namespace: "alertmanager",
|
|
||||||
Name: "notifications_failed_total",
|
|
||||||
Help: "The total number of failed notifications.",
|
|
||||||
}, []string{"integration"})
|
|
||||||
|
|
||||||
notificationLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
||||||
Namespace: "alertmanager",
|
|
||||||
Name: "notification_latency_seconds",
|
|
||||||
Help: "The latency of notifications in seconds.",
|
|
||||||
Buckets: []float64{1, 5, 10, 15, 20},
|
|
||||||
}, []string{"integration"})
|
|
||||||
)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
numNotifications.WithLabelValues("email")
|
|
||||||
numNotifications.WithLabelValues("hipchat")
|
|
||||||
numNotifications.WithLabelValues("pagerduty")
|
|
||||||
numNotifications.WithLabelValues("wechat")
|
|
||||||
numNotifications.WithLabelValues("pushover")
|
|
||||||
numNotifications.WithLabelValues("slack")
|
|
||||||
numNotifications.WithLabelValues("opsgenie")
|
|
||||||
numNotifications.WithLabelValues("webhook")
|
|
||||||
numNotifications.WithLabelValues("victorops")
|
|
||||||
numFailedNotifications.WithLabelValues("email")
|
|
||||||
numFailedNotifications.WithLabelValues("hipchat")
|
|
||||||
numFailedNotifications.WithLabelValues("pagerduty")
|
|
||||||
numFailedNotifications.WithLabelValues("wechat")
|
|
||||||
numFailedNotifications.WithLabelValues("pushover")
|
|
||||||
numFailedNotifications.WithLabelValues("slack")
|
|
||||||
numFailedNotifications.WithLabelValues("opsgenie")
|
|
||||||
numFailedNotifications.WithLabelValues("webhook")
|
|
||||||
numFailedNotifications.WithLabelValues("victorops")
|
|
||||||
notificationLatencySeconds.WithLabelValues("email")
|
|
||||||
notificationLatencySeconds.WithLabelValues("hipchat")
|
|
||||||
notificationLatencySeconds.WithLabelValues("pagerduty")
|
|
||||||
notificationLatencySeconds.WithLabelValues("wechat")
|
|
||||||
notificationLatencySeconds.WithLabelValues("pushover")
|
|
||||||
notificationLatencySeconds.WithLabelValues("slack")
|
|
||||||
notificationLatencySeconds.WithLabelValues("opsgenie")
|
|
||||||
notificationLatencySeconds.WithLabelValues("webhook")
|
|
||||||
notificationLatencySeconds.WithLabelValues("victorops")
|
|
||||||
|
|
||||||
prometheus.MustRegister(numNotifications)
|
|
||||||
prometheus.MustRegister(numFailedNotifications)
|
|
||||||
prometheus.MustRegister(notificationLatencySeconds)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ResolvedSender returns true if resolved notifications should be sent.
|
// ResolvedSender returns true if resolved notifications should be sent.
|
||||||
type ResolvedSender interface {
|
type ResolvedSender interface {
|
||||||
SendResolved() bool
|
SendResolved() bool
|
||||||
|
@ -261,8 +206,62 @@ type NotificationLog interface {
|
||||||
Query(params ...nflog.QueryParam) ([]*nflogpb.Entry, error)
|
Query(params ...nflog.QueryParam) ([]*nflogpb.Entry, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// BuildPipeline builds a map of receivers to Stages.
|
type metrics struct {
|
||||||
func BuildPipeline(
|
numNotifications *prometheus.CounterVec
|
||||||
|
numFailedNotifications *prometheus.CounterVec
|
||||||
|
notificationLatencySeconds *prometheus.HistogramVec
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMetrics(r prometheus.Registerer) *metrics {
|
||||||
|
m := &metrics{
|
||||||
|
numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: "alertmanager",
|
||||||
|
Name: "notifications_total",
|
||||||
|
Help: "The total number of attempted notifications.",
|
||||||
|
}, []string{"integration"}),
|
||||||
|
numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: "alertmanager",
|
||||||
|
Name: "notifications_failed_total",
|
||||||
|
Help: "The total number of failed notifications.",
|
||||||
|
}, []string{"integration"}),
|
||||||
|
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: "alertmanager",
|
||||||
|
Name: "notification_latency_seconds",
|
||||||
|
Help: "The latency of notifications in seconds.",
|
||||||
|
Buckets: []float64{1, 5, 10, 15, 20},
|
||||||
|
}, []string{"integration"}),
|
||||||
|
}
|
||||||
|
for _, integration := range []string{
|
||||||
|
"email",
|
||||||
|
"hipchat",
|
||||||
|
"pagerduty",
|
||||||
|
"wechat",
|
||||||
|
"pushover",
|
||||||
|
"slack",
|
||||||
|
"opsgenie",
|
||||||
|
"webhook",
|
||||||
|
"victorops",
|
||||||
|
} {
|
||||||
|
m.numNotifications.WithLabelValues(integration)
|
||||||
|
m.numFailedNotifications.WithLabelValues(integration)
|
||||||
|
m.notificationLatencySeconds.WithLabelValues(integration)
|
||||||
|
}
|
||||||
|
r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds)
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
type PipelineBuilder struct {
|
||||||
|
metrics *metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPipelineBuilder(r prometheus.Registerer) *PipelineBuilder {
|
||||||
|
return &PipelineBuilder{
|
||||||
|
metrics: newMetrics(r),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// New returns a map of receivers to Stages.
|
||||||
|
func (pb *PipelineBuilder) New(
|
||||||
receivers map[string][]Integration,
|
receivers map[string][]Integration,
|
||||||
wait func() time.Duration,
|
wait func() time.Duration,
|
||||||
inhibitor *inhibit.Inhibitor,
|
inhibitor *inhibit.Inhibitor,
|
||||||
|
@ -277,14 +276,20 @@ func BuildPipeline(
|
||||||
ss := NewMuteStage(silencer)
|
ss := NewMuteStage(silencer)
|
||||||
|
|
||||||
for name := range receivers {
|
for name := range receivers {
|
||||||
st := createReceiverStage(name, receivers[name], wait, notificationLog)
|
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
|
||||||
rs[name] = MultiStage{ms, is, ss, st}
|
rs[name] = MultiStage{ms, is, ss, st}
|
||||||
}
|
}
|
||||||
return rs
|
return rs
|
||||||
}
|
}
|
||||||
|
|
||||||
// createReceiverStage creates a pipeline of stages for a receiver.
|
// createReceiverStage creates a pipeline of stages for a receiver.
|
||||||
func createReceiverStage(name string, integrations []Integration, wait func() time.Duration, notificationLog NotificationLog) Stage {
|
func createReceiverStage(
|
||||||
|
name string,
|
||||||
|
integrations []Integration,
|
||||||
|
wait func() time.Duration,
|
||||||
|
notificationLog NotificationLog,
|
||||||
|
metrics *metrics,
|
||||||
|
) Stage {
|
||||||
var fs FanoutStage
|
var fs FanoutStage
|
||||||
for i := range integrations {
|
for i := range integrations {
|
||||||
recv := &nflogpb.Receiver{
|
recv := &nflogpb.Receiver{
|
||||||
|
@ -295,7 +300,7 @@ func createReceiverStage(name string, integrations []Integration, wait func() ti
|
||||||
var s MultiStage
|
var s MultiStage
|
||||||
s = append(s, NewWaitStage(wait))
|
s = append(s, NewWaitStage(wait))
|
||||||
s = append(s, NewDedupStage(&integrations[i], notificationLog, recv))
|
s = append(s, NewDedupStage(&integrations[i], notificationLog, recv))
|
||||||
s = append(s, NewRetryStage(integrations[i], name))
|
s = append(s, NewRetryStage(integrations[i], name, metrics))
|
||||||
s = append(s, NewSetNotifiesStage(notificationLog, recv))
|
s = append(s, NewSetNotifiesStage(notificationLog, recv))
|
||||||
|
|
||||||
fs = append(fs, s)
|
fs = append(fs, s)
|
||||||
|
@ -594,13 +599,15 @@ func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Al
|
||||||
type RetryStage struct {
|
type RetryStage struct {
|
||||||
integration Integration
|
integration Integration
|
||||||
groupName string
|
groupName string
|
||||||
|
metrics *metrics
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewRetryStage returns a new instance of a RetryStage.
|
// NewRetryStage returns a new instance of a RetryStage.
|
||||||
func NewRetryStage(i Integration, groupName string) *RetryStage {
|
func NewRetryStage(i Integration, groupName string, metrics *metrics) *RetryStage {
|
||||||
return &RetryStage{
|
return &RetryStage{
|
||||||
integration: i,
|
integration: i,
|
||||||
groupName: groupName,
|
groupName: groupName,
|
||||||
|
metrics: metrics,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -653,10 +660,10 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
|
||||||
case <-tick.C:
|
case <-tick.C:
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
retry, err := r.integration.Notify(ctx, sent...)
|
retry, err := r.integration.Notify(ctx, sent...)
|
||||||
notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
|
r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
|
||||||
numNotifications.WithLabelValues(r.integration.Name()).Inc()
|
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
|
r.metrics.numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||||
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "err", err)
|
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "err", err)
|
||||||
if !retry {
|
if !retry {
|
||||||
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
|
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
|
||||||
|
|
|
@ -389,6 +389,7 @@ func TestRetryStageWithError(t *testing.T) {
|
||||||
}
|
}
|
||||||
r := RetryStage{
|
r := RetryStage{
|
||||||
integration: i,
|
integration: i,
|
||||||
|
metrics: newMetrics(prometheus.NewRegistry()),
|
||||||
}
|
}
|
||||||
|
|
||||||
alerts := []*types.Alert{
|
alerts := []*types.Alert{
|
||||||
|
@ -429,6 +430,7 @@ func TestRetryStageNoResolved(t *testing.T) {
|
||||||
}
|
}
|
||||||
r := RetryStage{
|
r := RetryStage{
|
||||||
integration: i,
|
integration: i,
|
||||||
|
metrics: newMetrics(prometheus.NewRegistry()),
|
||||||
}
|
}
|
||||||
|
|
||||||
alerts := []*types.Alert{
|
alerts := []*types.Alert{
|
||||||
|
@ -482,6 +484,7 @@ func TestRetryStageSendResolved(t *testing.T) {
|
||||||
}
|
}
|
||||||
r := RetryStage{
|
r := RetryStage{
|
||||||
integration: i,
|
integration: i,
|
||||||
|
metrics: newMetrics(prometheus.NewRegistry()),
|
||||||
}
|
}
|
||||||
|
|
||||||
alerts := []*types.Alert{
|
alerts := []*types.Alert{
|
||||||
|
|
Loading…
Reference in New Issue