notify: don't use the global metrics registry (#1977)
* notify: don't use the global metrics registry Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Address Max's comment Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
40b3facdf6
commit
9f7f4ead46
|
@ -371,6 +371,7 @@ func run() int {
|
|||
tmpl *template.Template
|
||||
)
|
||||
|
||||
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer)
|
||||
configCoordinator := config.NewCoordinator(
|
||||
*configFile,
|
||||
prometheus.DefaultRegisterer,
|
||||
|
@ -399,7 +400,7 @@ func run() int {
|
|||
|
||||
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
|
||||
silencer := silence.NewSilencer(silences, marker, logger)
|
||||
pipeline := notify.BuildPipeline(
|
||||
pipeline := pipelineBuilder.New(
|
||||
receivers,
|
||||
waitFunc,
|
||||
inhibitor,
|
||||
|
|
135
notify/notify.go
135
notify/notify.go
|
@ -35,61 +35,6 @@ import (
|
|||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
var (
|
||||
numNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notifications_total",
|
||||
Help: "The total number of attempted notifications.",
|
||||
}, []string{"integration"})
|
||||
|
||||
numFailedNotifications = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notifications_failed_total",
|
||||
Help: "The total number of failed notifications.",
|
||||
}, []string{"integration"})
|
||||
|
||||
notificationLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notification_latency_seconds",
|
||||
Help: "The latency of notifications in seconds.",
|
||||
Buckets: []float64{1, 5, 10, 15, 20},
|
||||
}, []string{"integration"})
|
||||
)
|
||||
|
||||
func init() {
|
||||
numNotifications.WithLabelValues("email")
|
||||
numNotifications.WithLabelValues("hipchat")
|
||||
numNotifications.WithLabelValues("pagerduty")
|
||||
numNotifications.WithLabelValues("wechat")
|
||||
numNotifications.WithLabelValues("pushover")
|
||||
numNotifications.WithLabelValues("slack")
|
||||
numNotifications.WithLabelValues("opsgenie")
|
||||
numNotifications.WithLabelValues("webhook")
|
||||
numNotifications.WithLabelValues("victorops")
|
||||
numFailedNotifications.WithLabelValues("email")
|
||||
numFailedNotifications.WithLabelValues("hipchat")
|
||||
numFailedNotifications.WithLabelValues("pagerduty")
|
||||
numFailedNotifications.WithLabelValues("wechat")
|
||||
numFailedNotifications.WithLabelValues("pushover")
|
||||
numFailedNotifications.WithLabelValues("slack")
|
||||
numFailedNotifications.WithLabelValues("opsgenie")
|
||||
numFailedNotifications.WithLabelValues("webhook")
|
||||
numFailedNotifications.WithLabelValues("victorops")
|
||||
notificationLatencySeconds.WithLabelValues("email")
|
||||
notificationLatencySeconds.WithLabelValues("hipchat")
|
||||
notificationLatencySeconds.WithLabelValues("pagerduty")
|
||||
notificationLatencySeconds.WithLabelValues("wechat")
|
||||
notificationLatencySeconds.WithLabelValues("pushover")
|
||||
notificationLatencySeconds.WithLabelValues("slack")
|
||||
notificationLatencySeconds.WithLabelValues("opsgenie")
|
||||
notificationLatencySeconds.WithLabelValues("webhook")
|
||||
notificationLatencySeconds.WithLabelValues("victorops")
|
||||
|
||||
prometheus.MustRegister(numNotifications)
|
||||
prometheus.MustRegister(numFailedNotifications)
|
||||
prometheus.MustRegister(notificationLatencySeconds)
|
||||
}
|
||||
|
||||
// ResolvedSender returns true if resolved notifications should be sent.
|
||||
type ResolvedSender interface {
|
||||
SendResolved() bool
|
||||
|
@ -261,8 +206,62 @@ type NotificationLog interface {
|
|||
Query(params ...nflog.QueryParam) ([]*nflogpb.Entry, error)
|
||||
}
|
||||
|
||||
// BuildPipeline builds a map of receivers to Stages.
|
||||
func BuildPipeline(
|
||||
type metrics struct {
|
||||
numNotifications *prometheus.CounterVec
|
||||
numFailedNotifications *prometheus.CounterVec
|
||||
notificationLatencySeconds *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
func newMetrics(r prometheus.Registerer) *metrics {
|
||||
m := &metrics{
|
||||
numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notifications_total",
|
||||
Help: "The total number of attempted notifications.",
|
||||
}, []string{"integration"}),
|
||||
numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notifications_failed_total",
|
||||
Help: "The total number of failed notifications.",
|
||||
}, []string{"integration"}),
|
||||
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notification_latency_seconds",
|
||||
Help: "The latency of notifications in seconds.",
|
||||
Buckets: []float64{1, 5, 10, 15, 20},
|
||||
}, []string{"integration"}),
|
||||
}
|
||||
for _, integration := range []string{
|
||||
"email",
|
||||
"hipchat",
|
||||
"pagerduty",
|
||||
"wechat",
|
||||
"pushover",
|
||||
"slack",
|
||||
"opsgenie",
|
||||
"webhook",
|
||||
"victorops",
|
||||
} {
|
||||
m.numNotifications.WithLabelValues(integration)
|
||||
m.numFailedNotifications.WithLabelValues(integration)
|
||||
m.notificationLatencySeconds.WithLabelValues(integration)
|
||||
}
|
||||
r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds)
|
||||
return m
|
||||
}
|
||||
|
||||
type PipelineBuilder struct {
|
||||
metrics *metrics
|
||||
}
|
||||
|
||||
func NewPipelineBuilder(r prometheus.Registerer) *PipelineBuilder {
|
||||
return &PipelineBuilder{
|
||||
metrics: newMetrics(r),
|
||||
}
|
||||
}
|
||||
|
||||
// New returns a map of receivers to Stages.
|
||||
func (pb *PipelineBuilder) New(
|
||||
receivers map[string][]Integration,
|
||||
wait func() time.Duration,
|
||||
inhibitor *inhibit.Inhibitor,
|
||||
|
@ -277,14 +276,20 @@ func BuildPipeline(
|
|||
ss := NewMuteStage(silencer)
|
||||
|
||||
for name := range receivers {
|
||||
st := createReceiverStage(name, receivers[name], wait, notificationLog)
|
||||
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
|
||||
rs[name] = MultiStage{ms, is, ss, st}
|
||||
}
|
||||
return rs
|
||||
}
|
||||
|
||||
// createReceiverStage creates a pipeline of stages for a receiver.
|
||||
func createReceiverStage(name string, integrations []Integration, wait func() time.Duration, notificationLog NotificationLog) Stage {
|
||||
func createReceiverStage(
|
||||
name string,
|
||||
integrations []Integration,
|
||||
wait func() time.Duration,
|
||||
notificationLog NotificationLog,
|
||||
metrics *metrics,
|
||||
) Stage {
|
||||
var fs FanoutStage
|
||||
for i := range integrations {
|
||||
recv := &nflogpb.Receiver{
|
||||
|
@ -295,7 +300,7 @@ func createReceiverStage(name string, integrations []Integration, wait func() ti
|
|||
var s MultiStage
|
||||
s = append(s, NewWaitStage(wait))
|
||||
s = append(s, NewDedupStage(&integrations[i], notificationLog, recv))
|
||||
s = append(s, NewRetryStage(integrations[i], name))
|
||||
s = append(s, NewRetryStage(integrations[i], name, metrics))
|
||||
s = append(s, NewSetNotifiesStage(notificationLog, recv))
|
||||
|
||||
fs = append(fs, s)
|
||||
|
@ -594,13 +599,15 @@ func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Al
|
|||
type RetryStage struct {
|
||||
integration Integration
|
||||
groupName string
|
||||
metrics *metrics
|
||||
}
|
||||
|
||||
// NewRetryStage returns a new instance of a RetryStage.
|
||||
func NewRetryStage(i Integration, groupName string) *RetryStage {
|
||||
func NewRetryStage(i Integration, groupName string, metrics *metrics) *RetryStage {
|
||||
return &RetryStage{
|
||||
integration: i,
|
||||
groupName: groupName,
|
||||
metrics: metrics,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -653,10 +660,10 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
|
|||
case <-tick.C:
|
||||
now := time.Now()
|
||||
retry, err := r.integration.Notify(ctx, sent...)
|
||||
notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
|
||||
numNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||
r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
|
||||
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||
if err != nil {
|
||||
numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||
r.metrics.numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
|
||||
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "err", err)
|
||||
if !retry {
|
||||
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
|
||||
|
|
|
@ -389,6 +389,7 @@ func TestRetryStageWithError(t *testing.T) {
|
|||
}
|
||||
r := RetryStage{
|
||||
integration: i,
|
||||
metrics: newMetrics(prometheus.NewRegistry()),
|
||||
}
|
||||
|
||||
alerts := []*types.Alert{
|
||||
|
@ -429,6 +430,7 @@ func TestRetryStageNoResolved(t *testing.T) {
|
|||
}
|
||||
r := RetryStage{
|
||||
integration: i,
|
||||
metrics: newMetrics(prometheus.NewRegistry()),
|
||||
}
|
||||
|
||||
alerts := []*types.Alert{
|
||||
|
@ -482,6 +484,7 @@ func TestRetryStageSendResolved(t *testing.T) {
|
|||
}
|
||||
r := RetryStage{
|
||||
integration: i,
|
||||
metrics: newMetrics(prometheus.NewRegistry()),
|
||||
}
|
||||
|
||||
alerts := []*types.Alert{
|
||||
|
|
Loading…
Reference in New Issue