From 41da5c4ef26c9624e6c1f466b0989f652df05b97 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Thu, 2 Mar 2017 23:58:15 +0530 Subject: [PATCH] Better Metrics For Alerts * Closes prometheus/prometheus#2429 * Moved metrics to top of file for easier access * Initialised CounterVecs --- notifier/notifier.go | 127 +++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 58 deletions(-) diff --git a/notifier/notifier.go b/notifier/notifier.go index e9af00a4c..cc256ed4f 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -50,6 +50,49 @@ const ( alertmanagerLabel = "alertmanager" ) +var ( + alertLatency = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "latency_seconds", + Help: "Latency quantiles for sending alert notifications (not including dropped notifications).", + }, + []string{alertmanagerLabel}, + ) + + alertErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "errors_total", + Help: "Total number of errors sending alert notifications.", + }, + []string{alertmanagerLabel}, + ) + + alertSent = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_total", + Help: "Total number of alerts sent.", + }, + []string{alertmanagerLabel}, + ) + + alertDropped = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "dropped_total", + Help: "Total number of alerts dropped due to errors when sending to Alertmanager.", + }) + + alertQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queue_length", + Help: "The number of alert notifications in the queue.", + }) +) + // Notifier is responsible for dispatching alert notifications to an // alert manager service. type Notifier struct { @@ -61,11 +104,6 @@ type Notifier struct { ctx context.Context cancel func() - latency *prometheus.SummaryVec - errors *prometheus.CounterVec - sent *prometheus.CounterVec - dropped prometheus.Counter - queueLength prometheus.Gauge queueCapacity prometheus.Metric alertmanagers []*alertmanagerSet @@ -96,42 +134,6 @@ func New(o *Options) *Notifier { more: make(chan struct{}, 1), opts: o, - latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "latency_seconds", - Help: "Latency quantiles for sending alert notifications (not including dropped notifications).", - }, - []string{alertmanagerLabel}, - ), - errors: prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "errors_total", - Help: "Total number of errors sending alert notifications.", - }, - []string{alertmanagerLabel}, - ), - sent: prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "sent_total", - Help: "Total number of alerts successfully sent.", - }, - []string{alertmanagerLabel}, - ), - dropped: prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "dropped_total", - Help: "Total number of alerts dropped due to errors when sending to Alertmanager.", - }), - queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "queue_length", - Help: "The number of alert notifications in the queue.", - }), queueCapacity: prometheus.MustNewConstMetric( prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), @@ -160,6 +162,12 @@ func (n *Notifier) ApplyConfig(conf *config.Config) error { if err != nil { return err } + + for _, am := range ams.ams { + alertErrors.WithLabelValues(am.url()) + alertSent.WithLabelValues(am.url()) + } + amSets = append(amSets, ams) } @@ -216,7 +224,7 @@ func (n *Notifier) Run() { alerts := n.nextBatch() if !n.sendAll(alerts...) { - n.dropped.Add(float64(len(alerts))) + alertDropped.Add(float64(len(alerts))) } // If the queue still has items left, kick off the next iteration. if n.queueLen() > 0 { @@ -248,7 +256,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) { alerts = alerts[d:] log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d) - n.dropped.Add(float64(d)) + alertDropped.Add(float64(d)) } // If the queue is full, remove the oldest alerts in favor @@ -257,7 +265,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) { n.queue = n.queue[d:] log.Warnf("Alert notification queue full, dropping %d alerts", d) - n.dropped.Add(float64(d)) + alertDropped.Add(float64(d)) } n.queue = append(n.queue, alerts...) @@ -339,12 +347,12 @@ func (n *Notifier) sendAll(alerts ...*model.Alert) bool { if err := n.sendOne(ctx, ams.client, u, b); err != nil { log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err) - n.errors.WithLabelValues(u).Inc() + alertErrors.WithLabelValues(u).Inc() } else { atomic.AddUint64(&numSuccess, 1) } - n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds()) - n.sent.WithLabelValues(u).Add(float64(len(alerts))) + alertLatency.WithLabelValues(u).Observe(time.Since(begin).Seconds()) + alertSent.WithLabelValues(u).Add(float64(len(alerts))) wg.Done() }(am) @@ -383,25 +391,25 @@ func (n *Notifier) Stop() { // Describe implements prometheus.Collector. func (n *Notifier) Describe(ch chan<- *prometheus.Desc) { - n.latency.Describe(ch) - n.errors.Describe(ch) - n.sent.Describe(ch) + alertLatency.Describe(ch) + alertErrors.Describe(ch) + alertSent.Describe(ch) - ch <- n.dropped.Desc() - ch <- n.queueLength.Desc() + ch <- alertDropped.Desc() + ch <- alertQueueLength.Desc() ch <- n.queueCapacity.Desc() } // Collect implements prometheus.Collector. func (n *Notifier) Collect(ch chan<- prometheus.Metric) { - n.queueLength.Set(float64(n.queueLen())) + alertQueueLength.Set(float64(n.queueLen())) - n.latency.Collect(ch) - n.errors.Collect(ch) - n.sent.Collect(ch) + alertLatency.Collect(ch) + alertErrors.Collect(ch) + alertSent.Collect(ch) - ch <- n.dropped - ch <- n.queueLength + ch <- alertDropped + ch <- alertQueueLength ch <- n.queueCapacity } @@ -474,6 +482,9 @@ func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) { continue } + alertSent.WithLabelValues(us) + alertErrors.WithLabelValues(us) + seen[us] = struct{}{} s.ams = append(s.ams, am) }