Better Metrics For Alerts

* Closes prometheus/prometheus#2429 * Moved metrics to top of file for easier access * Initialised CounterVecs
2024-12-26 00:23:18 +00:00 · 2017-03-02 23:58:15 +05:30 · 2017-03-02 23:58:15 +05:30 · 41da5c4ef2
commit 41da5c4ef2
parent 7e14533b32
1 changed files with 69 additions and 58 deletions
--- a/notifier/notifier.go
+++ b/notifier/notifier.go
@ -50,6 +50,49 @@ const (
 	alertmanagerLabel = "alertmanager"
 )

+var (
+	alertLatency = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+		Namespace: namespace,
+		Subsystem: subsystem,
+		Name:      "latency_seconds",
+		Help:      "Latency quantiles for sending alert notifications (not including dropped notifications).",
+	},
+		[]string{alertmanagerLabel},
+	)
+
+	alertErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: namespace,
+		Subsystem: subsystem,
+		Name:      "errors_total",
+		Help:      "Total number of errors sending alert notifications.",
+	},
+		[]string{alertmanagerLabel},
+	)
+
+	alertSent = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: namespace,
+		Subsystem: subsystem,
+		Name:      "sent_total",
+		Help:      "Total number of alerts sent.",
+	},
+		[]string{alertmanagerLabel},
+	)
+
+	alertDropped = prometheus.NewCounter(prometheus.CounterOpts{
+		Namespace: namespace,
+		Subsystem: subsystem,
+		Name:      "dropped_total",
+		Help:      "Total number of alerts dropped due to errors when sending to Alertmanager.",
+	})
+
+	alertQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: namespace,
+		Subsystem: subsystem,
+		Name:      "queue_length",
+		Help:      "The number of alert notifications in the queue.",
+	})
+)
+
 // Notifier is responsible for dispatching alert notifications to an
 // alert manager service.
 type Notifier struct {
@ -61,11 +104,6 @@ type Notifier struct {
 	ctx    context.Context
 	cancel func()

-	latency       *prometheus.SummaryVec
-	errors        *prometheus.CounterVec
-	sent          *prometheus.CounterVec
-	dropped       prometheus.Counter
-	queueLength   prometheus.Gauge
 	queueCapacity prometheus.Metric

 	alertmanagers   []*alertmanagerSet
@ -96,42 +134,6 @@ func New(o *Options) *Notifier {
 		more:   make(chan struct{}, 1),
 		opts:   o,

-		latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{
-			Namespace: namespace,
-			Subsystem: subsystem,
-			Name:      "latency_seconds",
-			Help:      "Latency quantiles for sending alert notifications (not including dropped notifications).",
-		},
-			[]string{alertmanagerLabel},
-		),
-		errors: prometheus.NewCounterVec(prometheus.CounterOpts{
-			Namespace: namespace,
-			Subsystem: subsystem,
-			Name:      "errors_total",
-			Help:      "Total number of errors sending alert notifications.",
-		},
-			[]string{alertmanagerLabel},
-		),
-		sent: prometheus.NewCounterVec(prometheus.CounterOpts{
-			Namespace: namespace,
-			Subsystem: subsystem,
-			Name:      "sent_total",
-			Help:      "Total number of alerts successfully sent.",
-		},
-			[]string{alertmanagerLabel},
-		),
-		dropped: prometheus.NewCounter(prometheus.CounterOpts{
-			Namespace: namespace,
-			Subsystem: subsystem,
-			Name:      "dropped_total",
-			Help:      "Total number of alerts dropped due to errors when sending to Alertmanager.",
-		}),
-		queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
-			Namespace: namespace,
-			Subsystem: subsystem,
-			Name:      "queue_length",
-			Help:      "The number of alert notifications in the queue.",
-		}),
 		queueCapacity: prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
@ -160,6 +162,12 @@ func (n *Notifier) ApplyConfig(conf *config.Config) error {
 		if err != nil {
 			return err
 		}
+
+		for _, am := range ams.ams {
+			alertErrors.WithLabelValues(am.url())
+			alertSent.WithLabelValues(am.url())
+		}
+
 		amSets = append(amSets, ams)
 	}

@ -216,7 +224,7 @@ func (n *Notifier) Run() {
 		alerts := n.nextBatch()

 		if !n.sendAll(alerts...) {
-			n.dropped.Add(float64(len(alerts)))
+			alertDropped.Add(float64(len(alerts)))
 		}
 		// If the queue still has items left, kick off the next iteration.
 		if n.queueLen() > 0 {
@ -248,7 +256,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
 		alerts = alerts[d:]

 		log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d)
-		n.dropped.Add(float64(d))
+		alertDropped.Add(float64(d))
 	}

 	// If the queue is full, remove the oldest alerts in favor
@ -257,7 +265,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
 		n.queue = n.queue[d:]

 		log.Warnf("Alert notification queue full, dropping %d alerts", d)
-		n.dropped.Add(float64(d))
+		alertDropped.Add(float64(d))
 	}
 	n.queue = append(n.queue, alerts...)

@ -339,12 +347,12 @@ func (n *Notifier) sendAll(alerts ...*model.Alert) bool {

 				if err := n.sendOne(ctx, ams.client, u, b); err != nil {
 					log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err)
-					n.errors.WithLabelValues(u).Inc()
+					alertErrors.WithLabelValues(u).Inc()
 				} else {
 					atomic.AddUint64(&numSuccess, 1)
 				}
-				n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
-				n.sent.WithLabelValues(u).Add(float64(len(alerts)))
+				alertLatency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
+				alertSent.WithLabelValues(u).Add(float64(len(alerts)))

 				wg.Done()
 			}(am)
@ -383,25 +391,25 @@ func (n *Notifier) Stop() {

 // Describe implements prometheus.Collector.
 func (n *Notifier) Describe(ch chan<- *prometheus.Desc) {
-	n.latency.Describe(ch)
-	n.errors.Describe(ch)
-	n.sent.Describe(ch)
+	alertLatency.Describe(ch)
+	alertErrors.Describe(ch)
+	alertSent.Describe(ch)

-	ch <- n.dropped.Desc()
-	ch <- n.queueLength.Desc()
+	ch <- alertDropped.Desc()
+	ch <- alertQueueLength.Desc()
 	ch <- n.queueCapacity.Desc()
 }

 // Collect implements prometheus.Collector.
 func (n *Notifier) Collect(ch chan<- prometheus.Metric) {
-	n.queueLength.Set(float64(n.queueLen()))
+	alertQueueLength.Set(float64(n.queueLen()))

-	n.latency.Collect(ch)
-	n.errors.Collect(ch)
-	n.sent.Collect(ch)
+	alertLatency.Collect(ch)
+	alertErrors.Collect(ch)
+	alertSent.Collect(ch)

-	ch <- n.dropped
-	ch <- n.queueLength
+	ch <- alertDropped
+	ch <- alertQueueLength
 	ch <- n.queueCapacity
 }

@ -474,6 +482,9 @@ func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) {
 			continue
 		}

+		alertSent.WithLabelValues(us)
+		alertErrors.WithLabelValues(us)
+
 		seen[us] = struct{}{}
 		s.ams = append(s.ams, am)
 	}