mirror of
https://github.com/prometheus/prometheus
synced 2024-12-26 00:23:18 +00:00
Better Metrics For Alerts
* Closes prometheus/prometheus#2429 * Moved metrics to top of file for easier access * Initialised CounterVecs
This commit is contained in:
parent
7e14533b32
commit
41da5c4ef2
@ -50,6 +50,49 @@ const (
|
||||
alertmanagerLabel = "alertmanager"
|
||||
)
|
||||
|
||||
var (
|
||||
alertLatency = prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "latency_seconds",
|
||||
Help: "Latency quantiles for sending alert notifications (not including dropped notifications).",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
)
|
||||
|
||||
alertErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "errors_total",
|
||||
Help: "Total number of errors sending alert notifications.",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
)
|
||||
|
||||
alertSent = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "sent_total",
|
||||
Help: "Total number of alerts sent.",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
)
|
||||
|
||||
alertDropped = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "dropped_total",
|
||||
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
|
||||
})
|
||||
|
||||
alertQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "queue_length",
|
||||
Help: "The number of alert notifications in the queue.",
|
||||
})
|
||||
)
|
||||
|
||||
// Notifier is responsible for dispatching alert notifications to an
|
||||
// alert manager service.
|
||||
type Notifier struct {
|
||||
@ -61,11 +104,6 @@ type Notifier struct {
|
||||
ctx context.Context
|
||||
cancel func()
|
||||
|
||||
latency *prometheus.SummaryVec
|
||||
errors *prometheus.CounterVec
|
||||
sent *prometheus.CounterVec
|
||||
dropped prometheus.Counter
|
||||
queueLength prometheus.Gauge
|
||||
queueCapacity prometheus.Metric
|
||||
|
||||
alertmanagers []*alertmanagerSet
|
||||
@ -96,42 +134,6 @@ func New(o *Options) *Notifier {
|
||||
more: make(chan struct{}, 1),
|
||||
opts: o,
|
||||
|
||||
latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "latency_seconds",
|
||||
Help: "Latency quantiles for sending alert notifications (not including dropped notifications).",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
),
|
||||
errors: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "errors_total",
|
||||
Help: "Total number of errors sending alert notifications.",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
),
|
||||
sent: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "sent_total",
|
||||
Help: "Total number of alerts successfully sent.",
|
||||
},
|
||||
[]string{alertmanagerLabel},
|
||||
),
|
||||
dropped: prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "dropped_total",
|
||||
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
|
||||
}),
|
||||
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "queue_length",
|
||||
Help: "The number of alert notifications in the queue.",
|
||||
}),
|
||||
queueCapacity: prometheus.MustNewConstMetric(
|
||||
prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
|
||||
@ -160,6 +162,12 @@ func (n *Notifier) ApplyConfig(conf *config.Config) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, am := range ams.ams {
|
||||
alertErrors.WithLabelValues(am.url())
|
||||
alertSent.WithLabelValues(am.url())
|
||||
}
|
||||
|
||||
amSets = append(amSets, ams)
|
||||
}
|
||||
|
||||
@ -216,7 +224,7 @@ func (n *Notifier) Run() {
|
||||
alerts := n.nextBatch()
|
||||
|
||||
if !n.sendAll(alerts...) {
|
||||
n.dropped.Add(float64(len(alerts)))
|
||||
alertDropped.Add(float64(len(alerts)))
|
||||
}
|
||||
// If the queue still has items left, kick off the next iteration.
|
||||
if n.queueLen() > 0 {
|
||||
@ -248,7 +256,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
|
||||
alerts = alerts[d:]
|
||||
|
||||
log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d)
|
||||
n.dropped.Add(float64(d))
|
||||
alertDropped.Add(float64(d))
|
||||
}
|
||||
|
||||
// If the queue is full, remove the oldest alerts in favor
|
||||
@ -257,7 +265,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
|
||||
n.queue = n.queue[d:]
|
||||
|
||||
log.Warnf("Alert notification queue full, dropping %d alerts", d)
|
||||
n.dropped.Add(float64(d))
|
||||
alertDropped.Add(float64(d))
|
||||
}
|
||||
n.queue = append(n.queue, alerts...)
|
||||
|
||||
@ -339,12 +347,12 @@ func (n *Notifier) sendAll(alerts ...*model.Alert) bool {
|
||||
|
||||
if err := n.sendOne(ctx, ams.client, u, b); err != nil {
|
||||
log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err)
|
||||
n.errors.WithLabelValues(u).Inc()
|
||||
alertErrors.WithLabelValues(u).Inc()
|
||||
} else {
|
||||
atomic.AddUint64(&numSuccess, 1)
|
||||
}
|
||||
n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
|
||||
n.sent.WithLabelValues(u).Add(float64(len(alerts)))
|
||||
alertLatency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
|
||||
alertSent.WithLabelValues(u).Add(float64(len(alerts)))
|
||||
|
||||
wg.Done()
|
||||
}(am)
|
||||
@ -383,25 +391,25 @@ func (n *Notifier) Stop() {
|
||||
|
||||
// Describe implements prometheus.Collector.
|
||||
func (n *Notifier) Describe(ch chan<- *prometheus.Desc) {
|
||||
n.latency.Describe(ch)
|
||||
n.errors.Describe(ch)
|
||||
n.sent.Describe(ch)
|
||||
alertLatency.Describe(ch)
|
||||
alertErrors.Describe(ch)
|
||||
alertSent.Describe(ch)
|
||||
|
||||
ch <- n.dropped.Desc()
|
||||
ch <- n.queueLength.Desc()
|
||||
ch <- alertDropped.Desc()
|
||||
ch <- alertQueueLength.Desc()
|
||||
ch <- n.queueCapacity.Desc()
|
||||
}
|
||||
|
||||
// Collect implements prometheus.Collector.
|
||||
func (n *Notifier) Collect(ch chan<- prometheus.Metric) {
|
||||
n.queueLength.Set(float64(n.queueLen()))
|
||||
alertQueueLength.Set(float64(n.queueLen()))
|
||||
|
||||
n.latency.Collect(ch)
|
||||
n.errors.Collect(ch)
|
||||
n.sent.Collect(ch)
|
||||
alertLatency.Collect(ch)
|
||||
alertErrors.Collect(ch)
|
||||
alertSent.Collect(ch)
|
||||
|
||||
ch <- n.dropped
|
||||
ch <- n.queueLength
|
||||
ch <- alertDropped
|
||||
ch <- alertQueueLength
|
||||
ch <- n.queueCapacity
|
||||
}
|
||||
|
||||
@ -474,6 +482,9 @@ func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) {
|
||||
continue
|
||||
}
|
||||
|
||||
alertSent.WithLabelValues(us)
|
||||
alertErrors.WithLabelValues(us)
|
||||
|
||||
seen[us] = struct{}{}
|
||||
s.ams = append(s.ams, am)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user