Improve notification instrumentation (#1335)

* Improve notification instrumentation

- Add notificationLatencySeconds histogram to
debug duplicate messages. This can help rule out
if duplicate messages are being caused by
excessive latency when sending a notification.

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
This commit is contained in:
stuart nelson 2018-04-23 14:23:01 +02:00 committed by GitHub
parent 80f2eeb2ca
commit bc263d3e61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 21 additions and 1 deletions

View File

@ -48,6 +48,13 @@ var (
Name: "notifications_failed_total", Name: "notifications_failed_total",
Help: "The total number of failed notifications.", Help: "The total number of failed notifications.",
}, []string{"integration"}) }, []string{"integration"})
notificationLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Help: "The latency of notifications in seconds.",
Buckets: []float64{1, 5, 10, 15, 20},
}, []string{"integration"})
) )
func init() { func init() {
@ -69,9 +76,19 @@ func init() {
numFailedNotifications.WithLabelValues("opsgenie") numFailedNotifications.WithLabelValues("opsgenie")
numFailedNotifications.WithLabelValues("webhook") numFailedNotifications.WithLabelValues("webhook")
numFailedNotifications.WithLabelValues("victorops") numFailedNotifications.WithLabelValues("victorops")
notificationLatencySeconds.WithLabelValues("email")
notificationLatencySeconds.WithLabelValues("hipchat")
notificationLatencySeconds.WithLabelValues("pagerduty")
notificationLatencySeconds.WithLabelValues("wechat")
notificationLatencySeconds.WithLabelValues("pushover")
notificationLatencySeconds.WithLabelValues("slack")
notificationLatencySeconds.WithLabelValues("opsgenie")
notificationLatencySeconds.WithLabelValues("webhook")
notificationLatencySeconds.WithLabelValues("victorops")
prometheus.Register(numNotifications) prometheus.Register(numNotifications)
prometheus.Register(numFailedNotifications) prometheus.Register(numFailedNotifications)
prometheus.Register(notificationLatencySeconds)
} }
// MinTimeout is the minimum timeout that is set for the context of a call // MinTimeout is the minimum timeout that is set for the context of a call
@ -624,7 +641,10 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
select { select {
case <-tick.C: case <-tick.C:
if retry, err := r.integration.Notify(ctx, alerts...); err != nil { now := time.Now()
retry, err := r.integration.Notify(ctx, alerts...)
notificationLatencySeconds.WithLabelValues(r.integration.name).Observe(time.Since(now).Seconds())
if err != nil {
numFailedNotifications.WithLabelValues(r.integration.name).Inc() numFailedNotifications.WithLabelValues(r.integration.name).Inc()
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.name, "receiver", r.groupName, "err", err) level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.name, "receiver", r.groupName, "err", err)
if !retry { if !retry {