feature: add native histogram support to latency metrics (#3737)

Note that this does not stop showing classic metrics, for now it is up to the scrape config to decide whether to keep those instead or both. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
2024-02-29 15:53:47 +01:00 · 2024-02-29 15:53:47 +01:00 · d85bef20d9
parent d1fe4b7f6f
commit d85bef20d9
6 changed files with 38 additions and 17 deletions
--- a/cluster/channel.go
+++ b/cluster/channel.go
@ -70,9 +70,13 @@ func NewChannel(
 		ConstLabels: prometheus.Labels{"key": key},
 	})
 	oversizeGossipDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name:        "alertmanager_oversize_gossip_message_duration_seconds",
-		Help:        "Duration of oversized gossip message requests.",
-		ConstLabels: prometheus.Labels{"key": key},
+		Name:                            "alertmanager_oversize_gossip_message_duration_seconds",
+		Help:                            "Duration of oversized gossip message requests.",
+		ConstLabels:                     prometheus.Labels{"key": key},
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})

 	reg.MustRegister(oversizeGossipDuration, oversizeGossipMessageFailureTotal, oversizeGossipMessageDroppedTotal, oversizeGossipMessageSentTotal)
--- a/cluster/delegate.go
+++ b/cluster/delegate.go
@ -104,9 +104,12 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
 	}, []string{"peer"},
 	)
 	nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
-		Name:    "alertmanager_cluster_pings_seconds",
-		Help:    "Histogram of latencies for ping messages.",
-		Buckets: []float64{.005, .01, .025, .05, .1, .25, .5},
+		Name:                            "alertmanager_cluster_pings_seconds",
+		Help:                            "Histogram of latencies for ping messages.",
+		Buckets:                         []float64{.005, .01, .025, .05, .1, .25, .5},
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	}, []string{"peer"},
 	)

--- a/cmd/alertmanager/main.go
+++ b/cmd/alertmanager/main.go
@ -64,9 +64,12 @@ import (
 var (
 	requestDuration = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
-			Name:    "alertmanager_http_request_duration_seconds",
-			Help:    "Histogram of latencies for HTTP requests.",
-			Buckets: []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60},
+			Name:                            "alertmanager_http_request_duration_seconds",
+			Help:                            "Histogram of latencies for HTTP requests.",
+			Buckets:                         []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
 		},
 		[]string{"handler", "method"},
 	)
--- a/nflog/nflog.go
+++ b/nflog/nflog.go
@ -139,8 +139,12 @@ func newMetrics(r prometheus.Registerer) *metrics {
 		Help: "Number notification log received queries that failed.",
 	})
 	m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name: "alertmanager_nflog_query_duration_seconds",
-		Help: "Duration of notification log query evaluation.",
+		Name:                            "alertmanager_nflog_query_duration_seconds",
+		Help:                            "Duration of notification log query evaluation.",
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})
 	m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "alertmanager_nflog_gossip_messages_propagated_total",
--- a/notify/notify.go
+++ b/notify/notify.go
@ -291,10 +291,13 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
 			Help:      "The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.",
 		}, []string{"reason"}),
 		notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
-			Namespace: "alertmanager",
-			Name:      "notification_latency_seconds",
-			Help:      "The latency of notifications in seconds.",
-			Buckets:   []float64{1, 5, 10, 15, 20},
+			Namespace:                       "alertmanager",
+			Name:                            "notification_latency_seconds",
+			Help:                            "The latency of notifications in seconds.",
+			Buckets:                         []float64{1, 5, 10, 15, 20},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
 		}, labels),
 		ff: ff,
 	}
--- a/silence/silence.go
+++ b/silence/silence.go
@ -271,8 +271,12 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
 		Help: "How many silence received queries did not succeed.",
 	})
 	m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name: "alertmanager_silences_query_duration_seconds",
-		Help: "Duration of silence query evaluation.",
+		Name:                            "alertmanager_silences_query_duration_seconds",
+		Help:                            "Duration of silence query evaluation.",
+		Buckets:                         prometheus.DefBuckets,
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
 	})
 	m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "alertmanager_silences_gossip_messages_propagated_total",