From d85bef20d95cda14a5e0eba749d9aa5d2c825004 Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Thu, 29 Feb 2024 15:53:47 +0100 Subject: [PATCH] feature: add native histogram support to latency metrics (#3737) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note that this does not stop showing classic metrics, for now it is up to the scrape config to decide whether to keep those instead or both. Signed-off-by: György Krajcsovits --- cluster/channel.go | 10 +++++++--- cluster/delegate.go | 9 ++++++--- cmd/alertmanager/main.go | 9 ++++++--- nflog/nflog.go | 8 ++++++-- notify/notify.go | 11 +++++++---- silence/silence.go | 8 ++++++-- 6 files changed, 38 insertions(+), 17 deletions(-) diff --git a/cluster/channel.go b/cluster/channel.go index ba0b834c..5548d508 100644 --- a/cluster/channel.go +++ b/cluster/channel.go @@ -70,9 +70,13 @@ func NewChannel( ConstLabels: prometheus.Labels{"key": key}, }) oversizeGossipDuration := prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "alertmanager_oversize_gossip_message_duration_seconds", - Help: "Duration of oversized gossip message requests.", - ConstLabels: prometheus.Labels{"key": key}, + Name: "alertmanager_oversize_gossip_message_duration_seconds", + Help: "Duration of oversized gossip message requests.", + ConstLabels: prometheus.Labels{"key": key}, + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }) reg.MustRegister(oversizeGossipDuration, oversizeGossipMessageFailureTotal, oversizeGossipMessageDroppedTotal, oversizeGossipMessageSentTotal) diff --git a/cluster/delegate.go b/cluster/delegate.go index 9957f69b..edfda107 100644 --- a/cluster/delegate.go +++ b/cluster/delegate.go @@ -104,9 +104,12 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in }, []string{"peer"}, ) nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "alertmanager_cluster_pings_seconds", - Help: "Histogram of latencies for ping messages.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5}, + Name: "alertmanager_cluster_pings_seconds", + Help: "Histogram of latencies for ping messages.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }, []string{"peer"}, ) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index b2938189..c3e9d1b2 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -64,9 +64,12 @@ import ( var ( requestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "alertmanager_http_request_duration_seconds", - Help: "Histogram of latencies for HTTP requests.", - Buckets: []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60}, + Name: "alertmanager_http_request_duration_seconds", + Help: "Histogram of latencies for HTTP requests.", + Buckets: []float64{.05, 0.1, .25, .5, .75, 1, 2, 5, 20, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }, []string{"handler", "method"}, ) diff --git a/nflog/nflog.go b/nflog/nflog.go index c533dd0e..6ce12a8e 100644 --- a/nflog/nflog.go +++ b/nflog/nflog.go @@ -139,8 +139,12 @@ func newMetrics(r prometheus.Registerer) *metrics { Help: "Number notification log received queries that failed.", }) m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "alertmanager_nflog_query_duration_seconds", - Help: "Duration of notification log query evaluation.", + Name: "alertmanager_nflog_query_duration_seconds", + Help: "Duration of notification log query evaluation.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }) m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ Name: "alertmanager_nflog_gossip_messages_propagated_total", diff --git a/notify/notify.go b/notify/notify.go index 1d7597c9..3752148f 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -291,10 +291,13 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics { Help: "The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.", }, []string{"reason"}), notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "alertmanager", - Name: "notification_latency_seconds", - Help: "The latency of notifications in seconds.", - Buckets: []float64{1, 5, 10, 15, 20}, + Namespace: "alertmanager", + Name: "notification_latency_seconds", + Help: "The latency of notifications in seconds.", + Buckets: []float64{1, 5, 10, 15, 20}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }, labels), ff: ff, } diff --git a/silence/silence.go b/silence/silence.go index 710323f7..c87ab76e 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -271,8 +271,12 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics { Help: "How many silence received queries did not succeed.", }) m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "alertmanager_silences_query_duration_seconds", - Help: "Duration of silence query evaluation.", + Name: "alertmanager_silences_query_duration_seconds", + Help: "Duration of silence query evaluation.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, }) m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ Name: "alertmanager_silences_gossip_messages_propagated_total",