From 019ace3298ec92ca01762e5ec48a9d4352699c67 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Mon, 1 Jul 2019 10:24:41 +0200 Subject: [PATCH] cluster: add more metrics (#1941) - alertmanager_cluster_alive_messages_total, total number of alive messages received. - alertmanager_cluster_peer_info, a constant metric labeled by peer name. - alertmanager_cluster_pings_seconds, histogram of latencies for ping messages. Signed-off-by: Simon Pasquier --- cluster/cluster.go | 16 +++++++++++++--- cluster/delegate.go | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/cluster/cluster.go b/cluster/cluster.go index 76905b7a..20c7a1c3 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -179,7 +179,7 @@ func Create( knownPeers: knownPeers, } - p.register(reg) + p.register(reg, name.String()) retransmit := len(knownPeers) / 2 if retransmit < 3 { @@ -192,6 +192,8 @@ func Create( cfg.BindAddr = bindHost cfg.BindPort = bindPort cfg.Delegate = p.delegate + cfg.Ping = p.delegate + cfg.Alive = p.delegate cfg.Events = p.delegate cfg.GossipInterval = gossipInterval cfg.PushPullInterval = pushPullInterval @@ -304,7 +306,15 @@ func (l *logWriter) Write(b []byte) (int, error) { return len(b), level.Debug(l.l).Log("memberlist", string(b)) } -func (p *Peer) register(reg prometheus.Registerer) { +func (p *Peer) register(reg prometheus.Registerer, name string) { + peerInfo := prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "alertmanager_cluster_peer_info", + Help: "A metric with a constant '1' value labeled by peer name.", + ConstLabels: prometheus.Labels{"peer": name}, + }, + ) + peerInfo.Set(1) clusterFailedPeers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "alertmanager_cluster_failed_peers", Help: "Number indicating the current number of failed peers in the cluster.", @@ -346,7 +356,7 @@ func (p *Peer) register(reg prometheus.Registerer) { Help: "A counter of the number of peers that have joined.", }) - reg.MustRegister(clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter, + reg.MustRegister(peerInfo, clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter, p.peerLeaveCounter, p.peerUpdateCounter, p.peerJoinCounter, p.refreshCounter, p.failedRefreshCounter) } diff --git a/cluster/delegate.go b/cluster/delegate.go index 81cc3d53..bcacf83e 100644 --- a/cluster/delegate.go +++ b/cluster/delegate.go @@ -44,6 +44,8 @@ type delegate struct { messagesSent *prometheus.CounterVec messagesSentSize *prometheus.CounterVec messagesPruned prometheus.Counter + nodeAlive *prometheus.CounterVec + nodePingDuration *prometheus.HistogramVec } func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit int) *delegate { @@ -95,6 +97,17 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in }, func() float64 { return float64(bcast.NumQueued()) }) + nodeAlive := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_cluster_alive_messages_total", + Help: "Total number of received alive messages.", + }, []string{"peer"}, + ) + nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "alertmanager_cluster_pings_seconds", + Help: "Histogram of latencies for ping messages.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5}, + }, []string{"peer"}, + ) messagesReceived.WithLabelValues(fullState) messagesReceivedSize.WithLabelValues(fullState) @@ -106,7 +119,9 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in messagesSentSize.WithLabelValues(update) reg.MustRegister(messagesReceived, messagesReceivedSize, messagesSent, messagesSentSize, - gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned) + gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned, + nodeAlive, nodePingDuration, + ) d := &delegate{ logger: l, @@ -117,6 +132,8 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in messagesSent: messagesSent, messagesSentSize: messagesSentSize, messagesPruned: messagesPruned, + nodeAlive: nodeAlive, + nodePingDuration: nodePingDuration, } go d.handleQueueDepth() @@ -226,6 +243,22 @@ func (d *delegate) NotifyUpdate(n *memberlist.Node) { d.Peer.peerUpdate(n) } +// NotifyAlive implements the memberlist.AliveDelegate interface. +func (d *delegate) NotifyAlive(peer *memberlist.Node) error { + d.nodeAlive.WithLabelValues(peer.Name).Inc() + return nil +} + +// AckPayload implements the memberlist.PingDelegate interface. +func (d *delegate) AckPayload() []byte { + return []byte{} +} + +// NotifyPingComplete implements the memberlist.PingDelegate interface. +func (d *delegate) NotifyPingComplete(peer *memberlist.Node, rtt time.Duration, payload []byte) { + d.nodePingDuration.WithLabelValues(peer.Name).Observe(rtt.Seconds()) +} + // handleQueueDepth ensures that the queue doesn't grow unbounded by pruning // older messages at regular interval. func (d *delegate) handleQueueDepth() {