cluster: add more metrics (#1941)

- alertmanager_cluster_alive_messages_total, total number of alive
messages received.
- alertmanager_cluster_peer_info, a constant metric labeled by peer name.
- alertmanager_cluster_pings_seconds, histogram of latencies for ping
messages.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Simon Pasquier 2019-07-01 10:24:41 +02:00 committed by GitHub
parent c29b987ec6
commit 019ace3298
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 4 deletions

View File

@ -179,7 +179,7 @@ func Create(
knownPeers: knownPeers,
}
p.register(reg)
p.register(reg, name.String())
retransmit := len(knownPeers) / 2
if retransmit < 3 {
@ -192,6 +192,8 @@ func Create(
cfg.BindAddr = bindHost
cfg.BindPort = bindPort
cfg.Delegate = p.delegate
cfg.Ping = p.delegate
cfg.Alive = p.delegate
cfg.Events = p.delegate
cfg.GossipInterval = gossipInterval
cfg.PushPullInterval = pushPullInterval
@ -304,7 +306,15 @@ func (l *logWriter) Write(b []byte) (int, error) {
return len(b), level.Debug(l.l).Log("memberlist", string(b))
}
func (p *Peer) register(reg prometheus.Registerer) {
func (p *Peer) register(reg prometheus.Registerer, name string) {
peerInfo := prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "alertmanager_cluster_peer_info",
Help: "A metric with a constant '1' value labeled by peer name.",
ConstLabels: prometheus.Labels{"peer": name},
},
)
peerInfo.Set(1)
clusterFailedPeers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "alertmanager_cluster_failed_peers",
Help: "Number indicating the current number of failed peers in the cluster.",
@ -346,7 +356,7 @@ func (p *Peer) register(reg prometheus.Registerer) {
Help: "A counter of the number of peers that have joined.",
})
reg.MustRegister(clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter,
reg.MustRegister(peerInfo, clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter,
p.peerLeaveCounter, p.peerUpdateCounter, p.peerJoinCounter, p.refreshCounter, p.failedRefreshCounter)
}

View File

@ -44,6 +44,8 @@ type delegate struct {
messagesSent *prometheus.CounterVec
messagesSentSize *prometheus.CounterVec
messagesPruned prometheus.Counter
nodeAlive *prometheus.CounterVec
nodePingDuration *prometheus.HistogramVec
}
func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit int) *delegate {
@ -95,6 +97,17 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
}, func() float64 {
return float64(bcast.NumQueued())
})
nodeAlive := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_cluster_alive_messages_total",
Help: "Total number of received alive messages.",
}, []string{"peer"},
)
nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "alertmanager_cluster_pings_seconds",
Help: "Histogram of latencies for ping messages.",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5},
}, []string{"peer"},
)
messagesReceived.WithLabelValues(fullState)
messagesReceivedSize.WithLabelValues(fullState)
@ -106,7 +119,9 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
messagesSentSize.WithLabelValues(update)
reg.MustRegister(messagesReceived, messagesReceivedSize, messagesSent, messagesSentSize,
gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned)
gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned,
nodeAlive, nodePingDuration,
)
d := &delegate{
logger: l,
@ -117,6 +132,8 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
messagesSent: messagesSent,
messagesSentSize: messagesSentSize,
messagesPruned: messagesPruned,
nodeAlive: nodeAlive,
nodePingDuration: nodePingDuration,
}
go d.handleQueueDepth()
@ -226,6 +243,22 @@ func (d *delegate) NotifyUpdate(n *memberlist.Node) {
d.Peer.peerUpdate(n)
}
// NotifyAlive implements the memberlist.AliveDelegate interface.
func (d *delegate) NotifyAlive(peer *memberlist.Node) error {
d.nodeAlive.WithLabelValues(peer.Name).Inc()
return nil
}
// AckPayload implements the memberlist.PingDelegate interface.
func (d *delegate) AckPayload() []byte {
return []byte{}
}
// NotifyPingComplete implements the memberlist.PingDelegate interface.
func (d *delegate) NotifyPingComplete(peer *memberlist.Node, rtt time.Duration, payload []byte) {
d.nodePingDuration.WithLabelValues(peer.Name).Observe(rtt.Seconds())
}
// handleQueueDepth ensures that the queue doesn't grow unbounded by pruning
// older messages at regular interval.
func (d *delegate) handleQueueDepth() {