cluster: add more metrics (#1941)
- alertmanager_cluster_alive_messages_total, total number of alive messages received. - alertmanager_cluster_peer_info, a constant metric labeled by peer name. - alertmanager_cluster_pings_seconds, histogram of latencies for ping messages. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
c29b987ec6
commit
019ace3298
|
@ -179,7 +179,7 @@ func Create(
|
|||
knownPeers: knownPeers,
|
||||
}
|
||||
|
||||
p.register(reg)
|
||||
p.register(reg, name.String())
|
||||
|
||||
retransmit := len(knownPeers) / 2
|
||||
if retransmit < 3 {
|
||||
|
@ -192,6 +192,8 @@ func Create(
|
|||
cfg.BindAddr = bindHost
|
||||
cfg.BindPort = bindPort
|
||||
cfg.Delegate = p.delegate
|
||||
cfg.Ping = p.delegate
|
||||
cfg.Alive = p.delegate
|
||||
cfg.Events = p.delegate
|
||||
cfg.GossipInterval = gossipInterval
|
||||
cfg.PushPullInterval = pushPullInterval
|
||||
|
@ -304,7 +306,15 @@ func (l *logWriter) Write(b []byte) (int, error) {
|
|||
return len(b), level.Debug(l.l).Log("memberlist", string(b))
|
||||
}
|
||||
|
||||
func (p *Peer) register(reg prometheus.Registerer) {
|
||||
func (p *Peer) register(reg prometheus.Registerer, name string) {
|
||||
peerInfo := prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_cluster_peer_info",
|
||||
Help: "A metric with a constant '1' value labeled by peer name.",
|
||||
ConstLabels: prometheus.Labels{"peer": name},
|
||||
},
|
||||
)
|
||||
peerInfo.Set(1)
|
||||
clusterFailedPeers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
||||
Name: "alertmanager_cluster_failed_peers",
|
||||
Help: "Number indicating the current number of failed peers in the cluster.",
|
||||
|
@ -346,7 +356,7 @@ func (p *Peer) register(reg prometheus.Registerer) {
|
|||
Help: "A counter of the number of peers that have joined.",
|
||||
})
|
||||
|
||||
reg.MustRegister(clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter,
|
||||
reg.MustRegister(peerInfo, clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter,
|
||||
p.peerLeaveCounter, p.peerUpdateCounter, p.peerJoinCounter, p.refreshCounter, p.failedRefreshCounter)
|
||||
}
|
||||
|
||||
|
|
|
@ -44,6 +44,8 @@ type delegate struct {
|
|||
messagesSent *prometheus.CounterVec
|
||||
messagesSentSize *prometheus.CounterVec
|
||||
messagesPruned prometheus.Counter
|
||||
nodeAlive *prometheus.CounterVec
|
||||
nodePingDuration *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit int) *delegate {
|
||||
|
@ -95,6 +97,17 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
|
|||
}, func() float64 {
|
||||
return float64(bcast.NumQueued())
|
||||
})
|
||||
nodeAlive := prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "alertmanager_cluster_alive_messages_total",
|
||||
Help: "Total number of received alive messages.",
|
||||
}, []string{"peer"},
|
||||
)
|
||||
nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Name: "alertmanager_cluster_pings_seconds",
|
||||
Help: "Histogram of latencies for ping messages.",
|
||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5},
|
||||
}, []string{"peer"},
|
||||
)
|
||||
|
||||
messagesReceived.WithLabelValues(fullState)
|
||||
messagesReceivedSize.WithLabelValues(fullState)
|
||||
|
@ -106,7 +119,9 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
|
|||
messagesSentSize.WithLabelValues(update)
|
||||
|
||||
reg.MustRegister(messagesReceived, messagesReceivedSize, messagesSent, messagesSentSize,
|
||||
gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned)
|
||||
gossipClusterMembers, peerPosition, healthScore, messagesQueued, messagesPruned,
|
||||
nodeAlive, nodePingDuration,
|
||||
)
|
||||
|
||||
d := &delegate{
|
||||
logger: l,
|
||||
|
@ -117,6 +132,8 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer, retransmit in
|
|||
messagesSent: messagesSent,
|
||||
messagesSentSize: messagesSentSize,
|
||||
messagesPruned: messagesPruned,
|
||||
nodeAlive: nodeAlive,
|
||||
nodePingDuration: nodePingDuration,
|
||||
}
|
||||
|
||||
go d.handleQueueDepth()
|
||||
|
@ -226,6 +243,22 @@ func (d *delegate) NotifyUpdate(n *memberlist.Node) {
|
|||
d.Peer.peerUpdate(n)
|
||||
}
|
||||
|
||||
// NotifyAlive implements the memberlist.AliveDelegate interface.
|
||||
func (d *delegate) NotifyAlive(peer *memberlist.Node) error {
|
||||
d.nodeAlive.WithLabelValues(peer.Name).Inc()
|
||||
return nil
|
||||
}
|
||||
|
||||
// AckPayload implements the memberlist.PingDelegate interface.
|
||||
func (d *delegate) AckPayload() []byte {
|
||||
return []byte{}
|
||||
}
|
||||
|
||||
// NotifyPingComplete implements the memberlist.PingDelegate interface.
|
||||
func (d *delegate) NotifyPingComplete(peer *memberlist.Node, rtt time.Duration, payload []byte) {
|
||||
d.nodePingDuration.WithLabelValues(peer.Name).Observe(rtt.Seconds())
|
||||
}
|
||||
|
||||
// handleQueueDepth ensures that the queue doesn't grow unbounded by pruning
|
||||
// older messages at regular interval.
|
||||
func (d *delegate) handleQueueDepth() {
|
||||
|
|
Loading…
Reference in New Issue