From 3f2e00fbead29c4ba6186d75b97d5cd45a208dcf Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Fri, 9 Feb 2018 11:16:00 +0100 Subject: [PATCH] cluster/api: improve metrics and cluster status --- api/api.go | 56 +++++++++++++++++----------------------------- cluster/cluster.go | 32 +++++++++++++++++--------- 2 files changed, 41 insertions(+), 47 deletions(-) diff --git a/api/api.go b/api/api.go index 6e4cb997..7ae71837 100644 --- a/api/api.go +++ b/api/api.go @@ -189,11 +189,11 @@ func (api *API) status(w http.ResponseWriter, req *http.Request) { api.mtx.RLock() var status = struct { - ConfigYAML string `json:"configYAML"` - ConfigJSON *config.Config `json:"configJSON"` - VersionInfo map[string]string `json:"versionInfo"` - Uptime time.Time `json:"uptime"` - MeshStatus *meshStatus `json:"meshStatus"` + ConfigYAML string `json:"configYAML"` + ConfigJSON *config.Config `json:"configJSON"` + VersionInfo map[string]string `json:"versionInfo"` + Uptime time.Time `json:"uptime"` + ClusterStatus *clusterStatus `json:"clusterStatus"` }{ ConfigYAML: api.config.String(), ConfigJSON: api.config, @@ -205,8 +205,8 @@ func (api *API) status(w http.ResponseWriter, req *http.Request) { "buildDate": version.BuildDate, "goVersion": version.GoVersion, }, - Uptime: api.uptime, - MeshStatus: getMeshStatus(api), + Uptime: api.uptime, + ClusterStatus: getClusterStatus(api.peer), } api.mtx.RUnlock() @@ -214,45 +214,29 @@ func (api *API) status(w http.ResponseWriter, req *http.Request) { api.respond(w, status) } -type meshStatus struct { - Name string `json:"name"` - NickName string `json:"nickName"` - Peers []peerStatus `json:"peers"` - Connections []connectionStatus `json:"connections"` -} - type peerStatus struct { - Name string `json:"name"` // e.g. "00:00:00:00:00:01" - NickName string `json:"nickName"` // e.g. "a" - UID uint64 `json:"uid"` // e.g. "14015114173033265000" + Name string `json:"name"` + Address string `json:"address"` } -type connectionStatus struct { - Address string `json:"address"` - Outbound bool `json:"outbound"` - State string `json:"state"` - Info string `json:"info"` +type clusterStatus struct { + Name string `json:"name"` + Peers []peerStatus `json:"peers"` } -func getMeshStatus(api *API) *meshStatus { - if api.peer == nil { +func getClusterStatus(p *cluster.Peer) *clusterStatus { + if p == nil { return nil } + s := &clusterStatus{Name: p.Name()} - strippedStatus := &meshStatus{ - Name: api.peer.Name(), - NickName: "", - } - - for _, p := range api.peer.Peers() { - strippedStatus.Peers = append(strippedStatus.Peers, peerStatus{ - Name: p.Name, - NickName: "", - UID: 0, + for _, n := range p.Peers() { + s.Peers = append(s.Peers, peerStatus{ + Name: n.Name, + Address: n.Address(), }) } - - return strippedStatus + return s } func (api *API) alertGroups(w http.ResponseWriter, r *http.Request) { diff --git a/cluster/cluster.go b/cluster/cluster.go index f6cd4926..28c0dc4e 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -228,7 +228,8 @@ type delegate struct { logger log.Logger bcast *memberlist.TransmitLimitedQueue - gossipMsgsReceived prometheus.Counter + messagesReceived *prometheus.CounterVec + messagesReceivedSize *prometheus.CounterVec } func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer) *delegate { @@ -236,10 +237,14 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer) *delegate { NumNodes: p.ClusterSize, RetransmitMult: 3, } - gossipMsgsReceived := prometheus.NewCounter(prometheus.CounterOpts{ - Name: "alertmanager_gossip_messages_received_total", - Help: "Total gossip NotifyMsg calls.", - }) + messagesReceived := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_cluster_messages_received_total", + Help: "Total number of cluster messsages received.", + }, []string{"msg_type"}) + messagesReceivedSize := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_cluster_messages_received_size_total", + Help: "Total size of cluster messages received.", + }, []string{"msg_type"}) gossipClusterMembers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "alertmanager_cluster_members", Help: "Number indicating current number of members in cluster.", @@ -247,13 +252,14 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer) *delegate { return float64(p.ClusterSize()) }) - reg.MustRegister(gossipMsgsReceived, gossipClusterMembers) + reg.MustRegister(messagesReceived, messagesReceivedSize, gossipClusterMembers) return &delegate{ - logger: l, - Peer: p, - bcast: bcast, - gossipMsgsReceived: gossipMsgsReceived, + logger: l, + Peer: p, + bcast: bcast, + messagesReceived: messagesReceived, + messagesReceivedSize: messagesReceivedSize, } } @@ -264,7 +270,8 @@ func (d *delegate) NodeMeta(limit int) []byte { // NotifyMsg is the callback invoked when a user-level gossip message is received. func (d *delegate) NotifyMsg(b []byte) { - d.gossipMsgsReceived.Inc() + d.messagesReceived.WithLabelValues("update").Inc() + d.messagesReceivedSize.WithLabelValues("update").Add(float64(len(b))) var p clusterpb.Part if err := proto.Unmarshal(b, &p); err != nil { @@ -308,6 +315,9 @@ func (d *delegate) LocalState(_ bool) []byte { } func (d *delegate) MergeRemoteState(buf []byte, _ bool) { + d.messagesReceived.WithLabelValues("full_state").Inc() + d.messagesReceivedSize.WithLabelValues("full_state").Add(float64(len(buf))) + var fs clusterpb.FullState if err := proto.Unmarshal(buf, &fs); err != nil { level.Warn(d.logger).Log("msg", "merge remote state", "err", err)