From 6f290751c9318590837b0056eaaff876706b726f Mon Sep 17 00:00:00 2001 From: Vaibhav Bhembre Date: Sat, 23 Jun 2018 18:51:06 -0400 Subject: [PATCH] monitors: add back clock skew and latency metric support --- collectors/monitors.go | 46 ++++++++++++++ collectors/monitors_test.go | 121 ++++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) diff --git a/collectors/monitors.go b/collectors/monitors.go index bea642b..9f44981 100644 --- a/collectors/monitors.go +++ b/collectors/monitors.go @@ -210,6 +210,14 @@ func (m *MonitorCollector) metricsList() []prometheus.Metric { } } +type cephTimeSyncStatus struct { + TimeChecks map[string]struct { + Health string `json:"health"` + Latency json.Number `json:"latency"` + Skew json.Number `json:"skew"` + } `json:"time_skew_status"` +} + type cephMonitorStats struct { Health struct { Health struct { @@ -252,6 +260,17 @@ func (m *MonitorCollector) collect() error { return err } + cmd = m.cephTimeSyncStatusCommand() + buf, _, err = m.conn.MonCommand(cmd) + if err != nil { + return err + } + + timeStats := &cephTimeSyncStatus{} + if err := json.Unmarshal(buf, timeStats); err != nil { + return err + } + for _, healthService := range stats.Health.Health.HealthServices { for _, monstat := range healthService.Mons { kbTotal, err := monstat.KBTotal.Float64() @@ -318,6 +337,20 @@ func (m *MonitorCollector) collect() error { m.Latency.WithLabelValues(monstat.Name).Set(latency) } + for monNode, tstat := range timeStats.TimeChecks { + skew, err := tstat.Skew.Float64() + if err != nil { + return err + } + m.ClockSkew.WithLabelValues(monNode).Set(skew) + + latency, err := tstat.Latency.Float64() + if err != nil { + return err + } + m.Latency.WithLabelValues(monNode).Set(latency) + } + m.NodesinQuorum.Set(float64(len(stats.Quorum))) return nil @@ -336,6 +369,19 @@ func (m *MonitorCollector) cephUsageCommand() []byte { return cmd } +func (m *MonitorCollector) cephTimeSyncStatusCommand() []byte { + cmd, err := json.Marshal(map[string]interface{}{ + "prefix": "time-sync-status", + "format": "json", + }) + if err != nil { + // panic! because ideally in no world this hard-coded input + // should fail. + panic(err) + } + return cmd +} + // Describe sends the descriptors of each Monitor related metric we have defined // to the channel provided. func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) { diff --git a/collectors/monitors_test.go b/collectors/monitors_test.go index b0fbcc0..dd96473 100644 --- a/collectors/monitors_test.go +++ b/collectors/monitors_test.go @@ -290,3 +290,124 @@ func TestMonitorCollector(t *testing.T) { }() } } + +func TestMonitorTimeSyncStats(t *testing.T) { + for _, tt := range []struct { + input string + regexes []*regexp.Regexp + }{ + {` + { + "time_skew_status": { + "test-mon01": { + "skew": 0.000022, + "latency": 0.000677, + "health": "HEALTH_OK" + }, + "test-mon02": { + "skew": 0.001051, + "latency": 0.000682, + "health": "HEALTH_OK" + }, + "test-mon03": { + "skew": 0.003029, + "latency": 0.000582, + "health": "HEALTH_OK" + }, + "test-mon04": { + "skew": 0.000330, + "latency": 0.000667, + "health": "HEALTH_OK" + }, + "test-mon05": { + "skew": 0.003682, + "latency": 0.000667, + "health": "HEALTH_OK" + } + }, + "timechecks": { + "epoch": 84, + "round": 69600, + "round_status": "finished" + } + } +`, + []*regexp.Regexp{ + regexp.MustCompile(`ceph_monitor_clock_skew_seconds{cluster="ceph",monitor="test-mon01"} 2.2e\-05`), + regexp.MustCompile(`ceph_monitor_clock_skew_seconds{cluster="ceph",monitor="test-mon02"} 0.001051`), + regexp.MustCompile(`ceph_monitor_clock_skew_seconds{cluster="ceph",monitor="test-mon03"} 0.003029`), + regexp.MustCompile(`ceph_monitor_clock_skew_seconds{cluster="ceph",monitor="test-mon04"} 0.00033`), + regexp.MustCompile(`ceph_monitor_clock_skew_seconds{cluster="ceph",monitor="test-mon05"} 0.003682`), + regexp.MustCompile(`ceph_monitor_latency_seconds{cluster="ceph",monitor="test-mon01"} 0.000677`), + regexp.MustCompile(`ceph_monitor_latency_seconds{cluster="ceph",monitor="test-mon02"} 0.000682`), + regexp.MustCompile(`ceph_monitor_latency_seconds{cluster="ceph",monitor="test-mon03"} 0.000582`), + regexp.MustCompile(`ceph_monitor_latency_seconds{cluster="ceph",monitor="test-mon04"} 0.000667`), + regexp.MustCompile(`ceph_monitor_latency_seconds{cluster="ceph",monitor="test-mon05"} 0.000667`), + }, + }, + {` + { + "time_skew_status": { + "test-mon01": { + "skew": "wrong!", + "latency": 0.000677, + "health": "HEALTH_OK" + } + } +`, + []*regexp.Regexp{}, + }, + {` + { + "time_skew_status": { + "test-mon01": { + "skew": 0.000334, + "latency": "wrong!", + "health": "HEALTH_OK" + } + } +`, + []*regexp.Regexp{}, + }, + {` + { + "time_skew_status": { + "test-mon01": { + "skew"::: "0.000334", + "latency"::: "0.000677", + "health": "HEALTH_OK" + } + } +`, + []*regexp.Regexp{}, + }, + } { + func() { + collector := NewMonitorCollector(NewNoopConn(tt.input), "ceph") + if err := prometheus.Register(collector); err != nil { + t.Fatalf("collector failed to register: %s", err) + } + defer prometheus.Unregister(collector) + + server := httptest.NewServer(prometheus.Handler()) + defer server.Close() + + resp, err := http.Get(server.URL) + if err != nil { + t.Fatalf("unexpected failed response from prometheus: %s", err) + } + defer resp.Body.Close() + + buf, err := ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatalf("failed reading server response: %s", err) + } + + for _, re := range tt.regexes { + if !re.Match(buf) { + t.Errorf("failed matching: %q", re) + } + } + }() + } +}