Merge pull request #124 from digitalocean/skrutiy/HealthWarningsMap

Moved Health Checks Map
2019-09-09 11:49:51 -04:00 · 2019-09-09 11:49:51 -04:00 · 1df560f980
parent 4e1a458925 1a3dff593e
commit 1df560f980
5 changed files with 106 additions and 114 deletions
--- a/collectors/health.go
+++ b/collectors/health.go
@ -247,14 +247,87 @@ const (

 // NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
 // metrics on.
-func NewClusterHealthCollector(conn Conn, cluster string, healthChecksMap map[string]int) *ClusterHealthCollector {
+func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollector {
 	labels := make(prometheus.Labels)
 	labels["cluster"] = cluster

 	return &ClusterHealthCollector{
 		conn: conn,

-		healthChecksMap: healthChecksMap,
+		healthChecksMap: map[string]int{
+			"AUTH_BAD_CAPS":                        2,
+			"BLUEFS_AVAILABLE_SPACE":               1,
+			"BLUEFS_LOW_SPACE":                     1,
+			"BLUEFS_SPILLOVER":                     1,
+			"BLUESTORE_DISK_SIZE_MISMATCH":         1,
+			"BLUESTORE_FRAGMENTATION":              1,
+			"BLUESTORE_LEGACY_STATFS":              1,
+			"BLUESTORE_NO_COMPRESSION":             1,
+			"BLUESTORE_NO_PER_POOL_MAP":            1,
+			"CACHE_POOL_NEAR_FULL":                 1,
+			"CACHE_POOL_NO_HIT_SET":                1,
+			"DEVICE_HEALTH":                        1,
+			"DEVICE_HEALTH_IN_USE":                 2,
+			"DEVICE_HEALTH_TOOMANY":                2,
+			"LARGE_OMAP_OBJECTS":                   1,
+			"MANY_OBJECTS_PER_PG":                  1,
+			"MGR_DOWN":                             2,
+			"MGR_MODULE_DEPENDENCY":                1,
+			"MGR_MODULE_ERROR":                     2,
+			"MON_CLOCK_SKEW":                       2,
+			"MON_DISK_BIG":                         1,
+			"MON_DISK_CRIT":                        2,
+			"MON_DISK_LOW":                         2,
+			"MON_DOWN":                             2,
+			"MON_MSGR2_NOT_ENABLED":                2,
+			"OBJECT_MISPLACED":                     1,
+			"OBJECT_UNFOUND":                       2,
+			"OLD_CRUSH_STRAW_CALC_VERSION":         1,
+			"OLD_CRUSH_TUNABLES":                   2,
+			"OSDMAP_FLAGS":                         1,
+			"OSD_BACKFILLFULL":                     2,
+			"OSD_CHASSIS_DOWN":                     1,
+			"OSD_DATACENTER_DOWN":                  1,
+			"OSD_DOWN":                             1,
+			"OSD_FLAGS":                            1,
+			"OSD_FULL":                             2,
+			"OSD_HOST_DOWN":                        1,
+			"OSD_NEARFULL":                         2,
+			"OSD_NO_DOWN_OUT_INTERVAL":             2,
+			"OSD_NO_SORTBITWISE":                   2,
+			"OSD_ORPHAN":                           2,
+			"OSD_OSD_DOWN":                         1,
+			"OSD_OUT_OF_ORDER_FULL":                2,
+			"OSD_PDU_DOWN":                         1,
+			"OSD_POD_DOWN":                         1,
+			"OSD_RACK_DOWN":                        1,
+			"OSD_REGION_DOWN":                      1,
+			"OSD_ROOM_DOWN":                        1,
+			"OSD_ROOT_DOWN":                        1,
+			"OSD_ROW_DOWN":                         1,
+			"OSD_SCRUB_ERRORS":                     2,
+			"PG_AVAILABILITY":                      1,
+			"PG_BACKFILL_FULL":                     2,
+			"PG_DAMAGED":                           2,
+			"PG_DEGRADED":                          1,
+			"PG_NOT_DEEP_SCRUBBED":                 1,
+			"PG_NOT_SCRUBBED":                      1,
+			"PG_RECOVERY_FULL":                     2,
+			"PG_SLOW_SNAP_TRIMMING":                1,
+			"POOL_APP_NOT_ENABLED":                 2,
+			"POOL_FULL":                            2,
+			"POOL_NEAR_FULL":                       2,
+			"POOL_TARGET_SIZE_BYTES_OVERCOMMITTED": 1,
+			"POOL_TARGET_SIZE_RATIO_OVERCOMMITTED": 1,
+			"POOL_TOO_FEW_PGS":                     1,
+			"POOL_TOO_MANY_PGS":                    1,
+			"RECENT_CRASH":                         1,
+			"SLOW_OPS":                             1,
+			"SMALLER_PGP_NUM":                      1,
+			"TELEMETRY_CHANGED":                    1,
+			"TOO_FEW_OSDS":                         1,
+			"TOO_FEW_PGS":                          1,
+			"TOO_MANY_PGS":                         1},

 		HealthStatus: prometheus.NewGauge(
 			prometheus.GaugeOpts{
--- a/collectors/health_test.go
+++ b/collectors/health_test.go
@ -15,7 +15,6 @@
 package collectors

 import (
-	"gopkg.in/yaml.v2"
 	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
@ -25,21 +24,6 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 )

-var (
-	healthChecksString = `health_check_criticality:
-      MON_DOWN: 2
-      MDR_DOWN: 2
-      OSD_DOWN: 1
-      OSD_FULL: 2
-      OSDMAP_FLAGS: 1
-      DEVICE_HEALTH: 1
-      PG_DEGRADED: 1
-      PG_DAMAGED: 2
-      SLOW_OPS: 1
-      RECENT_CRASH: 1
-      TELEMETRY_CHANGED: 1`
-)
-
 func TestClusterHealthCollector(t *testing.T) {
 	for _, tt := range []struct {
 		input   string
@ -281,6 +265,7 @@ func TestClusterHealthCollector(t *testing.T) {
 	"health": { "overall_status": "HEALTH_WARN", "status": "HEALTH_OK } }`,
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`health_status{cluster="ceph"} 0`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
 			},
 		},
 		{
@ -297,6 +282,7 @@ func TestClusterHealthCollector(t *testing.T) {
 	"health": { "status": "HEALTH_OK } }`,
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`health_status{cluster="ceph"} 0`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
 			},
 		},
 		{
@ -313,6 +299,7 @@ func TestClusterHealthCollector(t *testing.T) {
 	"health": { "overall_status": "HEALTH_WARN" } }`,
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`health_status{cluster="ceph"} 1`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
 			},
 		},
 		{
@ -329,6 +316,7 @@ func TestClusterHealthCollector(t *testing.T) {
 	"health": { "overall_status": "HEALTH_ERR" } }`,
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`health_status{cluster="ceph"} 2`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 3`),
 			},
 		},
 		{
@ -490,6 +478,7 @@ $ sudo ceph -s
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`degraded_objects{cluster="ceph"} 1.54443937e\+08`),
 				regexp.MustCompile(`unclean_pgs{cluster="ceph"} 4886`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
 			},
 		},
 		{
@ -508,6 +497,25 @@ $ sudo ceph -s
 }`,
 			regexes: []*regexp.Regexp{
 				regexp.MustCompile(`misplaced_objects{cluster="ceph"} 4.31295341e\+08`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
+			},
+		},
+		{
+			input: `
+{
+  "health": {
+    "checks": {
+      "POOL_APP_NOT_ENABLED": {
+        "severity": "HEALTH_WARN",
+        "summary": {
+          "message": "application not enabled on 1 pool(s)"
+        }
+      }
+    }
+  }
+}`,
+			regexes: []*regexp.Regexp{
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
 			},
 		},
 		{
@ -635,6 +643,7 @@ $ sudo ceph -s
 				regexp.MustCompile(`osdmap_flag_noscrub{cluster="ceph"} 1`),
 				regexp.MustCompile(`osdmap_flag_nodeep_scrub{cluster="ceph"} 0`),
 				regexp.MustCompile(`osdmap_flag_notieragent{cluster="ceph"} 1`),
+				regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
 			},
 		},
 		{
@ -778,15 +787,7 @@ $ sudo ceph -s
 		},
 	} {
 		func() {
-			type WarningCategorization struct {
-				CheckNames map[string]int `yaml:"health_check_criticality"`
-			}
-			var healthChecksMap WarningCategorization
-			err := yaml.Unmarshal([]byte(healthChecksString), &healthChecksMap)
-			if err != nil {
-				t.Fatalf("failed to parse yaml of warning checks: %s", err)
-			}
-			collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph", healthChecksMap.CheckNames)
+			collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph")
 			if err := prometheus.Register(collector); err != nil {
 				t.Fatalf("collector failed to register: %s", err)
 			}
--- a/config.go
+++ b/config.go
@ -13,14 +13,9 @@ type ClusterConfig struct {
 	ConfigFile   string `yaml:"config_file"`
 }

-type WarningCategorization struct {
-	CheckNames map[string]int `yaml:"health_check_criticality"`
-}
-
 // Config is the top-level configuration for Metastord.
 type Config struct {
-	Cluster  []*ClusterConfig
-	Warnings *WarningCategorization
+	Cluster []*ClusterConfig
 }

 // fileExists returns true if the path exists and is a file.
--- a/exporter.go
+++ b/exporter.go
@ -79,14 +79,13 @@ var _ prometheus.Collector = &CephExporter{}
 // NewCephExporter creates an instance to CephExporter and returns a reference
 // to it. We can choose to enable a collector to extract stats out of by adding
 // it to the list of collectors.
-func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int,
-	healthChecksMap WarningCategorization) *CephExporter {
+func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int) *CephExporter {
 	c := &CephExporter{
 		collectors: []prometheus.Collector{
 			collectors.NewClusterUsageCollector(conn, cluster),
 			collectors.NewPoolUsageCollector(conn, cluster),
 			collectors.NewPoolInfoCollector(conn, cluster),
-			collectors.NewClusterHealthCollector(conn, cluster, healthChecksMap.CheckNames),
+			collectors.NewClusterHealthCollector(conn, cluster),
 			collectors.NewMonitorCollector(conn, cluster),
 			collectors.NewOSDCollector(conn, cluster),
 		},
@ -172,7 +171,7 @@ func main() {
 			defer conn.Shutdown()

 			log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
-			err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode, *cfg.Warnings))
+			err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode))
 			if err != nil {
 				log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
 			}
@ -197,7 +196,7 @@ func main() {
 		}
 		defer conn.Shutdown()

-		prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode, WarningCategorization{CheckNames: map[string]int{}}))
+		prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode))
 	}

 	http.Handle(*metricsPath, promhttp.Handler())
--- a/exporter.yml
+++ b/exporter.yml
@ -7,79 +7,3 @@ cluster:
    user: admin
    config_file: /etc/ceph/ceph2.conf

-health_check_criticality:
-  MON_DOWN: 2
-  MON_CLOCK_SKEW: 2
-  MON_MSGR2_NOT_ENABLED: 2
-  MON_DISK_LOW: 2
-  MON_DISK_CRIT: 2
-  MON_DISK_BIG: 1
-  MGR_MODULE_DEPENDENCY: 1
-  MGR_MODULE_ERROR: 2
-  MGR_DOWN: 2
-  OSD_DOWN: 1
-  OSD_OSD_DOWN: 1
-  OSD_HOST_DOWN: 1
-  OSD_CHASSIS_DOWN: 1
-  OSD_RACK_DOWN: 1
-  OSD_ROW_DOWN: 1
-  OSD_PDU_DOWN: 1
-  OSD_POD_DOWN: 1
-  OSD_ROOM_DOWN: 1
-  OSD_DATACENTER_DOWN: 1
-  OSD_REGION_DOWN: 1
-  OSD_ROOT_DOWN: 1
-  OSD_ORPHAN: 2
-  OSD_OUT_OF_ORDER_FULL: 2
-  OSD_FULL: 2
-  OSD_BACKFILLFULL: 2
-  OSD_NEARFULL: 2
-  OSDMAP_FLAGS: 1
-  OSD_FLAGS: 1
-  OLD_CRUSH_TUNABLES: 2
-  OLD_CRUSH_STRAW_CALC_VERSION: 1
-  CACHE_POOL_NO_HIT_SET: 1
-  OSD_NO_SORTBITWISE: 2
-  POOL_FULL: 2
-  BLUEFS_SPILLOVER: 1
-  BLUEFS_AVAILABLE_SPACE: 1
-  BLUEFS_LOW_SPACE: 1
-  BLUESTORE_FRAGMENTATION: 1
-  BLUESTORE_LEGACY_STATFS: 1
-  BLUESTORE_NO_PER_POOL_MAP: 1
-  BLUESTORE_DISK_SIZE_MISMATCH: 1
-  BLUESTORE_NO_COMPRESSION: 1
-  DEVICE_HEALTH: 1
-  DEVICE_HEALTH_IN_USE: 2
-  DEVICE_HEALTH_TOOMANY: 2
-  PG_AVAILABILITY: 1
-  PG_DEGRADED: 1
-  PG_RECOVERY_FULL: 2
-  PG_BACKFILL_FULL: 2
-  PG_DAMAGED: 2
-  OSD_SCRUB_ERRORS: 2
-  LARGE_OMAP_OBJECTS: 1
-  CACHE_POOL_NEAR_FULL: 1
-  TOO_FEW_PGS: 1
-  POOL_TOO_FEW_PGS: 1
-  TOO_MANY_PGS: 1
-  POOL_TOO_MANY_PGS: 1
-  POOL_TARGET_SIZE_RATIO_OVERCOMMITTED: 1
-  POOL_TARGET_SIZE_BYTES_OVERCOMMITTED: 1
-  TOO_FEW_OSDS: 1
-  SMALLER_PGP_NUM: 1
-  MANY_OBJECTS_PER_PG: 1
-  POOL_APP_NOT_ENABLED: 2
-  POOL_NEAR_FULL: 2
-  OBJECT_MISPLACED: 1
-  OBJECT_UNFOUND: 2
-  SLOW_OPS: 1
-  PG_NOT_SCRUBBED: 1
-  PG_NOT_DEEP_SCRUBBED: 1
-  PG_SLOW_SNAP_TRIMMING: 1
-  RECENT_CRASH: 1
-  TELEMETRY_CHANGED: 1
-  AUTH_BAD_CAPS: 2
-  OSD_NO_DOWN_OUT_INTERVAL: 2
-
-