Merge pull request #124 from digitalocean/skrutiy/HealthWarningsMap

Moved Health Checks Map
This commit is contained in:
Sasha Krutiy 2019-09-09 11:49:51 -04:00 committed by GitHub
commit 1df560f980
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 114 deletions

View File

@ -247,14 +247,87 @@ const (
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
// metrics on.
func NewClusterHealthCollector(conn Conn, cluster string, healthChecksMap map[string]int) *ClusterHealthCollector {
func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollector {
labels := make(prometheus.Labels)
labels["cluster"] = cluster
return &ClusterHealthCollector{
conn: conn,
healthChecksMap: healthChecksMap,
healthChecksMap: map[string]int{
"AUTH_BAD_CAPS": 2,
"BLUEFS_AVAILABLE_SPACE": 1,
"BLUEFS_LOW_SPACE": 1,
"BLUEFS_SPILLOVER": 1,
"BLUESTORE_DISK_SIZE_MISMATCH": 1,
"BLUESTORE_FRAGMENTATION": 1,
"BLUESTORE_LEGACY_STATFS": 1,
"BLUESTORE_NO_COMPRESSION": 1,
"BLUESTORE_NO_PER_POOL_MAP": 1,
"CACHE_POOL_NEAR_FULL": 1,
"CACHE_POOL_NO_HIT_SET": 1,
"DEVICE_HEALTH": 1,
"DEVICE_HEALTH_IN_USE": 2,
"DEVICE_HEALTH_TOOMANY": 2,
"LARGE_OMAP_OBJECTS": 1,
"MANY_OBJECTS_PER_PG": 1,
"MGR_DOWN": 2,
"MGR_MODULE_DEPENDENCY": 1,
"MGR_MODULE_ERROR": 2,
"MON_CLOCK_SKEW": 2,
"MON_DISK_BIG": 1,
"MON_DISK_CRIT": 2,
"MON_DISK_LOW": 2,
"MON_DOWN": 2,
"MON_MSGR2_NOT_ENABLED": 2,
"OBJECT_MISPLACED": 1,
"OBJECT_UNFOUND": 2,
"OLD_CRUSH_STRAW_CALC_VERSION": 1,
"OLD_CRUSH_TUNABLES": 2,
"OSDMAP_FLAGS": 1,
"OSD_BACKFILLFULL": 2,
"OSD_CHASSIS_DOWN": 1,
"OSD_DATACENTER_DOWN": 1,
"OSD_DOWN": 1,
"OSD_FLAGS": 1,
"OSD_FULL": 2,
"OSD_HOST_DOWN": 1,
"OSD_NEARFULL": 2,
"OSD_NO_DOWN_OUT_INTERVAL": 2,
"OSD_NO_SORTBITWISE": 2,
"OSD_ORPHAN": 2,
"OSD_OSD_DOWN": 1,
"OSD_OUT_OF_ORDER_FULL": 2,
"OSD_PDU_DOWN": 1,
"OSD_POD_DOWN": 1,
"OSD_RACK_DOWN": 1,
"OSD_REGION_DOWN": 1,
"OSD_ROOM_DOWN": 1,
"OSD_ROOT_DOWN": 1,
"OSD_ROW_DOWN": 1,
"OSD_SCRUB_ERRORS": 2,
"PG_AVAILABILITY": 1,
"PG_BACKFILL_FULL": 2,
"PG_DAMAGED": 2,
"PG_DEGRADED": 1,
"PG_NOT_DEEP_SCRUBBED": 1,
"PG_NOT_SCRUBBED": 1,
"PG_RECOVERY_FULL": 2,
"PG_SLOW_SNAP_TRIMMING": 1,
"POOL_APP_NOT_ENABLED": 2,
"POOL_FULL": 2,
"POOL_NEAR_FULL": 2,
"POOL_TARGET_SIZE_BYTES_OVERCOMMITTED": 1,
"POOL_TARGET_SIZE_RATIO_OVERCOMMITTED": 1,
"POOL_TOO_FEW_PGS": 1,
"POOL_TOO_MANY_PGS": 1,
"RECENT_CRASH": 1,
"SLOW_OPS": 1,
"SMALLER_PGP_NUM": 1,
"TELEMETRY_CHANGED": 1,
"TOO_FEW_OSDS": 1,
"TOO_FEW_PGS": 1,
"TOO_MANY_PGS": 1},
HealthStatus: prometheus.NewGauge(
prometheus.GaugeOpts{

View File

@ -15,7 +15,6 @@
package collectors
import (
"gopkg.in/yaml.v2"
"io/ioutil"
"net/http"
"net/http/httptest"
@ -25,21 +24,6 @@ import (
"github.com/prometheus/client_golang/prometheus"
)
var (
healthChecksString = `health_check_criticality:
MON_DOWN: 2
MDR_DOWN: 2
OSD_DOWN: 1
OSD_FULL: 2
OSDMAP_FLAGS: 1
DEVICE_HEALTH: 1
PG_DEGRADED: 1
PG_DAMAGED: 2
SLOW_OPS: 1
RECENT_CRASH: 1
TELEMETRY_CHANGED: 1`
)
func TestClusterHealthCollector(t *testing.T) {
for _, tt := range []struct {
input string
@ -281,6 +265,7 @@ func TestClusterHealthCollector(t *testing.T) {
"health": { "overall_status": "HEALTH_WARN", "status": "HEALTH_OK } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
},
},
{
@ -297,6 +282,7 @@ func TestClusterHealthCollector(t *testing.T) {
"health": { "status": "HEALTH_OK } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
},
},
{
@ -313,6 +299,7 @@ func TestClusterHealthCollector(t *testing.T) {
"health": { "overall_status": "HEALTH_WARN" } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 1`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
},
},
{
@ -329,6 +316,7 @@ func TestClusterHealthCollector(t *testing.T) {
"health": { "overall_status": "HEALTH_ERR" } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status{cluster="ceph"} 2`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 3`),
},
},
{
@ -490,6 +478,7 @@ $ sudo ceph -s
regexes: []*regexp.Regexp{
regexp.MustCompile(`degraded_objects{cluster="ceph"} 1.54443937e\+08`),
regexp.MustCompile(`unclean_pgs{cluster="ceph"} 4886`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
},
},
{
@ -508,6 +497,25 @@ $ sudo ceph -s
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`misplaced_objects{cluster="ceph"} 4.31295341e\+08`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
},
},
{
input: `
{
"health": {
"checks": {
"POOL_APP_NOT_ENABLED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "application not enabled on 1 pool(s)"
}
}
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
},
},
{
@ -635,6 +643,7 @@ $ sudo ceph -s
regexp.MustCompile(`osdmap_flag_noscrub{cluster="ceph"} 1`),
regexp.MustCompile(`osdmap_flag_nodeep_scrub{cluster="ceph"} 0`),
regexp.MustCompile(`osdmap_flag_notieragent{cluster="ceph"} 1`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
},
},
{
@ -778,15 +787,7 @@ $ sudo ceph -s
},
} {
func() {
type WarningCategorization struct {
CheckNames map[string]int `yaml:"health_check_criticality"`
}
var healthChecksMap WarningCategorization
err := yaml.Unmarshal([]byte(healthChecksString), &healthChecksMap)
if err != nil {
t.Fatalf("failed to parse yaml of warning checks: %s", err)
}
collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph", healthChecksMap.CheckNames)
collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph")
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}

View File

@ -13,14 +13,9 @@ type ClusterConfig struct {
ConfigFile string `yaml:"config_file"`
}
type WarningCategorization struct {
CheckNames map[string]int `yaml:"health_check_criticality"`
}
// Config is the top-level configuration for Metastord.
type Config struct {
Cluster []*ClusterConfig
Warnings *WarningCategorization
Cluster []*ClusterConfig
}
// fileExists returns true if the path exists and is a file.

View File

@ -79,14 +79,13 @@ var _ prometheus.Collector = &CephExporter{}
// NewCephExporter creates an instance to CephExporter and returns a reference
// to it. We can choose to enable a collector to extract stats out of by adding
// it to the list of collectors.
func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int,
healthChecksMap WarningCategorization) *CephExporter {
func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int) *CephExporter {
c := &CephExporter{
collectors: []prometheus.Collector{
collectors.NewClusterUsageCollector(conn, cluster),
collectors.NewPoolUsageCollector(conn, cluster),
collectors.NewPoolInfoCollector(conn, cluster),
collectors.NewClusterHealthCollector(conn, cluster, healthChecksMap.CheckNames),
collectors.NewClusterHealthCollector(conn, cluster),
collectors.NewMonitorCollector(conn, cluster),
collectors.NewOSDCollector(conn, cluster),
},
@ -172,7 +171,7 @@ func main() {
defer conn.Shutdown()
log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode, *cfg.Warnings))
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode))
if err != nil {
log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
}
@ -197,7 +196,7 @@ func main() {
}
defer conn.Shutdown()
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode, WarningCategorization{CheckNames: map[string]int{}}))
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode))
}
http.Handle(*metricsPath, promhttp.Handler())

View File

@ -7,79 +7,3 @@ cluster:
user: admin
config_file: /etc/ceph/ceph2.conf
health_check_criticality:
MON_DOWN: 2
MON_CLOCK_SKEW: 2
MON_MSGR2_NOT_ENABLED: 2
MON_DISK_LOW: 2
MON_DISK_CRIT: 2
MON_DISK_BIG: 1
MGR_MODULE_DEPENDENCY: 1
MGR_MODULE_ERROR: 2
MGR_DOWN: 2
OSD_DOWN: 1
OSD_OSD_DOWN: 1
OSD_HOST_DOWN: 1
OSD_CHASSIS_DOWN: 1
OSD_RACK_DOWN: 1
OSD_ROW_DOWN: 1
OSD_PDU_DOWN: 1
OSD_POD_DOWN: 1
OSD_ROOM_DOWN: 1
OSD_DATACENTER_DOWN: 1
OSD_REGION_DOWN: 1
OSD_ROOT_DOWN: 1
OSD_ORPHAN: 2
OSD_OUT_OF_ORDER_FULL: 2
OSD_FULL: 2
OSD_BACKFILLFULL: 2
OSD_NEARFULL: 2
OSDMAP_FLAGS: 1
OSD_FLAGS: 1
OLD_CRUSH_TUNABLES: 2
OLD_CRUSH_STRAW_CALC_VERSION: 1
CACHE_POOL_NO_HIT_SET: 1
OSD_NO_SORTBITWISE: 2
POOL_FULL: 2
BLUEFS_SPILLOVER: 1
BLUEFS_AVAILABLE_SPACE: 1
BLUEFS_LOW_SPACE: 1
BLUESTORE_FRAGMENTATION: 1
BLUESTORE_LEGACY_STATFS: 1
BLUESTORE_NO_PER_POOL_MAP: 1
BLUESTORE_DISK_SIZE_MISMATCH: 1
BLUESTORE_NO_COMPRESSION: 1
DEVICE_HEALTH: 1
DEVICE_HEALTH_IN_USE: 2
DEVICE_HEALTH_TOOMANY: 2
PG_AVAILABILITY: 1
PG_DEGRADED: 1
PG_RECOVERY_FULL: 2
PG_BACKFILL_FULL: 2
PG_DAMAGED: 2
OSD_SCRUB_ERRORS: 2
LARGE_OMAP_OBJECTS: 1
CACHE_POOL_NEAR_FULL: 1
TOO_FEW_PGS: 1
POOL_TOO_FEW_PGS: 1
TOO_MANY_PGS: 1
POOL_TOO_MANY_PGS: 1
POOL_TARGET_SIZE_RATIO_OVERCOMMITTED: 1
POOL_TARGET_SIZE_BYTES_OVERCOMMITTED: 1
TOO_FEW_OSDS: 1
SMALLER_PGP_NUM: 1
MANY_OBJECTS_PER_PG: 1
POOL_APP_NOT_ENABLED: 2
POOL_NEAR_FULL: 2
OBJECT_MISPLACED: 1
OBJECT_UNFOUND: 2
SLOW_OPS: 1
PG_NOT_SCRUBBED: 1
PG_NOT_DEEP_SCRUBBED: 1
PG_SLOW_SNAP_TRIMMING: 1
RECENT_CRASH: 1
TELEMETRY_CHANGED: 1
AUTH_BAD_CAPS: 2
OSD_NO_DOWN_OUT_INTERVAL: 2