mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-03-06 20:17:26 +00:00
Merge pull request #124 from digitalocean/skrutiy/HealthWarningsMap
Moved Health Checks Map
This commit is contained in:
commit
1df560f980
@ -247,14 +247,87 @@ const (
|
||||
|
||||
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
|
||||
// metrics on.
|
||||
func NewClusterHealthCollector(conn Conn, cluster string, healthChecksMap map[string]int) *ClusterHealthCollector {
|
||||
func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollector {
|
||||
labels := make(prometheus.Labels)
|
||||
labels["cluster"] = cluster
|
||||
|
||||
return &ClusterHealthCollector{
|
||||
conn: conn,
|
||||
|
||||
healthChecksMap: healthChecksMap,
|
||||
healthChecksMap: map[string]int{
|
||||
"AUTH_BAD_CAPS": 2,
|
||||
"BLUEFS_AVAILABLE_SPACE": 1,
|
||||
"BLUEFS_LOW_SPACE": 1,
|
||||
"BLUEFS_SPILLOVER": 1,
|
||||
"BLUESTORE_DISK_SIZE_MISMATCH": 1,
|
||||
"BLUESTORE_FRAGMENTATION": 1,
|
||||
"BLUESTORE_LEGACY_STATFS": 1,
|
||||
"BLUESTORE_NO_COMPRESSION": 1,
|
||||
"BLUESTORE_NO_PER_POOL_MAP": 1,
|
||||
"CACHE_POOL_NEAR_FULL": 1,
|
||||
"CACHE_POOL_NO_HIT_SET": 1,
|
||||
"DEVICE_HEALTH": 1,
|
||||
"DEVICE_HEALTH_IN_USE": 2,
|
||||
"DEVICE_HEALTH_TOOMANY": 2,
|
||||
"LARGE_OMAP_OBJECTS": 1,
|
||||
"MANY_OBJECTS_PER_PG": 1,
|
||||
"MGR_DOWN": 2,
|
||||
"MGR_MODULE_DEPENDENCY": 1,
|
||||
"MGR_MODULE_ERROR": 2,
|
||||
"MON_CLOCK_SKEW": 2,
|
||||
"MON_DISK_BIG": 1,
|
||||
"MON_DISK_CRIT": 2,
|
||||
"MON_DISK_LOW": 2,
|
||||
"MON_DOWN": 2,
|
||||
"MON_MSGR2_NOT_ENABLED": 2,
|
||||
"OBJECT_MISPLACED": 1,
|
||||
"OBJECT_UNFOUND": 2,
|
||||
"OLD_CRUSH_STRAW_CALC_VERSION": 1,
|
||||
"OLD_CRUSH_TUNABLES": 2,
|
||||
"OSDMAP_FLAGS": 1,
|
||||
"OSD_BACKFILLFULL": 2,
|
||||
"OSD_CHASSIS_DOWN": 1,
|
||||
"OSD_DATACENTER_DOWN": 1,
|
||||
"OSD_DOWN": 1,
|
||||
"OSD_FLAGS": 1,
|
||||
"OSD_FULL": 2,
|
||||
"OSD_HOST_DOWN": 1,
|
||||
"OSD_NEARFULL": 2,
|
||||
"OSD_NO_DOWN_OUT_INTERVAL": 2,
|
||||
"OSD_NO_SORTBITWISE": 2,
|
||||
"OSD_ORPHAN": 2,
|
||||
"OSD_OSD_DOWN": 1,
|
||||
"OSD_OUT_OF_ORDER_FULL": 2,
|
||||
"OSD_PDU_DOWN": 1,
|
||||
"OSD_POD_DOWN": 1,
|
||||
"OSD_RACK_DOWN": 1,
|
||||
"OSD_REGION_DOWN": 1,
|
||||
"OSD_ROOM_DOWN": 1,
|
||||
"OSD_ROOT_DOWN": 1,
|
||||
"OSD_ROW_DOWN": 1,
|
||||
"OSD_SCRUB_ERRORS": 2,
|
||||
"PG_AVAILABILITY": 1,
|
||||
"PG_BACKFILL_FULL": 2,
|
||||
"PG_DAMAGED": 2,
|
||||
"PG_DEGRADED": 1,
|
||||
"PG_NOT_DEEP_SCRUBBED": 1,
|
||||
"PG_NOT_SCRUBBED": 1,
|
||||
"PG_RECOVERY_FULL": 2,
|
||||
"PG_SLOW_SNAP_TRIMMING": 1,
|
||||
"POOL_APP_NOT_ENABLED": 2,
|
||||
"POOL_FULL": 2,
|
||||
"POOL_NEAR_FULL": 2,
|
||||
"POOL_TARGET_SIZE_BYTES_OVERCOMMITTED": 1,
|
||||
"POOL_TARGET_SIZE_RATIO_OVERCOMMITTED": 1,
|
||||
"POOL_TOO_FEW_PGS": 1,
|
||||
"POOL_TOO_MANY_PGS": 1,
|
||||
"RECENT_CRASH": 1,
|
||||
"SLOW_OPS": 1,
|
||||
"SMALLER_PGP_NUM": 1,
|
||||
"TELEMETRY_CHANGED": 1,
|
||||
"TOO_FEW_OSDS": 1,
|
||||
"TOO_FEW_PGS": 1,
|
||||
"TOO_MANY_PGS": 1},
|
||||
|
||||
HealthStatus: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
|
@ -15,7 +15,6 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@ -25,21 +24,6 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
var (
|
||||
healthChecksString = `health_check_criticality:
|
||||
MON_DOWN: 2
|
||||
MDR_DOWN: 2
|
||||
OSD_DOWN: 1
|
||||
OSD_FULL: 2
|
||||
OSDMAP_FLAGS: 1
|
||||
DEVICE_HEALTH: 1
|
||||
PG_DEGRADED: 1
|
||||
PG_DAMAGED: 2
|
||||
SLOW_OPS: 1
|
||||
RECENT_CRASH: 1
|
||||
TELEMETRY_CHANGED: 1`
|
||||
)
|
||||
|
||||
func TestClusterHealthCollector(t *testing.T) {
|
||||
for _, tt := range []struct {
|
||||
input string
|
||||
@ -281,6 +265,7 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
"health": { "overall_status": "HEALTH_WARN", "status": "HEALTH_OK } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -297,6 +282,7 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
"health": { "status": "HEALTH_OK } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status{cluster="ceph"} 0`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 0`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -313,6 +299,7 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
"health": { "overall_status": "HEALTH_WARN" } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status{cluster="ceph"} 1`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -329,6 +316,7 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
"health": { "overall_status": "HEALTH_ERR" } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status{cluster="ceph"} 2`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 3`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -490,6 +478,7 @@ $ sudo ceph -s
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`degraded_objects{cluster="ceph"} 1.54443937e\+08`),
|
||||
regexp.MustCompile(`unclean_pgs{cluster="ceph"} 4886`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -508,6 +497,25 @@ $ sudo ceph -s
|
||||
}`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`misplaced_objects{cluster="ceph"} 4.31295341e\+08`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"health": {
|
||||
"checks": {
|
||||
"POOL_APP_NOT_ENABLED": {
|
||||
"severity": "HEALTH_WARN",
|
||||
"summary": {
|
||||
"message": "application not enabled on 1 pool(s)"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 2`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -635,6 +643,7 @@ $ sudo ceph -s
|
||||
regexp.MustCompile(`osdmap_flag_noscrub{cluster="ceph"} 1`),
|
||||
regexp.MustCompile(`osdmap_flag_nodeep_scrub{cluster="ceph"} 0`),
|
||||
regexp.MustCompile(`osdmap_flag_notieragent{cluster="ceph"} 1`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -778,15 +787,7 @@ $ sudo ceph -s
|
||||
},
|
||||
} {
|
||||
func() {
|
||||
type WarningCategorization struct {
|
||||
CheckNames map[string]int `yaml:"health_check_criticality"`
|
||||
}
|
||||
var healthChecksMap WarningCategorization
|
||||
err := yaml.Unmarshal([]byte(healthChecksString), &healthChecksMap)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse yaml of warning checks: %s", err)
|
||||
}
|
||||
collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph", healthChecksMap.CheckNames)
|
||||
collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph")
|
||||
if err := prometheus.Register(collector); err != nil {
|
||||
t.Fatalf("collector failed to register: %s", err)
|
||||
}
|
||||
|
@ -13,14 +13,9 @@ type ClusterConfig struct {
|
||||
ConfigFile string `yaml:"config_file"`
|
||||
}
|
||||
|
||||
type WarningCategorization struct {
|
||||
CheckNames map[string]int `yaml:"health_check_criticality"`
|
||||
}
|
||||
|
||||
// Config is the top-level configuration for Metastord.
|
||||
type Config struct {
|
||||
Cluster []*ClusterConfig
|
||||
Warnings *WarningCategorization
|
||||
Cluster []*ClusterConfig
|
||||
}
|
||||
|
||||
// fileExists returns true if the path exists and is a file.
|
||||
|
@ -79,14 +79,13 @@ var _ prometheus.Collector = &CephExporter{}
|
||||
// NewCephExporter creates an instance to CephExporter and returns a reference
|
||||
// to it. We can choose to enable a collector to extract stats out of by adding
|
||||
// it to the list of collectors.
|
||||
func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int,
|
||||
healthChecksMap WarningCategorization) *CephExporter {
|
||||
func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int) *CephExporter {
|
||||
c := &CephExporter{
|
||||
collectors: []prometheus.Collector{
|
||||
collectors.NewClusterUsageCollector(conn, cluster),
|
||||
collectors.NewPoolUsageCollector(conn, cluster),
|
||||
collectors.NewPoolInfoCollector(conn, cluster),
|
||||
collectors.NewClusterHealthCollector(conn, cluster, healthChecksMap.CheckNames),
|
||||
collectors.NewClusterHealthCollector(conn, cluster),
|
||||
collectors.NewMonitorCollector(conn, cluster),
|
||||
collectors.NewOSDCollector(conn, cluster),
|
||||
},
|
||||
@ -172,7 +171,7 @@ func main() {
|
||||
defer conn.Shutdown()
|
||||
|
||||
log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
|
||||
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode, *cfg.Warnings))
|
||||
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode))
|
||||
if err != nil {
|
||||
log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
|
||||
}
|
||||
@ -197,7 +196,7 @@ func main() {
|
||||
}
|
||||
defer conn.Shutdown()
|
||||
|
||||
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode, WarningCategorization{CheckNames: map[string]int{}}))
|
||||
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode))
|
||||
}
|
||||
|
||||
http.Handle(*metricsPath, promhttp.Handler())
|
||||
|
76
exporter.yml
76
exporter.yml
@ -7,79 +7,3 @@ cluster:
|
||||
user: admin
|
||||
config_file: /etc/ceph/ceph2.conf
|
||||
|
||||
health_check_criticality:
|
||||
MON_DOWN: 2
|
||||
MON_CLOCK_SKEW: 2
|
||||
MON_MSGR2_NOT_ENABLED: 2
|
||||
MON_DISK_LOW: 2
|
||||
MON_DISK_CRIT: 2
|
||||
MON_DISK_BIG: 1
|
||||
MGR_MODULE_DEPENDENCY: 1
|
||||
MGR_MODULE_ERROR: 2
|
||||
MGR_DOWN: 2
|
||||
OSD_DOWN: 1
|
||||
OSD_OSD_DOWN: 1
|
||||
OSD_HOST_DOWN: 1
|
||||
OSD_CHASSIS_DOWN: 1
|
||||
OSD_RACK_DOWN: 1
|
||||
OSD_ROW_DOWN: 1
|
||||
OSD_PDU_DOWN: 1
|
||||
OSD_POD_DOWN: 1
|
||||
OSD_ROOM_DOWN: 1
|
||||
OSD_DATACENTER_DOWN: 1
|
||||
OSD_REGION_DOWN: 1
|
||||
OSD_ROOT_DOWN: 1
|
||||
OSD_ORPHAN: 2
|
||||
OSD_OUT_OF_ORDER_FULL: 2
|
||||
OSD_FULL: 2
|
||||
OSD_BACKFILLFULL: 2
|
||||
OSD_NEARFULL: 2
|
||||
OSDMAP_FLAGS: 1
|
||||
OSD_FLAGS: 1
|
||||
OLD_CRUSH_TUNABLES: 2
|
||||
OLD_CRUSH_STRAW_CALC_VERSION: 1
|
||||
CACHE_POOL_NO_HIT_SET: 1
|
||||
OSD_NO_SORTBITWISE: 2
|
||||
POOL_FULL: 2
|
||||
BLUEFS_SPILLOVER: 1
|
||||
BLUEFS_AVAILABLE_SPACE: 1
|
||||
BLUEFS_LOW_SPACE: 1
|
||||
BLUESTORE_FRAGMENTATION: 1
|
||||
BLUESTORE_LEGACY_STATFS: 1
|
||||
BLUESTORE_NO_PER_POOL_MAP: 1
|
||||
BLUESTORE_DISK_SIZE_MISMATCH: 1
|
||||
BLUESTORE_NO_COMPRESSION: 1
|
||||
DEVICE_HEALTH: 1
|
||||
DEVICE_HEALTH_IN_USE: 2
|
||||
DEVICE_HEALTH_TOOMANY: 2
|
||||
PG_AVAILABILITY: 1
|
||||
PG_DEGRADED: 1
|
||||
PG_RECOVERY_FULL: 2
|
||||
PG_BACKFILL_FULL: 2
|
||||
PG_DAMAGED: 2
|
||||
OSD_SCRUB_ERRORS: 2
|
||||
LARGE_OMAP_OBJECTS: 1
|
||||
CACHE_POOL_NEAR_FULL: 1
|
||||
TOO_FEW_PGS: 1
|
||||
POOL_TOO_FEW_PGS: 1
|
||||
TOO_MANY_PGS: 1
|
||||
POOL_TOO_MANY_PGS: 1
|
||||
POOL_TARGET_SIZE_RATIO_OVERCOMMITTED: 1
|
||||
POOL_TARGET_SIZE_BYTES_OVERCOMMITTED: 1
|
||||
TOO_FEW_OSDS: 1
|
||||
SMALLER_PGP_NUM: 1
|
||||
MANY_OBJECTS_PER_PG: 1
|
||||
POOL_APP_NOT_ENABLED: 2
|
||||
POOL_NEAR_FULL: 2
|
||||
OBJECT_MISPLACED: 1
|
||||
OBJECT_UNFOUND: 2
|
||||
SLOW_OPS: 1
|
||||
PG_NOT_SCRUBBED: 1
|
||||
PG_NOT_DEEP_SCRUBBED: 1
|
||||
PG_SLOW_SNAP_TRIMMING: 1
|
||||
RECENT_CRASH: 1
|
||||
TELEMETRY_CHANGED: 1
|
||||
AUTH_BAD_CAPS: 2
|
||||
OSD_NO_DOWN_OUT_INTERVAL: 2
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user