introduce constmetrics for osdmap flags

This commit is contained in:
Daniel R 2022-10-07 14:17:29 -04:00
parent c3a3d581aa
commit 1a741d7606
2 changed files with 71 additions and 29 deletions

View File

@ -203,6 +203,11 @@ type ClusterHealthCollector struct {
OSDMapFlagNoDeepScrub prometheus.Gauge OSDMapFlagNoDeepScrub prometheus.Gauge
OSDMapFlagNoTierAgent prometheus.Gauge OSDMapFlagNoTierAgent prometheus.Gauge
// OSDMapFlags, but implemented as a ConstMetric and each flag is a label
OSDMapFlags *prometheus.Desc
// OSDFlagToGaugeMap maps flags to gauges
OSDFlagToGaugeMap map[string]*prometheus.Gauge
// OSDsDown show the no. of OSDs that are in the DOWN state. // OSDsDown show the no. of OSDs that are in the DOWN state.
OSDsDown *prometheus.Desc OSDsDown *prometheus.Desc
@ -505,6 +510,8 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
ConstLabels: labels, ConstLabels: labels,
}, },
), ),
OSDMapFlags: prometheus.NewDesc(fmt.Sprintf("%s_osd_map_flags", cephNamespace), "A metric for all OSD flags", []string{"flag"}, labels),
OSDsDown: prometheus.NewDesc(fmt.Sprintf("%s_osds_down", cephNamespace), "Count of OSDs that are in DOWN state", nil, labels), OSDsDown: prometheus.NewDesc(fmt.Sprintf("%s_osds_down", cephNamespace), "Count of OSDs that are in DOWN state", nil, labels),
OSDsUp: prometheus.NewDesc(fmt.Sprintf("%s_osds_up", cephNamespace), "Count of OSDs that are in UP state", nil, labels), OSDsUp: prometheus.NewDesc(fmt.Sprintf("%s_osds_up", cephNamespace), "Count of OSDs that are in UP state", nil, labels),
OSDsIn: prometheus.NewDesc(fmt.Sprintf("%s_osds_in", cephNamespace), "Count of OSDs that are in IN state and available to serve requests", nil, labels), OSDsIn: prometheus.NewDesc(fmt.Sprintf("%s_osds_in", cephNamespace), "Count of OSDs that are in IN state and available to serve requests", nil, labels),
@ -526,6 +533,23 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
RbdMirrorUp: prometheus.NewDesc(fmt.Sprintf("%s_rbd_mirror_up", cephNamespace), "Alive rbd-mirror daemons", []string{"name"}, labels), RbdMirrorUp: prometheus.NewDesc(fmt.Sprintf("%s_rbd_mirror_up", cephNamespace), "Alive rbd-mirror daemons", []string{"name"}, labels),
} }
// This is here to support backwards compatibility with gauges, but also exists as a general list of possible flags
collector.OSDFlagToGaugeMap = map[string]*prometheus.Gauge{
"full": &collector.OSDMapFlagFull,
"pauserd": &collector.OSDMapFlagPauseRd,
"pausewr": &collector.OSDMapFlagPauseWr,
"noup": &collector.OSDMapFlagNoUp,
"nodown": &collector.OSDMapFlagNoDown,
"noin": &collector.OSDMapFlagNoIn,
"noout": &collector.OSDMapFlagNoOut,
"nobackfill": &collector.OSDMapFlagNoBackfill,
"norecover": &collector.OSDMapFlagNoRecover,
"norebalance": &collector.OSDMapFlagNoRebalance,
"noscrub": &collector.OSDMapFlagNoScrub,
"nodeep_scrub": &collector.OSDMapFlagNoDeepScrub,
"notieragent": &collector.OSDMapFlagNoTierAgent,
}
if exporter.Version.IsAtLeast(Pacific) { if exporter.Version.IsAtLeast(Pacific) {
// pacific adds the DAEMON_OLD_VERSION health check // pacific adds the DAEMON_OLD_VERSION health check
// that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay // that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay
@ -785,6 +809,8 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
} }
} }
// This stores OSD map flags that were found, so the rest can be set to 0
matchedOsdMapFlags := make(map[string]bool)
for k, check := range stats.Health.Checks { for k, check := range stats.Health.Checks {
if k == "MON_DOWN" { if k == "MON_DOWN" {
matched := monsDownRegex.FindStringSubmatch(check.Summary.Message) matched := monsDownRegex.FindStringSubmatch(check.Summary.Message)
@ -835,37 +861,14 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
if len(matched) > 0 { if len(matched) > 0 {
flags := strings.Split(matched[1], ",") flags := strings.Split(matched[1], ",")
for _, f := range flags { for _, f := range flags {
switch f { // Update the global metric for this specific flag
case "full": ch <- prometheus.MustNewConstMetric(c.OSDMapFlags, prometheus.GaugeValue, float64(1), f)
c.OSDMapFlagFull.Set(1) // Update the legacy gauges, based on the map
case "pauserd": (*c.OSDFlagToGaugeMap[f]).Set(1)
c.OSDMapFlagPauseRd.Set(1) // Mark the flag as having been set
case "pausewr": matchedOsdMapFlags[f] = true
c.OSDMapFlagPauseWr.Set(1)
case "noup":
c.OSDMapFlagNoUp.Set(1)
case "nodown":
c.OSDMapFlagNoDown.Set(1)
case "noin":
c.OSDMapFlagNoIn.Set(1)
case "noout":
c.OSDMapFlagNoOut.Set(1)
case "nobackfill":
c.OSDMapFlagNoBackfill.Set(1)
case "norecover":
c.OSDMapFlagNoRecover.Set(1)
case "norebalance":
c.OSDMapFlagNoRebalance.Set(1)
case "noscrub":
c.OSDMapFlagNoScrub.Set(1)
case "nodeep_scrub":
c.OSDMapFlagNoDeepScrub.Set(1)
case "notieragent":
c.OSDMapFlagNoTierAgent.Set(1)
}
} }
} }
} }
if !mapEmpty { if !mapEmpty {
if val, present := c.healthChecksMap[k]; present { if val, present := c.healthChecksMap[k]; present {
@ -874,6 +877,13 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
} }
} }
// Zero-fill the OSD Map ConstMetrics (the ones that haven't already been set to 1)
for flagKey := range c.OSDFlagToGaugeMap {
if matchedOsdMapFlags[flagKey] == false {
ch <- prometheus.MustNewConstMetric(c.OSDMapFlags, prometheus.GaugeValue, float64(0), flagKey)
}
}
var ( var (
degradedPGs float64 degradedPGs float64
activePGs float64 activePGs float64

View File

@ -500,6 +500,38 @@ $ sudo ceph -s
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`), regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
}, },
}, },
{
name: "many flags set new osdmap constmetrics filled",
input: `
{
"health": {
"checks": {
"OSDMAP_FLAGS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "pauserd,pausewr,noout,noin,norecover,noscrub,notieragent flag(s) set; mon 482f68d873d2 is low on available space"
}
}
}
}
}`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="full"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="pauserd"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="pausewr"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="noup"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="nodown"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="noin"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="noout"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="nobackfill"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="norecover"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="norebalance"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="noscrub"} 1`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="nodeep_scrub"} 0`),
regexp.MustCompile(`osd_map_flags{cluster="ceph",flag="notieragent"} 1`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
},
},
{ {
name: "lots of PG data", name: "lots of PG data",
input: ` input: `