323 lines
8.4 KiB
Go
323 lines
8.4 KiB
Go
// Copyright 2016 DigitalOcean
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package collectors
|
|
|
|
import (
|
|
"encoding/json"
|
|
"log"
|
|
"regexp"
|
|
"strconv"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// ClusterHealthCollector collects information about the health of an overall cluster.
|
|
// It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector
|
|
// does.
|
|
type ClusterHealthCollector struct {
|
|
// conn holds connection to the Ceph cluster
|
|
conn Conn
|
|
|
|
// DegradedPGs shows the no. of PGs that have some of the replicas
|
|
// missing.
|
|
DegradedPGs prometheus.Gauge
|
|
|
|
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
|
|
// that are supposed to be in it.
|
|
UncleanPGs prometheus.Gauge
|
|
|
|
// UndersizedPGs depicts the count of PGs that have fewer copies than configured
|
|
// replication level.
|
|
UndersizedPGs prometheus.Gauge
|
|
|
|
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
|
|
// anything about their latest state since their pg mapping was modified.
|
|
StalePGs prometheus.Gauge
|
|
|
|
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
|
DegradedObjectsCount prometheus.Gauge
|
|
|
|
// OSDsDown show the no. of OSDs that are in the DOWN state.
|
|
OSDsDown prometheus.Gauge
|
|
|
|
// OSDsUp show the no. of OSDs that are in the UP state and are able to serve requests.
|
|
OSDsUp prometheus.Gauge
|
|
|
|
// OSDsIn shows the no. of OSDs that are marked as IN in the cluster.
|
|
OSDsIn prometheus.Gauge
|
|
|
|
// OSDsNum shows the count of total OSDs the cluster has.
|
|
OSDsNum prometheus.Gauge
|
|
|
|
// RemappedPGs show the count of PGs that are currently remapped and needs to be moved
|
|
// to newer OSDs.
|
|
RemappedPGs prometheus.Gauge
|
|
}
|
|
|
|
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
|
|
// metrics on.
|
|
func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
|
return &ClusterHealthCollector{
|
|
conn: conn,
|
|
|
|
DegradedPGs: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "degraded_pgs",
|
|
Help: "No. of PGs in a degraded state",
|
|
},
|
|
),
|
|
UncleanPGs: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "unclean_pgs",
|
|
Help: "No. of PGs in an unclean state",
|
|
},
|
|
),
|
|
UndersizedPGs: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "undersized_pgs",
|
|
Help: "No. of undersized PGs in the cluster",
|
|
},
|
|
),
|
|
StalePGs: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "stale_pgs",
|
|
Help: "No. of stale PGs in the cluster",
|
|
},
|
|
),
|
|
DegradedObjectsCount: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "degraded_objects",
|
|
Help: "No. of degraded objects across all PGs",
|
|
},
|
|
),
|
|
OSDsDown: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "osds_down",
|
|
Help: "Count of OSDs that are in DOWN state",
|
|
},
|
|
),
|
|
OSDsUp: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "osds_up",
|
|
Help: "Count of OSDs that are in UP state",
|
|
},
|
|
),
|
|
OSDsIn: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "osds_in",
|
|
Help: "Count of OSDs that are in IN state and available to serve requests",
|
|
},
|
|
),
|
|
OSDsNum: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "osds",
|
|
Help: "Count of total OSDs in the cluster",
|
|
},
|
|
),
|
|
RemappedPGs: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: cephNamespace,
|
|
Name: "pgs_remapped",
|
|
Help: "No. of PGs that are remapped and incurring cluster-wide movement",
|
|
},
|
|
),
|
|
}
|
|
}
|
|
|
|
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
|
return []prometheus.Metric{
|
|
c.DegradedPGs,
|
|
c.UncleanPGs,
|
|
c.UndersizedPGs,
|
|
c.StalePGs,
|
|
c.DegradedObjectsCount,
|
|
c.OSDsDown,
|
|
c.OSDsUp,
|
|
c.OSDsIn,
|
|
c.OSDsNum,
|
|
c.RemappedPGs,
|
|
}
|
|
}
|
|
|
|
type cephHealthStats struct {
|
|
Health struct {
|
|
Summary []struct {
|
|
Severity string `json:"severity"`
|
|
Summary string `json:"summary"`
|
|
} `json:"summary"`
|
|
} `json:"health"`
|
|
OSDMap struct {
|
|
OSDMap struct {
|
|
NumOSDs json.Number `json:"num_osds"`
|
|
NumUpOSDs json.Number `json:"num_up_osds"`
|
|
NumInOSDs json.Number `json:"num_in_osds"`
|
|
NumRemappedPGs json.Number `json:"num_remapped_pgs"`
|
|
} `json:"osdmap"`
|
|
} `json:"osdmap"`
|
|
}
|
|
|
|
func (c *ClusterHealthCollector) collect() error {
|
|
cmd := c.cephUsageCommand()
|
|
buf, _, err := c.conn.MonCommand(cmd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stats := &cephHealthStats{}
|
|
if err := json.Unmarshal(buf, stats); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, metric := range c.metricsList() {
|
|
if gauge, ok := metric.(prometheus.Gauge); ok {
|
|
gauge.Set(0)
|
|
}
|
|
}
|
|
|
|
var (
|
|
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
|
|
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
|
|
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
|
|
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
|
|
degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
|
|
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
|
|
)
|
|
|
|
for _, s := range stats.Health.Summary {
|
|
matched := degradedRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 2 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.DegradedPGs.Set(float64(v))
|
|
}
|
|
|
|
matched = uncleanRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 2 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.UncleanPGs.Set(float64(v))
|
|
}
|
|
|
|
matched = undersizedRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 2 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.UndersizedPGs.Set(float64(v))
|
|
}
|
|
|
|
matched = staleRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 2 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.StalePGs.Set(float64(v))
|
|
}
|
|
|
|
matched = degradedObjectsRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 3 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.DegradedObjectsCount.Set(float64(v))
|
|
}
|
|
|
|
matched = osdsDownRegex.FindStringSubmatch(s.Summary)
|
|
if len(matched) == 3 {
|
|
v, err := strconv.Atoi(matched[1])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.OSDsDown.Set(float64(v))
|
|
}
|
|
}
|
|
|
|
osdsUp, err := stats.OSDMap.OSDMap.NumUpOSDs.Float64()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.OSDsUp.Set(osdsUp)
|
|
|
|
osdsIn, err := stats.OSDMap.OSDMap.NumInOSDs.Float64()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.OSDsIn.Set(osdsIn)
|
|
|
|
osdsNum, err := stats.OSDMap.OSDMap.NumOSDs.Float64()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.OSDsNum.Set(osdsNum)
|
|
|
|
remappedPGs, err := stats.OSDMap.OSDMap.NumRemappedPGs.Float64()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.RemappedPGs.Set(remappedPGs)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *ClusterHealthCollector) cephUsageCommand() []byte {
|
|
cmd, err := json.Marshal(map[string]interface{}{
|
|
"prefix": "status",
|
|
"format": "json",
|
|
})
|
|
if err != nil {
|
|
// panic! because ideally in no world this hard-coded input
|
|
// should fail.
|
|
panic(err)
|
|
}
|
|
return cmd
|
|
}
|
|
|
|
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
|
|
// to the provided prometheus channel.
|
|
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
for _, metric := range c.metricsList() {
|
|
ch <- metric.Desc()
|
|
}
|
|
}
|
|
|
|
// Collect sends all the collected metrics to the provided prometheus channel.
|
|
// It requires the caller to handle synchronization.
|
|
func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
|
|
if err := c.collect(); err != nil {
|
|
log.Println("failed collecting metrics:", err)
|
|
return
|
|
}
|
|
|
|
for _, metric := range c.metricsList() {
|
|
ch <- metric
|
|
}
|
|
}
|