diff --git a/ceph/cluster_usage.go b/ceph/cluster_usage.go index 9518a6a..d277bae 100644 --- a/ceph/cluster_usage.go +++ b/ceph/cluster_usage.go @@ -30,9 +30,8 @@ const ( // is growing or shrinking as a whole in order to zero in on the cause. The // pool specific stats are provided separately. type ClusterUsageCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // GlobalCapacity displays the total storage capacity of the cluster. This // information is based on the actual no. of objects that are @@ -55,9 +54,8 @@ func NewClusterUsageCollector(exporter *Exporter) *ClusterUsageCollector { labels["cluster"] = exporter.Cluster return &ClusterUsageCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, GlobalCapacity: prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -106,7 +104,6 @@ func (c *ClusterUsageCollector) collect() error { return err } - stats := &cephClusterStats{} if err := json.Unmarshal(buf, stats); err != nil { return err @@ -143,7 +140,7 @@ func (c *ClusterUsageCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends the metric values for each metric pertaining to the global // cluster usage over to the provided prometheus Metric channel. -func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric) { +func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric, version *Version) { c.logger.Debug("collecting cluster usage metrics") if err := c.collect(); err != nil { c.logger.WithError(err).Error("error collecting cluster usage metrics") diff --git a/ceph/crashes.go b/ceph/crashes.go index 425a3a6..06be930 100644 --- a/ceph/crashes.go +++ b/ceph/crashes.go @@ -31,9 +31,8 @@ var ( // This is NOT the same as new_crash_reports, that only counts new reports in the past // two weeks as reported by 'ceph health'. type CrashesCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger crashReportsDesc *prometheus.Desc } @@ -44,9 +43,8 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector { labels["cluster"] = exporter.Cluster collector := &CrashesCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, crashReportsDesc: prometheus.NewDesc( fmt.Sprintf("%s_crash_reports", cephNamespace), @@ -106,7 +104,7 @@ func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) { } // Collect sends all the collected metrics Prometheus. -func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric) { +func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric, version *Version) { crashes, err := c.getCrashLs() if err != nil { c.logger.WithError(err).Error("failed to run 'ceph crash ls'") diff --git a/ceph/exporter.go b/ceph/exporter.go index 78a406e..f052e8a 100644 --- a/ceph/exporter.go +++ b/ceph/exporter.go @@ -226,7 +226,26 @@ func (exporter *Exporter) Describe(ch chan<- *prometheus.Desc) { } for _, cc := range exporter.cc { - cc.Describe(ch) + switch cc.(type) { + case *ClusterUsageCollector: + cc.(*ClusterUsageCollector).Describe(ch) + case *PoolUsageCollector: + cc.(*PoolUsageCollector).Describe(ch) + case *PoolInfoCollector: + cc.(*PoolInfoCollector).Describe(ch) + case *ClusterHealthCollector: + cc.(*ClusterHealthCollector).Describe(ch) + case *MonitorCollector: + cc.(*MonitorCollector).Describe(ch) + case *OSDCollector: + cc.(*OSDCollector).Describe(ch) + case *CrashesCollector: + cc.(*CrashesCollector).Describe(ch) + case *RbdMirrorStatusCollector: + cc.(*RbdMirrorStatusCollector).Describe(ch) + case *RGWCollector: + cc.(*RGWCollector).Describe(ch) + } } } @@ -250,6 +269,25 @@ func (exporter *Exporter) Collect(ch chan<- prometheus.Metric) { } for _, cc := range exporter.cc { - cc.Collect(ch) + switch cc.(type) { + case *ClusterUsageCollector: + cc.(*ClusterUsageCollector).Collect(ch, exporter.Version) + case *PoolUsageCollector: + cc.(*PoolUsageCollector).Collect(ch, exporter.Version) + case *PoolInfoCollector: + cc.(*PoolInfoCollector).Collect(ch, exporter.Version) + case *ClusterHealthCollector: + cc.(*ClusterHealthCollector).Collect(ch, exporter.Version) + case *MonitorCollector: + cc.(*MonitorCollector).Collect(ch, exporter.Version) + case *OSDCollector: + cc.(*OSDCollector).Collect(ch, exporter.Version) + case *CrashesCollector: + cc.(*CrashesCollector).Collect(ch, exporter.Version) + case *RbdMirrorStatusCollector: + cc.(*RbdMirrorStatusCollector).Collect(ch, exporter.Version) + case *RGWCollector: + cc.(*RGWCollector).Collect(ch, exporter.Version) + } } } diff --git a/ceph/health.go b/ceph/health.go index 91d3026..1778098 100644 --- a/ceph/health.go +++ b/ceph/health.go @@ -47,9 +47,8 @@ var ( // It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector // does. type ClusterHealthCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // healthChecksMap stores warnings and their criticality healthChecksMap map[string]int @@ -287,9 +286,8 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector { labels["cluster"] = exporter.Cluster collector := &ClusterHealthCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, healthChecksMap: map[string]int{ "AUTH_BAD_CAPS": 2, @@ -558,13 +556,6 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector { "notieragent": &collector.OSDMapFlagNoTierAgent, } - if exporter.Version.IsAtLeast(Pacific) { - // pacific adds the DAEMON_OLD_VERSION health check - // that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay - // we'll interpret this is a critical warning (2) - collector.healthChecksMap["DAEMON_OLD_VERSION"] = 2 - } - return collector } @@ -724,7 +715,7 @@ type cephHealthStats struct { } `json:"servicemap"` } -func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { +func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric, version *Version) error { cmd := c.cephUsageCommand(jsonFormat) buf, _, err := c.conn.MonCommand(cmd) if err != nil { @@ -883,6 +874,14 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { } } } + + if version.IsAtLeast(Pacific) { + // pacific adds the DAEMON_OLD_VERSION health check + // that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay + // we'll interpret this is a critical warning (2) + c.healthChecksMap["DAEMON_OLD_VERSION"] = 2 + } + if !mapEmpty { if val, present := c.healthChecksMap[k]; present { c.HealthStatusInterpreter.Set(float64(val)) @@ -991,7 +990,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { ch <- prometheus.MustNewConstMetric(c.CachePromoteIOOps, prometheus.GaugeValue, stats.PGMap.CachePromoteOpPerSec) var actualOsdMap osdMap - if c.version.IsAtLeast(Octopus) { + if version.IsAtLeast(Octopus) { if stats.OSDMap != nil { actualOsdMap = osdMap{ NumOSDs: stats.OSDMap["num_osds"].(float64), @@ -1031,7 +1030,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { activeMgr := 0 standByMgrs := 0 - if c.version.IsAtLeast(Octopus) { + if version.IsAtLeast(Octopus) { if stats.MgrMap.Available { activeMgr = 1 } @@ -1334,9 +1333,9 @@ func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided prometheus channel. // It requires the caller to handle synchronization. -func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) { +func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric, version *Version) { c.logger.Debug("collecting cluster health metrics") - if err := c.collect(ch); err != nil { + if err := c.collect(ch, version); err != nil { c.logger.WithError(err).Error("error collecting cluster health metrics " + err.Error()) } diff --git a/ceph/monitors.go b/ceph/monitors.go index 03734d0..54942ed 100644 --- a/ceph/monitors.go +++ b/ceph/monitors.go @@ -32,9 +32,8 @@ var versionRegexp = regexp.MustCompile(`ceph version (?P\d+\.\d+\.\ // to each monitor instance, there are various vector metrics we // need to use. type MonitorCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // TotalKBs display the total storage a given monitor node has. TotalKBs *prometheus.GaugeVec @@ -96,9 +95,8 @@ func NewMonitorCollector(exporter *Exporter) *MonitorCollector { labels["cluster"] = exporter.Cluster return &MonitorCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, TotalKBs: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -553,7 +551,7 @@ func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the given metrics from the Monitors and sends it to the prometheus // channel. -func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) { +func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric, version *Version) { m.logger.Debug("collecting ceph monitor metrics") if err := m.collect(); err != nil { m.logger.WithError(err).Error("error collecting ceph monitor metrics") diff --git a/ceph/osd.go b/ceph/osd.go index 4f57e6e..23e573c 100644 --- a/ceph/osd.go +++ b/ceph/osd.go @@ -40,9 +40,8 @@ const ( // An important aspect of monitoring OSDs is to ensure that when the cluster is // up and running that all OSDs that are in the cluster are up and running, too type OSDCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // osdScrubCache holds the cache of previous PG scrubs osdScrubCache map[int]int @@ -152,9 +151,6 @@ type OSDCollector struct { OldestInactivePG prometheus.Gauge } -// This ensures OSDCollector implements interface prometheus.Collector. -var _ prometheus.Collector = &OSDCollector{} - // NewOSDCollector creates an instance of the OSDCollector and instantiates the // individual metrics that show information about the OSD. func NewOSDCollector(exporter *Exporter) *OSDCollector { @@ -163,9 +159,8 @@ func NewOSDCollector(exporter *Exporter) *OSDCollector { osdLabels := []string{"osd", "device_class", "host", "rack", "root"} return &OSDCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, osdScrubCache: make(map[int]int), osdLabelsCache: make(map[int64]*cephOSDLabel), @@ -1119,7 +1114,7 @@ func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided Prometheus channel. // It requires the caller to handle synchronization. -func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) { +func (o *OSDCollector) Collect(ch chan<- prometheus.Metric, version *Version) { // Reset daemon specifc metrics; daemons can leave the cluster o.CrushWeight.Reset() o.Depth.Reset() diff --git a/ceph/pool.go b/ceph/pool.go index 33da01b..a059b06 100644 --- a/ceph/pool.go +++ b/ceph/pool.go @@ -32,9 +32,8 @@ const ( // PoolInfoCollector gives information about each pool that exists in a given // ceph cluster. type PoolInfoCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // PGNum contains the count of PGs allotted to a particular pool. PGNum *prometheus.GaugeVec @@ -75,9 +74,8 @@ func NewPoolInfoCollector(exporter *Exporter) *PoolInfoCollector { labels["cluster"] = exporter.Cluster return &PoolInfoCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, PGNum: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -261,7 +259,7 @@ func (p *PoolInfoCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the current values of all the metrics and sends them to the // prometheus channel. -func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric) { +func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric, version *Version) { p.logger.Debug("collecting pool metrics") if err := p.collect(); err != nil { p.logger.WithError(err).Error("error collecting pool metrics") diff --git a/ceph/pool_usage.go b/ceph/pool_usage.go index 2c64865..3846445 100644 --- a/ceph/pool_usage.go +++ b/ceph/pool_usage.go @@ -25,9 +25,8 @@ import ( // PoolUsageCollector displays statistics about each pool in the Ceph cluster. type PoolUsageCollector struct { - conn Conn - logger *logrus.Logger - version *Version + conn Conn + logger *logrus.Logger // UsedBytes tracks the amount of bytes currently allocated for the pool. This // does not factor in the overcommitment made for individual images. @@ -80,9 +79,8 @@ func NewPoolUsageCollector(exporter *Exporter) *PoolUsageCollector { labels["cluster"] = exporter.Cluster return &PoolUsageCollector{ - conn: exporter.Conn, - logger: exporter.Logger, - version: exporter.Version, + conn: exporter.Conn, + logger: exporter.Logger, UsedBytes: prometheus.NewDesc(fmt.Sprintf("%s_%s_used_bytes", cephNamespace, subSystem), "Capacity of the pool that is currently under use", poolLabel, labels, @@ -213,7 +211,7 @@ func (p *PoolUsageCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the current values of all the metrics and sends them to the // prometheus channel. -func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric) { +func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric, version *Version) { p.logger.Debug("collecting pool usage metrics") if err := p.collect(ch); err != nil { p.logger.WithError(err).Error("error collecting pool usage metrics") diff --git a/ceph/rbd_mirror_status.go b/ceph/rbd_mirror_status.go index 62a42c4..d18022f 100644 --- a/ceph/rbd_mirror_status.go +++ b/ceph/rbd_mirror_status.go @@ -155,7 +155,7 @@ func (c *RbdMirrorStatusCollector) Describe(ch chan<- *prometheus.Desc) { } // Collect sends all the collected metrics Prometheus. -func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric) { +func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric, version *Version) { status, err := rbdMirrorStatus(c.config, c.user) if err != nil { c.logger.WithError(err).Error("failed to run 'rbd mirror pool status'") @@ -166,6 +166,7 @@ func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric) { } c.RbdMirrorStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.Health)) + c.version = version if c.version.IsAtLeast(Pacific) { c.RbdMirrorDaemonStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.DaemonHealth)) diff --git a/ceph/rgw.go b/ceph/rgw.go index 98d8290..61481d2 100644 --- a/ceph/rgw.go +++ b/ceph/rgw.go @@ -76,7 +76,6 @@ type RGWCollector struct { user string background bool logger *logrus.Logger - version *Version // ActiveTasks reports the number of (expired) RGW GC tasks ActiveTasks *prometheus.GaugeVec @@ -101,7 +100,6 @@ func NewRGWCollector(exporter *Exporter, background bool) *RGWCollector { config: exporter.Config, background: background, logger: exporter.Logger, - version: exporter.Version, getRGWGCTaskList: rgwGetGCTaskList, ActiveTasks: prometheus.NewGaugeVec( @@ -219,7 +217,7 @@ func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided prometheus channel. // It requires the caller to handle synchronization. -func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) { +func (r *RGWCollector) Collect(ch chan<- prometheus.Metric, version *Version) { if !r.background { r.logger.WithField("background", r.background).Debug("collecting RGW GC stats") err := r.collect()