pass version to collectors when calling Collect()

This commit is contained in:
Alex Marangone 2023-02-14 11:10:54 -08:00
parent 69edc55596
commit ba15bf50a3
10 changed files with 90 additions and 70 deletions

View File

@ -30,9 +30,8 @@ const (
// is growing or shrinking as a whole in order to zero in on the cause. The // is growing or shrinking as a whole in order to zero in on the cause. The
// pool specific stats are provided separately. // pool specific stats are provided separately.
type ClusterUsageCollector struct { type ClusterUsageCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// GlobalCapacity displays the total storage capacity of the cluster. This // GlobalCapacity displays the total storage capacity of the cluster. This
// information is based on the actual no. of objects that are // information is based on the actual no. of objects that are
@ -55,9 +54,8 @@ func NewClusterUsageCollector(exporter *Exporter) *ClusterUsageCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
return &ClusterUsageCollector{ return &ClusterUsageCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
GlobalCapacity: prometheus.NewGauge(prometheus.GaugeOpts{ GlobalCapacity: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: cephNamespace, Namespace: cephNamespace,
@ -106,7 +104,6 @@ func (c *ClusterUsageCollector) collect() error {
return err return err
} }
stats := &cephClusterStats{} stats := &cephClusterStats{}
if err := json.Unmarshal(buf, stats); err != nil { if err := json.Unmarshal(buf, stats); err != nil {
return err return err
@ -143,7 +140,7 @@ func (c *ClusterUsageCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect sends the metric values for each metric pertaining to the global // Collect sends the metric values for each metric pertaining to the global
// cluster usage over to the provided prometheus Metric channel. // cluster usage over to the provided prometheus Metric channel.
func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric) { func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
c.logger.Debug("collecting cluster usage metrics") c.logger.Debug("collecting cluster usage metrics")
if err := c.collect(); err != nil { if err := c.collect(); err != nil {
c.logger.WithError(err).Error("error collecting cluster usage metrics") c.logger.WithError(err).Error("error collecting cluster usage metrics")

View File

@ -31,9 +31,8 @@ var (
// This is NOT the same as new_crash_reports, that only counts new reports in the past // This is NOT the same as new_crash_reports, that only counts new reports in the past
// two weeks as reported by 'ceph health'. // two weeks as reported by 'ceph health'.
type CrashesCollector struct { type CrashesCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
crashReportsDesc *prometheus.Desc crashReportsDesc *prometheus.Desc
} }
@ -44,9 +43,8 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
collector := &CrashesCollector{ collector := &CrashesCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
crashReportsDesc: prometheus.NewDesc( crashReportsDesc: prometheus.NewDesc(
fmt.Sprintf("%s_crash_reports", cephNamespace), fmt.Sprintf("%s_crash_reports", cephNamespace),
@ -106,7 +104,7 @@ func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) {
} }
// Collect sends all the collected metrics Prometheus. // Collect sends all the collected metrics Prometheus.
func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric) { func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
crashes, err := c.getCrashLs() crashes, err := c.getCrashLs()
if err != nil { if err != nil {
c.logger.WithError(err).Error("failed to run 'ceph crash ls'") c.logger.WithError(err).Error("failed to run 'ceph crash ls'")

View File

@ -226,7 +226,26 @@ func (exporter *Exporter) Describe(ch chan<- *prometheus.Desc) {
} }
for _, cc := range exporter.cc { for _, cc := range exporter.cc {
cc.Describe(ch) switch cc.(type) {
case *ClusterUsageCollector:
cc.(*ClusterUsageCollector).Describe(ch)
case *PoolUsageCollector:
cc.(*PoolUsageCollector).Describe(ch)
case *PoolInfoCollector:
cc.(*PoolInfoCollector).Describe(ch)
case *ClusterHealthCollector:
cc.(*ClusterHealthCollector).Describe(ch)
case *MonitorCollector:
cc.(*MonitorCollector).Describe(ch)
case *OSDCollector:
cc.(*OSDCollector).Describe(ch)
case *CrashesCollector:
cc.(*CrashesCollector).Describe(ch)
case *RbdMirrorStatusCollector:
cc.(*RbdMirrorStatusCollector).Describe(ch)
case *RGWCollector:
cc.(*RGWCollector).Describe(ch)
}
} }
} }
@ -250,6 +269,25 @@ func (exporter *Exporter) Collect(ch chan<- prometheus.Metric) {
} }
for _, cc := range exporter.cc { for _, cc := range exporter.cc {
cc.Collect(ch) switch cc.(type) {
case *ClusterUsageCollector:
cc.(*ClusterUsageCollector).Collect(ch, exporter.Version)
case *PoolUsageCollector:
cc.(*PoolUsageCollector).Collect(ch, exporter.Version)
case *PoolInfoCollector:
cc.(*PoolInfoCollector).Collect(ch, exporter.Version)
case *ClusterHealthCollector:
cc.(*ClusterHealthCollector).Collect(ch, exporter.Version)
case *MonitorCollector:
cc.(*MonitorCollector).Collect(ch, exporter.Version)
case *OSDCollector:
cc.(*OSDCollector).Collect(ch, exporter.Version)
case *CrashesCollector:
cc.(*CrashesCollector).Collect(ch, exporter.Version)
case *RbdMirrorStatusCollector:
cc.(*RbdMirrorStatusCollector).Collect(ch, exporter.Version)
case *RGWCollector:
cc.(*RGWCollector).Collect(ch, exporter.Version)
}
} }
} }

View File

@ -47,9 +47,8 @@ var (
// It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector // It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector
// does. // does.
type ClusterHealthCollector struct { type ClusterHealthCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// healthChecksMap stores warnings and their criticality // healthChecksMap stores warnings and their criticality
healthChecksMap map[string]int healthChecksMap map[string]int
@ -287,9 +286,8 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
collector := &ClusterHealthCollector{ collector := &ClusterHealthCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
healthChecksMap: map[string]int{ healthChecksMap: map[string]int{
"AUTH_BAD_CAPS": 2, "AUTH_BAD_CAPS": 2,
@ -558,13 +556,6 @@ func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
"notieragent": &collector.OSDMapFlagNoTierAgent, "notieragent": &collector.OSDMapFlagNoTierAgent,
} }
if exporter.Version.IsAtLeast(Pacific) {
// pacific adds the DAEMON_OLD_VERSION health check
// that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay
// we'll interpret this is a critical warning (2)
collector.healthChecksMap["DAEMON_OLD_VERSION"] = 2
}
return collector return collector
} }
@ -724,7 +715,7 @@ type cephHealthStats struct {
} `json:"servicemap"` } `json:"servicemap"`
} }
func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric, version *Version) error {
cmd := c.cephUsageCommand(jsonFormat) cmd := c.cephUsageCommand(jsonFormat)
buf, _, err := c.conn.MonCommand(cmd) buf, _, err := c.conn.MonCommand(cmd)
if err != nil { if err != nil {
@ -883,6 +874,14 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
} }
} }
} }
if version.IsAtLeast(Pacific) {
// pacific adds the DAEMON_OLD_VERSION health check
// that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay
// we'll interpret this is a critical warning (2)
c.healthChecksMap["DAEMON_OLD_VERSION"] = 2
}
if !mapEmpty { if !mapEmpty {
if val, present := c.healthChecksMap[k]; present { if val, present := c.healthChecksMap[k]; present {
c.HealthStatusInterpreter.Set(float64(val)) c.HealthStatusInterpreter.Set(float64(val))
@ -991,7 +990,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
ch <- prometheus.MustNewConstMetric(c.CachePromoteIOOps, prometheus.GaugeValue, stats.PGMap.CachePromoteOpPerSec) ch <- prometheus.MustNewConstMetric(c.CachePromoteIOOps, prometheus.GaugeValue, stats.PGMap.CachePromoteOpPerSec)
var actualOsdMap osdMap var actualOsdMap osdMap
if c.version.IsAtLeast(Octopus) { if version.IsAtLeast(Octopus) {
if stats.OSDMap != nil { if stats.OSDMap != nil {
actualOsdMap = osdMap{ actualOsdMap = osdMap{
NumOSDs: stats.OSDMap["num_osds"].(float64), NumOSDs: stats.OSDMap["num_osds"].(float64),
@ -1031,7 +1030,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
activeMgr := 0 activeMgr := 0
standByMgrs := 0 standByMgrs := 0
if c.version.IsAtLeast(Octopus) { if version.IsAtLeast(Octopus) {
if stats.MgrMap.Available { if stats.MgrMap.Available {
activeMgr = 1 activeMgr = 1
} }
@ -1334,9 +1333,9 @@ func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect sends all the collected metrics to the provided prometheus channel. // Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization. // It requires the caller to handle synchronization.
func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) { func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
c.logger.Debug("collecting cluster health metrics") c.logger.Debug("collecting cluster health metrics")
if err := c.collect(ch); err != nil { if err := c.collect(ch, version); err != nil {
c.logger.WithError(err).Error("error collecting cluster health metrics " + err.Error()) c.logger.WithError(err).Error("error collecting cluster health metrics " + err.Error())
} }

View File

@ -32,9 +32,8 @@ var versionRegexp = regexp.MustCompile(`ceph version (?P<version_tag>\d+\.\d+\.\
// to each monitor instance, there are various vector metrics we // to each monitor instance, there are various vector metrics we
// need to use. // need to use.
type MonitorCollector struct { type MonitorCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// TotalKBs display the total storage a given monitor node has. // TotalKBs display the total storage a given monitor node has.
TotalKBs *prometheus.GaugeVec TotalKBs *prometheus.GaugeVec
@ -96,9 +95,8 @@ func NewMonitorCollector(exporter *Exporter) *MonitorCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
return &MonitorCollector{ return &MonitorCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
TotalKBs: prometheus.NewGaugeVec( TotalKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
@ -553,7 +551,7 @@ func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect extracts the given metrics from the Monitors and sends it to the prometheus // Collect extracts the given metrics from the Monitors and sends it to the prometheus
// channel. // channel.
func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) { func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
m.logger.Debug("collecting ceph monitor metrics") m.logger.Debug("collecting ceph monitor metrics")
if err := m.collect(); err != nil { if err := m.collect(); err != nil {
m.logger.WithError(err).Error("error collecting ceph monitor metrics") m.logger.WithError(err).Error("error collecting ceph monitor metrics")

View File

@ -40,9 +40,8 @@ const (
// An important aspect of monitoring OSDs is to ensure that when the cluster is // An important aspect of monitoring OSDs is to ensure that when the cluster is
// up and running that all OSDs that are in the cluster are up and running, too // up and running that all OSDs that are in the cluster are up and running, too
type OSDCollector struct { type OSDCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// osdScrubCache holds the cache of previous PG scrubs // osdScrubCache holds the cache of previous PG scrubs
osdScrubCache map[int]int osdScrubCache map[int]int
@ -152,9 +151,6 @@ type OSDCollector struct {
OldestInactivePG prometheus.Gauge OldestInactivePG prometheus.Gauge
} }
// This ensures OSDCollector implements interface prometheus.Collector.
var _ prometheus.Collector = &OSDCollector{}
// NewOSDCollector creates an instance of the OSDCollector and instantiates the // NewOSDCollector creates an instance of the OSDCollector and instantiates the
// individual metrics that show information about the OSD. // individual metrics that show information about the OSD.
func NewOSDCollector(exporter *Exporter) *OSDCollector { func NewOSDCollector(exporter *Exporter) *OSDCollector {
@ -163,9 +159,8 @@ func NewOSDCollector(exporter *Exporter) *OSDCollector {
osdLabels := []string{"osd", "device_class", "host", "rack", "root"} osdLabels := []string{"osd", "device_class", "host", "rack", "root"}
return &OSDCollector{ return &OSDCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
osdScrubCache: make(map[int]int), osdScrubCache: make(map[int]int),
osdLabelsCache: make(map[int64]*cephOSDLabel), osdLabelsCache: make(map[int64]*cephOSDLabel),
@ -1119,7 +1114,7 @@ func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect sends all the collected metrics to the provided Prometheus channel. // Collect sends all the collected metrics to the provided Prometheus channel.
// It requires the caller to handle synchronization. // It requires the caller to handle synchronization.
func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) { func (o *OSDCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
// Reset daemon specifc metrics; daemons can leave the cluster // Reset daemon specifc metrics; daemons can leave the cluster
o.CrushWeight.Reset() o.CrushWeight.Reset()
o.Depth.Reset() o.Depth.Reset()

View File

@ -32,9 +32,8 @@ const (
// PoolInfoCollector gives information about each pool that exists in a given // PoolInfoCollector gives information about each pool that exists in a given
// ceph cluster. // ceph cluster.
type PoolInfoCollector struct { type PoolInfoCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// PGNum contains the count of PGs allotted to a particular pool. // PGNum contains the count of PGs allotted to a particular pool.
PGNum *prometheus.GaugeVec PGNum *prometheus.GaugeVec
@ -75,9 +74,8 @@ func NewPoolInfoCollector(exporter *Exporter) *PoolInfoCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
return &PoolInfoCollector{ return &PoolInfoCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
PGNum: prometheus.NewGaugeVec( PGNum: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
@ -261,7 +259,7 @@ func (p *PoolInfoCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect extracts the current values of all the metrics and sends them to the // Collect extracts the current values of all the metrics and sends them to the
// prometheus channel. // prometheus channel.
func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric) { func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
p.logger.Debug("collecting pool metrics") p.logger.Debug("collecting pool metrics")
if err := p.collect(); err != nil { if err := p.collect(); err != nil {
p.logger.WithError(err).Error("error collecting pool metrics") p.logger.WithError(err).Error("error collecting pool metrics")

View File

@ -25,9 +25,8 @@ import (
// PoolUsageCollector displays statistics about each pool in the Ceph cluster. // PoolUsageCollector displays statistics about each pool in the Ceph cluster.
type PoolUsageCollector struct { type PoolUsageCollector struct {
conn Conn conn Conn
logger *logrus.Logger logger *logrus.Logger
version *Version
// UsedBytes tracks the amount of bytes currently allocated for the pool. This // UsedBytes tracks the amount of bytes currently allocated for the pool. This
// does not factor in the overcommitment made for individual images. // does not factor in the overcommitment made for individual images.
@ -80,9 +79,8 @@ func NewPoolUsageCollector(exporter *Exporter) *PoolUsageCollector {
labels["cluster"] = exporter.Cluster labels["cluster"] = exporter.Cluster
return &PoolUsageCollector{ return &PoolUsageCollector{
conn: exporter.Conn, conn: exporter.Conn,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
UsedBytes: prometheus.NewDesc(fmt.Sprintf("%s_%s_used_bytes", cephNamespace, subSystem), "Capacity of the pool that is currently under use", UsedBytes: prometheus.NewDesc(fmt.Sprintf("%s_%s_used_bytes", cephNamespace, subSystem), "Capacity of the pool that is currently under use",
poolLabel, labels, poolLabel, labels,
@ -213,7 +211,7 @@ func (p *PoolUsageCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect extracts the current values of all the metrics and sends them to the // Collect extracts the current values of all the metrics and sends them to the
// prometheus channel. // prometheus channel.
func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric) { func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
p.logger.Debug("collecting pool usage metrics") p.logger.Debug("collecting pool usage metrics")
if err := p.collect(ch); err != nil { if err := p.collect(ch); err != nil {
p.logger.WithError(err).Error("error collecting pool usage metrics") p.logger.WithError(err).Error("error collecting pool usage metrics")

View File

@ -155,7 +155,7 @@ func (c *RbdMirrorStatusCollector) Describe(ch chan<- *prometheus.Desc) {
} }
// Collect sends all the collected metrics Prometheus. // Collect sends all the collected metrics Prometheus.
func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric) { func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
status, err := rbdMirrorStatus(c.config, c.user) status, err := rbdMirrorStatus(c.config, c.user)
if err != nil { if err != nil {
c.logger.WithError(err).Error("failed to run 'rbd mirror pool status'") c.logger.WithError(err).Error("failed to run 'rbd mirror pool status'")
@ -166,6 +166,7 @@ func (c *RbdMirrorStatusCollector) Collect(ch chan<- prometheus.Metric) {
} }
c.RbdMirrorStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.Health)) c.RbdMirrorStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.Health))
c.version = version
if c.version.IsAtLeast(Pacific) { if c.version.IsAtLeast(Pacific) {
c.RbdMirrorDaemonStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.DaemonHealth)) c.RbdMirrorDaemonStatus.Set(c.mirrorStatusStringToInt(rbdStatus.Summary.DaemonHealth))

View File

@ -76,7 +76,6 @@ type RGWCollector struct {
user string user string
background bool background bool
logger *logrus.Logger logger *logrus.Logger
version *Version
// ActiveTasks reports the number of (expired) RGW GC tasks // ActiveTasks reports the number of (expired) RGW GC tasks
ActiveTasks *prometheus.GaugeVec ActiveTasks *prometheus.GaugeVec
@ -101,7 +100,6 @@ func NewRGWCollector(exporter *Exporter, background bool) *RGWCollector {
config: exporter.Config, config: exporter.Config,
background: background, background: background,
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version,
getRGWGCTaskList: rgwGetGCTaskList, getRGWGCTaskList: rgwGetGCTaskList,
ActiveTasks: prometheus.NewGaugeVec( ActiveTasks: prometheus.NewGaugeVec(
@ -219,7 +217,7 @@ func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect sends all the collected metrics to the provided prometheus channel. // Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization. // It requires the caller to handle synchronization.
func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) { func (r *RGWCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
if !r.background { if !r.background {
r.logger.WithField("background", r.background).Debug("collecting RGW GC stats") r.logger.WithField("background", r.background).Debug("collecting RGW GC stats")
err := r.collect() err := r.collect()