ceph_exporter/ceph/health.go

1717 lines
50 KiB
Go

// Copyright 2022 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ceph
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
var (
recoveryIORateRegex = regexp.MustCompile(`(\d+) (\w{2})/s`)
recoveryIOKeysRegex = regexp.MustCompile(`(\d+) keys/s`)
recoveryIOObjectsRegex = regexp.MustCompile(`(\d+) objects/s`)
clientReadBytesPerSecRegex = regexp.MustCompile(`(\d+) ([kKmMgG][bB])/s rd`)
clientWriteBytesPerSecRegex = regexp.MustCompile(`(\d+) ([kKmMgG][bB])/s wr`)
clientIOReadOpsRegex = regexp.MustCompile(`(\d+) op/s rd`)
clientIOWriteOpsRegex = regexp.MustCompile(`(\d+) op/s wr`)
cacheFlushRateRegex = regexp.MustCompile(`(\d+) ([kKmMgG][bB])/s flush`)
cacheEvictRateRegex = regexp.MustCompile(`(\d+) ([kKmMgG][bB])/s evict`)
cachePromoteOpsRegex = regexp.MustCompile(`(\d+) op/s promote`)
// Older versions of Ceph, hammer (v0.94) and below, support this format.
clientIOOpsRegex = regexp.MustCompile(`(\d+) op/s[^ \w]*$`)
)
// ClusterHealthCollector collects information about the health of an overall cluster.
// It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector
// does.
type ClusterHealthCollector struct {
conn Conn
logger *logrus.Logger
version *Version
// healthChecksMap stores warnings and their criticality
healthChecksMap map[string]int
// HealthStatus shows the overall health status of a given cluster.
HealthStatus prometheus.Gauge
// HealthStatusInterpreter shows the overall health status of a given
// cluster, with a breakdown of the HEALTH_WARN status into two groups
// based on criticality.
HealthStatusInterpreter prometheus.Gauge
// MONsDown show the no. of Monitor that are int DOWN state
MONsDown prometheus.Gauge
// TotalPGs shows the total no. of PGs the cluster constitutes of.
TotalPGs prometheus.Gauge
// PGstate contains state of all PGs labelled with the name of states.
PGState *prometheus.GaugeVec
// ActivePGs shows the no. of PGs the cluster is actively serving data
// from.
ActivePGs prometheus.Gauge
// DegradedPGs shows the no. of PGs that have some of the replicas
// missing.
DegradedPGs prometheus.Gauge
// StuckDegradedPGs shows the no. of PGs that have some of the replicas
// missing, and are stuck in that state.
StuckDegradedPGs prometheus.Gauge
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
// that are supposed to be in it.
UncleanPGs prometheus.Gauge
// StuckUncleanPGs shows the no. of PGs that do not have all objects in the PG
// that are supposed to be in it, and are stuck in that state.
StuckUncleanPGs prometheus.Gauge
// UndersizedPGs depicts the no. of PGs that have fewer copies than configured
// replication level.
UndersizedPGs prometheus.Gauge
// StuckUndersizedPGs depicts the no. of PGs that have fewer copies than configured
// replication level, and are stuck in that state.
StuckUndersizedPGs prometheus.Gauge
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
// anything about their latest state since their pg mapping was modified.
StalePGs prometheus.Gauge
// StuckStalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
// anything about their latest state since their pg mapping was modified, and are stuck
// in that state.
StuckStalePGs prometheus.Gauge
// PeeringPGs depicts no. of PGs that have one or more OSDs undergo state changes
// that need to be communicated to the remaining peers.
PeeringPGs prometheus.Gauge
// ScrubbingPGs depicts no. of PGs that are in scrubbing state.
// Light scrubbing checks the object size and attributes.
ScrubbingPGs prometheus.Gauge
// DeepScrubbingPGs depicts no. of PGs that are in scrubbing+deep state.
// Deep scrubbing reads the data and uses checksums to ensure data integrity.
DeepScrubbingPGs prometheus.Gauge
// RecoveringPGs depicts no. of PGs that are in recovering state.
// The PGs in this state have been dequeued from recovery_wait queue and are
// actively undergoing recovery.
RecoveringPGs prometheus.Gauge
// RecoveryWaitPGs depicts no. of PGs that are in recovery_wait state.
// The PGs in this state are still in queue to start recovery on them.
RecoveryWaitPGs prometheus.Gauge
// BackfillingPGs depicts no. of PGs that are in backfilling state.
// The PGs in this state have been dequeued from backfill_wait queue and are
// actively undergoing recovery.
BackfillingPGs prometheus.Gauge
// BackfillWaitPGs depicts no. of PGs that are in backfill_wait state.
// The PGs in this state are still in queue to start backfill on them.
BackfillWaitPGs prometheus.Gauge
// ForcedRecoveryPGs depicts no. of PGs that are undergoing forced recovery.
ForcedRecoveryPGs prometheus.Gauge
// ForcedBackfillPGs depicts no. of PGs that are undergoing forced backfill.
ForcedBackfillPGs prometheus.Gauge
// DownPGs depicts no. of PGs that are currently down and not able to serve traffic.
DownPGs prometheus.Gauge
// IncompletePGs depicts no. of PGs that are currently incomplete and not able to serve traffic.
IncompletePGs prometheus.Gauge
// InconsistentPGs depicts no. of PGs that are currently inconsistent
InconsistentPGs prometheus.Gauge
// SnaptrimPGs depicts no. of PGs that are currently snaptrimming
SnaptrimPGs prometheus.Gauge
// SnaptrimWaitPGs depicts no. of PGs that are currently waiting to snaptrim
SnaptrimWaitPGs prometheus.Gauge
// RepairingPGs depicts no. of PGs that are currently repairing
RepairingPGs prometheus.Gauge
// SlowOps depicts no. of total slow ops in the cluster
SlowOps prometheus.Gauge
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
// This includes object replicas in its count.
DegradedObjectsCount prometheus.Gauge
// MisplacedObjectsCount gives the no. of RADOS objects that constitute the misplaced PGs.
// Misplaced PGs usually represent the PGs that are not in the storage locations that
// they should be in. This is different than degraded PGs which means a PG has fewer copies
// that it should.
// This includes object replicas in its count.
MisplacedObjectsCount prometheus.Gauge
// MisplacedRatio shows the ratio of misplaced objects to total objects
MisplacedRatio prometheus.Gauge
// NewCrashReportCount reports if new Ceph daemon crash reports are available
NewCrashReportCount prometheus.Gauge
// TooManyRepairs reports the number of OSDs exceeding mon_osd_warn_num_repaired
TooManyRepairs prometheus.Gauge
// Objects show the total no. of RADOS objects that are currently allocated
Objects prometheus.Gauge
// OSDMapFlags
OSDMapFlagFull prometheus.Gauge
OSDMapFlagPauseRd prometheus.Gauge
OSDMapFlagPauseWr prometheus.Gauge
OSDMapFlagNoUp prometheus.Gauge
OSDMapFlagNoDown prometheus.Gauge
OSDMapFlagNoIn prometheus.Gauge
OSDMapFlagNoOut prometheus.Gauge
OSDMapFlagNoBackfill prometheus.Gauge
OSDMapFlagNoRecover prometheus.Gauge
OSDMapFlagNoRebalance prometheus.Gauge
OSDMapFlagNoScrub prometheus.Gauge
OSDMapFlagNoDeepScrub prometheus.Gauge
OSDMapFlagNoTierAgent prometheus.Gauge
// OSDsDown show the no. of OSDs that are in the DOWN state.
OSDsDown prometheus.Gauge
// OSDsUp show the no. of OSDs that are in the UP state and are able to serve requests.
OSDsUp prometheus.Gauge
// OSDsIn shows the no. of OSDs that are marked as IN in the cluster.
OSDsIn prometheus.Gauge
// OSDsNum shows the no. of total OSDs the cluster has.
OSDsNum prometheus.Gauge
// RemappedPGs show the no. of PGs that are currently remapped and needs to be moved
// to newer OSDs.
RemappedPGs prometheus.Gauge
// RecoveryIORate shows the i/o rate at which the cluster is performing its ongoing
// recovery at.
RecoveryIORate prometheus.Gauge
// RecoveryIOKeys shows the rate of rados keys recovery.
RecoveryIOKeys prometheus.Gauge
// RecoveryIOObjects shows the rate of rados objects being recovered.
RecoveryIOObjects prometheus.Gauge
// ClientReadBytesPerSec shows the total client read i/o on the cluster.
ClientReadBytesPerSec prometheus.Gauge
// ClientWriteBytesPerSec shows the total client write i/o on the cluster.
ClientWriteBytesPerSec prometheus.Gauge
// ClientIOOps shows the rate of total operations conducted by all clients on the cluster.
ClientIOOps prometheus.Gauge
// ClientIOReadOps shows the rate of total read operations conducted by all clients on the cluster.
ClientIOReadOps prometheus.Gauge
// ClientIOWriteOps shows the rate of total write operations conducted by all clients on the cluster.
ClientIOWriteOps prometheus.Gauge
// CacheFlushIORate shows the i/o rate at which data is being flushed from the cache pool.
CacheFlushIORate prometheus.Gauge
// CacheEvictIORate shows the i/o rate at which data is being flushed from the cache pool.
CacheEvictIORate prometheus.Gauge
// CachePromoteIOOps shows the rate of operations promoting objects to the cache pool.
CachePromoteIOOps prometheus.Gauge
// MgrsActive shows the number of active mgrs, can be either 0 or 1.
MgrsActive prometheus.Gauge
// MgrsNum shows the total number of mgrs, including standbys.
MgrsNum prometheus.Gauge
// RbdMirrorUp shows the alive rbd-mirror daemons
RbdMirrorUp *prometheus.Desc
}
const (
// CephHealthOK denotes the status of ceph cluster when healthy.
CephHealthOK = "HEALTH_OK"
// CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
CephHealthWarn = "HEALTH_WARN"
// CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
// manual intervention.
CephHealthErr = "HEALTH_ERR"
)
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
// metrics on.
func NewClusterHealthCollector(exporter *Exporter) *ClusterHealthCollector {
labels := make(prometheus.Labels)
labels["cluster"] = exporter.Cluster
collector := &ClusterHealthCollector{
conn: exporter.Conn,
logger: exporter.Logger,
version: exporter.Version,
healthChecksMap: map[string]int{
"AUTH_BAD_CAPS": 2,
"BLUEFS_AVAILABLE_SPACE": 1,
"BLUEFS_LOW_SPACE": 1,
"BLUEFS_SPILLOVER": 1,
"BLUESTORE_DISK_SIZE_MISMATCH": 1,
"BLUESTORE_FRAGMENTATION": 1,
"BLUESTORE_LEGACY_STATFS": 1,
"BLUESTORE_NO_COMPRESSION": 1,
"BLUESTORE_NO_PER_POOL_MAP": 1,
"CACHE_POOL_NEAR_FULL": 1,
"CACHE_POOL_NO_HIT_SET": 1,
"DEVICE_HEALTH": 1,
"DEVICE_HEALTH_IN_USE": 2,
"DEVICE_HEALTH_TOOMANY": 2,
"LARGE_OMAP_OBJECTS": 1,
"MANY_OBJECTS_PER_PG": 1,
"MGR_DOWN": 2,
"MGR_MODULE_DEPENDENCY": 1,
"MGR_MODULE_ERROR": 2,
"MON_CLOCK_SKEW": 2,
"MON_DISK_BIG": 1,
"MON_DISK_CRIT": 2,
"MON_DISK_LOW": 2,
"MON_DOWN": 2,
"MON_MSGR2_NOT_ENABLED": 2,
"OBJECT_MISPLACED": 1,
"OBJECT_UNFOUND": 2,
"OLD_CRUSH_STRAW_CALC_VERSION": 1,
"OLD_CRUSH_TUNABLES": 2,
"OSDMAP_FLAGS": 1,
"OSD_BACKFILLFULL": 2,
"OSD_CHASSIS_DOWN": 1,
"OSD_DATACENTER_DOWN": 1,
"OSD_DOWN": 1,
"OSD_FLAGS": 1,
"OSD_FULL": 2,
"OSD_HOST_DOWN": 1,
"OSD_NEARFULL": 2,
"OSD_NO_DOWN_OUT_INTERVAL": 2,
"OSD_NO_SORTBITWISE": 2,
"OSD_ORPHAN": 2,
"OSD_OSD_DOWN": 1,
"OSD_OUT_OF_ORDER_FULL": 2,
"OSD_PDU_DOWN": 1,
"OSD_POD_DOWN": 1,
"OSD_RACK_DOWN": 1,
"OSD_REGION_DOWN": 1,
"OSD_ROOM_DOWN": 1,
"OSD_ROOT_DOWN": 1,
"OSD_ROW_DOWN": 1,
"OSD_SCRUB_ERRORS": 2,
"OSD_TOO_MANY_REPAIRS": 1,
"PG_AVAILABILITY": 1,
"PG_BACKFILL_FULL": 2,
"PG_DAMAGED": 2,
"PG_DEGRADED": 1,
"PG_NOT_DEEP_SCRUBBED": 1,
"PG_NOT_SCRUBBED": 1,
"PG_RECOVERY_FULL": 2,
"PG_SLOW_SNAP_TRIMMING": 1,
"POOL_APP_NOT_ENABLED": 2,
"POOL_FULL": 2,
"POOL_NEAR_FULL": 2,
"POOL_TARGET_SIZE_BYTES_OVERCOMMITTED": 1,
"POOL_TARGET_SIZE_RATIO_OVERCOMMITTED": 1,
"POOL_TOO_FEW_PGS": 1,
"POOL_TOO_MANY_PGS": 1,
"RECENT_CRASH": 1,
"SLOW_OPS": 1,
"SMALLER_PGP_NUM": 1,
"TELEMETRY_CHANGED": 1,
"TOO_FEW_OSDS": 1,
"TOO_FEW_PGS": 1,
"TOO_MANY_PGS": 1},
HealthStatus: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "health_status",
Help: "Health status of Cluster, can vary only between 3 states (err:2, warn:1, ok:0)",
ConstLabels: labels,
},
),
HealthStatusInterpreter: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "health_status_interp",
Help: "Health status of Cluster, can vary only between 4 states (err:3, critical_warn:2, soft_warn:1, ok:0)",
ConstLabels: labels,
},
),
MONsDown: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "mons_down",
Help: "Count of Mons that are in DOWN state",
ConstLabels: labels,
},
),
TotalPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "total_pgs",
Help: "Total no. of PGs in the cluster",
ConstLabels: labels,
},
),
PGState: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "pg_state",
Help: "State of PGs in the cluster",
ConstLabels: labels,
},
[]string{"state"},
),
ActivePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "active_pgs",
Help: "No. of active PGs in the cluster",
ConstLabels: labels,
},
),
ScrubbingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "scrubbing_pgs",
Help: "No. of scrubbing PGs in the cluster",
ConstLabels: labels,
},
),
DeepScrubbingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "deep_scrubbing_pgs",
Help: "No. of deep scrubbing PGs in the cluster",
ConstLabels: labels,
},
),
RecoveringPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovering_pgs",
Help: "No. of recovering PGs in the cluster",
ConstLabels: labels,
},
),
RecoveryWaitPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovery_wait_pgs",
Help: "No. of PGs in the cluster with recovery_wait state",
ConstLabels: labels,
},
),
BackfillingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "backfilling_pgs",
Help: "No. of backfilling PGs in the cluster",
ConstLabels: labels,
},
),
BackfillWaitPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "backfill_wait_pgs",
Help: "No. of PGs in the cluster with backfill_wait state",
ConstLabels: labels,
},
),
ForcedRecoveryPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "forced_recovery_pgs",
Help: "No. of PGs in the cluster with forced_recovery state",
ConstLabels: labels,
},
),
ForcedBackfillPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "forced_backfill_pgs",
Help: "No. of PGs in the cluster with forced_backfill state",
ConstLabels: labels,
},
),
DownPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "down_pgs",
Help: "No. of PGs in the cluster in down state",
ConstLabels: labels,
},
),
IncompletePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "incomplete_pgs",
Help: "No. of PGs in the cluster in incomplete state",
ConstLabels: labels,
},
),
InconsistentPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "inconsistent_pgs",
Help: "No. of PGs in the cluster in inconsistent state",
ConstLabels: labels,
},
),
SnaptrimPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "snaptrim_pgs",
Help: "No. of snaptrim PGs in the cluster",
ConstLabels: labels,
},
),
SnaptrimWaitPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "snaptrim_wait_pgs",
Help: "No. of PGs in the cluster with snaptrim_wait state",
ConstLabels: labels,
},
),
RepairingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "repairing_pgs",
Help: "No. of PGs in the cluster with repair state",
ConstLabels: labels,
},
),
// with Nautilus, SLOW_OPS has replaced both REQUEST_SLOW and REQUEST_STUCK
// therefore slow_requests is deprecated, but for backwards compatibility
// the metric name will be kept the same for the time being
SlowOps: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "slow_requests",
Help: "No. of slow requests/slow ops",
ConstLabels: labels,
},
),
DegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "degraded_pgs",
Help: "No. of PGs in a degraded state",
ConstLabels: labels,
},
),
StuckDegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_degraded_pgs",
Help: "No. of PGs stuck in a degraded state",
ConstLabels: labels,
},
),
UncleanPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "unclean_pgs",
Help: "No. of PGs in an unclean state",
ConstLabels: labels,
},
),
StuckUncleanPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_unclean_pgs",
Help: "No. of PGs stuck in an unclean state",
ConstLabels: labels,
},
),
UndersizedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "undersized_pgs",
Help: "No. of undersized PGs in the cluster",
ConstLabels: labels,
},
),
StuckUndersizedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_undersized_pgs",
Help: "No. of stuck undersized PGs in the cluster",
ConstLabels: labels,
},
),
StalePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stale_pgs",
Help: "No. of stale PGs in the cluster",
ConstLabels: labels,
},
),
StuckStalePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_stale_pgs",
Help: "No. of stuck stale PGs in the cluster",
ConstLabels: labels,
},
),
PeeringPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "peering_pgs",
Help: "No. of peering PGs in the cluster",
ConstLabels: labels,
},
),
DegradedObjectsCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "degraded_objects",
Help: "No. of degraded objects across all PGs, includes replicas",
ConstLabels: labels,
},
),
MisplacedObjectsCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "misplaced_objects",
Help: "No. of misplaced objects across all PGs, includes replicas",
ConstLabels: labels,
},
),
MisplacedRatio: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "misplaced_ratio",
Help: "ratio of misplaced objects to total objects",
ConstLabels: labels,
},
),
NewCrashReportCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "new_crash_reports",
Help: "Number of new crash reports available",
ConstLabels: labels,
},
),
TooManyRepairs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_too_many_repair",
Help: "Number of OSDs with too many repaired reads",
ConstLabels: labels,
},
),
Objects: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cluster_objects",
Help: "No. of rados objects within the cluster",
ConstLabels: labels,
},
),
OSDMapFlagFull: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_full",
Help: "The cluster is flagged as full and cannot service writes",
ConstLabels: labels,
},
),
OSDMapFlagPauseRd: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_pauserd",
Help: "Reads are paused",
ConstLabels: labels,
},
),
OSDMapFlagPauseWr: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_pausewr",
Help: "Writes are paused",
ConstLabels: labels,
},
),
OSDMapFlagNoUp: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_noup",
Help: "OSDs are not allowed to start",
ConstLabels: labels,
},
),
OSDMapFlagNoDown: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_nodown",
Help: "OSD failure reports are ignored, OSDs will not be marked as down",
ConstLabels: labels,
},
),
OSDMapFlagNoIn: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_noin",
Help: "OSDs that are out will not be automatically marked in",
ConstLabels: labels,
},
),
OSDMapFlagNoOut: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_noout",
Help: "OSDs will not be automatically marked out after the configured interval",
ConstLabels: labels,
},
),
OSDMapFlagNoBackfill: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_nobackfill",
Help: "OSDs will not be backfilled",
ConstLabels: labels,
},
),
OSDMapFlagNoRecover: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_norecover",
Help: "Recovery is suspended",
ConstLabels: labels,
},
),
OSDMapFlagNoRebalance: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_norebalance",
Help: "Data rebalancing is suspended",
ConstLabels: labels,
},
),
OSDMapFlagNoScrub: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_noscrub",
Help: "Scrubbing is disabled",
ConstLabels: labels,
},
),
OSDMapFlagNoDeepScrub: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_nodeep_scrub",
Help: "Deep scrubbing is disabled",
ConstLabels: labels,
},
),
OSDMapFlagNoTierAgent: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osdmap_flag_notieragent",
Help: "Cache tiering activity is suspended",
ConstLabels: labels,
},
),
OSDsDown: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_down",
Help: "Count of OSDs that are in DOWN state",
ConstLabels: labels,
},
),
OSDsUp: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_up",
Help: "Count of OSDs that are in UP state",
ConstLabels: labels,
},
),
OSDsIn: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_in",
Help: "Count of OSDs that are in IN state and available to serve requests",
ConstLabels: labels,
},
),
OSDsNum: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds",
Help: "Count of total OSDs in the cluster",
ConstLabels: labels,
},
),
RemappedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "pgs_remapped",
Help: "No. of PGs that are remapped and incurring cluster-wide movement",
ConstLabels: labels,
},
),
RecoveryIORate: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovery_io_bytes",
Help: "Rate of bytes being recovered in cluster per second",
ConstLabels: labels,
},
),
RecoveryIOKeys: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovery_io_keys",
Help: "Rate of keys being recovered in cluster per second",
ConstLabels: labels,
},
),
RecoveryIOObjects: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovery_io_objects",
Help: "Rate of objects being recovered in cluster per second",
ConstLabels: labels,
},
),
ClientReadBytesPerSec: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "client_io_read_bytes",
Help: "Rate of bytes being read by all clients per second",
ConstLabels: labels,
},
),
ClientWriteBytesPerSec: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "client_io_write_bytes",
Help: "Rate of bytes being written by all clients per second",
ConstLabels: labels,
},
),
ClientIOOps: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "client_io_ops",
Help: "Total client ops on the cluster measured per second",
ConstLabels: labels,
},
),
ClientIOReadOps: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "client_io_read_ops",
Help: "Total client read I/O ops on the cluster measured per second",
ConstLabels: labels,
},
),
ClientIOWriteOps: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "client_io_write_ops",
Help: "Total client write I/O ops on the cluster measured per second",
ConstLabels: labels,
},
),
CacheFlushIORate: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cache_flush_io_bytes",
Help: "Rate of bytes being flushed from the cache pool per second",
ConstLabels: labels,
},
),
CacheEvictIORate: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cache_evict_io_bytes",
Help: "Rate of bytes being evicted from the cache pool per second",
ConstLabels: labels,
},
),
CachePromoteIOOps: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "cache_promote_io_ops",
Help: "Total cache promote operations measured per second",
ConstLabels: labels,
},
),
MgrsActive: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "mgrs_active",
Help: "Count of active mgrs, can be either 0 or 1",
ConstLabels: labels,
},
),
MgrsNum: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "mgrs",
Help: "Total number of mgrs, including standbys",
ConstLabels: labels,
},
),
RbdMirrorUp: prometheus.NewDesc(
fmt.Sprintf("%s_rbd_mirror_up", cephNamespace),
"Alive rbd-mirror daemons",
[]string{"name"},
labels,
),
}
if exporter.Version.IsAtLeast(Pacific) {
// pacific adds the DAEMON_OLD_VERSION health check
// that indicates that multiple versions of Ceph have been running for longer than mon_warn_older_version_delay
// we'll interpret this is a critical warning (2)
collector.healthChecksMap["DAEMON_OLD_VERSION"] = 2
}
return collector
}
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.HealthStatus,
c.HealthStatusInterpreter,
c.MONsDown,
c.TotalPGs,
c.DegradedPGs,
c.ActivePGs,
c.StuckDegradedPGs,
c.UncleanPGs,
c.StuckUncleanPGs,
c.UndersizedPGs,
c.StuckUndersizedPGs,
c.StalePGs,
c.StuckStalePGs,
c.PeeringPGs,
c.ScrubbingPGs,
c.DeepScrubbingPGs,
c.RecoveringPGs,
c.RecoveryWaitPGs,
c.BackfillingPGs,
c.BackfillWaitPGs,
c.ForcedRecoveryPGs,
c.ForcedBackfillPGs,
c.DownPGs,
c.IncompletePGs,
c.InconsistentPGs,
c.SnaptrimPGs,
c.SnaptrimWaitPGs,
c.RepairingPGs,
c.SlowOps,
c.DegradedObjectsCount,
c.MisplacedObjectsCount,
c.MisplacedRatio,
c.NewCrashReportCount,
c.TooManyRepairs,
c.Objects,
c.OSDMapFlagFull,
c.OSDMapFlagPauseRd,
c.OSDMapFlagPauseWr,
c.OSDMapFlagNoUp,
c.OSDMapFlagNoDown,
c.OSDMapFlagNoIn,
c.OSDMapFlagNoOut,
c.OSDMapFlagNoBackfill,
c.OSDMapFlagNoRecover,
c.OSDMapFlagNoRebalance,
c.OSDMapFlagNoScrub,
c.OSDMapFlagNoDeepScrub,
c.OSDMapFlagNoTierAgent,
c.OSDsDown,
c.OSDsUp,
c.OSDsIn,
c.OSDsNum,
c.RemappedPGs,
c.RecoveryIORate,
c.RecoveryIOKeys,
c.RecoveryIOObjects,
c.ClientReadBytesPerSec,
c.ClientWriteBytesPerSec,
c.ClientIOOps,
c.ClientIOReadOps,
c.ClientIOWriteOps,
c.CacheFlushIORate,
c.CacheEvictIORate,
c.CachePromoteIOOps,
c.MgrsActive,
c.MgrsNum,
}
}
func (c *ClusterHealthCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
c.PGState,
}
}
type osdMap struct {
NumOSDs float64 `json:"num_osds"`
NumUpOSDs float64 `json:"num_up_osds"`
NumInOSDs float64 `json:"num_in_osds"`
NumRemappedPGs float64 `json:"num_remapped_pgs"`
}
type cephHealthStats struct {
Health struct {
Summary []struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
Status string `json:"status"`
Checks map[string]struct {
Severity string `json:"severity"`
Summary struct {
Message string `json:"message"`
} `json:"summary"`
} `json:"checks"`
} `json:"health"`
OSDMap map[string]interface{} `json:"osdmap"`
PGMap struct {
NumPGs float64 `json:"num_pgs"`
TotalObjects float64 `json:"num_objects"`
WriteOpPerSec float64 `json:"write_op_per_sec"`
ReadOpPerSec float64 `json:"read_op_per_sec"`
WriteBytePerSec float64 `json:"write_bytes_sec"`
ReadBytePerSec float64 `json:"read_bytes_sec"`
RecoveringObjectsPerSec float64 `json:"recovering_objects_per_sec"`
RecoveringBytePerSec float64 `json:"recovering_bytes_per_sec"`
RecoveringKeysPerSec float64 `json:"recovering_keys_per_sec"`
CacheFlushBytePerSec float64 `json:"flush_bytes_sec"`
CacheEvictBytePerSec float64 `json:"evict_bytes_sec"`
CachePromoteOpPerSec float64 `json:"promote_op_per_sec"`
DegradedObjects float64 `json:"degraded_objects"`
MisplacedObjects float64 `json:"misplaced_objects"`
MisplacedRatio float64 `json:"misplaced_ratio"`
PGsByState []struct {
Count float64 `json:"count"`
States string `json:"state_name"`
} `json:"pgs_by_state"`
} `json:"pgmap"`
MgrMap struct {
// Octopus+ fields
Available bool `json:"available"`
NumStandBys int `json:"num_standbys"`
// Nautilus fields
ActiveName string `json:"active_name"`
StandBys []struct {
Name string `json:"name"`
} `json:"standbys"`
} `json:"mgrmap"`
ServiceMap struct {
Services struct {
RbdMirror struct {
Daemons map[string]json.RawMessage `json:"daemons"`
} `json:"rbd-mirror"`
} `json:"services"`
} `json:"servicemap"`
}
func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
cmd := c.cephUsageCommand(jsonFormat)
buf, _, err := c.conn.MonCommand(cmd)
if err != nil {
c.logger.WithError(err).WithField(
"args", string(cmd),
).Error("error executing mon command")
return err
}
stats := &cephHealthStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
for _, metric := range c.metricsList() {
if gauge, ok := metric.(prometheus.Gauge); ok {
gauge.Set(0)
}
}
switch stats.Health.Status {
case CephHealthOK:
c.HealthStatus.Set(0)
c.HealthStatusInterpreter.Set(0)
case CephHealthWarn:
c.HealthStatus.Set(1)
c.HealthStatusInterpreter.Set(2)
case CephHealthErr:
c.HealthStatus.Set(2)
c.HealthStatusInterpreter.Set(3)
}
var (
monsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) mons down, quorum \b+`)
stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`)
stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`)
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
slowOpsRegexNautilus = regexp.MustCompile(`([\d]+) slow ops, oldest one blocked for ([\d]+) sec`)
newCrashreportRegex = regexp.MustCompile(`([\d]+) daemons have recently crashed`)
tooManyRepairs = regexp.MustCompile(`Too many repaired reads on ([\d]+) OSDs`)
osdmapFlagsRegex = regexp.MustCompile(`([^ ]+) flag\(s\) set`)
)
var mapEmpty = len(c.healthChecksMap) == 0
for _, s := range stats.Health.Summary {
matched := stuckDegradedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckDegradedPGs.Set(float64(v))
}
matched = stuckUncleanRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckUncleanPGs.Set(float64(v))
}
matched = stuckUndersizedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckUndersizedPGs.Set(float64(v))
}
matched = stuckStaleRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckStalePGs.Set(float64(v))
}
matched = slowOpsRegexNautilus.FindStringSubmatch(s.Summary)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.SlowOps.Set(float64(v))
}
}
for k, check := range stats.Health.Checks {
if k == "MON_DOWN" {
matched := monsDownRegex.FindStringSubmatch(check.Summary.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.MONsDown.Set(float64(v))
}
}
if k == "SLOW_OPS" {
matched := slowOpsRegexNautilus.FindStringSubmatch(check.Summary.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.SlowOps.Set(float64(v))
}
}
if k == "RECENT_CRASH" {
matched := newCrashreportRegex.FindStringSubmatch(check.Summary.Message)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.NewCrashReportCount.Set(float64(v))
}
}
if k == "OSD_TOO_MANY_REPAIRS" {
matched := tooManyRepairs.FindStringSubmatch(check.Summary.Message)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.TooManyRepairs.Set(float64(v))
}
}
if k == "OSDMAP_FLAGS" {
matched := osdmapFlagsRegex.FindStringSubmatch(check.Summary.Message)
if len(matched) > 0 {
flags := strings.Split(matched[1], ",")
for _, f := range flags {
switch f {
case "full":
c.OSDMapFlagFull.Set(1)
case "pauserd":
c.OSDMapFlagPauseRd.Set(1)
case "pausewr":
c.OSDMapFlagPauseWr.Set(1)
case "noup":
c.OSDMapFlagNoUp.Set(1)
case "nodown":
c.OSDMapFlagNoDown.Set(1)
case "noin":
c.OSDMapFlagNoIn.Set(1)
case "noout":
c.OSDMapFlagNoOut.Set(1)
case "nobackfill":
c.OSDMapFlagNoBackfill.Set(1)
case "norecover":
c.OSDMapFlagNoRecover.Set(1)
case "norebalance":
c.OSDMapFlagNoRebalance.Set(1)
case "noscrub":
c.OSDMapFlagNoScrub.Set(1)
case "nodeep_scrub":
c.OSDMapFlagNoDeepScrub.Set(1)
case "notieragent":
c.OSDMapFlagNoTierAgent.Set(1)
}
}
}
}
if !mapEmpty {
if val, present := c.healthChecksMap[k]; present {
c.HealthStatusInterpreter.Set(float64(val))
}
}
}
var (
degradedPGs float64
activePGs float64
uncleanPGs float64
undersizedPGs float64
peeringPGs float64
stalePGs float64
scrubbingPGs float64
deepScrubbingPGs float64
recoveringPGs float64
recoveryWaitPGs float64
backfillingPGs float64
backfillWaitPGs float64
forcedRecoveryPGs float64
forcedBackfillPGs float64
downPGs float64
incompletePGs float64
inconsistentPGs float64
snaptrimPGs float64
snaptrimWaitPGs float64
repairingPGs float64
pgStateCounterMap = map[string]*float64{
"degraded": &degradedPGs,
"active": &activePGs,
"unclean": &uncleanPGs,
"undersized": &undersizedPGs,
"peering": &peeringPGs,
"stale": &stalePGs,
"scrubbing": &scrubbingPGs,
"scrubbing+deep": &deepScrubbingPGs,
"recovering": &recoveringPGs,
"recovery_wait": &recoveryWaitPGs,
"backfilling": &backfillingPGs,
"backfill_wait": &backfillWaitPGs,
"forced_recovery": &forcedRecoveryPGs,
"forced_backfill": &forcedBackfillPGs,
"down": &downPGs,
"incomplete": &incompletePGs,
"inconsistent": &inconsistentPGs,
"snaptrim": &snaptrimPGs,
"snaptrim_wait": &snaptrimWaitPGs,
"repair": &repairingPGs,
}
pgStateGaugeMap = map[string]prometheus.Gauge{
"degraded": c.DegradedPGs,
"active": c.ActivePGs,
"unclean": c.UncleanPGs,
"undersized": c.UndersizedPGs,
"peering": c.PeeringPGs,
"stale": c.StalePGs,
"scrubbing": c.ScrubbingPGs,
"scrubbing+deep": c.DeepScrubbingPGs,
"recovering": c.RecoveringPGs,
"recovery_wait": c.RecoveryWaitPGs,
"backfilling": c.BackfillingPGs,
"backfill_wait": c.BackfillWaitPGs,
"forced_recovery": c.ForcedRecoveryPGs,
"forced_backfill": c.ForcedBackfillPGs,
"down": c.DownPGs,
"incomplete": c.IncompletePGs,
"inconsistent": c.InconsistentPGs,
"snaptrim": c.SnaptrimPGs,
"snaptrim_wait": c.SnaptrimWaitPGs,
"repair": c.RepairingPGs,
}
)
for _, p := range stats.PGMap.PGsByState {
for pgState := range pgStateCounterMap {
if strings.Contains(p.States, pgState) {
*pgStateCounterMap[pgState] += p.Count
}
}
}
for state, gauge := range pgStateGaugeMap {
val := *pgStateCounterMap[state]
if state == "scrubbing" {
val -= *pgStateCounterMap["scrubbing+deep"]
}
if state == "snaptrim" {
val -= *pgStateCounterMap["snaptrim_wait"]
}
gauge.Set(val)
if state == "scrubbing+deep" {
state = "deep_scrubbing"
}
c.PGState.WithLabelValues(state).Set(val)
}
c.ClientReadBytesPerSec.Set(stats.PGMap.ReadBytePerSec)
c.ClientWriteBytesPerSec.Set(stats.PGMap.WriteBytePerSec)
c.ClientIOOps.Set(stats.PGMap.ReadOpPerSec + stats.PGMap.WriteOpPerSec)
c.ClientIOReadOps.Set(stats.PGMap.ReadOpPerSec)
c.ClientIOWriteOps.Set(stats.PGMap.WriteOpPerSec)
c.RecoveryIOKeys.Set(stats.PGMap.RecoveringKeysPerSec)
c.RecoveryIOObjects.Set(stats.PGMap.RecoveringObjectsPerSec)
c.RecoveryIORate.Set(stats.PGMap.RecoveringBytePerSec)
c.CacheEvictIORate.Set(stats.PGMap.CacheEvictBytePerSec)
c.CacheFlushIORate.Set(stats.PGMap.CacheFlushBytePerSec)
c.CachePromoteIOOps.Set(stats.PGMap.CachePromoteOpPerSec)
var actualOsdMap osdMap
if c.version.IsAtLeast(Octopus) {
if stats.OSDMap != nil {
actualOsdMap = osdMap{
NumOSDs: stats.OSDMap["num_osds"].(float64),
NumUpOSDs: stats.OSDMap["num_up_osds"].(float64),
NumInOSDs: stats.OSDMap["num_in_osds"].(float64),
NumRemappedPGs: stats.OSDMap["num_remapped_pgs"].(float64),
}
}
} else {
if stats.OSDMap != nil {
innerMap := stats.OSDMap["osdmap"].(map[string]interface{})
actualOsdMap = osdMap{
NumOSDs: innerMap["num_osds"].(float64),
NumUpOSDs: innerMap["num_up_osds"].(float64),
NumInOSDs: innerMap["num_in_osds"].(float64),
NumRemappedPGs: innerMap["num_remapped_pgs"].(float64),
}
}
}
c.OSDsUp.Set(actualOsdMap.NumUpOSDs)
c.OSDsIn.Set(actualOsdMap.NumInOSDs)
c.OSDsNum.Set(actualOsdMap.NumOSDs)
// Ceph (until v10.2.3) doesn't expose the value of down OSDs
// from its status, which is why we have to compute it ourselves.
c.OSDsDown.Set(actualOsdMap.NumOSDs - actualOsdMap.NumUpOSDs)
c.RemappedPGs.Set(actualOsdMap.NumRemappedPGs)
c.TotalPGs.Set(stats.PGMap.NumPGs)
c.Objects.Set(stats.PGMap.TotalObjects)
c.DegradedObjectsCount.Set(stats.PGMap.DegradedObjects)
c.MisplacedObjectsCount.Set(stats.PGMap.MisplacedObjects)
c.MisplacedRatio.Set(stats.PGMap.MisplacedRatio)
activeMgr := 0
standByMgrs := 0
if c.version.IsAtLeast(Octopus) {
if stats.MgrMap.Available {
activeMgr = 1
}
standByMgrs = stats.MgrMap.NumStandBys
} else {
if len(stats.MgrMap.ActiveName) > 0 {
activeMgr = 1
}
standByMgrs = len(stats.MgrMap.StandBys)
}
c.MgrsActive.Set(float64(activeMgr))
c.MgrsNum.Set(float64(activeMgr + standByMgrs))
for name, data := range stats.ServiceMap.Services.RbdMirror.Daemons {
if name == "summary" {
continue
}
md := struct {
Metadata struct {
Id string `json:"id"`
} `json:"metadata"`
}{}
// Extract id from metadata
if err := json.Unmarshal(data, &md); err == nil {
ch <- prometheus.MustNewConstMetric(
c.RbdMirrorUp, prometheus.GaugeValue, 1.0, md.Metadata.Id)
}
}
return nil
}
type format string
const (
jsonFormat format = "json"
plainFormat format = "plain"
)
func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "status",
"format": f,
})
if err != nil {
c.logger.WithError(err).Panic("error marshalling ceph status")
}
return cmd
}
func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
cmd := c.cephUsageCommand(plainFormat)
buf, _, err := c.conn.MonCommand(cmd)
if err != nil {
c.logger.WithError(err).WithField(
"args", string(cmd),
).Error("error executing mon command")
return err
}
sc := bufio.NewScanner(bytes.NewReader(buf))
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
// If we discover the health check is Luminous-specific
// we stop continuing extracting recovery/client I/O,
// because we already get it from health function.
if line == "cluster:" {
return nil
}
switch {
case strings.HasPrefix(line, "recovery io"):
if err := c.collectRecoveryIO(line); err != nil {
return err
}
case strings.HasPrefix(line, "recovery:"):
if err := c.collectRecoveryIO(line); err != nil {
return err
}
case strings.HasPrefix(line, "client io"):
if err := c.collectClientIO(line); err != nil {
return err
}
case strings.HasPrefix(line, "client:"):
if err := c.collectClientIO(line); err != nil {
return err
}
case strings.HasPrefix(line, "cache io"):
if err := c.collectCacheIO(line); err != nil {
return err
}
}
}
return nil
}
func (c *ClusterHealthCollector) collectClientIO(clientStr string) error {
matched := clientReadBytesPerSecRegex.FindStringSubmatch(clientStr)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
switch strings.ToLower(matched[2]) {
case "gb":
v = v * 1e9
case "mb":
v = v * 1e6
case "kb":
v = v * 1e3
default:
return fmt.Errorf("can't parse units %q", matched[2])
}
c.ClientReadBytesPerSec.Set(float64(v))
}
matched = clientWriteBytesPerSecRegex.FindStringSubmatch(clientStr)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
switch strings.ToLower(matched[2]) {
case "gb":
v = v * 1e9
case "mb":
v = v * 1e6
case "kb":
v = v * 1e3
default:
return fmt.Errorf("can't parse units %q", matched[2])
}
c.ClientWriteBytesPerSec.Set(float64(v))
}
var clientIOOps float64
matched = clientIOOpsRegex.FindStringSubmatch(clientStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
clientIOOps = float64(v)
}
var ClientIOReadOps, ClientIOWriteOps float64
matched = clientIOReadOpsRegex.FindStringSubmatch(clientStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
ClientIOReadOps = float64(v)
c.ClientIOReadOps.Set(ClientIOReadOps)
}
matched = clientIOWriteOpsRegex.FindStringSubmatch(clientStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
ClientIOWriteOps = float64(v)
c.ClientIOWriteOps.Set(ClientIOWriteOps)
}
// In versions older than Jewel, we directly get access to total
// client I/O. But in Jewel and newer the format is changed to
// separately display read and write IOPs. In such a case, we
// compute and set the total IOPs ourselves.
if clientIOOps == 0 {
clientIOOps = ClientIOReadOps + ClientIOWriteOps
}
c.ClientIOOps.Set(clientIOOps)
return nil
}
func (c *ClusterHealthCollector) collectRecoveryIO(recoveryStr string) error {
matched := recoveryIORateRegex.FindStringSubmatch(recoveryStr)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
switch strings.ToLower(matched[2]) {
case "gb":
v = v * 1e9
case "mb":
v = v * 1e6
case "kb":
v = v * 1e3
default:
return fmt.Errorf("can't parse units %q", matched[2])
}
c.RecoveryIORate.Set(float64(v))
}
matched = recoveryIOKeysRegex.FindStringSubmatch(recoveryStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.RecoveryIOKeys.Set(float64(v))
}
matched = recoveryIOObjectsRegex.FindStringSubmatch(recoveryStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.RecoveryIOObjects.Set(float64(v))
}
return nil
}
func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
matched := cacheFlushRateRegex.FindStringSubmatch(clientStr)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
switch strings.ToLower(matched[2]) {
case "gb":
v = v * 1e9
case "mb":
v = v * 1e6
case "kb":
v = v * 1e3
default:
return fmt.Errorf("can't parse units %q", matched[2])
}
c.CacheFlushIORate.Set(float64(v))
}
matched = cacheEvictRateRegex.FindStringSubmatch(clientStr)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
switch strings.ToLower(matched[2]) {
case "gb":
v = v * 1e9
case "mb":
v = v * 1e6
case "kb":
v = v * 1e3
default:
return fmt.Errorf("can't parse units %q", matched[2])
}
c.CacheEvictIORate.Set(float64(v))
}
matched = cachePromoteOpsRegex.FindStringSubmatch(clientStr)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.CachePromoteIOOps.Set(float64(v))
}
return nil
}
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
// to the provided prometheus channel.
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.RbdMirrorUp
for _, metric := range c.metricsList() {
ch <- metric.Desc()
}
for _, metric := range c.collectorList() {
metric.Describe(ch)
}
}
// Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization.
func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
c.logger.Debug("collecting cluster health metrics")
if err := c.collect(ch); err != nil {
c.logger.WithError(err).Error("error collecting cluster health metrics")
}
c.logger.Debug("collecting cluster recovery/client I/O metrics")
if err := c.collectRecoveryClientIO(); err != nil {
c.logger.WithError(err).Error("error collecting cluster recovery/client I/O metrics")
}
for _, metric := range c.metricsList() {
ch <- metric
}
for _, metric := range c.collectorList() {
metric.Collect(ch)
}
}