collectors/osd: backport oldest inactive PG to Luminous
This commit is contained in:
parent
9594165673
commit
c1e42db038
|
@ -8,8 +8,8 @@ import (
|
|||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
@ -36,6 +36,10 @@ type OSDCollector struct {
|
|||
// osdLabelsCache holds a cache of osd labels
|
||||
osdLabelsCache map[int64]*cephOSDLabel
|
||||
|
||||
// oldestInactivePGMap keeps track of how long we've known
|
||||
// a PG to not have an active state in it.
|
||||
oldestInactivePGMap map[string]time.Time
|
||||
|
||||
// pgDumpBrief holds the content of PG dump brief
|
||||
pgDumpBrief cephPGDumpBrief
|
||||
|
||||
|
@ -117,6 +121,12 @@ type OSDCollector struct {
|
|||
|
||||
// OSDObjectsBackfilled displays average number of objects backfilled in an OSD
|
||||
OSDObjectsBackfilled *prometheus.CounterVec
|
||||
|
||||
// OldestInactivePG gives us the amount of time that the oldest inactive PG
|
||||
// has been inactive for. This is useful to discern between rolling peering
|
||||
// (such as when issuing a bunch of upmaps or weight changes) and a single PG
|
||||
// stuck peering, for example.
|
||||
OldestInactivePG prometheus.Gauge
|
||||
}
|
||||
|
||||
// This ensures OSDCollector implements interface prometheus.Collector.
|
||||
|
@ -132,8 +142,9 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
return &OSDCollector{
|
||||
conn: conn,
|
||||
|
||||
osdScrubCache: make(map[int]int),
|
||||
osdLabelsCache: make(map[int64]*cephOSDLabel),
|
||||
osdScrubCache: make(map[int]int),
|
||||
osdLabelsCache: make(map[int64]*cephOSDLabel),
|
||||
oldestInactivePGMap: make(map[string]time.Time),
|
||||
|
||||
CrushWeight: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
|
@ -369,6 +380,15 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
},
|
||||
append([]string{"pgid"}, osdLabels...),
|
||||
),
|
||||
|
||||
OldestInactivePG: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
Name: "pg_oldest_inactive",
|
||||
Help: "The amount of time in seconds that the oldest PG has been inactive for",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -396,6 +416,7 @@ func (o *OSDCollector) collectorList() []prometheus.Collector {
|
|||
o.OSDNearFull,
|
||||
o.OSDBackfillFull,
|
||||
o.OSDObjectsBackfilled,
|
||||
o.OldestInactivePG,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -515,7 +536,7 @@ type cephOSDLabel struct {
|
|||
Status string `json:"status"`
|
||||
DeviceClass string `json:"device_class"`
|
||||
CrushWeight float64 `json:"crush_weight"`
|
||||
Root string `json:"host"`
|
||||
Root string `json:"root"`
|
||||
Rack string `json:"rack"`
|
||||
Host string `json:"host"`
|
||||
parent int64 // parent id when building tables
|
||||
|
@ -1041,6 +1062,38 @@ func (o *OSDCollector) cephPGQueryCommand(pgid string) []byte {
|
|||
return cmd
|
||||
}
|
||||
|
||||
func (o *OSDCollector) collectPGStates(ch chan<- prometheus.Metric) error {
|
||||
// - See if there are PGs that we're tracking that are now active
|
||||
// - See if there are new ones to add
|
||||
// - Find the oldest one
|
||||
now := time.Now()
|
||||
oldestTime := now
|
||||
|
||||
for _, pg := range o.pgDumpBrief {
|
||||
// If we were tracking it, and it's now active, remove it
|
||||
active := strings.Contains(pg.State, "active")
|
||||
if active {
|
||||
delete(o.oldestInactivePGMap, pg.PGID)
|
||||
continue
|
||||
}
|
||||
|
||||
// Now see if it's not here, we'll need to track it now
|
||||
pgTime, ok := o.oldestInactivePGMap[pg.PGID]
|
||||
if !ok {
|
||||
pgTime = now
|
||||
o.oldestInactivePGMap[pg.PGID] = now
|
||||
}
|
||||
|
||||
// And finally, track our oldest time
|
||||
if pgTime.Before(oldestTime) {
|
||||
oldestTime = pgTime
|
||||
}
|
||||
}
|
||||
|
||||
o.OldestInactivePG.Set(float64(now.Unix() - oldestTime.Unix()))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Describe sends the descriptors of each OSDCollector related metrics we have
|
||||
// defined to the provided Prometheus channel.
|
||||
func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
|
@ -1096,6 +1149,10 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
|
|||
log.Println("failed collecting OSD scrub metrics:", err)
|
||||
}
|
||||
|
||||
if err := o.collectPGStates(ch); err != nil {
|
||||
log.Println("failed collecting PG state metrics:", err)
|
||||
}
|
||||
|
||||
for _, metric := range o.collectorList() {
|
||||
metric.Collect(ch)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue