ceph_exporter/ceph/osd.go

//   Copyright 2022 DigitalOcean
//
//   Licensed under the Apache License, Version 2.0 (the "License");
//   you may not use this file except in compliance with the License.
//   You may obtain a copy of the License at
//
//       http://www.apache.org/licenses/LICENSE-2.0
//
//   Unless required by applicable law or agreed to in writing, software
//   distributed under the License is distributed on an "AS IS" BASIS,
//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//   See the License for the specific language governing permissions and
//   limitations under the License.

package ceph

import (
	"bytes"
	"encoding/json"
	"fmt"
	"math"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/sirupsen/logrus"
)

const (
	osdLabelFormat = "osd.%v"

	scrubStateIdle          = 0
	scrubStateScrubbing     = 1
	scrubStateDeepScrubbing = 2

	oldestInactivePGUpdatePeriod = 10 * time.Second
)

// OSDCollector displays statistics about OSD in the Ceph cluster.
// An important aspect of monitoring OSDs is to ensure that when the cluster is
// up and running that all OSDs that are in the cluster are up and running, too
type OSDCollector struct {
	conn   Conn
	logger *logrus.Logger

	// osdScrubCache holds the cache of previous PG scrubs
	osdScrubCache map[int]int

	// osdLabelsCache holds a cache of osd labels
	osdLabelsCache map[int64]*cephOSDLabel

	// oldestInactivePGMap keeps track of how long we've known
	// a PG to not have an active state in it.
	oldestInactivePGMap map[string]time.Time

	// CrushWeight is a persistent setting, and it affects how CRUSH assigns data to OSDs.
	// It displays the CRUSH weight for the OSD
	CrushWeight *prometheus.GaugeVec

	// Depth displays the OSD's level of hierarchy in the CRUSH map
	Depth *prometheus.GaugeVec

	// Reweight sets an override weight on the OSD.
	// It displays value within 0 to 1.
	Reweight *prometheus.GaugeVec

	// Bytes displays the total bytes available in the OSD
	Bytes *prometheus.GaugeVec

	// UsedBytes displays the total used bytes in the OSD
	UsedBytes *prometheus.GaugeVec

	// AvailBytes displays the total available bytes in the OSD
	AvailBytes *prometheus.GaugeVec

	// Utilization displays current utilization of the OSD
	Utilization *prometheus.GaugeVec

	// Variance displays current variance of the OSD from the standard utilization
	Variance *prometheus.GaugeVec

	// Pgs displays total number of placement groups in the OSD.
	// Available in Ceph Jewel version.
	Pgs *prometheus.GaugeVec

	// PgUpmapItemsTotal displays the total number of items in the pg-upmap exception table.
	PgUpmapItemsTotal prometheus.Gauge

	// CommitLatency displays in seconds how long it takes for an operation to be applied to disk
	CommitLatency *prometheus.GaugeVec

	// ApplyLatency displays in seconds how long it takes to get applied to the backing filesystem
	ApplyLatency *prometheus.GaugeVec

	// OSDIn displays the In state of the OSD
	OSDIn *prometheus.GaugeVec

	// OSDUp displays the Up state of the OSD
	OSDUp *prometheus.GaugeVec

	// OSDMetaData displays metadata of an OSD
	OSDMetadata *prometheus.GaugeVec

	// OSDFullRatio displays current full_ratio of OSD
	OSDFullRatio prometheus.Gauge

	// OSDFullRatio displays current backfillfull_ratio of OSD
	OSDBackfillFullRatio prometheus.Gauge

	// OSDNearFullRatio displays current nearfull_ratio of OSD
	OSDNearFullRatio prometheus.Gauge

	// OSDFull flags if an OSD is full
	OSDFull *prometheus.GaugeVec

	// OSDNearfull flags if an OSD is near full
	OSDNearFull *prometheus.GaugeVec

	// OSDBackfillFull flags if an OSD is backfill full
	OSDBackfillFull *prometheus.GaugeVec

	// OSDDownDesc displays OSDs present in the cluster in "down" state
	OSDDownDesc *prometheus.Desc

	// TotalBytes displays total bytes in all OSDs
	TotalBytes prometheus.Gauge

	// TotalUsedBytes displays total used bytes in all OSDs
	TotalUsedBytes prometheus.Gauge

	// TotalAvailBytes displays total available bytes in all OSDs
	TotalAvailBytes prometheus.Gauge

	// AverageUtil displays average utilization in all OSDs
	AverageUtil prometheus.Gauge

	// ScrubbingStateDesc depicts if an OSD is being scrubbed
	// labeled by OSD
	ScrubbingStateDesc *prometheus.Desc

	// PGObjectsRecoveredDesc displays total number of objects recovered in a PG
	PGObjectsRecoveredDesc *prometheus.Desc

	// OSDObjectsBackfilled displays average number of objects backfilled in an OSD
	OSDObjectsBackfilled *prometheus.CounterVec

	// OldestInactivePG gives us the amount of time that the oldest inactive PG
	// has been inactive for.  This is useful to discern between rolling peering
	// (such as when issuing a bunch of upmaps or weight changes) and a single PG
	// stuck peering, for example.
	OldestInactivePG prometheus.Gauge
}

// NewOSDCollector creates an instance of the OSDCollector and instantiates the
// individual metrics that show information about the OSD.
func NewOSDCollector(exporter *Exporter) *OSDCollector {
	labels := make(prometheus.Labels)
	labels["cluster"] = exporter.Cluster
	osdLabels := []string{"osd", "device_class", "host", "rack", "root"}
	osdMetadataLabels := []string{"osd", "objectstore", "ceph_version_when_created", "created_at"}

	o := &OSDCollector{
		conn:   exporter.Conn,
		logger: exporter.Logger,

		osdScrubCache:       make(map[int]int),
		osdLabelsCache:      make(map[int64]*cephOSDLabel),
		oldestInactivePGMap: make(map[string]time.Time),

		CrushWeight: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_crush_weight",
				Help:        "OSD Crush Weight",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Depth: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_depth",
				Help:        "OSD Depth",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Reweight: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_reweight",
				Help:        "OSD Reweight",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Bytes: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_bytes",
				Help:        "OSD Total Bytes",
				ConstLabels: labels,
			},
			osdLabels,
		),

		UsedBytes: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_used_bytes",
				Help:        "OSD Used Storage in Bytes",
				ConstLabels: labels,
			},
			osdLabels,
		),

		AvailBytes: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_avail_bytes",
				Help:        "OSD Available Storage in Bytes",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Utilization: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_utilization",
				Help:        "OSD Utilization",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Variance: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_variance",
				Help:        "OSD Variance",
				ConstLabels: labels,
			},
			osdLabels,
		),

		Pgs: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_pgs",
				Help:        "OSD Placement Group Count",
				ConstLabels: labels,
			},
			osdLabels,
		),

		PgUpmapItemsTotal: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_pg_upmap_items_total",
				Help:        "OSD PG-Upmap Exception Table Entry Count",
				ConstLabels: labels,
			},
		),

		TotalBytes: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_total_bytes",
				Help:        "OSD Total Storage Bytes",
				ConstLabels: labels,
			},
		),
		TotalUsedBytes: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_total_used_bytes",
				Help:        "OSD Total Used Storage Bytes",
				ConstLabels: labels,
			},
		),

		TotalAvailBytes: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_total_avail_bytes",
				Help:        "OSD Total Available Storage Bytes ",
				ConstLabels: labels,
			},
		),

		AverageUtil: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_average_utilization",
				Help:        "OSD Average Utilization",
				ConstLabels: labels,
			},
		),

		CommitLatency: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_perf_commit_latency_seconds",
				Help:        "OSD Perf Commit Latency",
				ConstLabels: labels,
			},
			osdLabels,
		),

		ApplyLatency: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_perf_apply_latency_seconds",
				Help:        "OSD Perf Apply Latency",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDIn: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_in",
				Help:        "OSD In Status",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDUp: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_up",
				Help:        "OSD Up Status",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDFullRatio: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_full_ratio",
				Help:        "OSD Full Ratio Value",
				ConstLabels: labels,
			},
		),

		OSDNearFullRatio: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_near_full_ratio",
				Help:        "OSD Near Full Ratio Value",
				ConstLabels: labels,
			},
		),

		OSDBackfillFullRatio: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_backfill_full_ratio",
				Help:        "OSD Backfill Full Ratio Value",
				ConstLabels: labels,
			},
		),

		OSDFull: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_full",
				Help:        "OSD Full Status",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDNearFull: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_near_full",
				Help:        "OSD Near Full Status",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDBackfillFull: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_backfill_full",
				Help:        "OSD Backfill Full Status",
				ConstLabels: labels,
			},
			osdLabels,
		),

		OSDMetadata: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "osd_metadata",
				Help:        "OSD Metadata",
				ConstLabels: labels,
			},
			osdMetadataLabels,
		),

		OSDDownDesc: prometheus.NewDesc(
			fmt.Sprintf("%s_osd_down", cephNamespace),
			"Number of OSDs down in the cluster",
			append([]string{"status"}, osdLabels...),
			labels,
		),

		ScrubbingStateDesc: prometheus.NewDesc(
			fmt.Sprintf("%s_osd_scrub_state", cephNamespace),
			"State of OSDs involved in a scrub",
			osdLabels,
			labels,
		),

		PGObjectsRecoveredDesc: prometheus.NewDesc(
			fmt.Sprintf("%s_pg_objects_recovered", cephNamespace),
			"Number of objects recovered in a PG",
			[]string{"pgid"},
			labels,
		),

		OSDObjectsBackfilled: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace:   cephNamespace,
				Name:        "osd_objects_backfilled",
				Help:        "Average number of objects backfilled in an OSD",
				ConstLabels: labels,
			},
			append([]string{"pgid"}, osdLabels...),
		),

		OldestInactivePG: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "pg_oldest_inactive",
				Help:        "The amount of time in seconds that the oldest PG has been inactive for",
				ConstLabels: labels,
			},
		),
	}

	go o.oldestInactivePGLoop()
	return o
}

func (o *OSDCollector) collectorList() []prometheus.Collector {
	return []prometheus.Collector{
		o.CrushWeight,
		o.Depth,
		o.Reweight,
		o.Bytes,
		o.UsedBytes,
		o.AvailBytes,
		o.Utilization,
		o.Variance,
		o.Pgs,
		o.PgUpmapItemsTotal,
		o.TotalBytes,
		o.TotalUsedBytes,
		o.TotalAvailBytes,
		o.AverageUtil,
		o.CommitLatency,
		o.ApplyLatency,
		o.OSDIn,
		o.OSDUp,
		o.OSDMetadata,
		o.OSDFullRatio,
		o.OSDNearFullRatio,
		o.OSDBackfillFullRatio,
		o.OSDFull,
		o.OSDNearFull,
		o.OSDBackfillFull,
		o.OSDObjectsBackfilled,
		o.OldestInactivePG,
	}
}

type cephOSDDF struct {
	OSDNodes []struct {
		Name        string      `json:"name"`
		CrushWeight json.Number `json:"crush_weight"`
		Depth       json.Number `json:"depth"`
		Reweight    json.Number `json:"reweight"`
		KB          json.Number `json:"kb"`
		UsedKB      json.Number `json:"kb_used"`
		AvailKB     json.Number `json:"kb_avail"`
		Utilization json.Number `json:"utilization"`
		Variance    json.Number `json:"var"`
		Pgs         json.Number `json:"pgs"`
	} `json:"nodes"`

	Summary struct {
		TotalKB      json.Number `json:"total_kb"`
		TotalUsedKB  json.Number `json:"total_kb_used"`
		TotalAvailKB json.Number `json:"total_kb_avail"`
		AverageUtil  json.Number `json:"average_utilization"`
	} `json:"summary"`
}

type cephPerfStat struct {
	PerfInfo []struct {
		ID    json.Number `json:"id"`
		Stats struct {
			CommitLatency json.Number `json:"commit_latency_ms"`
			ApplyLatency  json.Number `json:"apply_latency_ms"`
		} `json:"perf_stats"`
	} `json:"osd_perf_infos"`
}

type CephOSDPerfStat struct {
	cephPerfStat `json:"osdstats"`
}

type cephOSDDump struct {
	OSDs []struct {
		OSD   json.Number `json:"osd"`
		Up    json.Number `json:"up"`
		In    json.Number `json:"in"`
		State []string    `json:"state"`
	} `json:"osds"`

	PgUpmapItems []struct {
		PgID     string `json:"pgid"`
		Mappings []struct {
			From int `json:"from"`
			To   int `json:"to"`
		} `json:"mappings"`
	} `json:"pg_upmap_items"`

	FullRatio         json.Number `json:"full_ratio"`
	NearFullRatio     json.Number `json:"nearfull_ratio"`
	BackfillFullRatio json.Number `json:"backfillfull_ratio"`
}

type cephOSDTree struct {
	Nodes []struct {
		ID          int64   `json:"id"`
		Name        string  `json:"name"`
		Type        string  `json:"type"`
		Status      string  `json:"status"`
		Class       string  `json:"device_class"`
		CrushWeight float64 `json:"crush_weight"`
		Children    []int64 `json:"children"`
	} `json:"nodes"`
	Stray []struct {
		ID          int64   `json:"id"`
		Name        string  `json:"name"`
		Type        string  `json:"type"`
		Status      string  `json:"status"`
		CrushWeight float64 `json:"crush_weight"`
		Children    []int   `json:"children"`
	} `json:"stray"`
}

type osdNode struct {
	ID     int64  `json:"id"`
	Name   string `json:"name"`
	Type   string `json:"type"`
	Status string `json:"status"`
}

type cephOSDTreeDown struct {
	Nodes []osdNode `json:"nodes"`
	Stray []osdNode `json:"stray"`
}

type cephPGDumpBrief struct {
	PGStats []struct {
		PGID          string `json:"pgid"`
		ActingPrimary int64  `json:"acting_primary"`
		Acting        []int  `json:"acting"`
		State         string `json:"state"`
	} `json:"pg_stats"`
}

type cephOSDLabel struct {
	ID          int64   `json:"id"`
	Name        string  `json:"name"`
	Type        string  `json:"type"`
	Status      string  `json:"status"`
	DeviceClass string  `json:"device_class"`
	CrushWeight float64 `json:"crush_weight"`
	Root        string  `json:"root"`
	Rack        string  `json:"rack"`
	Host        string  `json:"host"`
	parent      int64   // parent id when building tables
}

type cephOSDMetadata struct {
	ID                     int    `json:"id"`
	CephVersionWhenCreated string `json:"ceph_version_when_created"`
	CreatedAt              string `json:"created_at"`
	OsdObjectstore         string `json:"osd_objectstore"`
}

func (o *OSDCollector) collectOSDDF() error {
	args := o.cephOSDDFCommand()
	buf, _, err := o.conn.MgrCommand(args)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(bytes.Join(args, []byte(","))),
		).Error("error executing mgr command")

		return err
	}

	// Workaround for Ceph Jewel after 10.2.5 produces invalid json when OSD is out
	buf = bytes.Replace(buf, []byte("-nan"), []byte("0"), -1)

	osdDF := &cephOSDDF{}
	if err := json.Unmarshal(buf, osdDF); err != nil {
		return err
	}

	for _, node := range osdDF.OSDNodes {
		lb := o.getOSDLabelFromName(node.Name)

		crushWeight, err := node.CrushWeight.Float64()
		if err != nil {
			return err
		}

		o.CrushWeight.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(crushWeight)
		depth, err := node.Depth.Float64()
		if err != nil {

			return err
		}

		o.Depth.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(depth)

		reweight, err := node.Reweight.Float64()
		if err != nil {
			return err
		}

		o.Reweight.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(reweight)

		osdKB, err := node.KB.Float64()
		if err != nil {
			return nil
		}

		o.Bytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(osdKB * 1024)

		usedKB, err := node.UsedKB.Float64()
		if err != nil {
			return err
		}

		o.UsedBytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(usedKB * 1024)

		availKB, err := node.AvailKB.Float64()
		if err != nil {
			return err
		}

		o.AvailBytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(availKB * 1024)

		util, err := node.Utilization.Float64()
		if err != nil {
			return err
		}

		o.Utilization.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(util)

		variance, err := node.Variance.Float64()
		if err != nil {
			return err
		}

		o.Variance.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(variance)

		pgs, err := node.Pgs.Float64()
		if err != nil {
			continue
		}

		o.Pgs.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(pgs)

	}

	totalKB, err := osdDF.Summary.TotalKB.Float64()
	if err != nil {
		return err
	}

	o.TotalBytes.Set(totalKB * 1024)

	totalUsedKB, err := osdDF.Summary.TotalUsedKB.Float64()
	if err != nil {
		return err
	}

	o.TotalUsedBytes.Set(totalUsedKB * 1024)

	totalAvailKB, err := osdDF.Summary.TotalAvailKB.Float64()
	if err != nil {
		return err
	}

	o.TotalAvailBytes.Set(totalAvailKB * 1024)

	averageUtil, err := osdDF.Summary.AverageUtil.Float64()
	if err != nil {
		return err
	}

	o.AverageUtil.Set(averageUtil)

	return nil

}

func (o *OSDCollector) collectOSDMetadata() error {
	cmd := o.cephOSDMetadataCommand()
	buf, _, err := o.conn.MonCommand(cmd)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	var osdMetadata []cephOSDMetadata
	if err := json.Unmarshal(buf, &osdMetadata); err != nil {
		return err
	}

	for _, osd := range osdMetadata {
		o.OSDMetadata.WithLabelValues(strconv.Itoa(osd.ID), osd.OsdObjectstore, osd.CephVersionWhenCreated, osd.CreatedAt).Set(1)
	}

	return nil
}

func (o *OSDCollector) collectOSDPerf() error {
	args := o.cephOSDPerfCommand()
	buf, _, err := o.conn.MgrCommand(args)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(bytes.Join(args, []byte(","))),
		).Error("error executing mon command")

		return err
	}

	osdPerf := &CephOSDPerfStat{}
	if err := json.Unmarshal(buf, osdPerf); err != nil {
		return err
	}

	for _, perfStat := range osdPerf.PerfInfo {
		osdID, err := perfStat.ID.Int64()
		if err != nil {
			return err
		}
		osdName := fmt.Sprintf(osdLabelFormat, osdID)

		lb := o.getOSDLabelFromID(osdID)

		commitLatency, err := perfStat.Stats.CommitLatency.Float64()
		if err != nil {
			return err
		}
		o.CommitLatency.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(commitLatency / 1000)

		applyLatency, err := perfStat.Stats.ApplyLatency.Float64()
		if err != nil {
			return err
		}
		o.ApplyLatency.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(applyLatency / 1000)
	}

	return nil
}

func buildOSDLabels(data []byte) (map[int64]*cephOSDLabel, error) {
	nodeList := &cephOSDTree{}
	if err := json.Unmarshal(data, nodeList); err != nil {
		return nil, err
	}

	nodeMap := make(map[int64]*cephOSDLabel)
	for _, node := range nodeList.Nodes {
		label := cephOSDLabel{
			ID:          node.ID,
			Name:        node.Name,
			Type:        node.Type,
			Status:      node.Status,
			DeviceClass: node.Class,
			CrushWeight: node.CrushWeight,
			parent:      math.MaxInt64,
		}
		nodeMap[node.ID] = &label
	}
	// now that we built a lookup table, fill in the parents
	for _, node := range nodeList.Nodes {
		for _, child := range node.Children {
			if label, ok := nodeMap[child]; ok {
				label.parent = node.ID
			}
		}
	}

	var findParent func(from *cephOSDLabel, kind string) (*cephOSDLabel, bool)
	findParent = func(from *cephOSDLabel, kind string) (*cephOSDLabel, bool) {
		if parent, ok := nodeMap[from.parent]; ok {
			if parent.Type == kind {
				return parent, true
			}
			return findParent(parent, kind)
		}
		return nil, false
	}

	// Now that we have parents filled in walk our map, and build a map of just osds.
	for k := range nodeMap {
		osdLabel := nodeMap[k]
		if host, ok := findParent(osdLabel, "host"); ok {
			osdLabel.Host = host.Name
		}
		if rack, ok := findParent(osdLabel, "rack"); ok {
			osdLabel.Rack = rack.Name
		}
		if root, ok := findParent(osdLabel, "root"); ok {
			osdLabel.Root = root.Name
		}
	}

	for k := range nodeMap {
		osdLabel := nodeMap[k]
		if osdLabel.Type != "osd" {
			delete(nodeMap, k)
		}
	}
	return nodeMap, nil
}

func (o *OSDCollector) buildOSDLabelCache() error {
	cmd := o.cephOSDTreeCommand()
	data, _, err := o.conn.MonCommand(cmd)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	cache, err := buildOSDLabels(data)
	if err != nil {
		return err
	}
	o.osdLabelsCache = cache
	return nil
}

func (o *OSDCollector) getOSDLabelFromID(id int64) *cephOSDLabel {
	if label, ok := o.osdLabelsCache[id]; ok {
		return label
	}
	return &cephOSDLabel{}
}

func (o *OSDCollector) getOSDLabelFromName(osdid string) *cephOSDLabel {
	var id int64
	c, err := fmt.Sscanf(osdid, "osd.%d", &id)
	if err != nil || c != 1 {
		return &cephOSDLabel{}
	}

	return o.getOSDLabelFromID(id)
}

func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error {
	cmd := o.cephOSDTreeCommand("down")
	buff, _, err := o.conn.MonCommand(cmd)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	osdDown := &cephOSDTreeDown{}
	if err := json.Unmarshal(buff, osdDown); err != nil {
		return err
	}

	downItems := append(osdDown.Nodes, osdDown.Stray...)
	for _, downItem := range downItems {
		if downItem.Type != "osd" {
			continue
		}

		osdName := downItem.Name
		lb := o.getOSDLabelFromName(osdName)

		ch <- prometheus.MustNewConstMetric(o.OSDDownDesc, prometheus.GaugeValue, 1,
			downItem.Status,
			osdName,
			lb.DeviceClass,
			lb.Host,
			lb.Rack,
			lb.Root)
	}

	return nil
}

func (o *OSDCollector) collectOSDDump() error {
	cmd := o.cephOSDDump()
	buff, _, err := o.conn.MonCommand(cmd)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	osdDump := cephOSDDump{}
	if err := json.Unmarshal(buff, &osdDump); err != nil {
		return err
	}

	osdFullRatio, err := osdDump.FullRatio.Float64()
	if err != nil {
		return err
	}
	osdNearFullRatio, err := osdDump.NearFullRatio.Float64()
	if err != nil {
		return err
	}
	osdBackfillFullRatio, err := osdDump.BackfillFullRatio.Float64()
	if err != nil {
		return err
	}
	o.OSDFullRatio.Set(osdFullRatio)
	o.OSDNearFullRatio.Set(osdNearFullRatio)
	o.OSDBackfillFullRatio.Set(osdBackfillFullRatio)
	o.PgUpmapItemsTotal.Set(float64(len(osdDump.PgUpmapItems)))

	for _, dumpInfo := range osdDump.OSDs {
		osdID, err := dumpInfo.OSD.Int64()
		if err != nil {
			return err
		}
		osdName := fmt.Sprintf(osdLabelFormat, osdID)
		lb := o.getOSDLabelFromID(osdID)

		in, err := dumpInfo.In.Float64()
		if err != nil {
			return err
		}

		o.OSDIn.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(in)

		up, err := dumpInfo.Up.Float64()
		if err != nil {
			return err
		}

		o.OSDUp.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(up)

		o.OSDFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
		o.OSDNearFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
		o.OSDBackfillFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
		for _, state := range dumpInfo.State {
			switch state {
			case "full":
				o.OSDFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
			case "nearfull":
				o.OSDNearFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
			case "backfillfull":
				o.OSDBackfillFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
			}
		}
	}

	return nil

}

func (o *OSDCollector) performPGDumpBrief() (*cephPGDumpBrief, error) {
	args := o.cephPGDumpCommand()
	buf, _, err := o.conn.MgrCommand(args)
	if err != nil {
		o.logger.WithError(err).WithField(
			"args", string(bytes.Join(args, []byte(","))),
		).Error("error executing mgr command")

		return nil, err
	}

	pgDumpBrief := cephPGDumpBrief{}
	if err := json.Unmarshal(buf, &pgDumpBrief); err != nil {
		return nil, err
	}

	return &pgDumpBrief, nil
}

func (o *OSDCollector) collectOSDScrubState(ch chan<- prometheus.Metric) error {
	pgDumpBrief, err := o.performPGDumpBrief()
	if err != nil {
		return err
	}

	// need to reset the PG scrub state since the scrub might have ended within
	// the last prom scrape interval.
	// This forces us to report scrub state on all previously discovered OSDs We
	// may be able to remove the "cache" when using Prometheus 2.0 if we can
	// tune how unreported/abandoned gauges are treated (ie set to 0).
	for i := range o.osdScrubCache {
		o.osdScrubCache[i] = scrubStateIdle
	}

	for _, pg := range pgDumpBrief.PGStats {
		if strings.Contains(pg.State, "scrubbing") {
			scrubState := scrubStateScrubbing
			if strings.Contains(pg.State, "deep") {
				scrubState = scrubStateDeepScrubbing
			}

			for _, osd := range pg.Acting {
				o.osdScrubCache[osd] = scrubState
			}
		}
	}

	for i, v := range o.osdScrubCache {
		lb := o.getOSDLabelFromID(int64(i))
		ch <- prometheus.MustNewConstMetric(
			o.ScrubbingStateDesc,
			prometheus.GaugeValue,
			float64(v),
			fmt.Sprintf(osdLabelFormat, i),
			lb.DeviceClass,
			lb.Host,
			lb.Rack,
			lb.Root)
	}

	return nil
}

func (o *OSDCollector) cephOSDDump() []byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "osd dump",
		"format": jsonFormat,
	})
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph osd dump")
	}
	return cmd
}

func (o *OSDCollector) cephOSDDFCommand() [][]byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "osd df",
		"format": jsonFormat,
	})
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph osd df")
	}
	return [][]byte{cmd}
}

func (o *OSDCollector) cephOSDPerfCommand() [][]byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "osd perf",
		"format": jsonFormat,
	})
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph osd perf")
	}
	return [][]byte{cmd}
}

func (o *OSDCollector) cephOSDMetadataCommand() []byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "osd metadata",
		"format": jsonFormat,
	})
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph osd metadata")
	}
	return cmd
}

func (o *OSDCollector) cephOSDTreeCommand(states ...string) []byte {
	req := map[string]interface{}{
		"prefix": "osd tree",
		"format": jsonFormat,
	}
	if len(states) > 0 {
		req["states"] = states
	}

	cmd, err := json.Marshal(req)
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph osd tree")
	}
	return cmd
}

func (o *OSDCollector) cephPGDumpCommand() [][]byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix":       "pg dump",
		"dumpcontents": []string{"pgs_brief"},
		"format":       jsonFormat,
	})
	if err != nil {
		o.logger.WithError(err).Panic("error marshalling ceph pg dump")
	}
	return [][]byte{cmd}
}

func (o *OSDCollector) oldestInactivePGLoop() {
	for {
		pgDumpBrief, err := o.performPGDumpBrief()
		if err != nil {
			o.logger.WithError(err).Warning("failed to get latest PG dump for oldest inactive PG update")
			time.Sleep(oldestInactivePGUpdatePeriod)
			continue
		}

		// - See if there are PGs that we're tracking that are now active
		// - See if there are new ones to add
		// - Find the oldest one
		now := time.Now()
		oldestTime := now

		for _, pg := range pgDumpBrief.PGStats {
			// If we were tracking it, and it's now active, remove it
			active := strings.Contains(pg.State, "active")
			if active {
				delete(o.oldestInactivePGMap, pg.PGID)
				continue
			}

			// Now see if it's not here, we'll need to track it now
			pgTime, ok := o.oldestInactivePGMap[pg.PGID]
			if !ok {
				pgTime = now
				o.oldestInactivePGMap[pg.PGID] = now
			}

			// And finally, track our oldest time
			if pgTime.Before(oldestTime) {
				oldestTime = pgTime
			}
		}

		o.OldestInactivePG.Set(float64(now.Unix() - oldestTime.Unix()))

		time.Sleep(oldestInactivePGUpdatePeriod)
	}
}

// Describe sends the descriptors of each OSDCollector related metrics we have
// defined to the provided Prometheus channel.
func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) {
	for _, metric := range o.collectorList() {
		metric.Describe(ch)
	}
	ch <- o.OSDDownDesc
	ch <- o.ScrubbingStateDesc
	ch <- o.PGObjectsRecoveredDesc
}

// Collect sends all the collected metrics to the provided Prometheus channel.
// It requires the caller to handle synchronization.
func (o *OSDCollector) Collect(ch chan<- prometheus.Metric, version *Version) {
	// Reset daemon specific metrics; daemons can leave the cluster
	o.CrushWeight.Reset()
	o.Depth.Reset()
	o.Reweight.Reset()
	o.Bytes.Reset()
	o.UsedBytes.Reset()
	o.AvailBytes.Reset()
	o.Utilization.Reset()
	o.Variance.Reset()
	o.Pgs.Reset()
	o.CommitLatency.Reset()
	o.ApplyLatency.Reset()
	o.OSDIn.Reset()
	o.OSDUp.Reset()
	o.OSDMetadata.Reset()
	o.buildOSDLabelCache()

	localWg := &sync.WaitGroup{}

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDPerf(); err != nil {
			o.logger.WithError(err).Error("error collecting OSD perf metrics")
		}
	}()

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDMetadata(); err != nil {
			o.logger.WithError(err).Error("error collecting OSD metadata metrics")
		}
	}()

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDDump(); err != nil {
			o.logger.WithError(err).Error("error collecting OSD dump metrics")
		}
	}()

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDDF(); err != nil {
			o.logger.WithError(err).Error("error collecting OSD df metrics")
		}
	}()

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDTreeDown(ch); err != nil {
			o.logger.WithError(err).Error("error collecting OSD tree down metrics")
		}
	}()

	localWg.Add(1)
	go func() {
		defer localWg.Done()
		if err := o.collectOSDScrubState(ch); err != nil {
			o.logger.WithError(err).Error("error collecting OSD scrub metrics")
		}
	}()

	localWg.Wait()

	for _, metric := range o.collectorList() {
		metric.Collect(ch)
	}
}