mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-03-09 13:37:32 +00:00
health: capture slow request per osd
This commit is contained in:
parent
afd5a2c4bf
commit
219fb69bde
@ -9,6 +9,8 @@ env:
|
|||||||
- DOCKER_TAG=$TRAVIS_TAG
|
- DOCKER_TAG=$TRAVIS_TAG
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
|
- wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
|
||||||
|
- echo deb https://download.ceph.com/debian-luminous/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
|
||||||
- sudo apt-get update
|
- sudo apt-get update
|
||||||
- sudo apt-get install -y librados-dev librbd-dev
|
- sudo apt-get install -y librados-dev librbd-dev
|
||||||
|
|
||||||
|
10
Dockerfile
10
Dockerfile
@ -7,9 +7,10 @@ ENV PATH $GOROOT/bin:$PATH
|
|||||||
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter
|
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y apt-transport-https build-essential git curl
|
apt-get install -y apt-transport-https build-essential git curl wget
|
||||||
|
|
||||||
RUN echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list
|
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
||||||
|
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --force-yes librados-dev librbd-dev
|
apt-get install -y --force-yes librados-dev librbd-dev
|
||||||
@ -28,8 +29,9 @@ FROM ubuntu:16.04
|
|||||||
MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com>
|
MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com>
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y apt-transport-https curl && \
|
apt-get install -y apt-transport-https curl wget
|
||||||
echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list && \
|
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
||||||
|
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --force-yes librados2 librbd1 && \
|
apt-get install -y --force-yes librados2 librbd1 && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
@ -21,6 +21,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -102,8 +103,13 @@ type ClusterHealthCollector struct {
|
|||||||
DeepScrubbingPGs prometheus.Gauge
|
DeepScrubbingPGs prometheus.Gauge
|
||||||
|
|
||||||
// SlowRequests depicts no. of total slow requests in the cluster
|
// SlowRequests depicts no. of total slow requests in the cluster
|
||||||
|
// This stat exists only for backwards compatbility.
|
||||||
SlowRequests prometheus.Gauge
|
SlowRequests prometheus.Gauge
|
||||||
|
|
||||||
|
// SlowRequestsByOSD depicts no. of total slow requests in the cluster
|
||||||
|
// labelled by OSD
|
||||||
|
SlowRequestsByOSD *prometheus.GaugeVec
|
||||||
|
|
||||||
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
||||||
// This includes object replicas in its count.
|
// This includes object replicas in its count.
|
||||||
DegradedObjectsCount prometheus.Gauge
|
DegradedObjectsCount prometheus.Gauge
|
||||||
@ -227,6 +233,15 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
|
|||||||
ConstLabels: labels,
|
ConstLabels: labels,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
SlowRequestsByOSD: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "slow_requests_osd",
|
||||||
|
Help: "No. of slow requests",
|
||||||
|
ConstLabels: labels,
|
||||||
|
},
|
||||||
|
[]string{"osd"},
|
||||||
|
),
|
||||||
DegradedPGs: prometheus.NewGauge(
|
DegradedPGs: prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: cephNamespace,
|
Namespace: cephNamespace,
|
||||||
@ -446,6 +461,12 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *ClusterHealthCollector) collectorList() []prometheus.Collector {
|
||||||
|
return []prometheus.Collector{
|
||||||
|
c.SlowRequestsByOSD,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
||||||
return []prometheus.Metric{
|
return []prometheus.Metric{
|
||||||
c.HealthStatus,
|
c.HealthStatus,
|
||||||
@ -527,6 +548,18 @@ type cephHealthStats struct {
|
|||||||
} `json:"pgmap"`
|
} `json:"pgmap"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cephHealthDetailStats struct {
|
||||||
|
Checks map[string]struct {
|
||||||
|
Details []struct {
|
||||||
|
Message string `json:"message"`
|
||||||
|
} `json:"detail"`
|
||||||
|
Summary struct {
|
||||||
|
Message string `json:"message"`
|
||||||
|
} `json:"summary"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
} `json:"checks"`
|
||||||
|
}
|
||||||
|
|
||||||
func (c *ClusterHealthCollector) collect() error {
|
func (c *ClusterHealthCollector) collect() error {
|
||||||
cmd := c.cephJSONUsage()
|
cmd := c.cephJSONUsage()
|
||||||
buf, _, err := c.conn.MonCommand(cmd)
|
buf, _, err := c.conn.MonCommand(cmd)
|
||||||
@ -769,6 +802,105 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs)
|
c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs)
|
||||||
c.TotalPGs.Set(stats.PGMap.NumPGs)
|
c.TotalPGs.Set(stats.PGMap.NumPGs)
|
||||||
|
|
||||||
|
cmd = c.cephHealthDetailCommand()
|
||||||
|
buf, _, err = c.conn.MonCommand(cmd)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
hdstats := &cephHealthDetailStats{}
|
||||||
|
if err := json.Unmarshal(buf, hdstats); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
slowOpsBlockedRegex = regexp.MustCompile(`([\d]+) ops are blocked > ([\d\.]+) sec`)
|
||||||
|
slowRequestSingleOSDRegex = regexp.MustCompile(`osd.([\d]+) has blocked requests > ([\d\.]+) sec`)
|
||||||
|
slowRequestMultipleOSDRegex = regexp.MustCompile(`osds ([\d,]+) have blocked requests > ([\d\.]+) sec`)
|
||||||
|
|
||||||
|
secToOpsBlocked = make(map[float64]int)
|
||||||
|
osdToSecondsBlocked = make(map[int]float64)
|
||||||
|
)
|
||||||
|
|
||||||
|
for key, check := range hdstats.Checks {
|
||||||
|
if key == "REQUEST_SLOW" {
|
||||||
|
for _, detail := range check.Details {
|
||||||
|
matched := slowOpsBlockedRegex.FindStringSubmatch(detail.Message)
|
||||||
|
if len(matched) == 3 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := strconv.ParseFloat(matched[2], 64)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
secToOpsBlocked[f] = v
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
matched = slowRequestSingleOSDRegex.FindStringSubmatch(detail.Message)
|
||||||
|
if len(matched) == 3 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := strconv.ParseFloat(matched[2], 64)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
osdToSecondsBlocked[v] = f
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
matched = slowRequestMultipleOSDRegex.FindStringSubmatch(detail.Message)
|
||||||
|
if len(matched) == 3 {
|
||||||
|
f, err := strconv.ParseFloat(matched[2], 64)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, osdID := range strings.Split(matched[1], ",") {
|
||||||
|
oid, err := strconv.Atoi(osdID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
osdToSecondsBlocked[oid] = f
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
secs := make([]float64, len(secToOpsBlocked))
|
||||||
|
for sec := range secToOpsBlocked {
|
||||||
|
secs = append(secs, sec)
|
||||||
|
}
|
||||||
|
sort.Float64s(secs)
|
||||||
|
|
||||||
|
totalOpsUntilNow := 0
|
||||||
|
totalOpsSet := false
|
||||||
|
for _, sec := range secs {
|
||||||
|
totalOpsUntilNow += secToOpsBlocked[sec]
|
||||||
|
for osd, osec := range osdToSecondsBlocked {
|
||||||
|
if sec == osec {
|
||||||
|
c.SlowRequestsByOSD.WithLabelValues(strconv.Itoa(osd)).Set(float64(totalOpsUntilNow))
|
||||||
|
totalOpsSet = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if totalOpsSet {
|
||||||
|
totalOpsUntilNow = 0
|
||||||
|
totalOpsSet = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -800,6 +932,20 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
|
|||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte {
|
||||||
|
cmd, err := json.Marshal(map[string]interface{}{
|
||||||
|
"prefix": "health",
|
||||||
|
"detail": "detail",
|
||||||
|
"format": jsonFormat,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
// panic! because ideally in no world this hard-coded input
|
||||||
|
// should fail.
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
|
func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
|
||||||
cmd := c.cephPlainUsage()
|
cmd := c.cephPlainUsage()
|
||||||
buf, _, err := c.conn.MonCommand(cmd)
|
buf, _, err := c.conn.MonCommand(cmd)
|
||||||
@ -1036,6 +1182,10 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
|
|||||||
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
|
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
|
||||||
// to the provided prometheus channel.
|
// to the provided prometheus channel.
|
||||||
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
|
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||||
|
for _, metric := range c.collectorList() {
|
||||||
|
metric.Describe(ch)
|
||||||
|
}
|
||||||
|
|
||||||
for _, metric := range c.metricsList() {
|
for _, metric := range c.metricsList() {
|
||||||
ch <- metric.Desc()
|
ch <- metric.Desc()
|
||||||
}
|
}
|
||||||
@ -1052,6 +1202,10 @@ func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
|
|||||||
log.Println("failed collecting cluster recovery/client io:", err)
|
log.Println("failed collecting cluster recovery/client io:", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, metric := range c.collectorList() {
|
||||||
|
metric.Collect(ch)
|
||||||
|
}
|
||||||
|
|
||||||
for _, metric := range c.metricsList() {
|
for _, metric := range c.metricsList() {
|
||||||
ch <- metric
|
ch <- metric
|
||||||
}
|
}
|
||||||
|
@ -440,6 +440,88 @@ $ sudo ceph -s
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
input: `
|
input: `
|
||||||
|
{
|
||||||
|
"checks": {
|
||||||
|
"REQUEST_SLOW": {
|
||||||
|
"severity": "HEALTH_WARN",
|
||||||
|
"summary": {
|
||||||
|
"message": "286 slow requests are blocked > 32 sec"
|
||||||
|
},
|
||||||
|
"detail": [
|
||||||
|
{
|
||||||
|
"message": "102 ops are blocked > 524.288 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "84 ops are blocked > 262.144 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "53 ops are blocked > 131.072 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "33 ops are blocked > 65.536 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "14 ops are blocked > 32.768 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "osds 363,463 have blocked requests > 32.768 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "osd.349 has blocked requests > 524.288 sec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 14`),
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 14`),
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 272`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
|
{
|
||||||
|
"checks": {
|
||||||
|
"REQUEST_SLOW": {
|
||||||
|
"severity": "HEALTH_WARN",
|
||||||
|
"summary": {
|
||||||
|
"message": "286 slow requests are blocked > 32 sec"
|
||||||
|
},
|
||||||
|
"detail": [
|
||||||
|
{
|
||||||
|
"message": "102 ops are blocked > 524.288 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "84 ops are blocked > 262.144 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "53 ops are blocked > 131.072 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "33 ops are blocked > 65.536 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "14 ops are blocked > 32.768 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "osds 363,463 have blocked requests > 131.072 sec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"message": "osd.349 has blocked requests > 524.288 sec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 100`),
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 100`),
|
||||||
|
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 186`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
{
|
{
|
||||||
"pgmap": {
|
"pgmap": {
|
||||||
"write_op_per_sec": 500,
|
"write_op_per_sec": 500,
|
||||||
|
Loading…
Reference in New Issue
Block a user