health: capture slow request per osd

This commit is contained in:
Vaibhav Bhembre 2018-05-09 23:04:07 -04:00
parent afd5a2c4bf
commit 219fb69bde
4 changed files with 244 additions and 4 deletions

View File

@ -9,6 +9,8 @@ env:
- DOCKER_TAG=$TRAVIS_TAG - DOCKER_TAG=$TRAVIS_TAG
before_install: before_install:
- wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
- echo deb https://download.ceph.com/debian-luminous/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
- sudo apt-get update - sudo apt-get update
- sudo apt-get install -y librados-dev librbd-dev - sudo apt-get install -y librados-dev librbd-dev

View File

@ -7,9 +7,10 @@ ENV PATH $GOROOT/bin:$PATH
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y apt-transport-https build-essential git curl apt-get install -y apt-transport-https build-essential git curl wget
RUN echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --force-yes librados-dev librbd-dev apt-get install -y --force-yes librados-dev librbd-dev
@ -28,8 +29,9 @@ FROM ubuntu:16.04
MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com> MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com>
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y apt-transport-https curl && \ apt-get install -y apt-transport-https curl wget
echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list && \ RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
apt-get update && \ apt-get update && \
apt-get install -y --force-yes librados2 librbd1 && \ apt-get install -y --force-yes librados2 librbd1 && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*

View File

@ -21,6 +21,7 @@ import (
"fmt" "fmt"
"log" "log"
"regexp" "regexp"
"sort"
"strconv" "strconv"
"strings" "strings"
@ -102,8 +103,13 @@ type ClusterHealthCollector struct {
DeepScrubbingPGs prometheus.Gauge DeepScrubbingPGs prometheus.Gauge
// SlowRequests depicts no. of total slow requests in the cluster // SlowRequests depicts no. of total slow requests in the cluster
// This stat exists only for backwards compatbility.
SlowRequests prometheus.Gauge SlowRequests prometheus.Gauge
// SlowRequestsByOSD depicts no. of total slow requests in the cluster
// labelled by OSD
SlowRequestsByOSD *prometheus.GaugeVec
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs. // DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
// This includes object replicas in its count. // This includes object replicas in its count.
DegradedObjectsCount prometheus.Gauge DegradedObjectsCount prometheus.Gauge
@ -227,6 +233,15 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
ConstLabels: labels, ConstLabels: labels,
}, },
), ),
SlowRequestsByOSD: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "slow_requests_osd",
Help: "No. of slow requests",
ConstLabels: labels,
},
[]string{"osd"},
),
DegradedPGs: prometheus.NewGauge( DegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: cephNamespace, Namespace: cephNamespace,
@ -446,6 +461,12 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
} }
} }
func (c *ClusterHealthCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
c.SlowRequestsByOSD,
}
}
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{ return []prometheus.Metric{
c.HealthStatus, c.HealthStatus,
@ -527,6 +548,18 @@ type cephHealthStats struct {
} `json:"pgmap"` } `json:"pgmap"`
} }
type cephHealthDetailStats struct {
Checks map[string]struct {
Details []struct {
Message string `json:"message"`
} `json:"detail"`
Summary struct {
Message string `json:"message"`
} `json:"summary"`
Severity string `json:"severity"`
} `json:"checks"`
}
func (c *ClusterHealthCollector) collect() error { func (c *ClusterHealthCollector) collect() error {
cmd := c.cephJSONUsage() cmd := c.cephJSONUsage()
buf, _, err := c.conn.MonCommand(cmd) buf, _, err := c.conn.MonCommand(cmd)
@ -769,6 +802,105 @@ func (c *ClusterHealthCollector) collect() error {
c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs) c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs)
c.TotalPGs.Set(stats.PGMap.NumPGs) c.TotalPGs.Set(stats.PGMap.NumPGs)
cmd = c.cephHealthDetailCommand()
buf, _, err = c.conn.MonCommand(cmd)
if err != nil {
return err
}
hdstats := &cephHealthDetailStats{}
if err := json.Unmarshal(buf, hdstats); err != nil {
return err
}
var (
slowOpsBlockedRegex = regexp.MustCompile(`([\d]+) ops are blocked > ([\d\.]+) sec`)
slowRequestSingleOSDRegex = regexp.MustCompile(`osd.([\d]+) has blocked requests > ([\d\.]+) sec`)
slowRequestMultipleOSDRegex = regexp.MustCompile(`osds ([\d,]+) have blocked requests > ([\d\.]+) sec`)
secToOpsBlocked = make(map[float64]int)
osdToSecondsBlocked = make(map[int]float64)
)
for key, check := range hdstats.Checks {
if key == "REQUEST_SLOW" {
for _, detail := range check.Details {
matched := slowOpsBlockedRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}
secToOpsBlocked[f] = v
continue
}
matched = slowRequestSingleOSDRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}
osdToSecondsBlocked[v] = f
continue
}
matched = slowRequestMultipleOSDRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}
for _, osdID := range strings.Split(matched[1], ",") {
oid, err := strconv.Atoi(osdID)
if err != nil {
return err
}
osdToSecondsBlocked[oid] = f
}
continue
}
}
}
}
secs := make([]float64, len(secToOpsBlocked))
for sec := range secToOpsBlocked {
secs = append(secs, sec)
}
sort.Float64s(secs)
totalOpsUntilNow := 0
totalOpsSet := false
for _, sec := range secs {
totalOpsUntilNow += secToOpsBlocked[sec]
for osd, osec := range osdToSecondsBlocked {
if sec == osec {
c.SlowRequestsByOSD.WithLabelValues(strconv.Itoa(osd)).Set(float64(totalOpsUntilNow))
totalOpsSet = true
}
}
if totalOpsSet {
totalOpsUntilNow = 0
totalOpsSet = false
}
}
return nil return nil
} }
@ -800,6 +932,20 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
return cmd return cmd
} }
func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "health",
"detail": "detail",
"format": jsonFormat,
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
func (c *ClusterHealthCollector) collectRecoveryClientIO() error { func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
cmd := c.cephPlainUsage() cmd := c.cephPlainUsage()
buf, _, err := c.conn.MonCommand(cmd) buf, _, err := c.conn.MonCommand(cmd)
@ -1036,6 +1182,10 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector // Describe sends all the descriptions of individual metrics of ClusterHealthCollector
// to the provided prometheus channel. // to the provided prometheus channel.
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) { func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range c.collectorList() {
metric.Describe(ch)
}
for _, metric := range c.metricsList() { for _, metric := range c.metricsList() {
ch <- metric.Desc() ch <- metric.Desc()
} }
@ -1052,6 +1202,10 @@ func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
log.Println("failed collecting cluster recovery/client io:", err) log.Println("failed collecting cluster recovery/client io:", err)
} }
for _, metric := range c.collectorList() {
metric.Collect(ch)
}
for _, metric := range c.metricsList() { for _, metric := range c.metricsList() {
ch <- metric ch <- metric
} }

View File

@ -440,6 +440,88 @@ $ sudo ceph -s
}, },
{ {
input: ` input: `
{
"checks": {
"REQUEST_SLOW": {
"severity": "HEALTH_WARN",
"summary": {
"message": "286 slow requests are blocked > 32 sec"
},
"detail": [
{
"message": "102 ops are blocked > 524.288 sec"
},
{
"message": "84 ops are blocked > 262.144 sec"
},
{
"message": "53 ops are blocked > 131.072 sec"
},
{
"message": "33 ops are blocked > 65.536 sec"
},
{
"message": "14 ops are blocked > 32.768 sec"
},
{
"message": "osds 363,463 have blocked requests > 32.768 sec"
},
{
"message": "osd.349 has blocked requests > 524.288 sec"
}
]
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 14`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 14`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 272`),
},
},
{
input: `
{
"checks": {
"REQUEST_SLOW": {
"severity": "HEALTH_WARN",
"summary": {
"message": "286 slow requests are blocked > 32 sec"
},
"detail": [
{
"message": "102 ops are blocked > 524.288 sec"
},
{
"message": "84 ops are blocked > 262.144 sec"
},
{
"message": "53 ops are blocked > 131.072 sec"
},
{
"message": "33 ops are blocked > 65.536 sec"
},
{
"message": "14 ops are blocked > 32.768 sec"
},
{
"message": "osds 363,463 have blocked requests > 131.072 sec"
},
{
"message": "osd.349 has blocked requests > 524.288 sec"
}
]
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 100`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 100`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 186`),
},
},
{
input: `
{ {
"pgmap": { "pgmap": {
"write_op_per_sec": 500, "write_op_per_sec": 500,