mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-03-08 21:17:28 +00:00
health: capture slow request per osd
This commit is contained in:
parent
afd5a2c4bf
commit
219fb69bde
@ -9,6 +9,8 @@ env:
|
||||
- DOCKER_TAG=$TRAVIS_TAG
|
||||
|
||||
before_install:
|
||||
- wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
|
||||
- echo deb https://download.ceph.com/debian-luminous/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
|
||||
- sudo apt-get update
|
||||
- sudo apt-get install -y librados-dev librbd-dev
|
||||
|
||||
|
10
Dockerfile
10
Dockerfile
@ -7,9 +7,10 @@ ENV PATH $GOROOT/bin:$PATH
|
||||
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y apt-transport-https build-essential git curl
|
||||
apt-get install -y apt-transport-https build-essential git curl wget
|
||||
|
||||
RUN echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list
|
||||
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
||||
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --force-yes librados-dev librbd-dev
|
||||
@ -28,8 +29,9 @@ FROM ubuntu:16.04
|
||||
MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com>
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y apt-transport-https curl && \
|
||||
echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list && \
|
||||
apt-get install -y apt-transport-https curl wget
|
||||
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
||||
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y --force-yes librados2 librbd1 && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@ -102,8 +103,13 @@ type ClusterHealthCollector struct {
|
||||
DeepScrubbingPGs prometheus.Gauge
|
||||
|
||||
// SlowRequests depicts no. of total slow requests in the cluster
|
||||
// This stat exists only for backwards compatbility.
|
||||
SlowRequests prometheus.Gauge
|
||||
|
||||
// SlowRequestsByOSD depicts no. of total slow requests in the cluster
|
||||
// labelled by OSD
|
||||
SlowRequestsByOSD *prometheus.GaugeVec
|
||||
|
||||
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
||||
// This includes object replicas in its count.
|
||||
DegradedObjectsCount prometheus.Gauge
|
||||
@ -227,6 +233,15 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
|
||||
ConstLabels: labels,
|
||||
},
|
||||
),
|
||||
SlowRequestsByOSD: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
Name: "slow_requests_osd",
|
||||
Help: "No. of slow requests",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
),
|
||||
DegradedPGs: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
@ -446,6 +461,12 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ClusterHealthCollector) collectorList() []prometheus.Collector {
|
||||
return []prometheus.Collector{
|
||||
c.SlowRequestsByOSD,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
||||
return []prometheus.Metric{
|
||||
c.HealthStatus,
|
||||
@ -527,6 +548,18 @@ type cephHealthStats struct {
|
||||
} `json:"pgmap"`
|
||||
}
|
||||
|
||||
type cephHealthDetailStats struct {
|
||||
Checks map[string]struct {
|
||||
Details []struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"detail"`
|
||||
Summary struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"summary"`
|
||||
Severity string `json:"severity"`
|
||||
} `json:"checks"`
|
||||
}
|
||||
|
||||
func (c *ClusterHealthCollector) collect() error {
|
||||
cmd := c.cephJSONUsage()
|
||||
buf, _, err := c.conn.MonCommand(cmd)
|
||||
@ -769,6 +802,105 @@ func (c *ClusterHealthCollector) collect() error {
|
||||
c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs)
|
||||
c.TotalPGs.Set(stats.PGMap.NumPGs)
|
||||
|
||||
cmd = c.cephHealthDetailCommand()
|
||||
buf, _, err = c.conn.MonCommand(cmd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hdstats := &cephHealthDetailStats{}
|
||||
if err := json.Unmarshal(buf, hdstats); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var (
|
||||
slowOpsBlockedRegex = regexp.MustCompile(`([\d]+) ops are blocked > ([\d\.]+) sec`)
|
||||
slowRequestSingleOSDRegex = regexp.MustCompile(`osd.([\d]+) has blocked requests > ([\d\.]+) sec`)
|
||||
slowRequestMultipleOSDRegex = regexp.MustCompile(`osds ([\d,]+) have blocked requests > ([\d\.]+) sec`)
|
||||
|
||||
secToOpsBlocked = make(map[float64]int)
|
||||
osdToSecondsBlocked = make(map[int]float64)
|
||||
)
|
||||
|
||||
for key, check := range hdstats.Checks {
|
||||
if key == "REQUEST_SLOW" {
|
||||
for _, detail := range check.Details {
|
||||
matched := slowOpsBlockedRegex.FindStringSubmatch(detail.Message)
|
||||
if len(matched) == 3 {
|
||||
v, err := strconv.Atoi(matched[1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := strconv.ParseFloat(matched[2], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
secToOpsBlocked[f] = v
|
||||
continue
|
||||
}
|
||||
|
||||
matched = slowRequestSingleOSDRegex.FindStringSubmatch(detail.Message)
|
||||
if len(matched) == 3 {
|
||||
v, err := strconv.Atoi(matched[1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := strconv.ParseFloat(matched[2], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
osdToSecondsBlocked[v] = f
|
||||
continue
|
||||
}
|
||||
|
||||
matched = slowRequestMultipleOSDRegex.FindStringSubmatch(detail.Message)
|
||||
if len(matched) == 3 {
|
||||
f, err := strconv.ParseFloat(matched[2], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, osdID := range strings.Split(matched[1], ",") {
|
||||
oid, err := strconv.Atoi(osdID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
osdToSecondsBlocked[oid] = f
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
secs := make([]float64, len(secToOpsBlocked))
|
||||
for sec := range secToOpsBlocked {
|
||||
secs = append(secs, sec)
|
||||
}
|
||||
sort.Float64s(secs)
|
||||
|
||||
totalOpsUntilNow := 0
|
||||
totalOpsSet := false
|
||||
for _, sec := range secs {
|
||||
totalOpsUntilNow += secToOpsBlocked[sec]
|
||||
for osd, osec := range osdToSecondsBlocked {
|
||||
if sec == osec {
|
||||
c.SlowRequestsByOSD.WithLabelValues(strconv.Itoa(osd)).Set(float64(totalOpsUntilNow))
|
||||
totalOpsSet = true
|
||||
}
|
||||
}
|
||||
|
||||
if totalOpsSet {
|
||||
totalOpsUntilNow = 0
|
||||
totalOpsSet = false
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -800,6 +932,20 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte {
|
||||
cmd, err := json.Marshal(map[string]interface{}{
|
||||
"prefix": "health",
|
||||
"detail": "detail",
|
||||
"format": jsonFormat,
|
||||
})
|
||||
if err != nil {
|
||||
// panic! because ideally in no world this hard-coded input
|
||||
// should fail.
|
||||
panic(err)
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
|
||||
cmd := c.cephPlainUsage()
|
||||
buf, _, err := c.conn.MonCommand(cmd)
|
||||
@ -1036,6 +1182,10 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
|
||||
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
|
||||
// to the provided prometheus channel.
|
||||
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
for _, metric := range c.collectorList() {
|
||||
metric.Describe(ch)
|
||||
}
|
||||
|
||||
for _, metric := range c.metricsList() {
|
||||
ch <- metric.Desc()
|
||||
}
|
||||
@ -1052,6 +1202,10 @@ func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
log.Println("failed collecting cluster recovery/client io:", err)
|
||||
}
|
||||
|
||||
for _, metric := range c.collectorList() {
|
||||
metric.Collect(ch)
|
||||
}
|
||||
|
||||
for _, metric := range c.metricsList() {
|
||||
ch <- metric
|
||||
}
|
||||
|
@ -440,6 +440,88 @@ $ sudo ceph -s
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"checks": {
|
||||
"REQUEST_SLOW": {
|
||||
"severity": "HEALTH_WARN",
|
||||
"summary": {
|
||||
"message": "286 slow requests are blocked > 32 sec"
|
||||
},
|
||||
"detail": [
|
||||
{
|
||||
"message": "102 ops are blocked > 524.288 sec"
|
||||
},
|
||||
{
|
||||
"message": "84 ops are blocked > 262.144 sec"
|
||||
},
|
||||
{
|
||||
"message": "53 ops are blocked > 131.072 sec"
|
||||
},
|
||||
{
|
||||
"message": "33 ops are blocked > 65.536 sec"
|
||||
},
|
||||
{
|
||||
"message": "14 ops are blocked > 32.768 sec"
|
||||
},
|
||||
{
|
||||
"message": "osds 363,463 have blocked requests > 32.768 sec"
|
||||
},
|
||||
{
|
||||
"message": "osd.349 has blocked requests > 524.288 sec"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 14`),
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 14`),
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 272`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"checks": {
|
||||
"REQUEST_SLOW": {
|
||||
"severity": "HEALTH_WARN",
|
||||
"summary": {
|
||||
"message": "286 slow requests are blocked > 32 sec"
|
||||
},
|
||||
"detail": [
|
||||
{
|
||||
"message": "102 ops are blocked > 524.288 sec"
|
||||
},
|
||||
{
|
||||
"message": "84 ops are blocked > 262.144 sec"
|
||||
},
|
||||
{
|
||||
"message": "53 ops are blocked > 131.072 sec"
|
||||
},
|
||||
{
|
||||
"message": "33 ops are blocked > 65.536 sec"
|
||||
},
|
||||
{
|
||||
"message": "14 ops are blocked > 32.768 sec"
|
||||
},
|
||||
{
|
||||
"message": "osds 363,463 have blocked requests > 131.072 sec"
|
||||
},
|
||||
{
|
||||
"message": "osd.349 has blocked requests > 524.288 sec"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 100`),
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 100`),
|
||||
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 186`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"pgmap": {
|
||||
"write_op_per_sec": 500,
|
||||
|
Loading…
Reference in New Issue
Block a user