Merge pull request #37590 from pcuzner/prometheus-slow-ops

mgr/prometheus: Add SLOW_OPS healthcheck as a metric

Reviewed-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
Kefu Chai 2020-11-02 11:32:06 +08:00 committed by GitHub
commit ac33cea383
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 0 deletions

View File

@ -241,3 +241,15 @@ groups:
description: >
Pool {{ $labels.name }} will be full in less than 5 days
assuming the average fill-up rate of the past 48 hours.
- name: healthchecks
rules:
- alert: Slow OSD Ops
expr: ceph_healthcheck_slow_ops > 0
for: 30s
labels:
severity: warning
type: ceph_default
annotations:
description: >
{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)

View File

@ -30,6 +30,8 @@ class SlowOps final : public DaemonHealthMetricCollector {
if (daemons.empty()) {
return;
}
// Note this message format is used in mgr/prometheus, so any change in format
// requires a corresponding change in the mgr/prometheus module.
ostringstream ss;
if (daemons.size() > 1) {
if (daemons.size() > 10) {

View File

@ -11,6 +11,7 @@ import time
from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
from mgr_util import get_default_addr, profile_method
from rbd import RBD
from collections import namedtuple
try:
from typing import Optional, Dict, Any, Set
except:
@ -108,6 +109,11 @@ DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
alert_metric = namedtuple('alert_metric', 'name description')
HEALTH_CHECKS = [
alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process' ),
]
class Metric(object):
def __init__(self, mtype, name, desc, labels=None):
@ -432,15 +438,62 @@ class Module(MgrModule):
'Number of {} objects'.format(state),
)
for check in HEALTH_CHECKS:
path = 'healthcheck_{}'.format(check.name.lower())
metrics[path] = Metric(
'gauge',
path,
check.description,
)
return metrics
@profile_method()
def get_health(self):
def _get_value(message, delim=' ', word_pos=0):
"""Extract value from message (default is 1st field)"""
v_str = message.split(delim)[word_pos]
if v_str.isdigit():
return int(v_str), 0
return 0, 1
health = json.loads(self.get('health')['json'])
# set overall health
self.metrics['health_status'].set(
health_status_to_number(health['status'])
)
# Examine the health to see if any health checks triggered need to
# become a metric.
active_healthchecks = health.get('checks', {})
active_names = active_healthchecks.keys()
for check in HEALTH_CHECKS:
path = 'healthcheck_{}'.format(check.name.lower())
if path in self.metrics:
if check.name in active_names:
check_data = active_healthchecks[check.name]
message = check_data['summary'].get('message', '')
v, err = 0, 0
if check.name == "SLOW_OPS":
# 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have slow ops.
v, err = _get_value(message)
if err:
self.log.error("healthcheck {} message format is incompatible and has been dropped".format(check.name))
# drop the metric, so it's no longer emitted
del self.metrics[path]
continue
else:
self.metrics[path].set(v)
else:
# health check is not active, so give it a default of 0
self.metrics[path].set(0)
@profile_method()
def get_pool_stats(self):
# retrieve pool stats to provide per pool recovery metrics