Ceph status outputs mds slow request for better monitoring

Signed-off-by: Zhi Zhang <zhangz.david@outlook.com>
This commit is contained in:
Zhi Zhang 2016-06-15 11:28:34 +08:00
parent ff18083d23
commit 92154c80a4
6 changed files with 34 additions and 7 deletions

View File

@ -173,7 +173,7 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
}
}
bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector, int *slow)
{
RWLock::RLocker l(lock);
if (!tracking_enabled)
@ -214,7 +214,11 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
//store summary message
warning_vector.push_back("");
int slow = 0; // total slow
int _slow = 0; // total slow
if (!slow)
slow = &_slow;
else
*slow = _slow; // start from 0 anyway
int warned = 0; // total logged
for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
ShardedTrackingData* sdata = sharded_in_flight_list[iter];
@ -224,7 +228,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
continue;
xlist<TrackedOp*>::iterator i = sdata->ops_in_flight_sharded.begin();
while (!i.end() && (*i)->get_initiated() < too_old) {
slow++;
(*slow)++;
// exponential backoff of warning intervals
if (warned < log_threshold &&
@ -252,7 +256,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
// off, we will stay silent.
if (warned > 0) {
stringstream ss;
ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
ss << *slow << " slow requests, " << warned << " included below; oldest blocked for > "
<< oldest_secs << " secs";
warning_vector[0] = ss.str();
}

View File

@ -120,7 +120,7 @@ public:
* with a warning string for each old Op.
* @return True if there are any Ops to warn on, false otherwise.
*/
bool check_ops_in_flight(std::vector<string> &warning_strings);
bool check_ops_in_flight(std::vector<string> &warning_strings, int *slow = NULL);
void mark_event(TrackedOp *op, const string &evt,
utime_t time = ceph_clock_now(g_ceph_context));

View File

@ -444,6 +444,19 @@ void Beacon::notify_health(MDSRank const *mds)
}
}
// Detect MDS_HEALTH_SLOW_REQUEST condition
{
int slow = mds->get_mds_slow_req_count();
dout(20) << slow << " slow request found" << dendl;
if (slow) {
std::ostringstream oss;
oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec";
MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
health.metrics.push_back(m);
}
}
// Report a health warning if we are readonly
if (mds->mdcache->is_readonly()) {
MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,

View File

@ -66,6 +66,7 @@ MDSRank::MDSRank(
stopping(false),
progress_thread(this), dispatch_depth(0),
hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_),
mds_slow_req_count(0),
last_client_mdsmap_bcast(0),
messenger(msgr), monc(monc_),
respawn_hook(respawn_hook_),
@ -2388,13 +2389,17 @@ void MDSRank::create_logger()
void MDSRank::check_ops_in_flight()
{
vector<string> warnings;
if (op_tracker.check_ops_in_flight(warnings)) {
int slow = 0;
if (op_tracker.check_ops_in_flight(warnings, &slow)) {
for (vector<string>::iterator i = warnings.begin();
i != warnings.end();
++i) {
clog->warn() << *i;
}
}
// set mds slow request count
mds_slow_req_count = slow;
return;
}

View File

@ -248,6 +248,8 @@ class MDSRank {
* Emit clog warnings for any ops reported as warnings by optracker
*/
void check_ops_in_flight();
int mds_slow_req_count;
/**
* Share MDSMap with clients
@ -361,6 +363,8 @@ class MDSRank {
MDSMap *get_mds_map() { return mdsmap; }
int get_req_rate() { return logger->get(l_mds_request); }
int get_mds_slow_req_count() const { return mds_slow_req_count; }
void dump_status(Formatter *f) const;

View File

@ -38,7 +38,8 @@ enum mds_metric_t {
MDS_HEALTH_CLIENT_OLDEST_TID,
MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
MDS_HEALTH_DAMAGE,
MDS_HEALTH_READ_ONLY
MDS_HEALTH_READ_ONLY,
MDS_HEALTH_SLOW_REQUEST
};
/**