mirror of
https://github.com/ceph/ceph
synced 2025-02-22 10:37:15 +00:00
Ceph status outputs mds slow request for better monitoring
Signed-off-by: Zhi Zhang <zhangz.david@outlook.com>
This commit is contained in:
parent
ff18083d23
commit
92154c80a4
@ -173,7 +173,7 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
|
||||
}
|
||||
}
|
||||
|
||||
bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
|
||||
bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector, int *slow)
|
||||
{
|
||||
RWLock::RLocker l(lock);
|
||||
if (!tracking_enabled)
|
||||
@ -214,7 +214,11 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
|
||||
//store summary message
|
||||
warning_vector.push_back("");
|
||||
|
||||
int slow = 0; // total slow
|
||||
int _slow = 0; // total slow
|
||||
if (!slow)
|
||||
slow = &_slow;
|
||||
else
|
||||
*slow = _slow; // start from 0 anyway
|
||||
int warned = 0; // total logged
|
||||
for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
|
||||
ShardedTrackingData* sdata = sharded_in_flight_list[iter];
|
||||
@ -224,7 +228,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
|
||||
continue;
|
||||
xlist<TrackedOp*>::iterator i = sdata->ops_in_flight_sharded.begin();
|
||||
while (!i.end() && (*i)->get_initiated() < too_old) {
|
||||
slow++;
|
||||
(*slow)++;
|
||||
|
||||
// exponential backoff of warning intervals
|
||||
if (warned < log_threshold &&
|
||||
@ -252,7 +256,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
|
||||
// off, we will stay silent.
|
||||
if (warned > 0) {
|
||||
stringstream ss;
|
||||
ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
|
||||
ss << *slow << " slow requests, " << warned << " included below; oldest blocked for > "
|
||||
<< oldest_secs << " secs";
|
||||
warning_vector[0] = ss.str();
|
||||
}
|
||||
|
@ -120,7 +120,7 @@ public:
|
||||
* with a warning string for each old Op.
|
||||
* @return True if there are any Ops to warn on, false otherwise.
|
||||
*/
|
||||
bool check_ops_in_flight(std::vector<string> &warning_strings);
|
||||
bool check_ops_in_flight(std::vector<string> &warning_strings, int *slow = NULL);
|
||||
void mark_event(TrackedOp *op, const string &evt,
|
||||
utime_t time = ceph_clock_now(g_ceph_context));
|
||||
|
||||
|
@ -444,6 +444,19 @@ void Beacon::notify_health(MDSRank const *mds)
|
||||
}
|
||||
}
|
||||
|
||||
// Detect MDS_HEALTH_SLOW_REQUEST condition
|
||||
{
|
||||
int slow = mds->get_mds_slow_req_count();
|
||||
dout(20) << slow << " slow request found" << dendl;
|
||||
if (slow) {
|
||||
std::ostringstream oss;
|
||||
oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec";
|
||||
|
||||
MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
|
||||
health.metrics.push_back(m);
|
||||
}
|
||||
}
|
||||
|
||||
// Report a health warning if we are readonly
|
||||
if (mds->mdcache->is_readonly()) {
|
||||
MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
|
||||
|
@ -66,6 +66,7 @@ MDSRank::MDSRank(
|
||||
stopping(false),
|
||||
progress_thread(this), dispatch_depth(0),
|
||||
hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_),
|
||||
mds_slow_req_count(0),
|
||||
last_client_mdsmap_bcast(0),
|
||||
messenger(msgr), monc(monc_),
|
||||
respawn_hook(respawn_hook_),
|
||||
@ -2388,13 +2389,17 @@ void MDSRank::create_logger()
|
||||
void MDSRank::check_ops_in_flight()
|
||||
{
|
||||
vector<string> warnings;
|
||||
if (op_tracker.check_ops_in_flight(warnings)) {
|
||||
int slow = 0;
|
||||
if (op_tracker.check_ops_in_flight(warnings, &slow)) {
|
||||
for (vector<string>::iterator i = warnings.begin();
|
||||
i != warnings.end();
|
||||
++i) {
|
||||
clog->warn() << *i;
|
||||
}
|
||||
}
|
||||
|
||||
// set mds slow request count
|
||||
mds_slow_req_count = slow;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -248,6 +248,8 @@ class MDSRank {
|
||||
* Emit clog warnings for any ops reported as warnings by optracker
|
||||
*/
|
||||
void check_ops_in_flight();
|
||||
|
||||
int mds_slow_req_count;
|
||||
|
||||
/**
|
||||
* Share MDSMap with clients
|
||||
@ -361,6 +363,8 @@ class MDSRank {
|
||||
MDSMap *get_mds_map() { return mdsmap; }
|
||||
|
||||
int get_req_rate() { return logger->get(l_mds_request); }
|
||||
|
||||
int get_mds_slow_req_count() const { return mds_slow_req_count; }
|
||||
|
||||
void dump_status(Formatter *f) const;
|
||||
|
||||
|
@ -38,7 +38,8 @@ enum mds_metric_t {
|
||||
MDS_HEALTH_CLIENT_OLDEST_TID,
|
||||
MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
|
||||
MDS_HEALTH_DAMAGE,
|
||||
MDS_HEALTH_READ_ONLY
|
||||
MDS_HEALTH_READ_ONLY,
|
||||
MDS_HEALTH_SLOW_REQUEST
|
||||
};
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user