mds: health metric for late releasing caps

Follow up on Yan Zheng's "mds: warn clients which
aren't revoking cap" to include a health metric
for this condition as well as the clog messages.

Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
John Spray 2014-09-07 16:22:37 +01:00
parent 05d69580b0
commit fd04d5e662
5 changed files with 53 additions and 4 deletions

View File

@ -264,6 +264,24 @@ void Beacon::notify_health(MDS const *mds)
health.metrics.push_back(m);
}
// Detect clients failing to respond to modifications to capabilities in
// CLIENT_CAPS messages.
std::list<const Capability*> late_caps;
mds->locker->get_late_cap_releases(&late_caps);
std::set<client_t> late_clients;
for (std::list<const Capability*>::iterator i =late_caps.begin(); i != late_caps.end(); ++i) {
const Capability *cap = *i;
late_clients.insert(cap->get_client());
}
for (std::set<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
std::ostringstream oss;
oss << "client." << *i << " failing to respond to capability release";
MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
m.metadata["client_id"] = stringify(i->v);
health.metrics.push_back(m);
}
// Detect clients failing to generate cap releases from SESSION_RECALL messages
// May be due to buggy client or resource-hogging application.
set<Session*> sessions;

View File

@ -286,7 +286,7 @@ public:
void clear_new() { state &= ~STATE_NEW; }
CInode *get_inode() { return inode; }
client_t get_client() { return client; }
client_t get_client() const { return client; }
// caps this client wants to hold
int wanted() { return _wanted; }

View File

@ -3215,20 +3215,47 @@ void Locker::caps_tick()
{
utime_t now = ceph_clock_now(g_ceph_context);
dout(20) << __func__ << " " << revoking_caps.size() << " revoking caps" << dendl;
for (xlist<Capability*>::iterator p = revoking_caps.begin(); !p.end(); ++p) {
Capability *cap = *p;
utime_t age = now - cap->get_last_revoke_stamp();
if (age <= g_conf->mds_revoke_cap_timeout)
dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
if (age <= g_conf->mds_revoke_cap_timeout) {
dout(20) << __func__ << " age below timeout " << g_conf->mds_revoke_cap_timeout << dendl;
break;
}
// exponential backoff of warning intervals
if (age > g_conf->mds_revoke_cap_timeout * (1 << cap->get_num_revoke_warnings())) {
cap->inc_num_revoke_warnings();
stringstream ss;
ss << "client." << cap->get_client() << " isn't responding to MClientCaps(revoke), ino "
ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
<< cap->get_inode()->ino() << " pending " << ccap_string(cap->pending())
<< " issued " << ccap_string(cap->issued()) << ", sent " << age << " seconds ago\n";
mds->clog->warn() << ss.str();
dout(20) << __func__ << " " << ss.str() << dendl;
} else {
dout(20) << __func__ << " silencing log message (backoff) for " << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
}
}
}
void Locker::get_late_cap_releases(std::list<const Capability*> *late_caps) const
{
assert(late_caps != NULL);
utime_t now = ceph_clock_now(g_ceph_context);
for (xlist<Capability*>::const_iterator p = revoking_caps.begin(); !p.end(); ++p) {
Capability *cap = *p;
utime_t age = now - cap->get_last_revoke_stamp();
if (age <= g_conf->mds_revoke_cap_timeout) {
break;
} else {
late_caps->push_back(cap);
}
}
}

View File

@ -49,6 +49,7 @@ class LogSegment;
class SimpleLock;
class ScatterLock;
class LocalLock;
class MDCache;
typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
@ -195,6 +196,8 @@ public:
void remove_client_cap(CInode *in, client_t client);
void get_late_cap_releases(std::list<const Capability*> *late_caps) const;
protected:
void adjust_cap_wanted(Capability *cap, int wanted, int issue_seq);
void handle_client_caps(class MClientCaps *m);

View File

@ -33,7 +33,8 @@
enum mds_metric_t {
MDS_HEALTH_NULL = 0,
MDS_HEALTH_TRIM = 1,
MDS_HEALTH_CLIENT_RECALL = 2
MDS_HEALTH_CLIENT_RECALL = 2,
MDS_HEALTH_CLIENT_LATE_RELEASE = 3
};
/**