mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
mds: warn when clients are not advancing their oldest_client_tid
Fixes: #10657 Signed-off-by: Yan, Zheng <zyan@redhat.com>
This commit is contained in:
parent
a84ae7b14e
commit
dcd9302227
@ -450,6 +450,8 @@ OPTION(mds_snap_min_uid, OPT_U32, 0) // The minimum UID required to create a sna
|
||||
OPTION(mds_snap_max_uid, OPT_U32, 65536) // The maximum UID allowed to create a snapshot
|
||||
OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot
|
||||
OPTION(mds_verify_backtrace, OPT_U32, 1)
|
||||
// detect clients which aren't trimming completed requests
|
||||
OPTION(mds_max_completed_requests, OPT_U32, 100000)
|
||||
|
||||
OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash
|
||||
OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE, 5)
|
||||
|
@ -343,6 +343,8 @@ void Beacon::notify_health(MDS const *mds)
|
||||
|
||||
// Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
|
||||
// messages. May be due to buggy client or resource-hogging application.
|
||||
//
|
||||
// Detect clients failing to advance their old_client_tid
|
||||
{
|
||||
set<Session*> sessions;
|
||||
mds->sessionmap.get_client_session_set(sessions);
|
||||
@ -350,6 +352,7 @@ void Beacon::notify_health(MDS const *mds)
|
||||
cutoff -= g_conf->mds_recall_state_timeout;
|
||||
|
||||
std::list<MDSHealthMetric> late_recall_metrics;
|
||||
std::list<MDSHealthMetric> large_completed_requests_metrics;
|
||||
for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
|
||||
Session *session = *i;
|
||||
if (!session->recalled_at.is_zero()) {
|
||||
@ -359,7 +362,7 @@ void Beacon::notify_health(MDS const *mds)
|
||||
if (session->recalled_at < cutoff) {
|
||||
dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
|
||||
std::ostringstream oss;
|
||||
oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
|
||||
oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
|
||||
MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
|
||||
m.metadata["client_id"] = session->info.inst.name.num();
|
||||
late_recall_metrics.push_back(m);
|
||||
@ -367,6 +370,14 @@ void Beacon::notify_health(MDS const *mds)
|
||||
dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
|
||||
}
|
||||
}
|
||||
if (session->get_num_trim_requests_warnings() > 0 &&
|
||||
session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) {
|
||||
std::ostringstream oss;
|
||||
oss << "Client " << session->get_human_name() << " failing to advance its oldest_client_tid";
|
||||
MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
|
||||
m.metadata["client_id"] = session->info.inst.name.num();
|
||||
large_completed_requests_metrics.push_back(m);
|
||||
}
|
||||
}
|
||||
|
||||
if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
|
||||
@ -380,6 +391,18 @@ void Beacon::notify_health(MDS const *mds)
|
||||
health.metrics.push_back(m);
|
||||
late_recall_metrics.clear();
|
||||
}
|
||||
|
||||
if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
|
||||
health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
|
||||
} else {
|
||||
std::ostringstream oss;
|
||||
oss << "Many clients (" << large_completed_requests_metrics.size()
|
||||
<< ") failing to advance their oldest_client_tid";
|
||||
MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
|
||||
m.metadata["client_count"] = large_completed_requests_metrics.size();
|
||||
health.metrics.push_back(m);
|
||||
large_completed_requests_metrics.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1356,6 +1356,22 @@ void Server::handle_client_request(MClientRequest *req)
|
||||
// Sessions 'completed_requests' was dirtied, mark it to be
|
||||
// potentially flushed at segment expiry.
|
||||
mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
|
||||
|
||||
if (session->get_num_trim_requests_warnings() > 0 &&
|
||||
session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
|
||||
session->reset_num_trim_requests_warnings();
|
||||
} else {
|
||||
if (session->get_num_completed_requests() >=
|
||||
(g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
|
||||
session->inc_num_trim_requests_warnings();
|
||||
stringstream ss;
|
||||
ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
|
||||
<< req->get_oldest_client_tid() << "), "
|
||||
<< session->get_num_completed_requests()
|
||||
<< " completed requests recorded in session\n";
|
||||
mds->clog->warn() << ss.str();
|
||||
dout(20) << __func__ << " " << ss.str() << dendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -194,9 +194,6 @@ private:
|
||||
version_t cap_push_seq; // cap push seq #
|
||||
map<version_t, list<MDSInternalContextBase*> > waitfor_flush; // flush session messages
|
||||
|
||||
// Has completed_requests been modified since the last time we
|
||||
// wrote this session out?
|
||||
bool completed_requests_dirty;
|
||||
public:
|
||||
xlist<Capability*> caps; // inodes with caps; front=most recently used
|
||||
xlist<ClientLease*> leases; // metadata leases to clients
|
||||
@ -231,8 +228,11 @@ public:
|
||||
|
||||
// -- completed requests --
|
||||
private:
|
||||
// Has completed_requests been modified since the last time we
|
||||
// wrote this session out?
|
||||
bool completed_requests_dirty;
|
||||
|
||||
|
||||
unsigned num_trim_requests_warnings;
|
||||
public:
|
||||
void add_completed_request(ceph_tid_t t, inodeno_t created) {
|
||||
info.completed_requests[t] = created;
|
||||
@ -261,6 +261,11 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned get_num_completed_requests() const { return info.completed_requests.size(); }
|
||||
unsigned get_num_trim_requests_warnings() { return num_trim_requests_warnings; }
|
||||
void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; }
|
||||
void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; }
|
||||
|
||||
bool has_dirty_completed_requests() const
|
||||
{
|
||||
return completed_requests_dirty;
|
||||
@ -278,8 +283,9 @@ public:
|
||||
connection(NULL), item_session_list(this),
|
||||
requests(0), // member_offset passed to front() manually
|
||||
cap_push_seq(0),
|
||||
lease_seq(0),
|
||||
completed_requests_dirty(false),
|
||||
lease_seq(0) { }
|
||||
num_trim_requests_warnings(0) { }
|
||||
~Session() {
|
||||
assert(!item_session_list.is_on_list());
|
||||
while (!preopen_out_queue.empty()) {
|
||||
|
@ -36,7 +36,9 @@ enum mds_metric_t {
|
||||
MDS_HEALTH_CLIENT_RECALL,
|
||||
MDS_HEALTH_CLIENT_LATE_RELEASE,
|
||||
MDS_HEALTH_CLIENT_RECALL_MANY,
|
||||
MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
|
||||
MDS_HEALTH_CLIENT_LATE_RELEASE_MANY,
|
||||
MDS_HEALTH_CLIENT_OLDEST_TID,
|
||||
MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
|
||||
};
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user