mon: subtree-based crush type down health warnings

Signed-off-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
Neha Ojha 2017-05-12 14:20:26 -07:00
parent d69aaccbc4
commit 72682e57ce
5 changed files with 136 additions and 71 deletions

View File

@ -587,6 +587,20 @@ int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string>
return 0; return 0;
} }
string CrushWrapper::get_full_location_ordered_string(int id)
{
vector<pair<string, string> > full_location_ordered;
string full_location;
get_full_location_ordered(id, full_location_ordered);
reverse(begin(full_location_ordered), end(full_location_ordered));
for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
full_location = full_location + i->first + "=" + i->second;
if (i != full_location_ordered.end() - 1) {
full_location = full_location + ",";
}
}
return full_location;
}
map<int, string> CrushWrapper::get_parent_hierarchy(int id) map<int, string> CrushWrapper::get_parent_hierarchy(int id)
{ {

View File

@ -590,6 +590,15 @@ public:
*/ */
int get_full_location_ordered(int id, vector<pair<string, string> >& path); int get_full_location_ordered(int id, vector<pair<string, string> >& path);
/*
* identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
* although it returns a concatenated string with the type/name pairs in descending
* hierarchical order with format key1=val1,key2=val2.
*
* returns the location in descending hierarchy as a string.
*/
string get_full_location_ordered_string(int id);
/** /**
* returns (type_id, type) of all parent buckets between id and * returns (type_id, type) of all parent buckets between id and
* default, can be used to check for anomolous CRUSH maps * default, can be used to check for anomolous CRUSH maps

View File

@ -3460,15 +3460,14 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
} else { } else {
int num_in_osds = 0; int num_in_osds = 0;
int num_down_in_osds = 0; int num_down_in_osds = 0;
int num_in_subtrees = 0;
int num_down_in_subtrees = 0;
set<int> osds; set<int> osds;
set<int> down_cache; // quick cache of down subtrees set<int> down_in_osds;
set<int> in_subtrees; set<int> up_in_osds;
set<int> up_in_subtrees; set<int> subtree_up;
set<int> down_in_subtrees; unordered_map<int, set<int> > subtree_type_down;
set<string> down_in_subtree_names; unordered_map<int, int> num_osds_subtree;
int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit); int max_type = osdmap.crush->get_num_type_names() - 1;
for (int i = 0; i < osdmap.get_max_osd(); i++) { for (int i = 0; i < osdmap.get_max_osd(); i++) {
if (!osdmap.exists(i)) { if (!osdmap.exists(i)) {
if (osdmap.crush->item_exists(i)) { if (osdmap.crush->item_exists(i)) {
@ -3479,57 +3478,97 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
if (osdmap.is_out(i)) if (osdmap.is_out(i))
continue; continue;
++num_in_osds; ++num_in_osds;
// get the id of the parent subtree if (down_in_osds.count(i) || up_in_osds.count(i))
int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache); continue;
if (subtree_id != -ENOENT) {
in_subtrees.insert(subtree_id);
}
if (!osdmap.is_up(i)) { if (!osdmap.is_up(i)) {
++num_down_in_osds; down_in_osds.insert(i);
if (detail) { int parent_id = 0;
const osd_info_t& info = osdmap.get_info(i); int current = i;
ostringstream ss; for (int type = 0; type <= max_type; type++) {
map<string, string> loc; int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
loc = osdmap.crush->get_full_location(i); if (r == -ENOENT)
ss << "osd." << i << loc << " is down since epoch " << info.down_at << ", last address " break;
<< osdmap.get_addr(i); // break early if this parent is already marked as up
detail->push_back(make_pair(HEALTH_WARN, ss.str())); if (subtree_up.count(parent_id))
break;
type = osdmap.crush->get_bucket_type(parent_id);
if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
break;
current = parent_id;
} }
} else {
// if an osd in a subtree is up, implies subtree is not down
up_in_subtrees.insert(subtree_id);
} }
} }
set_difference(in_subtrees.begin(), in_subtrees.end(), // calculate the number of down osds in each down subtree and store it in num_osds_subtree
up_in_subtrees.begin(), up_in_subtrees.end(), for (int type = 1; type <= max_type; type++) {
inserter(down_in_subtrees, down_in_subtrees.end())); for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
num_in_subtrees = in_subtrees.size(); if (type == 1) {
num_down_in_subtrees = down_in_subtrees.size(); list<int> children;
for (set<int>::iterator it = down_in_subtrees.begin(); int num = osdmap.crush->get_children(*j, &children);
it != down_in_subtrees.end(); ++it) { num_osds_subtree[*j] = num;
down_in_subtree_names.insert(osdmap.crush->get_item_name(*it)); } else {
list<int> children;
int num = 0;
int num_children = osdmap.crush->get_children(*j, &children);
if (num_children == 0)
continue;
for (auto l = children.begin(); l != children.end(); ++l) {
if (num_osds_subtree[*l] > 0) {
num = num + num_osds_subtree[*l];
} }
}
num_osds_subtree[*j] = num;
}
}
}
num_down_in_osds = down_in_osds.size();
assert(num_down_in_osds <= num_in_osds); assert(num_down_in_osds <= num_in_osds);
assert(num_down_in_subtrees <= num_in_subtrees);
if (num_down_in_osds > 0) { if (num_down_in_osds > 0) {
ostringstream ss; ostringstream ss;
ss << num_down_in_osds << "/" << num_in_osds << " in osds are down"; ss << "\n";
summary.push_back(make_pair(HEALTH_WARN, ss.str())); // summary of down subtree types and osds
if (num_down_in_subtrees > 0) { for (int type = max_type; type > 0; type--) {
ostringstream sst; if (subtree_type_down[type].size() > 0) {
if (num_in_subtrees == 1) { ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type);
sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit << if (subtree_type_down[type].size() > 1) {
" is down"; ss << "s";
sst << "(" << down_in_subtree_names << ")";
summary.push_back(make_pair(HEALTH_WARN, sst.str()));
} else {
sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
"s are down";
sst << "(" << down_in_subtree_names << ")";
summary.push_back(make_pair(HEALTH_WARN, sst.str()));
} }
int sum_down_osds = 0;
for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
sum_down_osds = sum_down_osds + num_osds_subtree[*j];
}
ss << " (" << sum_down_osds << " osds) down\n";
}
}
ss << down_in_osds.size() << " osds are down\n";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail) {
ostringstream ss;
// details of down subtree types
for (int type = max_type; type > 0; type--) {
for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) {
ss << osdmap.crush->get_type_name(type);
ss << " ";
ss << osdmap.crush->get_item_name(*j);
// at the top level, do not print location
if (type != max_type) {
ss << " (";
ss << osdmap.crush->get_full_location_ordered_string(*j);
ss << ")";
}
int num = num_osds_subtree[*j];
ss << " (" << num << " osds)";
ss << " is down\n";
}
}
// details of down osds
for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
ss << "osd." << *it << " (";
ss << osdmap.crush->get_full_location_ordered_string(*it);
ss << ") is down\n";
}
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
} }
} }

View File

@ -293,32 +293,36 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty
} }
} }
int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
{ {
set<int> local_down_cache; if (id >= 0) {
if (!down_cache) { bool is_down_ret = is_down(id);
down_cache = &local_down_cache; if (is_down_ret) {
} down_in_osds->insert(id);
int current = id;
while (true) {
int type;
if (current >= 0) {
type = 0;
} else { } else {
type = crush->get_bucket_type(current); up_in_osds->insert(id);
} }
assert(type >= 0); return is_down_ret;
if (type >= subtree_type) {
return current;
} }
int r = crush->get_immediate_parent_id(current, &current); if (subtree_type_down &&
if (r < 0) { (*subtree_type_down)[subtree_type].count(id)) {
return -ENOENT; return true;
}
list<int> children;
crush->get_children(id, &children);
for (const auto &child : children) {
if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
subtree_up->insert(id);
return false;
} }
} }
if (subtree_type_down) {
(*subtree_type_down)[subtree_type].insert(id);
}
return true;
} }
void OSDMap::Incremental::encode_client_old(bufferlist& bl) const void OSDMap::Incremental::encode_client_old(bufferlist& bl) const

View File

@ -495,10 +495,9 @@ public:
bool subtree_is_down(int id, set<int> *down_cache) const; bool subtree_is_down(int id, set<int> *down_cache) const;
bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const; bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
/** bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
* get the id of the parent subtree set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
*/
int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
int identify_osd(const entity_addr_t& addr) const; int identify_osd(const entity_addr_t& addr) const;
int identify_osd(const uuid_d& u) const; int identify_osd(const uuid_d& u) const;
int identify_osd_on_all_channels(const entity_addr_t& addr) const; int identify_osd_on_all_channels(const entity_addr_t& addr) const;