From 4fec5b85f1763eb6b5a8d68eae2af97f6fd63d0e Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Tue, 25 Apr 2017 14:41:09 +0800 Subject: [PATCH] mon/OSDMonitor: batch OSDs nodown/noout support This patch allow us to add batch OSDs(e.g., from a specific host which is currently in maintenance) into a specific nodown/noout list, which can not be automatically marked down/out and hence can be eliminated from data migration. This has the same effect with the global nodown/noout flag but is more fine-grained. Signed-off-by: xie xingguo --- qa/workunits/cephtool/test.sh | 10 ++ src/common/ceph_strings.cc | 4 + src/include/rados.h | 52 ++++---- src/mon/MonCommands.h | 22 ++++ src/mon/OSDMonitor.cc | 239 +++++++++++++++++++++++++++++++++- src/osd/OSDMap.h | 55 ++++++++ 6 files changed, 354 insertions(+), 28 deletions(-) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 74e098c34ca..296c93d7af4 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1430,6 +1430,16 @@ function test_mon_osd() ceph osd dump | grep 'osd.0.*in' ceph osd find 0 + ceph osd add-noout 0 + ceph health detail | grep 'noout osd(s).*0' + ceph osd rm-noout 0 + ! ceph health detail | grep 'noout osds(s).*0' + + ceph osd add-nodown 0 1 + ceph health detail | grep 'nodown osd(s).*0.*1' + ceph osd rm-nodown all + ! ceph health detail | grep 'nodown osd(s).*0.*1' + # make sure mark out preserves weight ceph osd reweight osd.0 .5 ceph osd dump | grep ^osd.0 | grep 'weight 0.5' diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index ef3aa802761..8af6bc470a9 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -46,6 +46,10 @@ const char *ceph_osd_state_name(int s) return "backfillfull"; case CEPH_OSD_DESTROYED: return "destroyed"; + case CEPH_OSD_NODOWN: + return "nodown"; + case CEPH_OSD_NOOUT: + return "noout"; default: return "???"; } diff --git a/src/include/rados.h b/src/include/rados.h index b2fa55b18e1..61e85626df9 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -110,14 +110,16 @@ struct ceph_eversion { */ /* status bits */ -#define CEPH_OSD_EXISTS (1<<0) -#define CEPH_OSD_UP (1<<1) -#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ -#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ -#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ -#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ +#define CEPH_OSD_EXISTS (1<<0) +#define CEPH_OSD_UP (1<<1) +#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ +#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ +#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ +#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ #define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */ -#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */ +#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */ +#define CEPH_OSD_NODOWN (1<<8) /* osd can not be marked down */ +#define CEPH_OSD_NOOUT (1<<9) /* osd can not be marked out */ extern const char *ceph_osd_state_name(int s); @@ -132,24 +134,24 @@ extern const char *ceph_osd_state_name(int s); /* * osd map flag bits */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ -#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ -#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ -#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ -#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ -#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ -#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ -#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ -#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ -#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ -#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ -#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ -#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ -#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ -#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ -#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ -#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ +#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ +#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ +#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ +#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ +#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ +#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ #define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ /* these are hidden in 'ceph status' view */ diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 0b4fa4e4e58..9d574ede878 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -650,6 +650,28 @@ COMMAND("osd in " \ COMMAND("osd rm " \ "name=ids,type=CephString,n=N", \ "remove osd(s) [...] in", "osd", "rw", "cli,rest") +COMMAND("osd add-noout " \ + "name=ids,type=CephString,n=N", \ + "mark osd(s) [...] as noout, " \ + "or use to mark all osds as noout", \ + "osd", "rw", "cli,rest") +COMMAND("osd add-nodown " \ + "name=ids,type=CephString,n=N", \ + "mark osd(s) [...] as nodown, " \ + "or use to mark all osds as nodown", \ + "osd", "rw", "cli,rest") +COMMAND("osd rm-noout " \ + "name=ids,type=CephString,n=N", \ + "allow osd(s) [...] to be marked out " \ + "(if they are currently marked as noout), " \ + "can use to automatically filter out all noout osds", \ + "osd", "rw", "cli,rest") +COMMAND("osd rm-nodown " \ + "name=ids,type=CephString,n=N", \ + "allow osd(s) [...] to be marked down " \ + "(if they are currently marked as nodown), " \ + "can use to automatically filter out all nodown osds", \ + "osd", "rw", "cli,rest") COMMAND("osd reweight " \ "name=id,type=CephOsdName " \ "type=CephFloat,name=weight,range=0.0|1.0", \ diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7dc58af8780..5108e16fef1 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1888,18 +1888,26 @@ bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op) bool OSDMonitor::can_mark_down(int i) { if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) { - dout(5) << "can_mark_down NODOWN flag set, will not mark osd." << i << " down" << dendl; + dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i + << " down" << dendl; return false; } + + if (osdmap.is_nodown(i)) { + dout(5) << __func__ << " osd." << i << " is marked as nodown, " + << "will not mark it down" << dendl; + return false; + } + int num_osds = osdmap.get_num_osds(); if (num_osds == 0) { - dout(5) << "can_mark_down no osds" << dendl; + dout(5) << __func__ << " no osds" << dendl; return false; } int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap); float up_ratio = (float)up / (float)num_osds; if (up_ratio < g_conf->mon_osd_min_up_ratio) { - dout(2) << "can_mark_down current up_ratio " << up_ratio << " < min " + dout(2) << __func__ << " current up_ratio " << up_ratio << " < min " << g_conf->mon_osd_min_up_ratio << ", will not mark osd." << i << " down" << dendl; return false; @@ -1926,6 +1934,13 @@ bool OSDMonitor::can_mark_out(int i) dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl; return false; } + + if (osdmap.is_noout(i)) { + dout(5) << __func__ << " osd." << i << " is marked as noout, " + << "will not mark it out" << dendl; + return false; + } + int num_osds = osdmap.get_num_osds(); if (num_osds == 0) { dout(5) << __func__ << " no osds" << dendl; @@ -3787,6 +3802,32 @@ void OSDMonitor::get_health(list >& summary, detail->push_back(make_pair(HEALTH_WARN, ss.str())); } } + + // warn if there is any nodown osds. + vector nodown_osds; + osdmap.get_nodown_osds(&nodown_osds); + if (nodown_osds.size()) { + ostringstream ss; + ss << nodown_osds.size() << " nodown osd(s)"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + if (detail) { + ss << ": " << nodown_osds; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } + } + + // warn if there is any noout osds. + vector noout_osds; + osdmap.get_noout_osds(&noout_osds); + if (noout_osds.size()) { + ostringstream ss; + ss << noout_osds.size() << " noout osd(s)"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + if (detail) { + ss << ": " << noout_osds; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } + } } // note: we leave it to ceph-mgr to generate details health warnings // with actual osd utilizations @@ -8417,6 +8458,198 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, get_last_committed() + 1)); return true; } + } else if (prefix == "osd add-noout" || + prefix == "osd add-nodown") { + + bool noout = prefix == "osd add-noout"; + bool any = false; + bool stop = false; + + vector idvec; + cmd_getval(g_ceph_context, cmdmap, "ids", idvec); + for (unsigned j = 0; j < idvec.size() && !stop; j++) { + + set osds; + + // wildcard? + if (j == 0 && + (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) { + osdmap.get_all_osds(osds); + stop = true; + } else { + // try traditional single osd way + + long osd = parse_osd_id(idvec[j].c_str(), &ss); + if (osd < 0) { + // ss has reason for failure + ss << ", unable to parse osd id:\"" << idvec[j] << "\". "; + err = -EINVAL; + continue; + } + + osds.insert(osd); + } + + for (auto &osd : osds) { + + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + + if (noout) { + if (osdmap.is_out(osd)) { + ss << "osd." << osd << " is already out. "; + continue; + } + + if (osdmap.is_noout(osd)) { // already noout? + // continue to check if there is any pending "rm-noout" request + if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NOOUT)) { + // cancel it + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT); + any = true; + } + + continue; + } + + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT); + any = true; + } else { + // nodown + + if (osdmap.is_down(osd)) { + ss << "osd." << osd << " is already down. "; + continue; + } + + if (osdmap.is_nodown(osd)) { // already nodown? + // continue to check if there is any pending "rm-nodown" request + if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NODOWN)) { + // cancel it + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN); + any = true; + } + + continue; + } + + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN); + any = true; + } + } + } + + if (any) { + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs, + get_last_committed() + 1)); + return true; + } + } else if (prefix == "osd rm-noout" || + prefix == "osd rm-nodown") { + + bool any = false; + bool stop = false; + bool noout = prefix == "osd rm-noout"; + + vector idvec; + cmd_getval(g_ceph_context, cmdmap, "ids", idvec); + + for (unsigned j = 0; j < idvec.size() && !stop; j++) { + vector osds; + + // wildcard? + if (j == 0 && + (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) { + + // touch previous noout/nodown osds only + if (noout) { + osdmap.get_noout_osds(&osds); + } else { + osdmap.get_nodown_osds(&osds); + } + + // cancel pending noout/nodown requests too, + // if there is any + vector pending_state_osds; + (void) pending_inc.get_pending_state_osds(&pending_state_osds); + for (auto &p : pending_state_osds) { + if (noout) { + if (!osdmap.is_noout(p) && + pending_inc.pending_osd_has_state(p, CEPH_OSD_NOOUT)) { + pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT); + any = true; + } + } else { + if (!osdmap.is_nodown(p) && + pending_inc.pending_osd_has_state(p, CEPH_OSD_NODOWN)) { + pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN); + any = true; + } + } + } + + stop = true; + } else { + // try traditional single osd way + + long osd = parse_osd_id(idvec[j].c_str(), &ss); + if (osd < 0) { + // ss has reason for failure + ss << ", unable to parse osd id:\"" << idvec[j] << "\". "; + err = -EINVAL; + continue; + } + + osds.push_back(osd); + } + + for (auto &osd : osds) { + + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + + if (noout) { + if (osdmap.is_noout(osd)) { + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT); + any = true; + } else { + // noout flag is not set or has already been successfully cancelled + // continue to check pending_inc + if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NOOUT)) { + // cancel pending noout flag + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT); + any = true; + } + } + } else { + // nodown + if (osdmap.is_nodown(osd)) { + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN); + any = true; + } else { + // nodown flag is not set or has already been successfully cancelled + // continue to check pending_inc + if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NODOWN)) { + // cancel pending nodown flag + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN); + any = true; + } + } + } + } + } + + if (any) { + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs, + get_last_committed() + 1)); + return true; + } } else if (prefix == "osd pg-temp") { string pgidstr; if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 2b5d15790d3..18e8df7674b 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -457,6 +457,31 @@ public: /// propage update pools' snap metadata to any of their tiers int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base); + + /// filter out osds with any pending state changing + size_t get_pending_state_osds(vector *osds) { + assert(osds); + osds->clear(); + + for (auto &p : new_state) { + osds->push_back(p.first); + } + + return osds->size(); + } + + bool pending_osd_has_state(int osd, unsigned state) { + return new_state.count(osd) && (new_state[osd] & state) != 0; + } + + void pending_osd_state_set(int osd, unsigned state) { + new_state[osd] |= state; + } + + void pending_osd_state_clear(int osd, unsigned state) { + new_state[osd] &= ~state; + } + }; private: @@ -745,6 +770,36 @@ public: return !is_out(osd); } + bool is_nodown(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); + } + + bool is_noout(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); + } + + void get_nodown_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_nodown(i)) { + osds->push_back(i); + } + } + } + + void get_noout_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_noout(i)) { + osds->push_back(i); + } + } + } + /** * check if an entire crush subtree is down */