mon/OSDMonitor: batch OSDs nodown/noout support

This patch allow us to add batch OSDs(e.g., from a specific host which is currently in maintenance)
into a specific nodown/noout list, which can not be automatically marked down/out
and hence can be eliminated from data migration.

This has the same effect with the global nodown/noout flag but is more fine-grained.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
This commit is contained in:
xie xingguo 2017-04-25 14:41:09 +08:00
parent c41ab152e9
commit 4fec5b85f1
6 changed files with 354 additions and 28 deletions

View File

@ -1430,6 +1430,16 @@ function test_mon_osd()
ceph osd dump | grep 'osd.0.*in'
ceph osd find 0
ceph osd add-noout 0
ceph health detail | grep 'noout osd(s).*0'
ceph osd rm-noout 0
! ceph health detail | grep 'noout osds(s).*0'
ceph osd add-nodown 0 1
ceph health detail | grep 'nodown osd(s).*0.*1'
ceph osd rm-nodown all
! ceph health detail | grep 'nodown osd(s).*0.*1'
# make sure mark out preserves weight
ceph osd reweight osd.0 .5
ceph osd dump | grep ^osd.0 | grep 'weight 0.5'

View File

@ -46,6 +46,10 @@ const char *ceph_osd_state_name(int s)
return "backfillfull";
case CEPH_OSD_DESTROYED:
return "destroyed";
case CEPH_OSD_NODOWN:
return "nodown";
case CEPH_OSD_NOOUT:
return "noout";
default:
return "???";
}

View File

@ -110,14 +110,16 @@ struct ceph_eversion {
*/
/* status bits */
#define CEPH_OSD_EXISTS (1<<0)
#define CEPH_OSD_UP (1<<1)
#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */
#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
#define CEPH_OSD_EXISTS (1<<0)
#define CEPH_OSD_UP (1<<1)
#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */
#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */
#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */
#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */
#define CEPH_OSD_NODOWN (1<<8) /* osd can not be marked down */
#define CEPH_OSD_NOOUT (1<<9) /* osd can not be marked out */
extern const char *ceph_osd_state_name(int s);
@ -132,24 +134,24 @@ extern const char *ceph_osd_state_name(int s);
/*
* osd map flag bits
*/
#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
/* these are hidden in 'ceph status' view */

View File

@ -650,6 +650,28 @@ COMMAND("osd in " \
COMMAND("osd rm " \
"name=ids,type=CephString,n=N", \
"remove osd(s) <id> [<id>...] in", "osd", "rw", "cli,rest")
COMMAND("osd add-noout " \
"name=ids,type=CephString,n=N", \
"mark osd(s) <id> [<id>...] as noout, " \
"or use <all|any|*> to mark all osds as noout", \
"osd", "rw", "cli,rest")
COMMAND("osd add-nodown " \
"name=ids,type=CephString,n=N", \
"mark osd(s) <id> [<id>...] as nodown, " \
"or use <all|any|*> to mark all osds as nodown", \
"osd", "rw", "cli,rest")
COMMAND("osd rm-noout " \
"name=ids,type=CephString,n=N", \
"allow osd(s) <id> [<id>...] to be marked out " \
"(if they are currently marked as noout), " \
"can use <all|any|*> to automatically filter out all noout osds", \
"osd", "rw", "cli,rest")
COMMAND("osd rm-nodown " \
"name=ids,type=CephString,n=N", \
"allow osd(s) <id> [<id>...] to be marked down " \
"(if they are currently marked as nodown), " \
"can use <all|any|*> to automatically filter out all nodown osds", \
"osd", "rw", "cli,rest")
COMMAND("osd reweight " \
"name=id,type=CephOsdName " \
"type=CephFloat,name=weight,range=0.0|1.0", \

View File

@ -1888,18 +1888,26 @@ bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
bool OSDMonitor::can_mark_down(int i)
{
if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
dout(5) << "can_mark_down NODOWN flag set, will not mark osd." << i << " down" << dendl;
dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
<< " down" << dendl;
return false;
}
if (osdmap.is_nodown(i)) {
dout(5) << __func__ << " osd." << i << " is marked as nodown, "
<< "will not mark it down" << dendl;
return false;
}
int num_osds = osdmap.get_num_osds();
if (num_osds == 0) {
dout(5) << "can_mark_down no osds" << dendl;
dout(5) << __func__ << " no osds" << dendl;
return false;
}
int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
float up_ratio = (float)up / (float)num_osds;
if (up_ratio < g_conf->mon_osd_min_up_ratio) {
dout(2) << "can_mark_down current up_ratio " << up_ratio << " < min "
dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
<< g_conf->mon_osd_min_up_ratio
<< ", will not mark osd." << i << " down" << dendl;
return false;
@ -1926,6 +1934,13 @@ bool OSDMonitor::can_mark_out(int i)
dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
return false;
}
if (osdmap.is_noout(i)) {
dout(5) << __func__ << " osd." << i << " is marked as noout, "
<< "will not mark it out" << dendl;
return false;
}
int num_osds = osdmap.get_num_osds();
if (num_osds == 0) {
dout(5) << __func__ << " no osds" << dendl;
@ -3787,6 +3802,32 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
// warn if there is any nodown osds.
vector<int> nodown_osds;
osdmap.get_nodown_osds(&nodown_osds);
if (nodown_osds.size()) {
ostringstream ss;
ss << nodown_osds.size() << " nodown osd(s)";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail) {
ss << ": " << nodown_osds;
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
// warn if there is any noout osds.
vector<int> noout_osds;
osdmap.get_noout_osds(&noout_osds);
if (noout_osds.size()) {
ostringstream ss;
ss << noout_osds.size() << " noout osd(s)";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail) {
ss << ": " << noout_osds;
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
}
// note: we leave it to ceph-mgr to generate details health warnings
// with actual osd utilizations
@ -8417,6 +8458,198 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
get_last_committed() + 1));
return true;
}
} else if (prefix == "osd add-noout" ||
prefix == "osd add-nodown") {
bool noout = prefix == "osd add-noout";
bool any = false;
bool stop = false;
vector<string> idvec;
cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
for (unsigned j = 0; j < idvec.size() && !stop; j++) {
set<int> osds;
// wildcard?
if (j == 0 &&
(idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
osdmap.get_all_osds(osds);
stop = true;
} else {
// try traditional single osd way
long osd = parse_osd_id(idvec[j].c_str(), &ss);
if (osd < 0) {
// ss has reason for failure
ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
err = -EINVAL;
continue;
}
osds.insert(osd);
}
for (auto &osd : osds) {
if (!osdmap.exists(osd)) {
ss << "osd." << osd << " does not exist. ";
continue;
}
if (noout) {
if (osdmap.is_out(osd)) {
ss << "osd." << osd << " is already out. ";
continue;
}
if (osdmap.is_noout(osd)) { // already noout?
// continue to check if there is any pending "rm-noout" request
if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NOOUT)) {
// cancel it
pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
any = true;
}
continue;
}
pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
any = true;
} else {
// nodown
if (osdmap.is_down(osd)) {
ss << "osd." << osd << " is already down. ";
continue;
}
if (osdmap.is_nodown(osd)) { // already nodown?
// continue to check if there is any pending "rm-nodown" request
if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NODOWN)) {
// cancel it
pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
any = true;
}
continue;
}
pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
any = true;
}
}
}
if (any) {
getline(ss, rs);
wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
get_last_committed() + 1));
return true;
}
} else if (prefix == "osd rm-noout" ||
prefix == "osd rm-nodown") {
bool any = false;
bool stop = false;
bool noout = prefix == "osd rm-noout";
vector<string> idvec;
cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
for (unsigned j = 0; j < idvec.size() && !stop; j++) {
vector<int> osds;
// wildcard?
if (j == 0 &&
(idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
// touch previous noout/nodown osds only
if (noout) {
osdmap.get_noout_osds(&osds);
} else {
osdmap.get_nodown_osds(&osds);
}
// cancel pending noout/nodown requests too,
// if there is any
vector<int> pending_state_osds;
(void) pending_inc.get_pending_state_osds(&pending_state_osds);
for (auto &p : pending_state_osds) {
if (noout) {
if (!osdmap.is_noout(p) &&
pending_inc.pending_osd_has_state(p, CEPH_OSD_NOOUT)) {
pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT);
any = true;
}
} else {
if (!osdmap.is_nodown(p) &&
pending_inc.pending_osd_has_state(p, CEPH_OSD_NODOWN)) {
pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN);
any = true;
}
}
}
stop = true;
} else {
// try traditional single osd way
long osd = parse_osd_id(idvec[j].c_str(), &ss);
if (osd < 0) {
// ss has reason for failure
ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
err = -EINVAL;
continue;
}
osds.push_back(osd);
}
for (auto &osd : osds) {
if (!osdmap.exists(osd)) {
ss << "osd." << osd << " does not exist. ";
continue;
}
if (noout) {
if (osdmap.is_noout(osd)) {
pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
any = true;
} else {
// noout flag is not set or has already been successfully cancelled
// continue to check pending_inc
if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NOOUT)) {
// cancel pending noout flag
pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
any = true;
}
}
} else {
// nodown
if (osdmap.is_nodown(osd)) {
pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
any = true;
} else {
// nodown flag is not set or has already been successfully cancelled
// continue to check pending_inc
if (pending_inc.pending_osd_has_state(osd, CEPH_OSD_NODOWN)) {
// cancel pending nodown flag
pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
any = true;
}
}
}
}
}
if (any) {
getline(ss, rs);
wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
get_last_committed() + 1));
return true;
}
} else if (prefix == "osd pg-temp") {
string pgidstr;
if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {

View File

@ -457,6 +457,31 @@ public:
/// propage update pools' snap metadata to any of their tiers
int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
/// filter out osds with any pending state changing
size_t get_pending_state_osds(vector<int> *osds) {
assert(osds);
osds->clear();
for (auto &p : new_state) {
osds->push_back(p.first);
}
return osds->size();
}
bool pending_osd_has_state(int osd, unsigned state) {
return new_state.count(osd) && (new_state[osd] & state) != 0;
}
void pending_osd_state_set(int osd, unsigned state) {
new_state[osd] |= state;
}
void pending_osd_state_clear(int osd, unsigned state) {
new_state[osd] &= ~state;
}
};
private:
@ -745,6 +770,36 @@ public:
return !is_out(osd);
}
bool is_nodown(int osd) const {
return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
}
bool is_noout(int osd) const {
return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
}
void get_nodown_osds(vector<int> *osds) const {
assert(osds);
osds->clear();
for (int i = 0; i < max_osd; i++) {
if (is_nodown(i)) {
osds->push_back(i);
}
}
}
void get_noout_osds(vector<int> *osds) const {
assert(osds);
osds->clear();
for (int i = 0; i < max_osd; i++) {
if (is_noout(i)) {
osds->push_back(i);
}
}
}
/**
* check if an entire crush subtree is down
*/