Merge pull request #17371 from xiexingguo/wip-per-pool-full-control

mon, osd: per pool space-full flag support

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Xie Xingguo 2017-09-14 18:26:12 +08:00 committed by GitHub
commit b4bb7ce2da
8 changed files with 368 additions and 50 deletions

View File

@ -9,6 +9,7 @@ overrides:
- (OSDMAP_FLAGS)
- (OSD_FULL)
- (MDS_READ_ONLY)
- (POOL_FULL)
tasks:
- install:
- ceph:

View File

@ -2076,6 +2076,44 @@ int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
return 0;
}
// return rules that might reference the given osd
int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
{
assert(rules);
rules->clear();
if (osd < 0) {
return -EINVAL;
}
for (unsigned i = 0; i < crush->max_rules; ++i) {
crush_rule *r = crush->rules[i];
if (!r)
continue;
for (unsigned j = 0; j < r->len; ++j) {
if (r->steps[j].op == CRUSH_RULE_TAKE) {
int step_item = r->steps[j].arg1;
list<int> unordered;
int rc = _get_leaves(step_item, &unordered);
if (rc < 0) {
return rc; // propagate fatal errors!
}
bool match = false;
for (auto &o: unordered) {
assert(o >= 0);
if (o == osd) {
match = true;
break;
}
}
if (match) {
rules->insert(i);
break;
}
}
}
}
return 0;
}
bool CrushWrapper::_class_is_dead(int class_id)
{
for (auto &p: class_map) {

View File

@ -1253,6 +1253,7 @@ public:
int populate_classes(
const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
int get_rules_by_class(const string &class_name, set<int> *rules);
int get_rules_by_osd(int osd, set<int> *rules);
bool _class_is_dead(int class_id);
void cleanup_dead_classes();
int rebuild_roots_with_classes();

View File

@ -917,31 +917,190 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
tmp.apply_incremental(pending_inc);
if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
// set or clear full/nearfull?
int full, backfill, nearfull;
tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
if (full > 0) {
if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
dout(10) << __func__ << " setting full flag" << dendl;
add_flag(CEPH_OSDMAP_FULL);
remove_flag(CEPH_OSDMAP_NEARFULL);
}
} else {
if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
dout(10) << __func__ << " clearing full flag" << dendl;
remove_flag(CEPH_OSDMAP_FULL);
}
if (nearfull > 0) {
if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
dout(10) << __func__ << " setting nearfull flag" << dendl;
add_flag(CEPH_OSDMAP_NEARFULL);
}
} else {
if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
dout(10) << __func__ << " clearing nearfull flag" << dendl;
remove_flag(CEPH_OSDMAP_NEARFULL);
}
}
// remove any legacy osdmap nearfull/full flags
{
if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
<< dendl;
remove_flag(CEPH_OSDMAP_NEARFULL);
remove_flag(CEPH_OSDMAP_FULL);
}
}
// collect which pools are currently affected by
// the near/backfill/full osd(s),
// and set per-pool near/backfill/full flag instead
set<int64_t> full_pool_ids;
set<int64_t> backfillfull_pool_ids;
set<int64_t> nearfull_pool_ids;
tmp.get_full_pools(g_ceph_context,
&full_pool_ids,
&backfillfull_pool_ids,
&nearfull_pool_ids);
if (full_pool_ids.empty() ||
backfillfull_pool_ids.empty() ||
nearfull_pool_ids.empty()) {
// normal case - no nearfull, backfillfull or full osds
// try cancel any improper nearfull/backfillfull/full pool
// flags first
for (auto &pool: tmp.get_pools()) {
auto p = pool.first;
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
nearfull_pool_ids.empty()) {
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s nearfull flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
// load original pool info first!
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
backfillfull_pool_ids.empty()) {
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s backfillfull flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
full_pool_ids.empty()) {
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
// set by EQUOTA, skipping
continue;
}
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s full flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
}
}
}
if (!full_pool_ids.empty()) {
dout(10) << __func__ << " marking pool(s) " << full_pool_ids
<< " as full" << dendl;
for (auto &p: full_pool_ids) {
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
continue;
}
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = tmp.pools[p];
}
pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
}
// cancel FLAG_FULL for pools which are no longer full too
for (auto &pool: tmp.get_pools()) {
auto p = pool.first;
if (full_pool_ids.count(p)) {
// skip pools we have just marked as full above
continue;
}
if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
// don't touch if currently is not full
// or is running out of quota (and hence considered as full)
continue;
}
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s full flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
}
}
if (!backfillfull_pool_ids.empty()) {
for (auto &p: backfillfull_pool_ids) {
if (full_pool_ids.count(p)) {
// skip pools we have already considered as full above
continue;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
// make sure FLAG_FULL is truly set, so we are safe not
// to set a extra (redundant) FLAG_BACKFILLFULL flag
assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
continue;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
// don't bother if pool is already marked as backfillfull
continue;
}
dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
<< "'s as backfillfull" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = tmp.pools[p];
}
pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
}
// cancel FLAG_BACKFILLFULL for pools
// which are no longer backfillfull too
for (auto &pool: tmp.get_pools()) {
auto p = pool.first;
if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
// skip pools we have just marked as backfillfull/full above
continue;
}
if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
// and don't touch if currently is not backfillfull
continue;
}
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s backfillfull flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
}
}
if (!nearfull_pool_ids.empty()) {
for (auto &p: nearfull_pool_ids) {
if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
continue;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
// make sure FLAG_FULL is truly set, so we are safe not
// to set a extra (redundant) FLAG_NEARFULL flag
assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
continue;
}
if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
// don't bother if pool is already marked as nearfull
continue;
}
dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
<< "'s as nearfull" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = tmp.pools[p];
}
pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
}
// cancel FLAG_NEARFULL for pools
// which are no longer nearfull too
for (auto &pool: tmp.get_pools()) {
auto p = pool.first;
if (full_pool_ids.count(p) ||
backfillfull_pool_ids.count(p) ||
nearfull_pool_ids.count(p)) {
// skip pools we have just marked as
// nearfull/backfillfull/full above
continue;
}
if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
// and don't touch if currently is not nearfull
continue;
}
dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
<< "'s nearfull flag" << dendl;
if (pending_inc.new_pools.count(p) == 0) {
pending_inc.new_pools[p] = pool.second;
}
pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
}
}
// min_compat_client?
@ -4828,10 +4987,20 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
return true;
}
void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
{
const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
pending_inc.get_new_pool(pool_id, pool)->flags = flags;
pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
osdmap.get_pg_pool(pool_id));
assert(pool);
pool->set_flag(flags);
}
void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
{
pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
osdmap.get_pg_pool(pool_id));
assert(pool);
pool->unset_flag(flags);
}
bool OSDMonitor::update_pools_status()
@ -4854,14 +5023,16 @@ bool OSDMonitor::update_pools_status()
(pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
(pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
if (pool_is_full)
continue;
mon->clog->info() << "pool '" << pool_name
<< "' no longer full; removing FULL flag";
update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
<< "' no longer out of quota; removing NO_QUOTA flag";
// below we cancel FLAG_FULL too, we'll set it again in
// OSDMonitor::encode_pending if it still fails the osd-full checking.
clear_pool_flags(it->first,
pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
ret = true;
} else {
if (!pool_is_full)
@ -4879,7 +5050,14 @@ bool OSDMonitor::update_pools_status()
<< " (reached quota's max_objects: "
<< pool.quota_max_objects << ")";
}
update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
// set both FLAG_FULL_NO_QUOTA and FLAG_FULL
// note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
// since FLAG_FULL should always take precedence
set_pool_flags(it->first,
pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
clear_pool_flags(it->first,
pg_pool_t::FLAG_NEARFULL |
pg_pool_t::FLAG_BACKFILLFULL);
ret = true;
}
}

View File

@ -357,7 +357,8 @@ private:
ostream *ss);
int prepare_new_pool(MonOpRequestRef op);
void update_pool_flags(int64_t pool_id, uint64_t flags);
void set_pool_flags(int64_t pool_id, uint64_t flags);
void clear_pool_flags(int64_t pool_id, uint64_t flags);
bool update_pools_status();
bool prepare_set_flag(MonOpRequestRef op, int flag);

View File

@ -20,6 +20,7 @@
#include "OSDMap.h"
#include <algorithm>
#include "common/config.h"
#include "common/errno.h"
#include "common/Formatter.h"
#include "common/TextTable.h"
#include "include/ceph_features.h"
@ -1145,21 +1146,41 @@ int OSDMap::calc_num_osds()
return num_osd;
}
void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
void OSDMap::get_full_pools(CephContext *cct,
set<int64_t> *full,
set<int64_t> *backfillfull,
set<int64_t> *nearfull) const
{
*full = 0;
*backfill = 0;
*nearfull = 0;
assert(full);
assert(backfillfull);
assert(nearfull);
full->clear();
backfillfull->clear();
nearfull->clear();
vector<int> full_osds;
vector<int> backfillfull_osds;
vector<int> nearfull_osds;
for (int i = 0; i < max_osd; ++i) {
if (exists(i) && is_up(i) && is_in(i)) {
if (osd_state[i] & CEPH_OSD_FULL)
++(*full);
full_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
++(*backfill);
backfillfull_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_NEARFULL)
++(*nearfull);
nearfull_osds.push_back(i);
}
}
for (auto i: full_osds) {
get_pool_ids_by_osd(cct, i, full);
}
for (auto i: backfillfull_osds) {
get_pool_ids_by_osd(cct, i, backfillfull);
}
for (auto i: nearfull_osds) {
get_pool_ids_by_osd(cct, i, nearfull);
}
}
void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
@ -3925,6 +3946,31 @@ int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
return crush->get_leaves(name, osds);
}
// get pools whose crush rules might reference the given osd
void OSDMap::get_pool_ids_by_osd(CephContext *cct,
int osd,
set<int64_t> *pool_ids) const
{
assert(pool_ids);
set<int> raw_rules;
int r = crush->get_rules_by_osd(osd, &raw_rules);
if (r < 0) {
lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
<< dendl;
assert(r >= 0);
}
set<int> rules;
for (auto &i: raw_rules) {
// exclude any dead rule
if (crush_ruleset_in_use(i)) {
rules.insert(i);
}
}
for (auto &r: rules) {
get_pool_ids_by_rule(r, pool_ids);
}
}
template <typename F>
class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
public:
@ -4480,6 +4526,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
{
// warn about flags
uint64_t warn_flags =
CEPH_OSDMAP_NEARFULL |
CEPH_OSDMAP_FULL |
CEPH_OSDMAP_PAUSERD |
CEPH_OSDMAP_PAUSEWR |
@ -4586,23 +4633,49 @@ void OSDMap::check_health(health_check_map_t *checks) const
// OSD_UPGRADE_FINISHED
// none of these (yet) since we don't run until luminous upgrade is done.
// POOL_FULL
// POOL_NEARFULL/BACKFILLFULL/FULL
{
list<string> detail;
list<string> full_detail, backfillfull_detail, nearfull_detail;
for (auto it : get_pools()) {
const pg_pool_t &pool = it.second;
const string& pool_name = get_pool_name(it.first);
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
const string& pool_name = get_pool_name(it.first);
stringstream ss;
ss << "pool '" << pool_name << "' is full";
detail.push_back(ss.str());
if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
// may run out of space too,
// but we want EQUOTA taking precedence
ss << "pool '" << pool_name << "' is full (no quota)";
} else {
ss << "pool '" << pool_name << "' is full (no space)";
}
full_detail.push_back(ss.str());
} else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
stringstream ss;
ss << "pool '" << pool_name << "' is backfillfull";
backfillfull_detail.push_back(ss.str());
} else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
stringstream ss;
ss << "pool '" << pool_name << "' is nearfull";
nearfull_detail.push_back(ss.str());
}
}
if (!detail.empty()) {
if (!full_detail.empty()) {
ostringstream ss;
ss << detail.size() << " pool(s) full";
ss << full_detail.size() << " pool(s) full";
auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
d.detail.swap(detail);
d.detail.swap(full_detail);
}
if (!backfillfull_detail.empty()) {
ostringstream ss;
ss << backfillfull_detail.size() << " pool(s) backfillfull";
auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
d.detail.swap(backfillfull_detail);
}
if (!nearfull_detail.empty()) {
ostringstream ss;
ss << nearfull_detail.size() << " pool(s) nearfull";
auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
d.detail.swap(nearfull_detail);
}
}
}

View File

@ -644,7 +644,10 @@ public:
float get_nearfull_ratio() const {
return nearfull_ratio;
}
void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
void get_full_pools(CephContext *cct,
set<int64_t> *full,
set<int64_t> *backfillfull,
set<int64_t> *nearfull) const;
void get_full_osd_counts(set<int> *full, set<int> *backfill,
set<int> *nearfull) const;
@ -1165,6 +1168,17 @@ public:
mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
return pools;
}
void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
assert(pool_ids);
for (auto &p: pools) {
if ((int)p.second.get_crush_rule() == rule_id) {
pool_ids->insert(p.first);
}
}
}
void get_pool_ids_by_osd(CephContext *cct,
int osd,
set<int64_t> *pool_ids) const;
const string& get_pool_name(int64_t p) const {
auto i = pool_name.find(p);
assert(i != pool_name.end());

View File

@ -1154,6 +1154,9 @@ struct pg_pool_t {
FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
FLAG_NOSCRUB = 1<<8, // block periodic scrub
FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
FLAG_NEARFULL = 1<<11, // pool is nearfull
FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
};
static const char *get_flag_name(int f) {
@ -1168,6 +1171,9 @@ struct pg_pool_t {
case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
case FLAG_NOSCRUB: return "noscrub";
case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
case FLAG_FULL_NO_QUOTA: return "full_no_quota";
case FLAG_NEARFULL: return "nearfull";
case FLAG_BACKFILLFULL: return "backfillfull";
default: return "???";
}
}
@ -1206,6 +1212,12 @@ struct pg_pool_t {
return FLAG_NOSCRUB;
if (name == "nodeep-scrub")
return FLAG_NODEEP_SCRUB;
if (name == "full_no_quota")
return FLAG_FULL_NO_QUOTA;
if (name == "nearfull")
return FLAG_NEARFULL;
if (name == "backfillfull")
return FLAG_BACKFILLFULL;
return 0;
}