mirror of
https://github.com/ceph/ceph
synced 2025-02-20 17:37:29 +00:00
mon: refactor PGMap updating code for reuse in mgr
...and remove the copypasta from mgr. mgr still doesn't do all the same logic (it doesn't have osdmap history handy) but it is now much easier to see which bits are used one place vs. the other. Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
parent
58dd3db0be
commit
afa7078763
@ -99,239 +99,13 @@ void ClusterState::notify_osdmap(const OSDMap &osd_map)
|
||||
PGMap::Incremental pending_inc;
|
||||
pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
|
||||
|
||||
_update_creating_pgs(osd_map, &pending_inc);
|
||||
_register_new_pgs(osd_map, &pending_inc);
|
||||
PGMapUpdater::update_creating_pgs(osd_map, &pg_map, &pending_inc);
|
||||
PGMapUpdater::register_new_pgs(osd_map, &pg_map, &pending_inc);
|
||||
|
||||
pg_map.apply_incremental(g_ceph_context, pending_inc);
|
||||
|
||||
// TODO: Reinstate check_down_pgs logic?
|
||||
}
|
||||
|
||||
void ClusterState::_register_new_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
// iterate over crush mapspace
|
||||
epoch_t epoch = osd_map.get_epoch();
|
||||
dout(10) << "checking pg pools for osdmap epoch " << epoch
|
||||
<< ", last_pg_scan " << pg_map.last_pg_scan << dendl;
|
||||
|
||||
int created = 0;
|
||||
for (const auto & p : osd_map.pools) {
|
||||
int64_t poolid = p.first;
|
||||
const pg_pool_t &pool = p.second;
|
||||
|
||||
int ruleno = osd_map.crush->find_rule(pool.get_crush_ruleset(),
|
||||
pool.get_type(), pool.get_size());
|
||||
if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
|
||||
continue;
|
||||
|
||||
if (pool.get_last_change() <= pg_map.last_pg_scan ||
|
||||
pool.get_last_change() <= pending_inc->pg_scan) {
|
||||
dout(10) << " no change in pool " << poolid << " " << pool << dendl;
|
||||
continue;
|
||||
}
|
||||
|
||||
dout(10) << "scanning pool " << poolid
|
||||
<< " " << pool << dendl;
|
||||
|
||||
// first pgs in this pool
|
||||
bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
|
||||
|
||||
for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
|
||||
pg_t pgid(ps, poolid, -1);
|
||||
if (pg_map.pg_stat.count(pgid)) {
|
||||
dout(20) << "register_new_pgs have " << pgid << dendl;
|
||||
continue;
|
||||
}
|
||||
created++;
|
||||
_register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
|
||||
pending_inc);
|
||||
}
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
for (const auto &p : pg_map.creating_pgs) {
|
||||
if (p.preferred() >= 0) {
|
||||
dout(20) << " removing creating_pg " << p
|
||||
<< " because it is localized and obsolete" << dendl;
|
||||
pending_inc->pg_remove.insert(p);
|
||||
removed++;
|
||||
}
|
||||
if (!osd_map.have_pg_pool(p.pool())) {
|
||||
dout(20) << " removing creating_pg " << p
|
||||
<< " because containing pool deleted" << dendl;
|
||||
pending_inc->pg_remove.insert(p);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// deleted pools?
|
||||
for (const auto & p : pg_map.pg_stat) {
|
||||
if (!osd_map.have_pg_pool(p.first.pool())) {
|
||||
dout(20) << " removing pg_stat " << p.first << " because "
|
||||
<< "containing pool deleted" << dendl;
|
||||
pending_inc->pg_remove.insert(p.first);
|
||||
++removed;
|
||||
}
|
||||
if (p.first.preferred() >= 0) {
|
||||
dout(20) << " removing localized pg " << p.first << dendl;
|
||||
pending_inc->pg_remove.insert(p.first);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// we don't want to redo this work if we can avoid it.
|
||||
pending_inc->pg_scan = epoch;
|
||||
|
||||
dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
|
||||
<< removed << " uncreated pgs" << dendl;
|
||||
}
|
||||
|
||||
void ClusterState::_register_pg(
|
||||
const OSDMap &osd_map,
|
||||
pg_t pgid, epoch_t epoch,
|
||||
bool new_pool,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
pg_t parent;
|
||||
int split_bits = 0;
|
||||
bool parent_found = false;
|
||||
if (!new_pool) {
|
||||
parent = pgid;
|
||||
while (1) {
|
||||
// remove most significant bit
|
||||
int msb = cbits(parent.ps());
|
||||
if (!msb)
|
||||
break;
|
||||
parent.set_ps(parent.ps() & ~(1<<(msb-1)));
|
||||
split_bits++;
|
||||
dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
|
||||
if (pg_map.pg_stat.count(parent) &&
|
||||
pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
|
||||
dout(10) << " parent is " << parent << dendl;
|
||||
parent_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
|
||||
stats.state = PG_STATE_CREATING;
|
||||
stats.created = epoch;
|
||||
stats.parent = parent;
|
||||
stats.parent_split_bits = split_bits;
|
||||
stats.mapping_epoch = epoch;
|
||||
|
||||
if (parent_found) {
|
||||
pg_stat_t &ps = pg_map.pg_stat[parent];
|
||||
stats.last_fresh = ps.last_fresh;
|
||||
stats.last_active = ps.last_active;
|
||||
stats.last_change = ps.last_change;
|
||||
stats.last_peered = ps.last_peered;
|
||||
stats.last_clean = ps.last_clean;
|
||||
stats.last_unstale = ps.last_unstale;
|
||||
stats.last_undegraded = ps.last_undegraded;
|
||||
stats.last_fullsized = ps.last_fullsized;
|
||||
stats.last_scrub_stamp = ps.last_scrub_stamp;
|
||||
stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
|
||||
stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
|
||||
} else {
|
||||
utime_t now = ceph_clock_now(g_ceph_context);
|
||||
stats.last_fresh = now;
|
||||
stats.last_active = now;
|
||||
stats.last_change = now;
|
||||
stats.last_peered = now;
|
||||
stats.last_clean = now;
|
||||
stats.last_unstale = now;
|
||||
stats.last_undegraded = now;
|
||||
stats.last_fullsized = now;
|
||||
stats.last_scrub_stamp = now;
|
||||
stats.last_deep_scrub_stamp = now;
|
||||
stats.last_clean_scrub_stamp = now;
|
||||
}
|
||||
|
||||
osd_map.pg_to_up_acting_osds(
|
||||
pgid,
|
||||
&stats.up,
|
||||
&stats.up_primary,
|
||||
&stats.acting,
|
||||
&stats.acting_primary);
|
||||
|
||||
if (split_bits == 0) {
|
||||
dout(10) << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< dendl;
|
||||
} else {
|
||||
dout(10) << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< " parent " << parent
|
||||
<< " by " << split_bits << " bits"
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
|
||||
// This was PGMonitor::map_pg_creates
|
||||
void ClusterState::_update_creating_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
assert(pending_inc != nullptr);
|
||||
|
||||
dout(10) << "to " << pg_map.creating_pgs.size()
|
||||
<< " pgs, osdmap epoch " << osd_map.get_epoch()
|
||||
<< dendl;
|
||||
|
||||
for (set<pg_t>::const_iterator p = pg_map.creating_pgs.begin();
|
||||
p != pg_map.creating_pgs.end();
|
||||
++p) {
|
||||
pg_t pgid = *p;
|
||||
pg_t on = pgid;
|
||||
ceph::unordered_map<pg_t,pg_stat_t>::const_iterator q =
|
||||
pg_map.pg_stat.find(pgid);
|
||||
assert(q != pg_map.pg_stat.end());
|
||||
const pg_stat_t *s = &q->second;
|
||||
|
||||
if (s->parent_split_bits)
|
||||
on = s->parent;
|
||||
|
||||
vector<int> up, acting;
|
||||
int up_primary, acting_primary;
|
||||
osd_map.pg_to_up_acting_osds(
|
||||
on,
|
||||
&up,
|
||||
&up_primary,
|
||||
&acting,
|
||||
&acting_primary);
|
||||
|
||||
if (up != s->up ||
|
||||
up_primary != s->up_primary ||
|
||||
acting != s->acting ||
|
||||
acting_primary != s->acting_primary) {
|
||||
pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
|
||||
dout(20) << pgid << " "
|
||||
<< " acting_primary: " << s->acting_primary
|
||||
<< " -> " << acting_primary
|
||||
<< " acting: " << s->acting << " -> " << acting
|
||||
<< " up_primary: " << s->up_primary << " -> " << up_primary
|
||||
<< " up: " << s->up << " -> " << up
|
||||
<< dendl;
|
||||
|
||||
// only initialize if it wasn't already a pending update
|
||||
if (ns->reported_epoch == 0)
|
||||
*ns = *s;
|
||||
|
||||
// note epoch if the target of the create message changed
|
||||
if (acting_primary != ns->acting_primary)
|
||||
ns->mapping_epoch = osd_map.get_epoch();
|
||||
|
||||
ns->up = up;
|
||||
ns->up_primary = up_primary;
|
||||
ns->acting = acting;
|
||||
ns->acting_primary = acting_primary;
|
||||
}
|
||||
}
|
||||
// TODO: Complete the separation of PG state handling so
|
||||
// that a cut-down set of functionality remains in PGMonitor
|
||||
// while the full-blown PGMap lives only here.
|
||||
}
|
||||
|
||||
|
285
src/mon/PGMap.cc
285
src/mon/PGMap.cc
@ -2070,3 +2070,288 @@ void PGMap::dump_object_stat_sum(TextTable &tbl, Formatter *f,
|
||||
}
|
||||
|
||||
|
||||
void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
|
||||
std::set<int> *need_check_down_pg_osds,
|
||||
std::map<int,utime_t> *last_osd_report,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
for (const auto &p : osd_inc.new_weight) {
|
||||
if (p.second == CEPH_OSD_OUT) {
|
||||
dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
|
||||
pending_inc->stat_osd_out(p.first);
|
||||
}
|
||||
}
|
||||
|
||||
// this is conservative: we want to know if any osds (maybe) got marked down.
|
||||
for (const auto &p : osd_inc.new_state) {
|
||||
if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
|
||||
// but we're too lazy to check
|
||||
// which
|
||||
need_check_down_pg_osds->insert(p.first);
|
||||
|
||||
// clear out the last_osd_report for this OSD
|
||||
map<int, utime_t>::iterator report = last_osd_report->find(p.first);
|
||||
if (report != last_osd_report->end()) {
|
||||
last_osd_report->erase(report);
|
||||
}
|
||||
|
||||
// clear out osd_stat slow request histogram
|
||||
dout(20) << __func__ << " clearing osd." << p.first
|
||||
<< " request histogram" << dendl;
|
||||
pending_inc->stat_osd_down_up(p.first, *pg_map);
|
||||
}
|
||||
|
||||
if (p.second & CEPH_OSD_EXISTS) {
|
||||
// whether it was created *or* destroyed, we can safely drop
|
||||
// it's osd_stat_t record.
|
||||
dout(10) << __func__ << " osd." << p.first
|
||||
<< " created or destroyed" << dendl;
|
||||
pending_inc->rm_stat(p.first);
|
||||
|
||||
// and adjust full, nearfull set
|
||||
pg_map->nearfull_osds.erase(p.first);
|
||||
pg_map->full_osds.erase(p.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PGMapUpdater::register_pg(
|
||||
const OSDMap &osd_map,
|
||||
pg_t pgid, epoch_t epoch,
|
||||
bool new_pool,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
pg_t parent;
|
||||
int split_bits = 0;
|
||||
bool parent_found = false;
|
||||
if (!new_pool) {
|
||||
parent = pgid;
|
||||
while (1) {
|
||||
// remove most significant bit
|
||||
int msb = cbits(parent.ps());
|
||||
if (!msb)
|
||||
break;
|
||||
parent.set_ps(parent.ps() & ~(1<<(msb-1)));
|
||||
split_bits++;
|
||||
dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
|
||||
if (pg_map->pg_stat.count(parent) &&
|
||||
pg_map->pg_stat[parent].state != PG_STATE_CREATING) {
|
||||
dout(10) << " parent is " << parent << dendl;
|
||||
parent_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
|
||||
stats.state = PG_STATE_CREATING;
|
||||
stats.created = epoch;
|
||||
stats.parent = parent;
|
||||
stats.parent_split_bits = split_bits;
|
||||
stats.mapping_epoch = epoch;
|
||||
|
||||
if (parent_found) {
|
||||
pg_stat_t &ps = pg_map->pg_stat[parent];
|
||||
stats.last_fresh = ps.last_fresh;
|
||||
stats.last_active = ps.last_active;
|
||||
stats.last_change = ps.last_change;
|
||||
stats.last_peered = ps.last_peered;
|
||||
stats.last_clean = ps.last_clean;
|
||||
stats.last_unstale = ps.last_unstale;
|
||||
stats.last_undegraded = ps.last_undegraded;
|
||||
stats.last_fullsized = ps.last_fullsized;
|
||||
stats.last_scrub_stamp = ps.last_scrub_stamp;
|
||||
stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
|
||||
stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
|
||||
} else {
|
||||
utime_t now = ceph_clock_now(g_ceph_context);
|
||||
stats.last_fresh = now;
|
||||
stats.last_active = now;
|
||||
stats.last_change = now;
|
||||
stats.last_peered = now;
|
||||
stats.last_clean = now;
|
||||
stats.last_unstale = now;
|
||||
stats.last_undegraded = now;
|
||||
stats.last_fullsized = now;
|
||||
stats.last_scrub_stamp = now;
|
||||
stats.last_deep_scrub_stamp = now;
|
||||
stats.last_clean_scrub_stamp = now;
|
||||
}
|
||||
|
||||
osd_map.pg_to_up_acting_osds(
|
||||
pgid,
|
||||
&stats.up,
|
||||
&stats.up_primary,
|
||||
&stats.acting,
|
||||
&stats.acting_primary);
|
||||
|
||||
if (split_bits == 0) {
|
||||
dout(10) << __func__ << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< dendl;
|
||||
} else {
|
||||
dout(10) << __func__ << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< " parent " << parent
|
||||
<< " by " << split_bits << " bits"
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
|
||||
void PGMapUpdater::register_new_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
epoch_t epoch = osd_map.get_epoch();
|
||||
dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
|
||||
<< ", last_pg_scan " << pg_map->last_pg_scan << dendl;
|
||||
|
||||
int created = 0;
|
||||
for (const auto &p : osd_map.pools) {
|
||||
int64_t poolid = p.first;
|
||||
const pg_pool_t &pool = p.second;
|
||||
int ruleno = osd_map.crush->find_rule(pool.get_crush_ruleset(),
|
||||
pool.get_type(), pool.get_size());
|
||||
if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
|
||||
continue;
|
||||
|
||||
if (pool.get_last_change() <= pg_map->last_pg_scan ||
|
||||
pool.get_last_change() <= pending_inc->pg_scan) {
|
||||
dout(10) << " no change in pool " << poolid << " " << pool << dendl;
|
||||
continue;
|
||||
}
|
||||
|
||||
dout(10) << __func__ << " scanning pool " << poolid
|
||||
<< " " << pool << dendl;
|
||||
|
||||
// first pgs in this pool
|
||||
bool new_pool = pg_map->pg_pool_sum.count(poolid) == 0;
|
||||
|
||||
for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
|
||||
pg_t pgid(ps, poolid, -1);
|
||||
if (pg_map->pg_stat.count(pgid)) {
|
||||
dout(20) << "register_new_pgs have " << pgid << dendl;
|
||||
continue;
|
||||
}
|
||||
created++;
|
||||
register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
|
||||
pg_map, pending_inc);
|
||||
}
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
for (const auto &p : pg_map->creating_pgs) {
|
||||
if (p.preferred() >= 0) {
|
||||
dout(20) << " removing creating_pg " << p
|
||||
<< " because it is localized and obsolete" << dendl;
|
||||
pending_inc->pg_remove.insert(p);
|
||||
removed++;
|
||||
}
|
||||
if (!osd_map.have_pg_pool(p.pool())) {
|
||||
dout(20) << " removing creating_pg " << p
|
||||
<< " because containing pool deleted" << dendl;
|
||||
pending_inc->pg_remove.insert(p);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// deleted pools?
|
||||
for (const auto &p : pg_map->pg_stat) {
|
||||
if (!osd_map.have_pg_pool(p.first.pool())) {
|
||||
dout(20) << " removing pg_stat " << p.first << " because "
|
||||
<< "containing pool deleted" << dendl;
|
||||
pending_inc->pg_remove.insert(p.first);
|
||||
++removed;
|
||||
}
|
||||
if (p.first.preferred() >= 0) {
|
||||
dout(20) << " removing localized pg " << p.first << dendl;
|
||||
pending_inc->pg_remove.insert(p.first);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// we don't want to redo this work if we can avoid it.
|
||||
pending_inc->pg_scan = epoch;
|
||||
|
||||
dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
|
||||
<< removed << " uncreated pgs" << dendl;
|
||||
}
|
||||
|
||||
|
||||
void PGMapUpdater::update_creating_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc)
|
||||
{
|
||||
dout(10) << __func__ << " to " << pg_map->creating_pgs.size()
|
||||
<< " pgs, osdmap epoch " << osd_map.get_epoch()
|
||||
<< dendl;
|
||||
|
||||
unsigned changed = 0;
|
||||
for (set<pg_t>::const_iterator p = pg_map->creating_pgs.begin();
|
||||
p != pg_map->creating_pgs.end();
|
||||
++p) {
|
||||
pg_t pgid = *p;
|
||||
pg_t on = pgid;
|
||||
ceph::unordered_map<pg_t,pg_stat_t>::const_iterator q =
|
||||
pg_map->pg_stat.find(pgid);
|
||||
assert(q != pg_map->pg_stat.end());
|
||||
const pg_stat_t *s = &q->second;
|
||||
|
||||
if (s->parent_split_bits)
|
||||
on = s->parent;
|
||||
|
||||
vector<int> up, acting;
|
||||
int up_primary, acting_primary;
|
||||
osd_map.pg_to_up_acting_osds(
|
||||
on,
|
||||
&up,
|
||||
&up_primary,
|
||||
&acting,
|
||||
&acting_primary);
|
||||
|
||||
if (up != s->up ||
|
||||
up_primary != s->up_primary ||
|
||||
acting != s->acting ||
|
||||
acting_primary != s->acting_primary) {
|
||||
pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
|
||||
if (osd_map.get_epoch() > ns->reported_epoch) {
|
||||
dout(20) << __func__ << " " << pgid << " "
|
||||
<< " acting_primary: " << s->acting_primary
|
||||
<< " -> " << acting_primary
|
||||
<< " acting: " << s->acting << " -> " << acting
|
||||
<< " up_primary: " << s->up_primary << " -> " << up_primary
|
||||
<< " up: " << s->up << " -> " << up
|
||||
<< dendl;
|
||||
|
||||
// only initialize if it wasn't already a pending update
|
||||
if (ns->reported_epoch == 0)
|
||||
*ns = *s;
|
||||
|
||||
// note epoch if the target of the create message changed
|
||||
if (acting_primary != ns->acting_primary)
|
||||
ns->mapping_epoch = osd_map.get_epoch();
|
||||
|
||||
ns->up = up;
|
||||
ns->up_primary = up_primary;
|
||||
ns->acting = acting;
|
||||
ns->acting_primary = acting_primary;
|
||||
|
||||
++changed;
|
||||
} else {
|
||||
dout(20) << __func__ << " " << pgid << " has pending update from newer"
|
||||
<< " epoch " << ns->reported_epoch
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (changed) {
|
||||
dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,10 @@
|
||||
#include "osd/osd_types.h"
|
||||
#include <sstream>
|
||||
|
||||
// FIXME: don't like including this here to get OSDMap::Incremental, maybe
|
||||
// PGMapUpdater needs its own header.
|
||||
#include "osd/OSDMap.h"
|
||||
|
||||
namespace ceph { class Formatter; }
|
||||
|
||||
class PGMap {
|
||||
@ -382,4 +386,39 @@ inline ostream& operator<<(ostream& out, const PGMap& m) {
|
||||
return out;
|
||||
}
|
||||
|
||||
class PGMapUpdater
|
||||
{
|
||||
public:
|
||||
static void check_osd_map(
|
||||
const OSDMap::Incremental &osd_inc,
|
||||
std::set<int> *need_check_down_pg_osds,
|
||||
std::map<int,utime_t> *last_osd_report,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc);
|
||||
|
||||
/**
|
||||
* check latest osdmap for new pgs to register
|
||||
*/
|
||||
static void register_new_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc);
|
||||
|
||||
/**
|
||||
* recalculate creating pg mappings
|
||||
*/
|
||||
static void update_creating_pgs(
|
||||
const OSDMap &osd_map,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc);
|
||||
|
||||
protected:
|
||||
static void register_pg(
|
||||
const OSDMap &osd_map,
|
||||
pg_t pgid, epoch_t epoch,
|
||||
bool new_pool,
|
||||
PGMap *pg_map,
|
||||
PGMap::Incremental *pending_inc);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -67,7 +67,7 @@ void PGMonitor::on_restart()
|
||||
void PGMonitor::on_active()
|
||||
{
|
||||
if (mon->is_leader()) {
|
||||
check_osd_map(mon->osdmon()->osdmap.epoch);
|
||||
check_osd_map(mon->osdmon()->osdmap.get_epoch());
|
||||
need_check_down_pgs = true;
|
||||
}
|
||||
|
||||
@ -911,54 +911,16 @@ void PGMonitor::check_osd_map(epoch_t epoch)
|
||||
|
||||
assert(bl.length());
|
||||
OSDMap::Incremental inc(bl);
|
||||
for (map<int32_t,uint32_t>::iterator p = inc.new_weight.begin();
|
||||
p != inc.new_weight.end();
|
||||
++p)
|
||||
if (p->second == CEPH_OSD_OUT) {
|
||||
dout(10) << __func__ << " osd." << p->first << " went OUT" << dendl;
|
||||
pending_inc.stat_osd_out(p->first);
|
||||
}
|
||||
|
||||
|
||||
// this is conservative: we want to know if any osds (maybe) got marked down.
|
||||
for (map<int32_t,uint8_t>::iterator p = inc.new_state.begin();
|
||||
p != inc.new_state.end();
|
||||
++p) {
|
||||
if (p->second & CEPH_OSD_UP) { // true if marked up OR down,
|
||||
// but we're too lazy to check
|
||||
// which
|
||||
need_check_down_pg_osds.insert(p->first);
|
||||
|
||||
// clear out the last_osd_report for this OSD
|
||||
map<int, utime_t>::iterator report = last_osd_report.find(p->first);
|
||||
if (report != last_osd_report.end()) {
|
||||
last_osd_report.erase(report);
|
||||
}
|
||||
|
||||
// clear out osd_stat slow request histogram
|
||||
dout(20) << __func__ << " clearing osd." << p->first
|
||||
<< " request histogram" << dendl;
|
||||
pending_inc.stat_osd_down_up(p->first, pg_map);
|
||||
}
|
||||
|
||||
if (p->second & CEPH_OSD_EXISTS) {
|
||||
// whether it was created *or* destroyed, we can safely drop
|
||||
// it's osd_stat_t record.
|
||||
dout(10) << __func__ << " osd." << p->first
|
||||
<< " created or destroyed" << dendl;
|
||||
pending_inc.rm_stat(p->first);
|
||||
|
||||
// and adjust full, nearfull set
|
||||
pg_map.nearfull_osds.erase(p->first);
|
||||
pg_map.full_osds.erase(p->first);
|
||||
}
|
||||
}
|
||||
PGMapUpdater::check_osd_map(inc, &need_check_down_pg_osds,
|
||||
&last_osd_report, &pg_map, &pending_inc);
|
||||
}
|
||||
|
||||
assert(pg_map.last_osdmap_epoch < epoch);
|
||||
pending_inc.osdmap_epoch = epoch;
|
||||
map_pg_creates();
|
||||
register_new_pgs();
|
||||
PGMapUpdater::update_creating_pgs(mon->osdmon()->osdmap,
|
||||
&pg_map, &pending_inc);
|
||||
PGMapUpdater::register_new_pgs(mon->osdmon()->osdmap, &pg_map, &pending_inc);
|
||||
|
||||
if (need_check_down_pgs || !need_check_down_pg_osds.empty())
|
||||
check_down_pgs();
|
||||
@ -966,244 +928,6 @@ void PGMonitor::check_osd_map(epoch_t epoch)
|
||||
propose_pending();
|
||||
}
|
||||
|
||||
void PGMonitor::register_pg(OSDMap *osdmap,
|
||||
pg_pool_t& pool, pg_t pgid, epoch_t epoch,
|
||||
bool new_pool)
|
||||
{
|
||||
pg_t parent;
|
||||
int split_bits = 0;
|
||||
bool parent_found = false;
|
||||
if (!new_pool) {
|
||||
parent = pgid;
|
||||
while (1) {
|
||||
// remove most significant bit
|
||||
int msb = cbits(parent.ps());
|
||||
if (!msb)
|
||||
break;
|
||||
parent.set_ps(parent.ps() & ~(1<<(msb-1)));
|
||||
split_bits++;
|
||||
dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
|
||||
if (pg_map.pg_stat.count(parent) &&
|
||||
pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
|
||||
dout(10) << " parent is " << parent << dendl;
|
||||
parent_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pg_stat_t &stats = pending_inc.pg_stat_updates[pgid];
|
||||
stats.state = PG_STATE_CREATING;
|
||||
stats.created = epoch;
|
||||
stats.parent = parent;
|
||||
stats.parent_split_bits = split_bits;
|
||||
stats.mapping_epoch = epoch;
|
||||
|
||||
if (parent_found) {
|
||||
pg_stat_t &ps = pg_map.pg_stat[parent];
|
||||
stats.last_fresh = ps.last_fresh;
|
||||
stats.last_active = ps.last_active;
|
||||
stats.last_change = ps.last_change;
|
||||
stats.last_peered = ps.last_peered;
|
||||
stats.last_clean = ps.last_clean;
|
||||
stats.last_unstale = ps.last_unstale;
|
||||
stats.last_undegraded = ps.last_undegraded;
|
||||
stats.last_fullsized = ps.last_fullsized;
|
||||
stats.last_scrub_stamp = ps.last_scrub_stamp;
|
||||
stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
|
||||
stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
|
||||
} else {
|
||||
utime_t now = ceph_clock_now(g_ceph_context);
|
||||
stats.last_fresh = now;
|
||||
stats.last_active = now;
|
||||
stats.last_change = now;
|
||||
stats.last_peered = now;
|
||||
stats.last_clean = now;
|
||||
stats.last_unstale = now;
|
||||
stats.last_undegraded = now;
|
||||
stats.last_fullsized = now;
|
||||
stats.last_scrub_stamp = now;
|
||||
stats.last_deep_scrub_stamp = now;
|
||||
stats.last_clean_scrub_stamp = now;
|
||||
}
|
||||
|
||||
osdmap->pg_to_up_acting_osds(
|
||||
pgid,
|
||||
&stats.up,
|
||||
&stats.up_primary,
|
||||
&stats.acting,
|
||||
&stats.acting_primary);
|
||||
|
||||
if (split_bits == 0) {
|
||||
dout(10) << __func__ << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< dendl;
|
||||
} else {
|
||||
dout(10) << __func__ << " will create " << pgid
|
||||
<< " primary " << stats.acting_primary
|
||||
<< " acting " << stats.acting
|
||||
<< " parent " << parent
|
||||
<< " by " << split_bits << " bits"
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
|
||||
void PGMonitor::register_new_pgs()
|
||||
{
|
||||
// iterate over crush mapspace
|
||||
OSDMap *osdmap = &mon->osdmon()->osdmap;
|
||||
epoch_t epoch = osdmap->get_epoch();
|
||||
dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
|
||||
<< ", last_pg_scan " << pg_map.last_pg_scan << dendl;
|
||||
|
||||
int created = 0;
|
||||
for (map<int64_t,pg_pool_t>::iterator p = osdmap->pools.begin();
|
||||
p != osdmap->pools.end();
|
||||
++p) {
|
||||
int64_t poolid = p->first;
|
||||
pg_pool_t &pool = p->second;
|
||||
int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(),
|
||||
pool.get_type(), pool.get_size());
|
||||
if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
|
||||
continue;
|
||||
|
||||
if (pool.get_last_change() <= pg_map.last_pg_scan ||
|
||||
pool.get_last_change() <= pending_inc.pg_scan) {
|
||||
dout(10) << " no change in pool " << p->first << " " << pool << dendl;
|
||||
continue;
|
||||
}
|
||||
|
||||
dout(10) << __func__ << " scanning pool " << p->first
|
||||
<< " " << pool << dendl;
|
||||
|
||||
// first pgs in this pool
|
||||
bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
|
||||
|
||||
for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
|
||||
pg_t pgid(ps, poolid, -1);
|
||||
if (pg_map.pg_stat.count(pgid)) {
|
||||
dout(20) << "register_new_pgs have " << pgid << dendl;
|
||||
continue;
|
||||
}
|
||||
created++;
|
||||
register_pg(osdmap, pool, pgid, pool.get_last_change(), new_pool);
|
||||
}
|
||||
}
|
||||
|
||||
int removed = 0;
|
||||
for (set<pg_t>::iterator p = pg_map.creating_pgs.begin();
|
||||
p != pg_map.creating_pgs.end();
|
||||
++p) {
|
||||
if (p->preferred() >= 0) {
|
||||
dout(20) << " removing creating_pg " << *p
|
||||
<< " because it is localized and obsolete" << dendl;
|
||||
pending_inc.pg_remove.insert(*p);
|
||||
removed++;
|
||||
}
|
||||
if (!osdmap->have_pg_pool(p->pool())) {
|
||||
dout(20) << " removing creating_pg " << *p
|
||||
<< " because containing pool deleted" << dendl;
|
||||
pending_inc.pg_remove.insert(*p);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// deleted pools?
|
||||
for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p =
|
||||
pg_map.pg_stat.begin();
|
||||
p != pg_map.pg_stat.end(); ++p) {
|
||||
if (!osdmap->have_pg_pool(p->first.pool())) {
|
||||
dout(20) << " removing pg_stat " << p->first << " because "
|
||||
<< "containing pool deleted" << dendl;
|
||||
pending_inc.pg_remove.insert(p->first);
|
||||
++removed;
|
||||
}
|
||||
if (p->first.preferred() >= 0) {
|
||||
dout(20) << " removing localized pg " << p->first << dendl;
|
||||
pending_inc.pg_remove.insert(p->first);
|
||||
++removed;
|
||||
}
|
||||
}
|
||||
|
||||
// we don't want to redo this work if we can avoid it.
|
||||
pending_inc.pg_scan = epoch;
|
||||
|
||||
dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
|
||||
<< removed << " uncreated pgs" << dendl;
|
||||
}
|
||||
|
||||
void PGMonitor::map_pg_creates()
|
||||
{
|
||||
OSDMap *osdmap = &mon->osdmon()->osdmap;
|
||||
|
||||
dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
|
||||
<< " pgs, osdmap epoch " << osdmap->get_epoch()
|
||||
<< dendl;
|
||||
|
||||
unsigned changed = 0;
|
||||
for (set<pg_t>::const_iterator p = pg_map.creating_pgs.begin();
|
||||
p != pg_map.creating_pgs.end();
|
||||
++p) {
|
||||
pg_t pgid = *p;
|
||||
pg_t on = pgid;
|
||||
ceph::unordered_map<pg_t,pg_stat_t>::const_iterator q =
|
||||
pg_map.pg_stat.find(pgid);
|
||||
assert(q != pg_map.pg_stat.end());
|
||||
const pg_stat_t *s = &q->second;
|
||||
|
||||
if (s->parent_split_bits)
|
||||
on = s->parent;
|
||||
|
||||
vector<int> up, acting;
|
||||
int up_primary, acting_primary;
|
||||
osdmap->pg_to_up_acting_osds(
|
||||
on,
|
||||
&up,
|
||||
&up_primary,
|
||||
&acting,
|
||||
&acting_primary);
|
||||
|
||||
if (up != s->up ||
|
||||
up_primary != s->up_primary ||
|
||||
acting != s->acting ||
|
||||
acting_primary != s->acting_primary) {
|
||||
pg_stat_t *ns = &pending_inc.pg_stat_updates[pgid];
|
||||
if (osdmap->get_epoch() > ns->reported_epoch) {
|
||||
dout(20) << __func__ << " " << pgid << " "
|
||||
<< " acting_primary: " << s->acting_primary
|
||||
<< " -> " << acting_primary
|
||||
<< " acting: " << s->acting << " -> " << acting
|
||||
<< " up_primary: " << s->up_primary << " -> " << up_primary
|
||||
<< " up: " << s->up << " -> " << up
|
||||
<< dendl;
|
||||
|
||||
// only initialize if it wasn't already a pending update
|
||||
if (ns->reported_epoch == 0)
|
||||
*ns = *s;
|
||||
|
||||
// note epoch if the target of the create message changed
|
||||
if (acting_primary != ns->acting_primary)
|
||||
ns->mapping_epoch = osdmap->get_epoch();
|
||||
|
||||
ns->up = up;
|
||||
ns->up_primary = up_primary;
|
||||
ns->acting = acting;
|
||||
ns->acting_primary = acting_primary;
|
||||
|
||||
++changed;
|
||||
} else {
|
||||
dout(20) << __func__ << " " << pgid << " has pending update from newer"
|
||||
<< " epoch " << ns->reported_epoch
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (changed) {
|
||||
dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
|
||||
}
|
||||
}
|
||||
|
||||
void PGMonitor::send_pg_creates()
|
||||
{
|
||||
// We only need to do this old, spammy way of broadcasting create messages
|
||||
|
@ -91,19 +91,8 @@ private:
|
||||
// when we last received PG stats from each osd
|
||||
map<int,utime_t> last_osd_report;
|
||||
|
||||
void register_pg(OSDMap *osdmap, pg_pool_t& pool, pg_t pgid,
|
||||
epoch_t epoch, bool new_pool);
|
||||
|
||||
/**
|
||||
* check latest osdmap for new pgs to register
|
||||
*/
|
||||
void register_new_pgs();
|
||||
|
||||
/**
|
||||
* recalculate creating pg mappings
|
||||
*/
|
||||
void map_pg_creates();
|
||||
|
||||
void send_pg_creates();
|
||||
epoch_t send_pg_creates(int osd, Connection *con, epoch_t next);
|
||||
|
||||
|
@ -256,8 +256,7 @@ private:
|
||||
|
||||
friend class OSDMonitor;
|
||||
// FIXME: the elements required for PGMap updates should be exposed properly
|
||||
friend class PGMonitor;
|
||||
friend class ClusterState;
|
||||
friend class PGMapUpdater;
|
||||
|
||||
public:
|
||||
OSDMap() : epoch(0),
|
||||
|
Loading…
Reference in New Issue
Block a user