osd: partially refactor pg split

This partially refactors the OSD split code to do the split synchronously
when processing a new OSDMap.  It is incomplete in that it does not yet
do anything useful for the PG.  The full solution needs to:

- Do the split synchronously when applying the map update.
- Reset the parent pg so that it repeers.  This will cause problems until
  we consistently consider this a new interval when looking backwards in
  time; this needs to be fixed.  Anybody doing generate_past_intervals()
  or similar will need to consider a split/merge event as an interval
  boundary.
- The recovery state machine should trigger appropriately when this
  happens.
- The old PG that was split should probably be handle identically to the
  new children.  That means deleting the old PG instance and creating a new
  PG object for the newly-split child.  Ditto for merge.

Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
Sage Weil 2012-02-27 14:04:22 -08:00
parent d9cf33223e
commit fc7b11a9ee
2 changed files with 82 additions and 98 deletions

View File

@ -3377,6 +3377,8 @@ void OSD::handle_osd_map(MOSDMap *m)
// finally, take map_lock _after_ we do this flush, to avoid deadlock
map_lock.get_write();
C_Contexts *fin = new C_Contexts(g_ceph_context);
// advance through the new maps
for (epoch_t cur = start; cur <= superblock.newest_map; cur++) {
dout(10) << " advance to epoch " << cur << " (<= newest " << superblock.newest_map << ")" << dendl;
@ -3396,11 +3398,10 @@ void OSD::handle_osd_map(MOSDMap *m)
osdmap = newmap;
superblock.current_epoch = cur;
advance_map(t);
advance_map(t, fin);
had_map_since = ceph_clock_now(g_ceph_context);
}
C_Contexts *fin = new C_Contexts(g_ceph_context);
if (osdmap->is_up(whoami) &&
osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
bind_epoch < osdmap->get_up_from(whoami)) {
@ -3555,7 +3556,7 @@ void OSD::handle_osd_map(MOSDMap *m)
* scan placement groups, initiate any replication
* activities.
*/
void OSD::advance_map(ObjectStore::Transaction& t)
void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
{
assert(osd_lock.is_locked());
@ -3574,6 +3575,8 @@ void OSD::advance_map(ObjectStore::Transaction& t)
}
}
map<int64_t, int> pool_resize; // poolid -> old size
// update pools
for (map<int, PGPool*>::iterator p = pool_map.begin();
p != pool_map.end();
@ -3584,9 +3587,17 @@ void OSD::advance_map(ObjectStore::Transaction& t)
continue;
}
PGPool *pool = p->second;
bool changed = false;
// make sure auid stays up to date
pool->auid = pi->auid;
// split?
if (pool->info.pg_num != pi->pg_num) {
dout(1) << " pool " << p->first << " pg_num " << pool->info.pg_num << " -> " << pi->pg_num << dendl;
pool_resize[p->first] = pool->info.pg_num;
changed = true;
}
if (pi->get_snap_epoch() == osdmap->get_epoch()) {
pi->build_removed_snaps(pool->newly_removed_snaps);
@ -3595,13 +3606,15 @@ void OSD::advance_map(ObjectStore::Transaction& t)
dout(10) << " pool " << p->first << " removed_snaps " << pool->cached_removed_snaps
<< ", newly so are " << pool->newly_removed_snaps << ")"
<< dendl;
pool->info = *pi;
pool->snapc = pi->get_snap_context();
changed = true;
} else {
dout(10) << " pool " << p->first << " removed snaps " << pool->cached_removed_snaps
<< ", unchanged (snap_epoch = " << pi->get_snap_epoch() << ")" << dendl;
pool->newly_removed_snaps.clear();
}
if (changed)
pool->info = *pi;
}
@ -3627,6 +3640,21 @@ void OSD::advance_map(ObjectStore::Transaction& t)
}
}
// do splits?
for (map<int64_t,int>::iterator p = pool_resize.begin(); p != pool_resize.end(); p++) {
dout(10) << " processing pool " << p->first << " resize" << dendl;
for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
it++) {
pg_t pgid = it->first;
PG *pg = it->second;
set<pg_t> children;
if (pgid.is_split(p->second, pg->pool->info.pg_num, children)) {
do_split(pg, children, t, tfin);
}
}
}
// if we skipped a discontinuity and are the first epoch, we won't have a previous map.
OSDMapRef lastmap;
if (osdmap->get_epoch() > superblock.oldest_map)
@ -4007,8 +4035,7 @@ bool OSD::can_create_pg(pg_t pgid)
}
if (creating_pgs[pgid].split_bits) {
dout(10) << "can_create_pg " << pgid << " - queueing for split" << dendl;
pg_split_ready[creating_pgs[pgid].parent].insert(pgid);
dout(10) << "can_create_pg " << pgid << " - split" << dendl;
return false;
}
@ -4016,94 +4043,52 @@ bool OSD::can_create_pg(pg_t pgid)
return true;
}
void OSD::kick_pg_split_queue()
void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction& t,
C_Contexts *tfin)
{
map< int, map<pg_t,pg_query_t> > query_map;
map<int, MOSDPGInfo*> info_map;
int created = 0;
dout(10) << "do_split to " << childpgids << " on " << *parent << dendl;
dout(10) << "kick_pg_split_queue" << dendl;
map<pg_t, set<pg_t> >::iterator n = pg_split_ready.begin();
while (n != pg_split_ready.end()) {
map<pg_t, set<pg_t> >::iterator p = n++;
// how many children should this parent have?
unsigned nchildren = (1 << (creating_pgs[*p->second.begin()].split_bits - 1)) - 1;
if (p->second.size() < nchildren) {
dout(15) << " parent " << p->first << " children " << p->second
<< " ... waiting for " << nchildren << " children" << dendl;
continue;
}
PG *parent = _lookup_lock_pg(p->first);
assert(parent);
if (!parent->is_clean()) {
dout(10) << "kick_pg_split_queue parent " << p->first << " not clean" << dendl;
parent->unlock();
continue;
}
dout(15) << " parent " << p->first << " children " << p->second
<< " ready" << dendl;
// FIXME: this should be done in a separate thread, eventually
// create and lock children
ObjectStore::Transaction *t = new ObjectStore::Transaction;
C_Contexts *fin = new C_Contexts(g_ceph_context);
map<pg_t,PG*> children;
for (set<pg_t>::iterator q = p->second.begin();
q != p->second.end();
q++) {
pg_history_t history;
history.epoch_created = history.same_up_since =
history.same_interval_since = history.same_primary_since =
osdmap->get_epoch();
PG *pg = _create_lock_pg(*q, true,
0, creating_pgs[*q].acting, creating_pgs[*q].acting, history,
*t);
children[*q] = pg;
}
// split
split_pg(parent, children, *t);
parent->update_stats();
parent->write_info(*t);
// unlock parent, children
parent->unlock();
for (map<pg_t,PG*>::iterator q = children.begin(); q != children.end(); q++) {
PG *pg = q->second;
// fix up pg metadata
pg->info.last_complete = pg->info.last_update;
pg->write_info(*t);
pg->write_log(*t);
wake_pg_waiters(pg->info.pgid);
PG::RecoveryCtx rctx(&query_map, &info_map, 0, &fin->contexts, t);
pg->handle_create(&rctx);
pg->update_stats();
pg->unlock();
created++;
}
int tr = store->queue_transaction(NULL, t, new ObjectStore::C_DeleteTransaction(t), fin);
assert(tr == 0);
// remove from queue
pg_split_ready.erase(p);
parent->lock_with_map_lock_held();
// create and lock children
map<pg_t,PG*> children;
for (set<pg_t>::iterator q = childpgids.begin();
q != childpgids.end();
q++) {
pg_history_t history;
history.epoch_created = history.same_up_since =
history.same_interval_since = history.same_primary_since =
osdmap->get_epoch();
PG *pg = _create_lock_pg(*q, true,
0, parent->up, parent->acting, history, t);
children[*q] = pg;
dout(10) << " child " << *pg << dendl;
}
split_pg(parent, children, t);
// reset pg
map< int, vector<pg_info_t> > notify_list; // primary -> list
map< int, map<pg_t,pg_query_t> > query_map; // peer -> PG -> get_summary_since
map<int,MOSDPGInfo*> info_map; // peer -> message
PG::RecoveryCtx rctx(&query_map, &info_map, &notify_list, &tfin->contexts, &t);
// FIXME: this breaks if we have a map discontinuity
//parent->handle_split(osdmap, get_map(osdmap->get_epoch() - 1), &rctx);
// unlock parent, children
parent->unlock();
for (map<pg_t,PG*>::iterator q = children.begin(); q != children.end(); q++) {
PG *pg = q->second;
pg->handle_create(&rctx);
wake_pg_waiters(pg->info.pgid);
pg->unlock();
}
do_notifies(notify_list, osdmap->get_epoch());
do_queries(query_map);
do_infos(info_map);
if (created)
update_heartbeat_peers();
}
void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t)
@ -4168,8 +4153,11 @@ void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction
list<pg_log_entry_t>::iterator cur = p;
p++;
hobject_t& poid = cur->soid;
ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.pool(), parentid.preferred());
pg_t pgid = osdmap->raw_pg_to_pg(pg_t(l.ol_pgid));
object_locator_t oloc(parentid.pool(), parentid.preferred());
if (poid.get_key().size())
oloc.key = poid.get_key();
pg_t rawpg = osdmap->object_locator_to_pg(poid.oid, oloc);
pg_t pgid = osdmap->raw_pg_to_pg(rawpg);
if (pgid != parentid) {
dout(20) << " moving " << *cur << " from " << parentid << " -> " << pgid << dendl;
PG *child = children[pgid];
@ -4195,8 +4183,8 @@ void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction
child->log.index();
}
child->info.last_update = child->log.head;
child->info.last_complete = parent->info.last_complete;
child->info.log_tail = parent->log.tail;
child->info.last_complete = child->info.last_update;
child->info.log_tail = parent->log.tail;
child->info.history.last_epoch_split = osdmap->get_epoch();
child->snap_trimq = parent->snap_trimq;
@ -4330,7 +4318,6 @@ void OSD::handle_pg_create(OpRequest *op)
do_queries(query_map);
do_infos(info_map);
kick_pg_split_queue();
if (num_created)
update_heartbeat_peers();
op->put();
@ -4455,8 +4442,6 @@ void OSD::handle_pg_notify(OpRequest *op)
do_queries(query_map);
do_infos(info_map);
kick_pg_split_queue();
if (created)
update_heartbeat_peers();

View File

@ -404,7 +404,7 @@ private:
void note_down_osd(int osd);
void note_up_osd(int osd);
void advance_map(ObjectStore::Transaction& t);
void advance_map(ObjectStore::Transaction& t, C_Contexts *tfin);
void activate_map(ObjectStore::Transaction& t, list<Context*>& tfin);
// osd map cache (past osd maps)
@ -487,12 +487,11 @@ protected:
int split_bits;
};
hash_map<pg_t, create_pg_info> creating_pgs;
map<pg_t, set<pg_t> > pg_split_ready; // children ready to be split to, by parent
bool can_create_pg(pg_t pgid);
void handle_pg_create(OpRequest *op);
void kick_pg_split_queue();
void do_split(PG *parent, set<pg_t>& children, ObjectStore::Transaction &t, C_Contexts *tfin);
void split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t);