// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include "include/types.h" #include "OSD.h" #include "OSDMap.h" #ifdef USE_OBFS # include "OBFSStore.h" #else # include "FakeStore.h" #endif #include "ebofs/Ebofs.h" #include "Ager.h" #include "msg/Messenger.h" #include "msg/Message.h" #include "messages/MGenericMessage.h" #include "messages/MPing.h" #include "messages/MPingAck.h" #include "messages/MOSDPing.h" #include "messages/MOSDFailure.h" #include "messages/MOSDOp.h" #include "messages/MOSDOpReply.h" #include "messages/MOSDBoot.h" #include "messages/MOSDIn.h" #include "messages/MOSDOut.h" #include "messages/MOSDMap.h" #include "messages/MOSDGetMap.h" #include "messages/MOSDPGNotify.h" #include "messages/MOSDPGQuery.h" #include "messages/MOSDPGLog.h" #include "messages/MOSDPGRemove.h" #include "common/Logger.h" #include "common/LogType.h" #include "common/Timer.h" #include "common/ThreadPool.h" #include #include #include #include #include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " #define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " char *osd_base_path = "./osddata"; char *ebofs_base_path = "./dev"; object_t SUPERBLOCK_OBJECT(0,0); // force remount hack for performance testing FakeStore class C_Remount : public Context { OSD *osd; public: C_Remount(OSD *o) : osd(o) {} void finish(int) { osd->force_remount(); } }; void OSD::force_remount() { dout(0) << "forcing remount" << endl; osd_lock.Lock(); { store->umount(); store->mount(); } osd_lock.Unlock(); dout(0) << "finished remount" << endl; } // // cons/des LogType osd_logtype; OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock) { whoami = id; messenger = m; monmap = mm; osdmap = 0; boot_epoch = 0; last_tid = 0; num_pulling = 0; state = STATE_BOOTING; hb_stat_ops = 0; hb_stat_qlen = 0; pending_ops = 0; waiting_for_no_ops = false; if (g_conf.osd_remount_at) timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); // init object store // try in this order: // dev/osd$num // dev/osd.$hostname // dev/osd.all if (dev) { strcpy(dev_path,dev); } else { char hostname[100]; hostname[0] = 0; gethostname(hostname,100); sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); struct stat sta; if (::lstat(dev_path, &sta) != 0) sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); if (::lstat(dev_path, &sta) != 0) sprintf(dev_path, "%s/osd.all", ebofs_base_path); } if (g_conf.ebofs) { store = new Ebofs(dev_path); //store->_fake_writes(true); } #ifdef USE_OBFS else if (g_conf.uofs) { store = new OBFSStore(whoami, NULL, dev_path); } #endif else { store = new FakeStore(osd_base_path, whoami); } } OSD::~OSD() { if (threadpool) { delete threadpool; threadpool = 0; } if (osdmap) { delete osdmap; osdmap = 0; } //if (monitor) { delete monitor; monitor = 0; } if (messenger) { delete messenger; messenger = 0; } if (logger) { delete logger; logger = 0; } if (store) { delete store; store = 0; } } int OSD::init() { osd_lock.Lock(); { // mkfs? if (g_conf.osd_mkfs) { dout(2) << "mkfs" << endl; store->mkfs(); // make up a superblock //superblock.fsid = ???; superblock.whoami = whoami; } // mount. dout(2) << "mounting " << dev_path << endl; int r = store->mount(); assert(r>=0); if (g_conf.osd_mkfs) { // age? if (g_conf.osd_age_time != 0) { dout(2) << "age" << endl; Ager ager(store); if (g_conf.osd_age_time < 0) ager.load_freelist(); else ager.age(g_conf.osd_age_time, g_conf.osd_age, g_conf.osd_age - .05, 50000, g_conf.osd_age - .05); } } else { dout(2) << "boot" << endl; // read superblock read_superblock(); // load up pgs (as they previously existed) load_pgs(); dout(2) << "superblock: i am osd" << superblock.whoami << endl; assert(whoami == superblock.whoami); } // log char name[80]; sprintf(name, "osd%02d", whoami); logger = new Logger(name, (LogType*)&osd_logtype); osd_logtype.add_set("opq"); osd_logtype.add_inc("op"); osd_logtype.add_inc("c_rd"); osd_logtype.add_inc("c_rdb"); osd_logtype.add_inc("c_wr"); osd_logtype.add_inc("c_wrb"); osd_logtype.add_inc("r_push"); osd_logtype.add_inc("r_pushb"); osd_logtype.add_inc("r_wr"); osd_logtype.add_inc("r_wrb"); osd_logtype.add_inc("rlnum"); osd_logtype.add_set("numpg"); osd_logtype.add_set("pingset"); osd_logtype.add_set("buf"); osd_logtype.add_inc("map"); osd_logtype.add_inc("mapi"); osd_logtype.add_inc("mapidup"); osd_logtype.add_inc("mapf"); osd_logtype.add_inc("mapfdup"); // request thread pool { char name[80]; sprintf(name,"osd%d.threadpool", whoami); threadpool = new ThreadPool(name, g_conf.osd_maxthreads, static_dequeueop, this); } // i'm ready! messenger->set_dispatcher(this); // announce to monitor i exist and have booted. int mon = monmap->pick_mon(); messenger->send_message(new MOSDBoot(superblock), monmap->get_inst(mon)); // start the heart timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); } osd_lock.Unlock(); //dout(0) << "osd_rep " << g_conf.osd_rep << endl; return 0; } int OSD::shutdown() { dout(1) << "shutdown" << endl; state = STATE_STOPPING; // cancel timers timer.cancel_all(); timer.join(); // finish ops wait_for_no_ops(); // stop threads delete threadpool; threadpool = 0; // close pgs for (hash_map::iterator p = pg_map.begin(); p != pg_map.end(); p++) { delete p->second; } pg_map.clear(); // shut everything else down //monitor->shutdown(); messenger->shutdown(); osd_lock.Unlock(); int r = store->umount(); osd_lock.Lock(); return r; } void OSD::write_superblock(ObjectStore::Transaction& t) { dout(10) << "write_superblock " << superblock << endl; bufferlist bl; bl.append((char*)&superblock, sizeof(superblock)); t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); } int OSD::read_superblock() { bufferlist bl; int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); if (bl.length() != sizeof(superblock)) { dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl; return -1; } bl.copy(0, sizeof(superblock), (char*)&superblock); dout(10) << "read_superblock " << superblock << endl; // load up "current" osdmap assert(!osdmap); osdmap = new OSDMap; bl.clear(); get_map_bl(superblock.current_epoch, bl); osdmap->decode(bl); assert(whoami == superblock.whoami); // fixme! return 0; } // object locks PG *OSD::lock_pg(pg_t pgid) { osd_lock.Lock(); PG *pg = _lock_pg(pgid); osd_lock.Unlock(); return pg; } PG *OSD::_lock_pg(pg_t pgid) { assert(pg_map.count(pgid)); if (pg_lock.count(pgid)) { Cond c; dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl; //cerr << "lock_pg " << pgid << " waiting as " << &c << endl; list& ls = pg_lock_waiters[pgid]; // this is commit, right? ls.push_back(&c); while (pg_lock.count(pgid) || ls.front() != &c) c.Wait(osd_lock); assert(ls.front() == &c); ls.pop_front(); if (ls.empty()) pg_lock_waiters.erase(pgid); } dout(15) << "lock_pg " << pgid << endl; pg_lock.insert(pgid); return pg_map[pgid]; } void OSD::unlock_pg(pg_t pgid) { osd_lock.Lock(); _unlock_pg(pgid); osd_lock.Unlock(); } void OSD::_unlock_pg(pg_t pgid) { // unlock assert(pg_lock.count(pgid)); pg_lock.erase(pgid); if (pg_lock_waiters.count(pgid)) { // someone is in line Cond *c = pg_lock_waiters[pgid].front(); assert(c); dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl; c->Signal(); } else { // nobody waiting dout(15) << "unlock_pg " << pgid << endl; } } void OSD::_remove_pg(pg_t pgid) { dout(10) << "_remove_pg " << pgid << endl; // remove from store list olist; store->collection_list(pgid, olist); ObjectStore::Transaction t; { for (list::iterator p = olist.begin(); p != olist.end(); p++) t.remove(*p); t.remove_collection(pgid); t.remove(object_t(1,pgid)); // log too } store->apply_transaction(t); // hose from memory delete pg_map[pgid]; pg_map.erase(pgid); } void OSD::activate_pg(pg_t pgid, epoch_t epoch) { osd_lock.Lock(); { if (pg_map.count(pgid)) { PG *pg = _lock_pg(pgid); if (pg->is_crashed() && pg->is_replay() && pg->get_role() == 0 && pg->info.history.same_primary_since <= epoch) { ObjectStore::Transaction t; pg->activate(t); store->apply_transaction(t); } _unlock_pg(pgid); } } // finishers? if (finished.empty()) { osd_lock.Unlock(); } else { list waiting; waiting.splice(waiting.begin(), finished); osd_lock.Unlock(); for (list::iterator it = waiting.begin(); it != waiting.end(); it++) { dispatch(*it); } } } // ------------------------------------- void OSD::heartbeat() { utime_t now = g_clock.now(); utime_t since = now; since.sec_ref() -= g_conf.osd_heartbeat_interval; // calc my stats float avg_qlen = 0; if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops; dout(5) << "heartbeat " << now << ": ops " << hb_stat_ops << ", avg qlen " << avg_qlen << endl; // reset until next time around hb_stat_ops = 0; hb_stat_qlen = 0; // send pings set pingset; for (hash_map::iterator i = pg_map.begin(); i != pg_map.end(); i++) { PG *pg = i->second; // we want to ping the primary. if (pg->get_role() <= 0) continue; if (pg->acting.size() < 1) continue; if (pg->last_heartbeat < since) { pg->last_heartbeat = now; pingset.insert(pg->acting[0]); } } for (set::iterator i = pingset.begin(); i != pingset.end(); i++) { _share_map_outgoing( osdmap->get_inst(*i) ); messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), osdmap->get_inst(*i)); } if (logger) logger->set("pingset", pingset.size()); // hack: fake reorg? if (osdmap && g_conf.fake_osdmap_updates) { int mon = monmap->pick_mon(); if ((rand() % g_conf.fake_osdmap_updates) == 0) { //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { messenger->send_message(new MOSDIn(osdmap->get_epoch()), monmap->get_inst(mon)); } /* if (osdmap->is_out(whoami)) { messenger->send_message(new MOSDIn(osdmap->get_epoch()), MSG_ADDR_MON(mon), monmap->get_inst(mon)); } else if ((rand() % g_conf.fake_osdmap_updates) == 0) { //messenger->send_message(new MOSDOut(osdmap->get_epoch()), //MSG_ADDR_MON(mon), monmap->get_inst(mon)); } } */ } // schedule next! randomly. float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; timer.add_event_after(wait, new C_Heartbeat(this)); } // -------------------------------------- // dispatch bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) { bool shared = false; // does client have old map? if (inst.name.is_client()) { if (epoch < osdmap->get_epoch()) { dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; send_incremental_map(epoch, inst, true); shared = true; } } // does peer have old map? if (inst.name.is_osd()) { // remember if (peer_map_epoch[inst.name] < epoch) peer_map_epoch[inst.name] = epoch; // older? if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; send_incremental_map(epoch, inst, true); peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. shared = true; } } return shared; } void OSD::_share_map_outgoing(const entity_inst_t& inst) { assert(inst.name.is_osd()); if (inst.name.is_osd()) { // send map? if (peer_map_epoch.count(inst.name)) { epoch_t pe = peer_map_epoch[inst.name]; if (pe < osdmap->get_epoch()) { send_incremental_map(pe, inst, true); peer_map_epoch[inst.name] = osdmap->get_epoch(); } } else { // no idea about peer's epoch. // ??? send recent ??? // do nothing. } } } void OSD::dispatch(Message *m) { // lock! osd_lock.Lock(); switch (m->get_type()) { // -- don't need lock -- case MSG_PING: dout(10) << "ping from " << m->get_source() << endl; delete m; break; // -- don't need OSDMap -- /* // host monitor case MSG_PING_ACK: case MSG_FAILURE_ACK: monitor->proc_message(m); break; */ // map and replication case MSG_OSD_MAP: handle_osd_map((MOSDMap*)m); break; // osd case MSG_SHUTDOWN: shutdown(); delete m; break; // -- need OSDMap -- default: { // no map? starting up? if (!osdmap) { dout(7) << "no OSDMap, not booted" << endl; waiting_for_osdmap.push_back(m); break; } // down? if (osdmap->is_down(whoami)) { dout(7) << "i am marked down, dropping " << *m << endl; delete m; break; } // need OSDMap switch (m->get_type()) { case MSG_OSD_PING: // take note. handle_osd_ping((MOSDPing*)m); break; case MSG_OSD_PG_NOTIFY: handle_pg_notify((MOSDPGNotify*)m); break; case MSG_OSD_PG_QUERY: handle_pg_query((MOSDPGQuery*)m); break; case MSG_OSD_PG_LOG: handle_pg_log((MOSDPGLog*)m); break; case MSG_OSD_PG_REMOVE: handle_pg_remove((MOSDPGRemove*)m); break; case MSG_OSD_OP: handle_op((MOSDOp*)m); break; // for replication etc. case MSG_OSD_OPREPLY: handle_op_reply((MOSDOpReply*)m); break; default: dout(1) << " got unknown message " << m->get_type() << endl; assert(0); } } } // finishers? if (!finished.empty()) { list waiting; waiting.splice(waiting.begin(), finished); osd_lock.Unlock(); for (list::iterator it = waiting.begin(); it != waiting.end(); it++) { dispatch(*it); } return; } osd_lock.Unlock(); } void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) { entity_name_t dest = inst.name; if (g_conf.ms_die_on_failure) { exit(0); } if (dest.is_osd()) { // failed osd. drop message, report to mon. int mon = monmap->pick_mon(); dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", dropping and reporting to mon" << mon << endl; messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), monmap->get_inst(mon)); delete m; } else if (dest.is_mon()) { // resend to a different monitor. int mon = monmap->pick_mon(true); dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", resending to mon" << mon << endl; messenger->send_message(m, monmap->get_inst(mon)); } else { // client? dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", dropping" << endl; delete m; } } void OSD::handle_osd_ping(MOSDPing *m) { dout(20) << "osdping from " << m->get_source() << endl; _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); int from = m->get_source().num(); peer_qlen[from] = m->avg_qlen; //if (!m->ack) //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), //m->get_source()); delete m; } // ===================================================== // MAP void OSD::wait_for_new_map(Message *m) { // ask if (waiting_for_osdmap.empty()) { int mon = monmap->pick_mon(); messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), monmap->get_inst(mon)); } waiting_for_osdmap.push_back(m); } /** update_map * assimilate new OSDMap(s). scan pgs, etc. */ void OSD::handle_osd_map(MOSDMap *m) { wait_for_no_ops(); assert(osd_lock.is_locked()); ObjectStore::Transaction t; if (osdmap) { dout(3) << "handle_osd_map epochs [" << m->get_first() << "," << m->get_last() << "], i have " << osdmap->get_epoch() << endl; } else { dout(3) << "handle_osd_map epochs [" << m->get_first() << "," << m->get_last() << "], i have none" << endl; osdmap = new OSDMap; boot_epoch = m->get_last(); // hrm...? } logger->inc("mapmsg"); // store them? for (map::iterator p = m->maps.begin(); p != m->maps.end(); p++) { object_t oid = get_osdmap_object_name(p->first); if (store->exists(oid)) { dout(10) << "handle_osd_map already had full map epoch " << p->first << endl; logger->inc("mapfdup"); bufferlist bl; get_map_bl(p->first, bl); dout(10) << " .. it is " << bl.length() << " bytes" << endl; continue; } dout(10) << "handle_osd_map got full map epoch " << p->first << endl; //t.write(oid, 0, p->second.length(), p->second); store->write(oid, 0, p->second.length(), p->second, 0); if (p->first > superblock.newest_map) superblock.newest_map = p->first; if (p->first < superblock.oldest_map || superblock.oldest_map == 0) superblock.oldest_map = p->first; logger->inc("mapf"); } for (map::iterator p = m->incremental_maps.begin(); p != m->incremental_maps.end(); p++) { object_t oid = get_inc_osdmap_object_name(p->first); if (store->exists(oid)) { dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl; logger->inc("mapidup"); bufferlist bl; get_inc_map_bl(p->first, bl); dout(10) << " .. it is " << bl.length() << " bytes" << endl; continue; } dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl; //t.write(oid, 0, p->second.length(), p->second); store->write(oid, 0, p->second.length(), p->second, 0); if (p->first > superblock.newest_map) superblock.newest_map = p->first; if (p->first < superblock.oldest_map || superblock.oldest_map == 0) superblock.oldest_map = p->first; logger->inc("mapi"); } // advance if we can bool advanced = false; if (m->get_source().is_mon() && is_booting()) advanced = true; epoch_t cur = superblock.current_epoch; while (cur < superblock.newest_map) { bufferlist bl; if (m->incremental_maps.count(cur+1) || store->exists(get_inc_osdmap_object_name(cur+1))) { dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl; bufferlist bl; if (m->incremental_maps.count(cur+1)) bl = m->incremental_maps[cur+1]; else get_inc_map_bl(cur+1, bl); OSDMap::Incremental inc; int off = 0; inc.decode(bl, off); osdmap->apply_incremental(inc); // archive the full map bl.clear(); osdmap->encode(bl); t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); // notify messenger for (map::iterator i = inc.new_down.begin(); i != inc.new_down.end(); i++) { int osd = i->first; if (osd == whoami) continue; messenger->mark_down(i->second.addr); peer_map_epoch.erase(MSG_ADDR_OSD(osd)); // kick any replica ops for (hash_map::iterator it = pg_map.begin(); it != pg_map.end(); it++) { PG *pg = it->second; _lock_pg(pg->info.pgid); { list ls; // do async; repop_ack() may modify pg->repop_gather for (map::iterator p = pg->repop_gather.begin(); p != pg->repop_gather.end(); p++) { //dout(-1) << "checking repop tid " << p->first << endl; if (p->second->waitfor_ack.count(osd) || p->second->waitfor_commit.count(osd)) ls.push_back(p->second); } for (list::iterator p = ls.begin(); p != ls.end(); p++) repop_ack(pg, *p, -1, true, osd); } _unlock_pg(pg->info.pgid); } } for (map::iterator i = inc.new_up.begin(); i != inc.new_up.end(); i++) { if (i->first == whoami) continue; peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); } } else if (m->maps.count(cur+1) || store->exists(get_osdmap_object_name(cur+1))) { dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl; bufferlist bl; if (m->maps.count(cur+1)) bl = m->maps[cur+1]; else get_map_bl(cur+1, bl); osdmap->decode(bl); // FIXME BUG: need to notify messenger of ups/downs!! } else { dout(10) << "handle_osd_map missing epoch " << cur+1 << endl; int mon = monmap->pick_mon(); messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon)); break; } cur++; superblock.current_epoch = cur; advance_map(t); advanced = true; } // all the way? if (advanced && cur == superblock.newest_map) { // yay! activate_map(t); // process waiters take_waiters(waiting_for_osdmap); } // write updated pg state to store for (hash_map::iterator i = pg_map.begin(); i != pg_map.end(); i++) { pg_t pgid = i->first; PG *pg = i->second; t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); } // superblock and commit write_superblock(t); store->apply_transaction(t); //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah delete m; } /** * scan placement groups, initiate any replication * activities. */ void OSD::advance_map(ObjectStore::Transaction& t) { dout(7) << "advance_map epoch " << osdmap->get_epoch() << " " << pg_map.size() << " pgs" << endl; if (osdmap->is_mkfs()) { ps_t maxps = 1ULL << osdmap->get_pg_bits(); ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits(); dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl; assert(osdmap->get_epoch() == 1); //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl; logger->set_start( osdmap->get_ctime() ); // create PGs for (int nrep = 1; nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh nrep++) { for (ps_t ps = 0; ps < maxps; ++ps) { vector acting; pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep); int nrep = osdmap->pg_to_acting_osds(pgid, acting); int role = osdmap->calc_pg_role(whoami, acting, nrep); if (role < 0) continue; PG *pg = create_pg(pgid, t); pg->set_role(role); pg->acting.swap(acting); pg->last_epoch_started_any = pg->info.last_epoch_started = pg->info.history.same_since = pg->info.history.same_primary_since = pg->info.history.same_acker_since = osdmap->get_epoch(); pg->activate(t); dout(7) << "created " << *pg << endl; } for (ps_t ps = 0; ps < maxlps; ++ps) { // local PG too vector acting; pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep); int nrep = osdmap->pg_to_acting_osds(pgid, acting); int role = osdmap->calc_pg_role(whoami, acting, nrep); PG *pg = create_pg(pgid, t); pg->acting.swap(acting); pg->set_role(role); pg->last_epoch_started_any = pg->info.last_epoch_started = pg->info.history.same_primary_since = pg->info.history.same_acker_since = pg->info.history.same_since = osdmap->get_epoch(); pg->activate(t); dout(7) << "created " << *pg << endl; } } dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl; } else { // scan existing pg's for (hash_map::iterator it = pg_map.begin(); it != pg_map.end(); it++) { pg_t pgid = it->first; PG *pg = it->second; // did i finish this epoch? if (pg->is_active()) { pg->info.last_epoch_finished = osdmap->get_epoch()-1; } // get new acting set vector tacting; int nrep = osdmap->pg_to_acting_osds(pgid, tacting); int role = osdmap->calc_pg_role(whoami, tacting, nrep); // no change? if (tacting == pg->acting) continue; // -- there was a change! -- _lock_pg(pgid); int oldrole = pg->get_role(); int oldprimary = pg->get_primary(); int oldacker = pg->get_acker(); vector oldacting = pg->acting; // update PG pg->acting.swap(tacting); pg->set_role(role); // did primary|acker change? pg->info.history.same_since = osdmap->get_epoch(); if (oldprimary != pg->get_primary()) { pg->info.history.same_primary_since = osdmap->get_epoch(); pg->cancel_recovery(); } if (oldacker != pg->get_acker()) { pg->info.history.same_acker_since = osdmap->get_epoch(); } // deactivate. pg->state_clear(PG::STATE_ACTIVE); // reset primary state? if (oldrole == 0 || pg->get_role() == 0) pg->clear_primary_state(); // apply any repops in progress. if (oldacker == whoami) { // apply repops for (map::iterator p = pg->repop_gather.begin(); p != pg->repop_gather.end(); p++) { if (!p->second->applied) apply_repop(pg, p->second); delete p->second->op; delete p->second; } pg->repop_gather.clear(); // and repop waiters for (map >::iterator p = pg->waiting_for_repop.begin(); p != pg->waiting_for_repop.end(); p++) for (list::iterator pm = p->second.begin(); pm != p->second.end(); pm++) delete *pm; pg->waiting_for_repop.clear(); } if (role != oldrole) { // old primary? if (oldrole == 0) { pg->state_clear(PG::STATE_CLEAN); // take replay queue waiters list ls; for (map::iterator it = pg->replay_queue.begin(); it != pg->replay_queue.end(); it++) ls.push_back(it->second); pg->replay_queue.clear(); take_waiters(ls); // take active waiters take_waiters(pg->waiting_for_active); // take object waiters for (hash_map >::iterator it = pg->waiting_for_missing_object.begin(); it != pg->waiting_for_missing_object.end(); it++) take_waiters(it->second); pg->waiting_for_missing_object.clear(); } // new primary? if (role == 0) { // i am new primary pg->state_clear(PG::STATE_STRAY); } else { // i am now replica|stray. we need to send a notify. pg->state_set(PG::STATE_STRAY); if (nrep == 0) { pg->state_set(PG::STATE_CRASHED); dout(1) << *pg << " is crashed" << endl; } } // my role changed. dout(10) << *pg << " " << oldacting << " -> " << pg->acting << ", role " << oldrole << " -> " << role << endl; } else { // no role change. // did primary change? if (pg->get_primary() != oldprimary) { // we need to announce pg->state_set(PG::STATE_STRAY); dout(10) << *pg << " " << oldacting << " -> " << pg->acting << ", acting primary " << oldprimary << " -> " << pg->get_primary() << endl; } else { // primary is the same. if (role == 0) { // i am (still) primary. but my replica set changed. pg->state_clear(PG::STATE_CLEAN); pg->state_clear(PG::STATE_REPLAY); dout(10) << *pg << " " << oldacting << " -> " << pg->acting << ", replicas changed" << endl; } } } _unlock_pg(pgid); } } } void OSD::activate_map(ObjectStore::Transaction& t) { dout(7) << "activate_map version " << osdmap->get_epoch() << endl; map< int, list > notify_list; // primary -> list map< int, map > query_map; // peer -> PG -> get_summary_since // scan pg's for (hash_map::iterator it = pg_map.begin(); it != pg_map.end(); it++) { //pg_t pgid = it->first; PG *pg = it->second; if (pg->is_active()) { // update started counter pg->info.last_epoch_started = osdmap->get_epoch(); } else if (pg->get_role() == 0 && !pg->is_active()) { // i am (inactive) primary pg->build_prior(); pg->peer(t, query_map); } else if (pg->is_stray() && pg->get_primary() >= 0) { // i am residual|replica notify_list[pg->get_primary()].push_back(pg->info); } } if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs return; // notify? (residual|replica) do_notifies(notify_list); // do queries. do_queries(query_map); logger->set("numpg", pg_map.size()); } void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) { dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() << " to " << inst << endl; MOSDMap *m = new MOSDMap; for (epoch_t e = osdmap->get_epoch(); e > since; e--) { bufferlist bl; if (get_inc_map_bl(e,bl)) { m->incremental_maps[e].claim(bl); } else if (get_map_bl(e,bl)) { m->maps[e].claim(bl); if (!full) break; } else { assert(0); // we should have all maps. } } messenger->send_message(m, inst); } bool OSD::get_map_bl(epoch_t e, bufferlist& bl) { return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; } bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) { return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; } void OSD::get_map(epoch_t epoch, OSDMap &m) { // find a complete map list incs; epoch_t e; for (e = epoch; e > 0; e--) { bufferlist bl; if (get_map_bl(e, bl)) { //dout(10) << "get_map " << epoch << " full " << e << endl; m.decode(bl); break; } else { OSDMap::Incremental inc; bool got = get_inc_map(e, inc); assert(got); incs.push_front(inc); } } assert(e > 0); // apply incrementals for (e++; e <= epoch; e++) { //dout(10) << "get_map " << epoch << " inc " << e << endl; m.apply_incremental( incs.front() ); incs.pop_front(); } } bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) { bufferlist bl; if (!get_inc_map_bl(e, bl)) return false; int off = 0; inc.decode(bl, off); return true; } bool OSD::require_current_map(Message *m, epoch_t ep) { // older map? if (ep < osdmap->get_epoch()) { dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl; delete m; // discard and ignore. return false; } // newer map? if (ep > osdmap->get_epoch()) { dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl; wait_for_new_map(m); return false; } assert(ep == osdmap->get_epoch()); return true; } /* * require that we have same (or newer) map, and that * the source is the pg primary. */ bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) { dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl; // newer map? if (epoch > osdmap->get_epoch()) { dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl; wait_for_new_map(m); return false; } if (epoch < boot_epoch) { dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl; delete m; return false; } return true; } // ====================================================== // REPLICATION // PG bool OSD::pg_exists(pg_t pgid) { return store->collection_exists(pgid); } PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t) { if (pg_map.count(pgid)) { dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl; } assert(pg_map.count(pgid) == 0); assert(!pg_exists(pgid)); PG *pg = new PG(this, pgid); pg_map[pgid] = pg; t.create_collection(pgid); return pg; } PG *OSD::get_pg(pg_t pgid) { if (pg_map.count(pgid)) return pg_map[pgid]; return 0; } void OSD::load_pgs() { dout(10) << "load_pgs" << endl; assert(pg_map.empty()); list ls; store->list_collections(ls); for (list::iterator it = ls.begin(); it != ls.end(); it++) { pg_t pgid = *it; PG *pg = new PG(this, pgid); pg_map[pgid] = pg; // read pg info store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); // read pg log pg->read_log(store); // generate state for current mapping int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); pg->set_role(role); dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl; } } /** * check epochs starting from start to verify the pg acting set hasn't changed * up until now */ void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from) { dout(15) << "project_pg_history " << pgid << " from " << from << " to " << osdmap->get_epoch() << ", start " << h << endl; vector last; osdmap->pg_to_acting_osds(pgid, last); for (epoch_t e = osdmap->get_epoch()-1; e >= from; e--) { // verify during intermediate epoch OSDMap oldmap; get_map(e, oldmap); vector acting; oldmap.pg_to_acting_osds(pgid, acting); // acting set change? if (acting != last && e <= h.same_since) { dout(15) << "project_pg_history " << pgid << " changed in " << e+1 << " from " << acting << " -> " << last << endl; h.same_since = e+1; } // primary change? if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && e <= h.same_primary_since) { dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl; h.same_primary_since = e+1; if (g_conf.osd_rep == OSD_REP_PRIMARY) h.same_acker_since = h.same_primary_since; } // acker change? if (g_conf.osd_rep != OSD_REP_PRIMARY) { if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && e <= h.same_acker_since) { dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl; h.same_acker_since = e+1; } } if (h.same_since > e && h.same_primary_since > e && h.same_acker_since > e) break; } dout(15) << "project_pg_history end " << h << endl; } /** do_notifies * Send an MOSDPGNotify to a primary, with a list of PGs that I have * content for, and they are primary for. */ void OSD::do_notifies(map< int, list >& notify_list) { for (map< int, list >::iterator it = notify_list.begin(); it != notify_list.end(); it++) { if (it->first == whoami) { dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl; continue; } dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl; MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); _share_map_outgoing(osdmap->get_inst(it->first)); messenger->send_message(m, osdmap->get_inst(it->first)); } } /** do_queries * send out pending queries for info | summaries */ void OSD::do_queries(map< int, map >& query_map) { for (map< int, map >::iterator pit = query_map.begin(); pit != query_map.end(); pit++) { int who = pit->first; dout(7) << "do_queries querying osd" << who << " on " << pit->second.size() << " PGs" << endl; MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), pit->second); _share_map_outgoing(osdmap->get_inst(who)); messenger->send_message(m, osdmap->get_inst(who)); } } /** PGNotify * from non-primary to primary * includes PG::Info. * NOTE: called with opqueue active. */ void OSD::handle_pg_notify(MOSDPGNotify *m) { dout(7) << "handle_pg_notify from " << m->get_source() << endl; int from = m->get_source().num(); if (!require_same_or_newer_map(m, m->get_epoch())) return; ObjectStore::Transaction t; // look for unknown PGs i'm primary for map< int, map > query_map; for (list::iterator it = m->get_pg_list().begin(); it != m->get_pg_list().end(); it++) { pg_t pgid = it->pgid; PG *pg; if (pg_map.count(pgid) == 0) { // same primary? PG::Info::History history = it->history; project_pg_history(pgid, history, m->get_epoch()); if (m->get_epoch() < history.same_primary_since) { dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; continue; } // ok, create PG! pg = create_pg(pgid, t); osdmap->pg_to_acting_osds(pgid, pg->acting); pg->set_role(0); pg->info.history = history; pg->last_epoch_started_any = it->last_epoch_started; pg->build_prior(); t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); dout(10) << *pg << " is new" << endl; // kick any waiters if (waiting_for_pg.count(pgid)) { take_waiters(waiting_for_pg[pgid]); waiting_for_pg.erase(pgid); } _lock_pg(pgid); } else { // already had it. am i (still) the primary? pg = _lock_pg(pgid); if (m->get_epoch() < pg->info.history.same_primary_since) { dout(10) << *pg << " handle_pg_notify primary changed in " << pg->info.history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; _unlock_pg(pgid); continue; } } // ok! // stray? bool acting = pg->is_acting(from); if (!acting && (*it).last_epoch_started > 0) { dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl; pg->stray_set.insert(from); pg->state_clear(PG::STATE_CLEAN); } // save info. bool had = pg->peer_info.count(from); pg->peer_info[from] = *it; if (had) { if (pg->is_active() && (*it).is_clean() && acting) { pg->clean_set.insert(from); dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set << "): " << *it << endl; if (pg->is_all_clean()) { dout(-10) << *pg << " now clean on all replicas" << endl; pg->state_set(PG::STATE_CLEAN); pg->clean_replicas(); } } else { // hmm, maybe keep an eye out for cases where we see this, but peer should happen. dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl; } } else { // adjust prior? if (it->last_epoch_started > pg->last_epoch_started_any) pg->adjust_prior(); // peer pg->peer(t, query_map); } _unlock_pg(pgid); } unsigned tr = store->apply_transaction(t); assert(tr == 0); do_queries(query_map); delete m; } /** PGLog * from non-primary to primary * includes log and info * from primary to non-primary * includes log for use in recovery * NOTE: called with opqueue active. */ void OSD::handle_pg_log(MOSDPGLog *m) { int from = m->get_source().num(); const pg_t pgid = m->get_pgid(); if (!require_same_or_newer_map(m, m->get_epoch())) return; if (pg_map.count(pgid) == 0) { dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl; assert(m->get_epoch() < osdmap->get_epoch()); delete m; return; } PG *pg = _lock_pg(pgid); assert(pg); if (m->get_epoch() < pg->info.history.same_since) { dout(10) << "handle_pg_log " << *pg << " from " << m->get_source() << " is old, discarding" << endl; delete m; return; } dout(7) << "handle_pg_log " << *pg << " got " << m->log << " " << m->missing << " from " << m->get_source() << endl; //m->log.print(cout); ObjectStore::Transaction t; if (pg->is_primary()) { // i am PRIMARY assert(pg->peer_log_requested.count(from) || pg->peer_summary_requested.count(from)); pg->proc_replica_log(m->log, m->missing, from); // peer map< int, map > query_map; pg->peer(t, query_map); do_queries(query_map); } else { // i am REPLICA dout(10) << *pg << " got " << m->log << " " << m->missing << endl; // merge log pg->merge_log(m->log, m->missing, from); pg->proc_missing(m->log, m->missing, from); assert(pg->missing.num_lost() == 0); // ok activate! pg->activate(t); } unsigned tr = store->apply_transaction(t); assert(tr == 0); _unlock_pg(pgid); delete m; } /** PGQuery * from primary to replica | stray * NOTE: called with opqueue active. */ void OSD::handle_pg_query(MOSDPGQuery *m) { dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl; int from = m->get_source().num(); if (!require_same_or_newer_map(m, m->get_epoch())) return; map< int, list > notify_list; for (map::iterator it = m->pg_list.begin(); it != m->pg_list.end(); it++) { pg_t pgid = it->first; PG *pg = 0; if (pg_map.count(pgid) == 0) { // same primary? PG::Info::History history = it->second.history; project_pg_history(pgid, history, m->get_epoch()); if (m->get_epoch() < history.same_since) { dout(10) << " pg " << pgid << " dne, and pg has changed in " << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; continue; } // get active rush mapping vector acting; int nrep = osdmap->pg_to_acting_osds(pgid, acting); int role = osdmap->calc_pg_role(whoami, acting, nrep); if (role < 0) { dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl; PG::Info empty(pgid); notify_list[from].push_back(empty); continue; } assert(role > 0); ObjectStore::Transaction t; pg = create_pg(pgid, t); pg->acting.swap( acting ); pg->set_role(role); pg->info.history = history; t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); store->apply_transaction(t); dout(10) << *pg << " dne (before), but i am role " << role << endl; _lock_pg(pgid); } else { pg = _lock_pg(pgid); // same primary? if (m->get_epoch() < pg->info.history.same_since) { dout(10) << *pg << " handle_pg_query primary changed in " << pg->info.history.same_since << " (msg from " << m->get_epoch() << ")" << endl; _unlock_pg(pgid); continue; } } // ok, process query! assert(!pg->acting.empty()); assert(from == pg->acting[0]); if (it->second.type == PG::Query::INFO) { // info dout(10) << *pg << " sending info" << endl; notify_list[from].push_back(pg->info); } else { MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid()); m->info = pg->info; m->missing = pg->missing; if (it->second.type == PG::Query::LOG) { dout(10) << *pg << " sending info+missing+log since split " << it->second.split << " from floor " << it->second.floor << endl; if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { dout(10) << *pg << " divergent, sending backlog" << endl; it->second.type = PG::Query::BACKLOG; } } if (it->second.type == PG::Query::BACKLOG) { dout(10) << *pg << " sending info+missing+backlog" << endl; if (pg->log.backlog) { m->log = pg->log; } else { pg->generate_backlog(); m->log = pg->log; pg->drop_backlog(); } } else if (it->second.type == PG::Query::FULLLOG) { dout(10) << *pg << " sending info+missing+full log" << endl; m->log.copy_non_backlog(pg->log); } dout(10) << *pg << " sending " << m->log << " " << m->missing << endl; //m->log.print(cout); _share_map_outgoing(osdmap->get_inst(from)); messenger->send_message(m, osdmap->get_inst(from)); } _unlock_pg(pgid); } do_notifies(notify_list); delete m; } void OSD::handle_pg_remove(MOSDPGRemove *m) { dout(7) << "handle_pg_remove from " << m->get_source() << endl; if (!require_same_or_newer_map(m, m->get_epoch())) return; for (set::iterator it = m->pg_list.begin(); it != m->pg_list.end(); it++) { pg_t pgid = *it; PG *pg; if (pg_map.count(pgid) == 0) { dout(10) << " don't have pg " << pgid << endl; continue; } pg = _lock_pg(pgid); dout(10) << *pg << " removing." << endl; assert(pg->get_role() == -1); _remove_pg(pgid); // unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0. assert(pg_lock_waiters.count(pgid) == 0); _unlock_pg(pgid); } delete m; } /*** RECOVERY ***/ /** pull - request object from a peer */ void OSD::pull(PG *pg, object_t oid) { assert(pg->missing.loc.count(oid)); eversion_t v = pg->missing.missing[oid]; int osd = pg->missing.loc[oid]; dout(7) << *pg << " pull " << oid << " v " << v << " from osd" << osd << endl; // send op tid_t tid = ++last_tid; MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid, oid, pg->get_pgid(), osdmap->get_epoch(), OSD_OP_PULL); op->set_version(v); messenger->send_message(op, osdmap->get_inst(osd)); // take note assert(pg->objects_pulling.count(oid) == 0); num_pulling++; pg->objects_pulling[oid] = v; } /** push - send object to a peer */ void OSD::push(PG *pg, object_t oid, int dest) { // read data+attrs bufferlist bl; eversion_t v; int vlen = sizeof(v); map attrset; ObjectStore::Transaction t; t.read(oid, 0, 0, &bl); t.getattr(oid, "version", &v, &vlen); t.getattrs(oid, attrset); unsigned tr = store->apply_transaction(t); assert(tr == 0); // !!! // ok dout(7) << *pg << " push " << oid << " v " << v << " size " << bl.length() << " to osd" << dest << endl; logger->inc("r_push"); logger->inc("r_pushb", bl.length()); // send MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid, oid, pg->info.pgid, osdmap->get_epoch(), OSD_OP_PUSH); op->set_offset(0); op->set_length(bl.length()); op->set_data(bl); // note: claims bl, set length above here! op->set_version(v); op->set_attrset(attrset); messenger->send_message(op, osdmap->get_inst(dest)); } /** op_pull * process request to pull an entire object. * NOTE: called from opqueue. */ void OSD::op_pull(MOSDOp *op, PG *pg) { const object_t oid = op->get_oid(); const eversion_t v = op->get_version(); int from = op->get_source().num(); dout(7) << *pg << " op_pull " << oid << " v " << op->get_version() << " from " << op->get_source() << endl; // is a replica asking? are they missing it? if (pg->is_primary()) { // primary assert(pg->peer_missing.count(from)); // we had better know this, from the peering process. if (!pg->peer_missing[from].is_missing(oid)) { dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl; delete op; return; } // do we have it yet? if (waitfor_missing_object(op, pg)) return; } else { // non-primary if (pg->missing.is_missing(oid)) { dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl; delete op; return; } } // push it back! push(pg, oid, op->get_source().num()); } /** op_push * NOTE: called from opqueue. */ void OSD::op_push(MOSDOp *op, PG *pg) { object_t oid = op->get_oid(); eversion_t v = op->get_version(); if (!pg->missing.is_missing(oid)) { dout(7) << *pg << " op_push not missing " << oid << endl; return; } dout(7) << *pg << " op_push " << oid << " v " << v << " size " << op->get_length() << " " << op->get_data().length() << endl; assert(op->get_data().length() == op->get_length()); // write object and add it to the PG ObjectStore::Transaction t; t.remove(oid); // in case old version exists t.write(oid, 0, op->get_length(), op->get_data()); t.setattrs(oid, op->get_attrset()); t.collection_add(pg->info.pgid, oid); // close out pull op? num_pulling--; if (pg->objects_pulling.count(oid)) pg->objects_pulling.erase(oid); pg->missing.got(oid, v); // raise last_complete? assert(pg->log.complete_to != pg->log.log.end()); while (pg->log.complete_to != pg->log.log.end()) { if (pg->missing.missing.count(pg->log.complete_to->oid)) break; if (pg->info.last_complete < pg->log.complete_to->version) pg->info.last_complete = pg->log.complete_to->version; pg->log.complete_to++; } dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl; // apply to disk! t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info)); unsigned r = store->apply_transaction(t); assert(r == 0); // am i primary? are others missing this too? if (pg->is_primary()) { for (unsigned i=1; iacting.size(); i++) { int peer = pg->acting[i]; assert(pg->peer_missing.count(peer)); if (pg->peer_missing[peer].is_missing(oid)) { // ok, push it, and they (will) have it now. pg->peer_missing[peer].got(oid, v); push(pg, oid, peer); } } } // continue recovery pg->do_recovery(); // kick waiters if (pg->waiting_for_missing_object.count(oid)) take_waiters(pg->waiting_for_missing_object[oid]); delete op; } // op_rep_modify // commit (to disk) callback class C_OSD_RepModifyCommit : public Context { public: OSD *osd; MOSDOp *op; int destosd; eversion_t pg_last_complete; Mutex lock; Cond cond; bool acked; bool waiting; C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) : osd(o), op(oo), destosd(dosd), pg_last_complete(lc), acked(false), waiting(false) { } void finish(int r) { lock.Lock(); assert(!waiting); while (!acked) { waiting = true; cond.Wait(lock); } assert(acked); lock.Unlock(); osd->op_rep_modify_commit(op, destosd, pg_last_complete); } void ack() { lock.Lock(); assert(!acked); acked = true; if (waiting) cond.Signal(); // discard my reference to buffer op->get_data().clear(); lock.Unlock(); } }; void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) { // send commit. dout(10) << "rep_modify_commit on op " << *op << ", sending commit to osd" << ackerosd << endl; MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); commit->set_pg_complete_thru(last_complete); messenger->send_message(commit, osdmap->get_inst(ackerosd)); delete op; } // process a modification operation class C_OSD_WriteCommit : public Context { public: OSD *osd; pg_t pgid; tid_t rep_tid; eversion_t pg_last_complete; C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {} void finish(int r) { osd->op_modify_commit(pgid, rep_tid, pg_last_complete); } }; /** op_rep_modify * process a replicated modify. * NOTE: called from opqueue. */ void OSD::op_rep_modify(MOSDOp *op, PG *pg) { object_t oid = op->get_oid(); eversion_t nv = op->get_version(); const char *opname = MOSDOp::get_opname(op->get_op()); // check crev objectrev_t crev = 0; store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); dout(10) << "op_rep_modify " << opname << " " << oid << " v " << nv << " " << op->get_offset() << "~" << op->get_length() << " in " << *pg << endl; // we better not be missing this. assert(!pg->missing.is_missing(oid)); // prepare our transaction ObjectStore::Transaction t; // am i acker? PG::RepOpGather *repop = 0; int ackerosd = pg->acting[0]; if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { ackerosd = pg->get_acker(); if (pg->is_acker()) { // i am tail acker. if (pg->repop_gather.count(op->get_rep_tid())) { repop = pg->repop_gather[ op->get_rep_tid() ]; } else { repop = new_repop_gather(pg, op); } // infer ack from source int fromosd = op->get_source().num(); get_repop_gather(repop); { //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. repop->waitfor_ack.erase(fromosd); } put_repop_gather(pg, repop); // prepare dest socket //messenger->prepare_send_message(op->get_client()); } // chain? forward? if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) { // chain rep, not at the tail yet. int myrank = osdmap->calc_pg_rank(whoami, pg->acting); int next = myrank+1; if (next == (int)pg->acting.size()) next = 1; issue_repop(pg, op, pg->acting[next]); } } // do op? C_OSD_RepModifyCommit *oncommit = 0; logger->inc("r_wr"); logger->inc("r_wrb", op->get_length()); if (repop) { // acker. we'll apply later. if (op->get_op() != OSD_OP_WRNOOP) { prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); } } else { // middle|replica. if (op->get_op() != OSD_OP_WRNOOP) { prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); } oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete); // apply log update. and possibly update itself. unsigned tr = store->apply_transaction(t, oncommit); if (tr != 0 && // no errors tr != 2) { // or error on collection_add cerr << "error applying transaction: r = " << tr << endl; assert(tr == 0); } } // ack? if (repop) { // (logical) local ack. this may induce the actual update. get_repop_gather(repop); { assert(repop->waitfor_ack.count(whoami)); repop->waitfor_ack.erase(whoami); } put_repop_gather(pg, repop); } else { // send ack to acker? if (g_conf.osd_rep != OSD_REP_CHAIN) { MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false); messenger->send_message(ack, osdmap->get_inst(ackerosd)); } // ack myself. assert(oncommit); oncommit->ack(); } } // ========================================================= // OPS void OSD::handle_op(MOSDOp *op) { const pg_t pgid = op->get_pg(); PG *pg = get_pg(pgid); logger->set("buf", buffer_total_alloc); // update qlen stats hb_stat_ops++; hb_stat_qlen += pending_ops; // require same or newer map if (!require_same_or_newer_map(op, op->get_map_epoch())) return; // share our map with sender, if they're old _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); // what kind of op? bool read = op->get_op() < 10; // read, stat. but not pull. if (!op->get_source().is_osd()) { // REGULAR OP (non-replication) // note original source op->set_client_inst( op->get_source_inst() ); op->clear_payload(); // and hose encoded payload (in case we forward) // have pg? if (!pg) { dout(7) << "hit non-existent pg " << pgid << ", waiting" << endl; waiting_for_pg[pgid].push_back(op); return; } if (read) { // read. am i the (same) acker? if (//pg->get_acker() != whoami || op->get_map_epoch() < pg->info.history.same_acker_since) { dout(7) << "acting acker is osd" << pg->get_acker() << " since " << pg->info.history.same_acker_since << ", dropping" << endl; assert(op->get_map_epoch() < osdmap->get_epoch()); delete op; return; } } else { // write. am i the (same) primary? if (pg->get_primary() != whoami || op->get_map_epoch() < pg->info.history.same_primary_since) { dout(7) << "acting primary is osd" << pg->get_primary() << " since " << pg->info.history.same_primary_since << ", dropping" << endl; assert(op->get_map_epoch() < osdmap->get_epoch()); delete op; return; } } // must be active. if (!pg->is_active()) { // replay? if (op->get_version().version > 0) { if (op->get_version() > pg->info.last_update) { dout(7) << *pg << " queueing replay at " << op->get_version() << " for " << *op << endl; pg->replay_queue[op->get_version()] = op; return; } else { dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update << " for " << *op << ", will queue for WRNOOP" << endl; } } dout(7) << *pg << " not active (yet)" << endl; pg->waiting_for_active.push_back(op); return; } // missing object? if (read && op->get_oid().rev > 0) { // versioned read. hrm. // are we missing a revision that we might need? object_t moid = op->get_oid(); if (pick_missing_object_rev(moid, pg)) { // is there a local revision we might use instead? object_t loid = op->get_oid(); if (store->pick_object_revision_lt(loid) && moid <= loid) { // we need moid. pull it. dout(10) << "handle_op read on " << op->get_oid() << ", have " << loid << ", but need missing " << moid << ", pulling" << endl; pull(pg, moid); pg->waiting_for_missing_object[moid].push_back(op); return; } dout(10) << "handle_op read on " << op->get_oid() << ", have " << loid << ", don't need missing " << moid << endl; } } else { // live revision. easy. if (op->get_op() != OSD_OP_PUSH && waitfor_missing_object(op, pg)) return; } dout(7) << "handle_op " << *op << " in " << *pg << endl; // balance reads? if (read && g_conf.osd_balance_reads && pg->get_acker() == whoami) { // test if (false) { if (pg->acting.size() > 1) { int peer = pg->acting[1]; dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl; messenger->send_message(op, osdmap->get_inst(peer)); return; } } // am i above my average? float my_avg = hb_stat_qlen / hb_stat_ops; if (pending_ops > my_avg) { // is there a peer who is below my average? for (unsigned i=1; iacting.size(); ++i) { int peer = pg->acting[i]; if (peer_qlen.count(peer) && peer_qlen[peer] < my_avg) { // calculate a probability that we should redirect float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb. if (drand48() <= p) { // take the first one dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg << ", p=" << p << ", fwd to peer w/ qlen " << peer_qlen[peer] << " osd" << peer << endl; messenger->send_message(op, osdmap->get_inst(peer)); return; } } } } } } else { // REPLICATION OP (it's from another OSD) // have pg? if (!pg) { derr(-7) << "handle_rep_op " << *op << " pgid " << pgid << " dne" << endl; delete op; //assert(0); // wtf, shouldn't happen. return; } // check osd map: same set, or primary+acker? if (g_conf.osd_rep == OSD_REP_CHAIN && op->get_map_epoch() < pg->info.history.same_since) { dout(10) << "handle_rep_op pg changed " << pg->info.history << " after " << op->get_map_epoch() << ", dropping" << endl; delete op; return; } if (g_conf.osd_rep != OSD_REP_CHAIN && (op->get_map_epoch() < pg->info.history.same_primary_since || op->get_map_epoch() < pg->info.history.same_acker_since)) { dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history << " after " << op->get_map_epoch() << ", dropping" << endl; delete op; return; } assert(pg->get_role() >= 0); dout(7) << "handle_rep_op " << op << " in " << *pg << endl; } if (g_conf.osd_maxthreads < 1) { _lock_pg(pgid); do_op(op, pg); // do it now _unlock_pg(pgid); } else { // queue for worker threads if (read) enqueue_op(0, op); // no locking needed for reads else enqueue_op(pgid, op); } } void OSD::handle_op_reply(MOSDOpReply *op) { if (op->get_map_epoch() < boot_epoch) { dout(3) << "replica op reply from before boot" << endl; delete op; return; } // must be a rep op. assert(op->get_source().is_osd()); // make sure we have the pg const pg_t pgid = op->get_pg(); PG *pg = get_pg(pgid); // require same or newer map if (!require_same_or_newer_map(op, op->get_map_epoch())) return; // share our map with sender, if they're old _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); if (!pg) { // hmm. delete op; } if (g_conf.osd_maxthreads < 1) { _lock_pg(pgid); do_op(op, pg); // do it now _unlock_pg(pgid); } else { enqueue_op(pgid, op); // queue for worker threads } } /* * enqueue called with osd_lock held */ void OSD::enqueue_op(pg_t pgid, Message *op) { while (pending_ops > g_conf.osd_max_opq) { dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl; op_queue_cond.Wait(osd_lock); } op_queue[pgid].push_back(op); pending_ops++; logger->set("opq", pending_ops); threadpool->put_op(pgid); } /* * NOTE: dequeue called in worker thread, without osd_lock */ void OSD::dequeue_op(pg_t pgid) { Message *op = 0; PG *pg = 0; osd_lock.Lock(); { if (pgid) { // lock pg pg = _lock_pg(pgid); } // get pending op list &ls = op_queue[pgid]; assert(!ls.empty()); op = ls.front(); ls.pop_front(); if (pgid) { dout(10) << "dequeue_op " << op << " write pg " << pgid << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; } else { dout(10) << "dequeue_op " << op << " read " << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; } if (ls.empty()) op_queue.erase(pgid); } osd_lock.Unlock(); // do it do_op(op, pg); // finish osd_lock.Lock(); { if (pgid) { // unlock pg _unlock_pg(pgid); } dout(10) << "dequeue_op " << op << " finish" << endl; assert(pending_ops > 0); if (pending_ops > g_conf.osd_max_opq) op_queue_cond.Signal(); pending_ops--; logger->set("opq", pending_ops); if (pending_ops == 0 && waiting_for_no_ops) no_pending_ops.Signal(); } osd_lock.Unlock(); } /** do_op - do an op * object lock will be held (if multithreaded) * osd_lock NOT held. */ void OSD::do_op(Message *m, PG *pg) { //dout(15) << "do_op " << *m << endl; if (m->get_type() == MSG_OSD_OP) { MOSDOp *op = (MOSDOp*)m; logger->inc("op"); switch (op->get_op()) { // reads case OSD_OP_READ: op_read(op);//, pg); break; case OSD_OP_STAT: op_stat(op);//, pg); break; // rep stuff case OSD_OP_PULL: op_pull(op, pg); break; case OSD_OP_PUSH: op_push(op, pg); break; // writes case OSD_OP_WRNOOP: case OSD_OP_WRITE: case OSD_OP_ZERO: case OSD_OP_DELETE: case OSD_OP_TRUNCATE: case OSD_OP_WRLOCK: case OSD_OP_WRUNLOCK: case OSD_OP_RDLOCK: case OSD_OP_RDUNLOCK: case OSD_OP_UPLOCK: case OSD_OP_DNLOCK: if (op->get_source().is_osd()) op_rep_modify(op, pg); else op_modify(op, pg); break; default: assert(0); } } else if (m->get_type() == MSG_OSD_OPREPLY) { // must be replication. MOSDOpReply *r = (MOSDOpReply*)m; tid_t rep_tid = r->get_rep_tid(); if (pg->repop_gather.count(rep_tid)) { // oh, good. int fromosd = r->get_source().num(); repop_ack(pg, pg->repop_gather[rep_tid], r->get_result(), r->get_commit(), fromosd, r->get_pg_complete_thru()); delete m; } else { // early ack. pg->waiting_for_repop[rep_tid].push_back(r); } } else assert(0); } void OSD::wait_for_no_ops() { if (pending_ops > 0) { dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl; waiting_for_no_ops = true; while (pending_ops > 0) no_pending_ops.Wait(osd_lock); waiting_for_no_ops = false; assert(pending_ops == 0); } dout(7) << "wait_for_no_ops - none" << endl; } // ============================== // Object locking // // If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock // returns true if object was locked, otherwise returns false // bool OSD::block_if_wrlocked(MOSDOp* op) { object_t oid = op->get_oid(); entity_name_t source; int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); //cout << "getattr returns " << len << " on " << oid << endl; if (len == sizeof(source) && source != op->get_client()) { //the object is locked for writing by someone else -- add the op to the waiting queue waiting_for_wr_unlock[oid].push_back(op); return true; } return false; //the object wasn't locked, so the operation can be handled right away } // =============================== // OPS /* int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) { int c = 0; oid.rev = 0; map::iterator p = pg->missing.missing.lower_bound(oid); if (p == pg->missing.missing.end()) return 0; // clearly not while (p->first.ino == oid.ino && p->first.bno == oid.bno) { revs.insert(p->first); c++; } return c; }*/ bool OSD::pick_missing_object_rev(object_t& oid, PG *pg) { map::iterator p = pg->missing.missing.upper_bound(oid); if (p == pg->missing.missing.end()) return false; // clearly no candidate if (p->first.ino == oid.ino && p->first.bno == oid.bno) { oid = p->first; // yes! it's an upper bound revision for me. return true; } return false; } bool OSD::pick_object_rev(object_t& oid) { object_t t = oid; if (!store->pick_object_revision_lt(t)) return false; // we have no revisions of this object! objectrev_t crev; int r = store->getattr(t, "crev", &crev, sizeof(crev)); assert(r >= 0); if (crev <= oid.rev) { dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl; oid = t; return true; } return false; } bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg) { const object_t oid = op->get_oid(); // are we missing the object? if (pg->missing.missing.count(oid)) { // we don't have it (yet). eversion_t v = pg->missing.missing[oid]; if (pg->objects_pulling.count(oid)) { dout(7) << "missing " << oid << " v " << v << " in " << *pg << ", already pulling" << endl; } else { dout(7) << "missing " << oid << " v " << v << " in " << *pg << ", pulling" << endl; pull(pg, oid); } pg->waiting_for_missing_object[oid].push_back(op); return true; } return false; } // READ OPS /** op_read * client read op * NOTE: called from opqueue. */ void OSD::op_read(MOSDOp *op)//, PG *pg) { object_t oid = op->get_oid(); // if the target object is locked for writing by another client, put 'op' to the waiting queue // for _any_ op type -- eg only the locker can unlock! if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks dout(10) << "op_read " << oid << " " << op->get_offset() << "~" << op->get_length() //<< " in " << *pg << endl; long r = 0; bufferlist bl; if (oid.rev && !pick_object_rev(oid)) { // we have no revision for this request. r = -EEXIST; } else { // read into a buffer r = store->read(oid, op->get_offset(), op->get_length(), bl); } // set up reply MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); if (r >= 0) { reply->set_result(0); reply->set_data(bl); reply->set_length(r); logger->inc("c_rd"); logger->inc("c_rdb", r); } else { reply->set_result(r); // error reply->set_length(0); } dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl; logger->inc("rd"); if (r >= 0) logger->inc("rdb", r); // send it messenger->send_message(reply, op->get_client_inst()); delete op; } /** op_stat * client stat * NOTE: called from opqueue */ void OSD::op_stat(MOSDOp *op)//, PG *pg) { object_t oid = op->get_oid(); // if the target object is locked for writing by another client, put 'op' to the waiting queue if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks struct stat st; memset(&st, sizeof(st), 0); int r = 0; if (oid.rev && !pick_object_rev(oid)) { // we have no revision for this request. r = -EEXIST; } else { r = store->stat(oid, &st); } dout(3) << "op_stat on " << oid << " r = " << r << " size = " << st.st_size //<< " in " << *pg << endl; MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true); reply->set_object_size(st.st_size); messenger->send_message(reply, op->get_client_inst()); logger->inc("stat"); delete op; } /********* * new repops */ void OSD::get_repop_gather(PG::RepOpGather *repop) { //repop->lock.Lock(); dout(10) << "get_repop " << *repop << endl; } void OSD::apply_repop(PG *pg, PG::RepOpGather *repop) { dout(10) << "apply_repop applying update on " << *repop << endl; assert(!repop->applied); Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete); unsigned r = store->apply_transaction(repop->t, oncommit); if (r) dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl; // discard my reference to buffer repop->op->get_data().clear(); repop->applied = true; } void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) { dout(10) << "put_repop " << *repop << endl; // commit? if (repop->can_send_commit() && repop->op->wants_commit()) { // send commit. MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true); dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl; messenger->send_message(reply, repop->op->get_client_inst()); repop->sent_commit = true; } // ack? else if (repop->can_send_ack() && repop->op->wants_ack()) { // apply apply_repop(pg, repop); // send ack MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false); dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl; messenger->send_message(reply, repop->op->get_client_inst()); repop->sent_ack = true; utime_t now = g_clock.now(); now -= repop->start; logger->finc("rlsum", now); logger->inc("rlnum", 1); } // done. if (repop->can_delete()) { // adjust peers_complete_thru if (!repop->pg_complete_thru.empty()) { eversion_t min = pg->info.last_complete; // hrm.... for (unsigned i=0; iacting.size(); i++) { if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. min = repop->pg_complete_thru[pg->acting[i]]; } if (min > pg->peers_complete_thru) { dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl; pg->peers_complete_thru = min; } } dout(10) << "put_repop deleting " << *repop << endl; //repop->lock.Unlock(); assert(pg->repop_gather.count(repop->rep_tid)); pg->repop_gather.erase(repop->rep_tid); delete repop->op; delete repop; } else { //repop->lock.Unlock(); } } void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) { object_t oid = op->get_oid(); dout(7) << " issue_repop rep_tid " << op->get_rep_tid() << " in " << *pg << " o " << oid << " to osd" << osd << endl; // forward the write/update/whatever MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, oid, pg->get_pgid(), osdmap->get_epoch(), op->get_op()); wr->get_data() = op->get_data(); // _copy_ bufferlist wr->set_length(op->get_length()); wr->set_offset(op->get_offset()); wr->set_version(op->get_version()); wr->set_rep_tid(op->get_rep_tid()); wr->set_pg_trim_to(pg->peers_complete_thru); messenger->send_message(wr, osdmap->get_inst(osd)); } PG::RepOpGather *OSD::new_repop_gather(PG *pg, MOSDOp *op) { dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl; PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(), op->get_version(), pg->info.last_complete); // osds. commits all come to me. for (unsigned i=0; iacting.size(); i++) { int osd = pg->acting[i]; repop->osds.insert(osd); repop->waitfor_commit.insert(osd); } // acks vary: if (g_conf.osd_rep == OSD_REP_CHAIN) { // chain rep. // there's my local ack... repop->osds.insert(whoami); repop->waitfor_ack.insert(whoami); repop->waitfor_commit.insert(whoami); // also, the previous guy will ack to me int myrank = osdmap->calc_pg_rank(whoami, pg->acting); if (myrank > 0) { int osd = pg->acting[ myrank-1 ]; repop->osds.insert(osd); repop->waitfor_ack.insert(osd); repop->waitfor_commit.insert(osd); } } else { // primary, splay. all osds ack to me. for (unsigned i=0; iacting.size(); i++) { int osd = pg->acting[i]; repop->waitfor_ack.insert(osd); } } repop->start = g_clock.now(); pg->repop_gather[ repop->rep_tid ] = repop; // anyone waiting? (acks that got here before the op did) if (pg->waiting_for_repop.count(repop->rep_tid)) { take_waiters(pg->waiting_for_repop[repop->rep_tid]); pg->waiting_for_repop.erase(repop->rep_tid); } return repop; } void OSD::repop_ack(PG *pg, PG::RepOpGather *repop, int result, bool commit, int fromosd, eversion_t pg_complete_thru) { MOSDOp *op = repop->op; dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op << " result " << result << " commit " << commit << " from osd" << fromosd << " in " << *pg << endl; get_repop_gather(repop); { if (commit) { // commit assert(repop->waitfor_commit.count(fromosd)); repop->waitfor_commit.erase(fromosd); repop->waitfor_ack.erase(fromosd); repop->pg_complete_thru[fromosd] = pg_complete_thru; } else { // ack repop->waitfor_ack.erase(fromosd); } } put_repop_gather(pg, repop); } /** op_modify_commit * transaction commit on the acker. */ void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru) { PG *pg = lock_pg(pgid); if (pg) { if (pg->repop_gather.count(rep_tid)) { PG::RepOpGather *repop = pg->repop_gather[rep_tid]; dout(10) << "op_modify_commit " << *repop->op << endl; get_repop_gather(repop); { assert(repop->waitfor_commit.count(whoami)); repop->waitfor_commit.erase(whoami); repop->pg_complete_thru[whoami] = pg_complete_thru; } put_repop_gather(pg, repop); dout(10) << "op_modify_commit done on " << repop << endl; } else { dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl; } unlock_pg(pgid); } else { dout(10) << "op_modify_commit pg " << pgid << " dne" << endl; } } /** op_modify * process client modify op * NOTE: called from opqueue. */ void OSD::op_modify(MOSDOp *op, PG *pg) { object_t oid = op->get_oid(); const char *opname = MOSDOp::get_opname(op->get_op()); // are any peers missing this? for (unsigned i=1; iacting.size(); i++) { int peer = pg->acting[i]; if (pg->peer_missing.count(peer) && pg->peer_missing[peer].is_missing(oid)) { // push it before this update. // FIXME, this is probably extra much work (eg if we're about to overwrite) pg->peer_missing[peer].got(oid); push(pg, oid, peer); } } // dup op? if (pg->log.logged_req(op->get_reqid())) { dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() << ", doing WRNOOP" << endl; op->set_op(OSD_OP_WRNOOP); opname = MOSDOp::get_opname(op->get_op()); } // locked by someone else? // for _any_ op type -- eg only the locker can unlock! if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks // check crev objectrev_t crev = 0; store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); // assign version eversion_t clone_version; eversion_t nv = pg->log.top; if (op->get_op() != OSD_OP_WRNOOP) { nv.epoch = osdmap->get_epoch(); nv.version++; assert(nv > pg->info.last_update); assert(nv > pg->log.top); // will clone? if (crev && op->get_rev() && op->get_rev() > crev) { clone_version = nv; nv.version++; } if (op->get_version().version) { // replay! if (nv.version < op->get_version().version) { nv.version = op->get_version().version; // clone? if (crev && op->get_rev() && op->get_rev() > crev) { // backstep clone clone_version = nv; clone_version.version--; } } } } // set version in op, for benefit of client and our eventual reply op->set_version(nv); dout(10) << "op_modify " << opname << " " << oid << " v " << nv << " crev " << crev << " rev " << op->get_rev() << " " << op->get_offset() << "~" << op->get_length() << endl; if (op->get_op() == OSD_OP_WRITE) { logger->inc("c_wr"); logger->inc("c_wrb", op->get_length()); } // share latest osd map? osd_lock.Lock(); { for (unsigned i=1; iacting.size(); i++) { int osd = pg->acting[i]; _share_map_outgoing( osdmap->get_inst(osd) ); } } osd_lock.Unlock(); // issue replica writes PG::RepOpGather *repop = 0; bool alone = (pg->acting.size() == 1); tid_t rep_tid = ++last_tid; op->set_rep_tid(rep_tid); if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { // chain rep. send to #2 only. int next = pg->acting[1]; if (pg->acting.size() > 2) next = pg->acting[2]; issue_repop(pg, op, next); } else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { // splay rep. send to rest. for (unsigned i=1; iacting.size(); ++i) //for (unsigned i=pg->acting.size()-1; i>=1; --i) issue_repop(pg, op, pg->acting[i]); } else { // primary rep, or alone. repop = new_repop_gather(pg, op); // send to rest. if (!alone) for (unsigned i=1; iacting.size(); i++) issue_repop(pg, op, pg->acting[i]); } if (repop) { // we are acker. if (op->get_op() != OSD_OP_WRNOOP) { // log and update later. prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); } // (logical) local ack. // (if alone, this will apply the update.) get_repop_gather(repop); { assert(repop->waitfor_ack.count(whoami)); repop->waitfor_ack.erase(whoami); } put_repop_gather(pg, repop); } else { // chain or splay. apply. ObjectStore::Transaction t; prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(), pg->info.last_complete); unsigned r = store->apply_transaction(t, oncommit); if (r != 0 && // no errors r != 2) { // or error on collection_add cerr << "error applying transaction: r = " << r << endl; assert(r == 0); } oncommit->ack(); } } void OSD::prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp *op, eversion_t& version, objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to) { const object_t oid = op->get_oid(); // clone entry? if (crev && rev && rev > crev) { eversion_t cv = version; cv.version--; PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); pg->log.add(cloneentry); dout(10) << "prepare_log_transaction " << op->get_op() << " " << cloneentry << " in " << *pg << endl; } // actual op int opcode = PG::Log::Entry::MODIFY; if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; PG::Log::Entry logentry(opcode, oid, version, op->get_reqid()); dout(10) << "prepare_log_transaction " << op->get_op() << " " << logentry << " in " << *pg << endl; // append to log assert(version > pg->log.top); pg->log.add(logentry); assert(pg->log.top == version); dout(10) << "prepare_log_transaction appended to " << *pg << endl; // write to pg log on disk pg->append_log(t, logentry, trim_to); } /** prepare_op_transaction * apply an op to the store wrapped in a transaction. */ void OSD::prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp *op, eversion_t& version, objectrev_t crev, objectrev_t rev, PG *pg) { const object_t oid = op->get_oid(); const pg_t pgid = op->get_pg(); bool did_clone = false; dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) << " " << oid << " v " << version << " crev " << crev << " rev " << rev << " in " << *pg << endl; // WRNOOP does nothing. if (op->get_op() == OSD_OP_WRNOOP) return; // raise last_complete? if (pg->info.last_complete == pg->info.last_update) pg->info.last_complete = version; // raise last_update. assert(version > pg->info.last_update); pg->info.last_update = version; // write pg info t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info)); // clone? if (crev && rev && rev > crev) { object_t noid = oid; noid.rev = rev; dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl; t.clone(oid, noid); did_clone = true; } // apply the op switch (op->get_op()) { case OSD_OP_WRLOCK: { // lock object //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit); t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); } break; case OSD_OP_WRUNLOCK: { // unlock objects //r = store->rmattr(oid, "wrlock", oncommit); t.rmattr(oid, "wrlock"); // unblock all operations that were waiting for this object to become unlocked if (waiting_for_wr_unlock.count(oid)) { take_waiters(waiting_for_wr_unlock[oid]); waiting_for_wr_unlock.erase(oid); } } break; case OSD_OP_WRITE: { // write assert(op->get_data().length() == op->get_length()); bufferlist bl; bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! //if (oid < 100000000000000ULL) // hack hack-- don't write client data t.write( oid, op->get_offset(), op->get_length(), bl ); } break; case OSD_OP_ZERO: { assert(0); // are you sure this is what you want? // zero, remove, or truncate? struct stat st; int r = store->stat(oid, &st); if (r >= 0) { if (op->get_offset() + op->get_length() >= st.st_size) { if (op->get_offset()) t.truncate(oid, op->get_length() + op->get_offset()); else t.remove(oid); } else { // zero. the dumb way. FIXME. bufferptr bp(op->get_length()); bp.zero(); bufferlist bl; bl.push_back(bp); t.write(oid, op->get_offset(), op->get_length(), bl); } } else { // noop? dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl; } } break; case OSD_OP_TRUNCATE: { // truncate //r = store->truncate(oid, op->get_offset()); t.truncate(oid, op->get_length() ); } break; case OSD_OP_DELETE: { // delete //r = store->remove(oid); t.remove(oid); } break; default: assert(0); } // object collection, version if (op->get_op() == OSD_OP_DELETE) { // remove object from c t.collection_remove(pgid, oid); } else { // add object to c t.collection_add(pgid, oid); // object version t.setattr(oid, "version", &version, sizeof(version)); // set object crev if (crev == 0 || // new object did_clone) // we cloned t.setattr(oid, "crev", &rev, sizeof(rev)); } }