ceph/branches/riccardo/monitor2/osd/OSD.cc

3480 lines
88 KiB
C++

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "include/types.h"
#include "OSD.h"
#include "OSDMap.h"
#ifdef USE_OBFS
# include "OBFSStore.h"
#else
# include "FakeStore.h"
#endif
#include "ebofs/Ebofs.h"
#include "Ager.h"
#include "msg/Messenger.h"
#include "msg/Message.h"
#include "messages/MGenericMessage.h"
#include "messages/MPing.h"
#include "messages/MPingAck.h"
#include "messages/MOSDPing.h"
#include "messages/MOSDFailure.h"
#include "messages/MOSDOp.h"
#include "messages/MOSDOpReply.h"
#include "messages/MOSDBoot.h"
#include "messages/MOSDIn.h"
#include "messages/MOSDOut.h"
#include "messages/MOSDMap.h"
#include "messages/MOSDGetMap.h"
#include "messages/MOSDPGNotify.h"
#include "messages/MOSDPGQuery.h"
#include "messages/MOSDPGLog.h"
#include "messages/MOSDPGRemove.h"
#include "common/Logger.h"
#include "common/LogType.h"
#include "common/Timer.h"
#include "common/ThreadPool.h"
#include <iostream>
#include <cassert>
#include <errno.h>
#include <sys/stat.h>
#include "config.h"
#undef dout
#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
char *osd_base_path = "./osddata";
char *ebofs_base_path = "./dev";
object_t SUPERBLOCK_OBJECT(0,0);
// <hack> force remount hack for performance testing FakeStore
class C_Remount : public Context {
OSD *osd;
public:
C_Remount(OSD *o) : osd(o) {}
void finish(int) {
osd->force_remount();
}
};
void OSD::force_remount()
{
dout(0) << "forcing remount" << endl;
osd_lock.Lock();
{
store->umount();
store->mount();
}
osd_lock.Unlock();
dout(0) << "finished remount" << endl;
}
// </hack>
// cons/des
LogType osd_logtype;
OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock)
{
whoami = id;
messenger = m;
monmap = mm;
osdmap = 0;
boot_epoch = 0;
last_tid = 0;
num_pulling = 0;
state = STATE_BOOTING;
hb_stat_ops = 0;
hb_stat_qlen = 0;
pending_ops = 0;
waiting_for_no_ops = false;
if (g_conf.osd_remount_at)
timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this));
// init object store
// try in this order:
// dev/osd$num
// dev/osd.$hostname
// dev/osd.all
if (dev) {
strcpy(dev_path,dev);
} else {
char hostname[100];
hostname[0] = 0;
gethostname(hostname,100);
sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami);
struct stat sta;
if (::lstat(dev_path, &sta) != 0)
sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname);
if (::lstat(dev_path, &sta) != 0)
sprintf(dev_path, "%s/osd.all", ebofs_base_path);
}
if (g_conf.ebofs) {
store = new Ebofs(dev_path);
//store->_fake_writes(true);
}
#ifdef USE_OBFS
else if (g_conf.uofs) {
store = new OBFSStore(whoami, NULL, dev_path);
}
#endif
else {
store = new FakeStore(osd_base_path, whoami);
}
}
OSD::~OSD()
{
if (threadpool) { delete threadpool; threadpool = 0; }
if (osdmap) { delete osdmap; osdmap = 0; }
//if (monitor) { delete monitor; monitor = 0; }
if (messenger) { delete messenger; messenger = 0; }
if (logger) { delete logger; logger = 0; }
if (store) { delete store; store = 0; }
}
int OSD::init()
{
osd_lock.Lock();
{
// mkfs?
if (g_conf.osd_mkfs) {
dout(2) << "mkfs" << endl;
store->mkfs();
// make up a superblock
//superblock.fsid = ???;
superblock.whoami = whoami;
}
// mount.
dout(2) << "mounting " << dev_path << endl;
int r = store->mount();
assert(r>=0);
if (g_conf.osd_mkfs) {
// age?
if (g_conf.osd_age_time != 0) {
dout(2) << "age" << endl;
Ager ager(store);
if (g_conf.osd_age_time < 0)
ager.load_freelist();
else
ager.age(g_conf.osd_age_time,
g_conf.osd_age,
g_conf.osd_age - .05,
50000,
g_conf.osd_age - .05);
}
}
else {
dout(2) << "boot" << endl;
// read superblock
read_superblock();
// load up pgs (as they previously existed)
load_pgs();
dout(2) << "superblock: i am osd" << superblock.whoami << endl;
assert(whoami == superblock.whoami);
}
// log
char name[80];
sprintf(name, "osd%02d", whoami);
logger = new Logger(name, (LogType*)&osd_logtype);
osd_logtype.add_set("opq");
osd_logtype.add_inc("op");
osd_logtype.add_inc("c_rd");
osd_logtype.add_inc("c_rdb");
osd_logtype.add_inc("c_wr");
osd_logtype.add_inc("c_wrb");
osd_logtype.add_inc("r_push");
osd_logtype.add_inc("r_pushb");
osd_logtype.add_inc("r_wr");
osd_logtype.add_inc("r_wrb");
osd_logtype.add_inc("rlnum");
osd_logtype.add_set("numpg");
osd_logtype.add_set("pingset");
osd_logtype.add_set("buf");
osd_logtype.add_inc("map");
osd_logtype.add_inc("mapi");
osd_logtype.add_inc("mapidup");
osd_logtype.add_inc("mapf");
osd_logtype.add_inc("mapfdup");
// request thread pool
{
char name[80];
sprintf(name,"osd%d.threadpool", whoami);
threadpool = new ThreadPool<OSD*, pg_t>(name, g_conf.osd_maxthreads,
static_dequeueop,
this);
}
// i'm ready!
messenger->set_dispatcher(this);
// announce to monitor i exist and have booted.
int mon = monmap->pick_mon();
messenger->send_message(new MOSDBoot(superblock), monmap->get_inst(mon));
// start the heart
timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this));
}
osd_lock.Unlock();
//dout(0) << "osd_rep " << g_conf.osd_rep << endl;
return 0;
}
int OSD::shutdown()
{
dout(1) << "shutdown" << endl;
state = STATE_STOPPING;
// cancel timers
timer.cancel_all();
timer.join();
// finish ops
wait_for_no_ops();
// stop threads
delete threadpool;
threadpool = 0;
// close pgs
for (hash_map<pg_t, PG*>::iterator p = pg_map.begin();
p != pg_map.end();
p++) {
delete p->second;
}
pg_map.clear();
// shut everything else down
//monitor->shutdown();
messenger->shutdown();
osd_lock.Unlock();
int r = store->umount();
osd_lock.Lock();
return r;
}
void OSD::write_superblock(ObjectStore::Transaction& t)
{
dout(10) << "write_superblock " << superblock << endl;
bufferlist bl;
bl.append((char*)&superblock, sizeof(superblock));
t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
}
int OSD::read_superblock()
{
bufferlist bl;
int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
if (bl.length() != sizeof(superblock)) {
dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl;
return -1;
}
bl.copy(0, sizeof(superblock), (char*)&superblock);
dout(10) << "read_superblock " << superblock << endl;
// load up "current" osdmap
assert(!osdmap);
osdmap = new OSDMap;
bl.clear();
get_map_bl(superblock.current_epoch, bl);
osdmap->decode(bl);
assert(whoami == superblock.whoami); // fixme!
return 0;
}
// object locks
PG *OSD::lock_pg(pg_t pgid)
{
osd_lock.Lock();
PG *pg = _lock_pg(pgid);
osd_lock.Unlock();
return pg;
}
PG *OSD::_lock_pg(pg_t pgid)
{
assert(pg_map.count(pgid));
if (pg_lock.count(pgid)) {
Cond c;
dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl;
//cerr << "lock_pg " << pgid << " waiting as " << &c << endl;
list<Cond*>& ls = pg_lock_waiters[pgid]; // this is commit, right?
ls.push_back(&c);
while (pg_lock.count(pgid) ||
ls.front() != &c)
c.Wait(osd_lock);
assert(ls.front() == &c);
ls.pop_front();
if (ls.empty())
pg_lock_waiters.erase(pgid);
}
dout(15) << "lock_pg " << pgid << endl;
pg_lock.insert(pgid);
return pg_map[pgid];
}
void OSD::unlock_pg(pg_t pgid)
{
osd_lock.Lock();
_unlock_pg(pgid);
osd_lock.Unlock();
}
void OSD::_unlock_pg(pg_t pgid)
{
// unlock
assert(pg_lock.count(pgid));
pg_lock.erase(pgid);
if (pg_lock_waiters.count(pgid)) {
// someone is in line
Cond *c = pg_lock_waiters[pgid].front();
assert(c);
dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl;
c->Signal();
} else {
// nobody waiting
dout(15) << "unlock_pg " << pgid << endl;
}
}
void OSD::_remove_pg(pg_t pgid)
{
dout(10) << "_remove_pg " << pgid << endl;
// remove from store
list<object_t> olist;
store->collection_list(pgid, olist);
ObjectStore::Transaction t;
{
for (list<object_t>::iterator p = olist.begin();
p != olist.end();
p++)
t.remove(*p);
t.remove_collection(pgid);
t.remove(object_t(1,pgid)); // log too
}
store->apply_transaction(t);
// hose from memory
delete pg_map[pgid];
pg_map.erase(pgid);
}
void OSD::activate_pg(pg_t pgid, epoch_t epoch)
{
osd_lock.Lock();
{
if (pg_map.count(pgid)) {
PG *pg = _lock_pg(pgid);
if (pg->is_crashed() &&
pg->is_replay() &&
pg->get_role() == 0 &&
pg->info.history.same_primary_since <= epoch) {
ObjectStore::Transaction t;
pg->activate(t);
store->apply_transaction(t);
}
_unlock_pg(pgid);
}
}
// finishers?
if (finished.empty()) {
osd_lock.Unlock();
} else {
list<Message*> waiting;
waiting.splice(waiting.begin(), finished);
osd_lock.Unlock();
for (list<Message*>::iterator it = waiting.begin();
it != waiting.end();
it++) {
dispatch(*it);
}
}
}
// -------------------------------------
void OSD::heartbeat()
{
utime_t now = g_clock.now();
utime_t since = now;
since.sec_ref() -= g_conf.osd_heartbeat_interval;
// calc my stats
float avg_qlen = 0;
if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops;
dout(5) << "heartbeat " << now
<< ": ops " << hb_stat_ops
<< ", avg qlen " << avg_qlen
<< endl;
// reset until next time around
hb_stat_ops = 0;
hb_stat_qlen = 0;
// send pings
set<int> pingset;
for (hash_map<pg_t, PG*>::iterator i = pg_map.begin();
i != pg_map.end();
i++) {
PG *pg = i->second;
// we want to ping the primary.
if (pg->get_role() <= 0) continue;
if (pg->acting.size() < 1) continue;
if (pg->last_heartbeat < since) {
pg->last_heartbeat = now;
pingset.insert(pg->acting[0]);
}
}
for (set<int>::iterator i = pingset.begin();
i != pingset.end();
i++) {
_share_map_outgoing( osdmap->get_inst(*i) );
messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen),
osdmap->get_inst(*i));
}
if (logger) logger->set("pingset", pingset.size());
// hack: fake reorg?
if (osdmap && g_conf.fake_osdmap_updates) {
int mon = monmap->pick_mon();
if ((rand() % g_conf.fake_osdmap_updates) == 0) {
//if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) {
messenger->send_message(new MOSDIn(osdmap->get_epoch()),
monmap->get_inst(mon));
}
/*
if (osdmap->is_out(whoami)) {
messenger->send_message(new MOSDIn(osdmap->get_epoch()),
MSG_ADDR_MON(mon), monmap->get_inst(mon));
}
else if ((rand() % g_conf.fake_osdmap_updates) == 0) {
//messenger->send_message(new MOSDOut(osdmap->get_epoch()),
//MSG_ADDR_MON(mon), monmap->get_inst(mon));
}
}
*/
}
// schedule next! randomly.
float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval;
timer.add_event_after(wait, new C_Heartbeat(this));
}
// --------------------------------------
// dispatch
bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch)
{
bool shared = false;
// does client have old map?
if (inst.name.is_client()) {
if (epoch < osdmap->get_epoch()) {
dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
send_incremental_map(epoch, inst, true);
shared = true;
}
}
// does peer have old map?
if (inst.name.is_osd()) {
// remember
if (peer_map_epoch[inst.name] < epoch)
peer_map_epoch[inst.name] = epoch;
// older?
if (peer_map_epoch[inst.name] < osdmap->get_epoch()) {
dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
send_incremental_map(epoch, inst, true);
peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again.
shared = true;
}
}
return shared;
}
void OSD::_share_map_outgoing(const entity_inst_t& inst)
{
assert(inst.name.is_osd());
if (inst.name.is_osd()) {
// send map?
if (peer_map_epoch.count(inst.name)) {
epoch_t pe = peer_map_epoch[inst.name];
if (pe < osdmap->get_epoch()) {
send_incremental_map(pe, inst, true);
peer_map_epoch[inst.name] = osdmap->get_epoch();
}
} else {
// no idea about peer's epoch.
// ??? send recent ???
// do nothing.
}
}
}
void OSD::dispatch(Message *m)
{
// lock!
osd_lock.Lock();
switch (m->get_type()) {
// -- don't need lock --
case MSG_PING:
dout(10) << "ping from " << m->get_source() << endl;
delete m;
break;
// -- don't need OSDMap --
/*
// host monitor
case MSG_PING_ACK:
case MSG_FAILURE_ACK:
monitor->proc_message(m);
break;
*/
// map and replication
case MSG_OSD_MAP:
handle_osd_map((MOSDMap*)m);
break;
// osd
case MSG_SHUTDOWN:
shutdown();
delete m;
break;
// -- need OSDMap --
default:
{
// no map? starting up?
if (!osdmap) {
dout(7) << "no OSDMap, not booted" << endl;
waiting_for_osdmap.push_back(m);
break;
}
// down?
if (osdmap->is_down(whoami)) {
dout(7) << "i am marked down, dropping " << *m << endl;
delete m;
break;
}
// need OSDMap
switch (m->get_type()) {
case MSG_OSD_PING:
// take note.
handle_osd_ping((MOSDPing*)m);
break;
case MSG_OSD_PG_NOTIFY:
handle_pg_notify((MOSDPGNotify*)m);
break;
case MSG_OSD_PG_QUERY:
handle_pg_query((MOSDPGQuery*)m);
break;
case MSG_OSD_PG_LOG:
handle_pg_log((MOSDPGLog*)m);
break;
case MSG_OSD_PG_REMOVE:
handle_pg_remove((MOSDPGRemove*)m);
break;
case MSG_OSD_OP:
handle_op((MOSDOp*)m);
break;
// for replication etc.
case MSG_OSD_OPREPLY:
handle_op_reply((MOSDOpReply*)m);
break;
default:
dout(1) << " got unknown message " << m->get_type() << endl;
assert(0);
}
}
}
// finishers?
if (!finished.empty()) {
list<Message*> waiting;
waiting.splice(waiting.begin(), finished);
osd_lock.Unlock();
for (list<Message*>::iterator it = waiting.begin();
it != waiting.end();
it++) {
dispatch(*it);
}
return;
}
osd_lock.Unlock();
}
void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst)
{
entity_name_t dest = inst.name;
if (g_conf.ms_die_on_failure) {
exit(0);
}
if (dest.is_osd()) {
// failed osd. drop message, report to mon.
int mon = monmap->pick_mon();
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", dropping and reporting to mon" << mon
<< endl;
messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()),
monmap->get_inst(mon));
delete m;
} else if (dest.is_mon()) {
// resend to a different monitor.
int mon = monmap->pick_mon(true);
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", resending to mon" << mon
<< endl;
messenger->send_message(m, monmap->get_inst(mon));
}
else {
// client?
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", dropping" << endl;
delete m;
}
}
void OSD::handle_osd_ping(MOSDPing *m)
{
dout(20) << "osdping from " << m->get_source() << endl;
_share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
int from = m->get_source().num();
peer_qlen[from] = m->avg_qlen;
//if (!m->ack)
//messenger->send_message(new MOSDPing(osdmap->get_epoch(), true),
//m->get_source());
delete m;
}
// =====================================================
// MAP
void OSD::wait_for_new_map(Message *m)
{
// ask
if (waiting_for_osdmap.empty()) {
int mon = monmap->pick_mon();
messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
monmap->get_inst(mon));
}
waiting_for_osdmap.push_back(m);
}
/** update_map
* assimilate new OSDMap(s). scan pgs, etc.
*/
void OSD::handle_osd_map(MOSDMap *m)
{
wait_for_no_ops();
assert(osd_lock.is_locked());
ObjectStore::Transaction t;
if (osdmap) {
dout(3) << "handle_osd_map epochs ["
<< m->get_first() << "," << m->get_last()
<< "], i have " << osdmap->get_epoch()
<< endl;
} else {
dout(3) << "handle_osd_map epochs ["
<< m->get_first() << "," << m->get_last()
<< "], i have none"
<< endl;
osdmap = new OSDMap;
boot_epoch = m->get_last(); // hrm...?
}
logger->inc("mapmsg");
// store them?
for (map<epoch_t,bufferlist>::iterator p = m->maps.begin();
p != m->maps.end();
p++) {
object_t oid = get_osdmap_object_name(p->first);
if (store->exists(oid)) {
dout(10) << "handle_osd_map already had full map epoch " << p->first << endl;
logger->inc("mapfdup");
bufferlist bl;
get_map_bl(p->first, bl);
dout(10) << " .. it is " << bl.length() << " bytes" << endl;
continue;
}
dout(10) << "handle_osd_map got full map epoch " << p->first << endl;
//t.write(oid, 0, p->second.length(), p->second);
store->write(oid, 0, p->second.length(), p->second, 0);
if (p->first > superblock.newest_map)
superblock.newest_map = p->first;
if (p->first < superblock.oldest_map ||
superblock.oldest_map == 0)
superblock.oldest_map = p->first;
logger->inc("mapf");
}
for (map<epoch_t,bufferlist>::iterator p = m->incremental_maps.begin();
p != m->incremental_maps.end();
p++) {
object_t oid = get_inc_osdmap_object_name(p->first);
if (store->exists(oid)) {
dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl;
logger->inc("mapidup");
bufferlist bl;
get_inc_map_bl(p->first, bl);
dout(10) << " .. it is " << bl.length() << " bytes" << endl;
continue;
}
dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl;
//t.write(oid, 0, p->second.length(), p->second);
store->write(oid, 0, p->second.length(), p->second, 0);
if (p->first > superblock.newest_map)
superblock.newest_map = p->first;
if (p->first < superblock.oldest_map ||
superblock.oldest_map == 0)
superblock.oldest_map = p->first;
logger->inc("mapi");
}
// advance if we can
bool advanced = false;
if (m->get_source().is_mon() && is_booting())
advanced = true;
epoch_t cur = superblock.current_epoch;
while (cur < superblock.newest_map) {
bufferlist bl;
if (m->incremental_maps.count(cur+1) ||
store->exists(get_inc_osdmap_object_name(cur+1))) {
dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl;
bufferlist bl;
if (m->incremental_maps.count(cur+1))
bl = m->incremental_maps[cur+1];
else
get_inc_map_bl(cur+1, bl);
OSDMap::Incremental inc;
int off = 0;
inc.decode(bl, off);
osdmap->apply_incremental(inc);
// archive the full map
bl.clear();
osdmap->encode(bl);
t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl);
// notify messenger
for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
i != inc.new_down.end();
i++) {
int osd = i->first;
if (osd == whoami) continue;
messenger->mark_down(i->second.addr);
peer_map_epoch.erase(MSG_ADDR_OSD(osd));
// kick any replica ops
for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
it++) {
PG *pg = it->second;
_lock_pg(pg->info.pgid);
{
list<PG::RepOpGather*> ls; // do async; repop_ack() may modify pg->repop_gather
for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
p != pg->repop_gather.end();
p++) {
//dout(-1) << "checking repop tid " << p->first << endl;
if (p->second->waitfor_ack.count(osd) ||
p->second->waitfor_commit.count(osd))
ls.push_back(p->second);
}
for (list<PG::RepOpGather*>::iterator p = ls.begin();
p != ls.end();
p++)
repop_ack(pg, *p, -1, true, osd);
}
_unlock_pg(pg->info.pgid);
}
}
for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
i != inc.new_up.end();
i++) {
if (i->first == whoami) continue;
peer_map_epoch.erase(MSG_ADDR_OSD(i->first));
}
}
else if (m->maps.count(cur+1) ||
store->exists(get_osdmap_object_name(cur+1))) {
dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl;
bufferlist bl;
if (m->maps.count(cur+1))
bl = m->maps[cur+1];
else
get_map_bl(cur+1, bl);
osdmap->decode(bl);
// FIXME BUG: need to notify messenger of ups/downs!!
}
else {
dout(10) << "handle_osd_map missing epoch " << cur+1 << endl;
int mon = monmap->pick_mon();
messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon));
break;
}
cur++;
superblock.current_epoch = cur;
advance_map(t);
advanced = true;
}
// all the way?
if (advanced && cur == superblock.newest_map) {
// yay!
activate_map(t);
// process waiters
take_waiters(waiting_for_osdmap);
}
// write updated pg state to store
for (hash_map<pg_t,PG*>::iterator i = pg_map.begin();
i != pg_map.end();
i++) {
pg_t pgid = i->first;
PG *pg = i->second;
t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info));
}
// superblock and commit
write_superblock(t);
store->apply_transaction(t);
//if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah
delete m;
}
/**
* scan placement groups, initiate any replication
* activities.
*/
void OSD::advance_map(ObjectStore::Transaction& t)
{
dout(7) << "advance_map epoch " << osdmap->get_epoch()
<< " " << pg_map.size() << " pgs"
<< endl;
if (osdmap->is_mkfs()) {
ps_t maxps = 1ULL << osdmap->get_pg_bits();
ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits();
dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl;
assert(osdmap->get_epoch() == 1);
//cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl;
logger->set_start( osdmap->get_ctime() );
// create PGs
for (int nrep = 1;
nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh
nrep++) {
for (ps_t ps = 0; ps < maxps; ++ps) {
vector<int> acting;
pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep);
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role < 0) continue;
PG *pg = create_pg(pgid, t);
pg->set_role(role);
pg->acting.swap(acting);
pg->last_epoch_started_any =
pg->info.last_epoch_started =
pg->info.history.same_since =
pg->info.history.same_primary_since =
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->activate(t);
dout(7) << "created " << *pg << endl;
}
for (ps_t ps = 0; ps < maxlps; ++ps) {
// local PG too
vector<int> acting;
pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep);
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
PG *pg = create_pg(pgid, t);
pg->acting.swap(acting);
pg->set_role(role);
pg->last_epoch_started_any =
pg->info.last_epoch_started =
pg->info.history.same_primary_since =
pg->info.history.same_acker_since =
pg->info.history.same_since = osdmap->get_epoch();
pg->activate(t);
dout(7) << "created " << *pg << endl;
}
}
dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl;
} else {
// scan existing pg's
for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
it++) {
pg_t pgid = it->first;
PG *pg = it->second;
// did i finish this epoch?
if (pg->is_active()) {
pg->info.last_epoch_finished = osdmap->get_epoch()-1;
}
// get new acting set
vector<int> tacting;
int nrep = osdmap->pg_to_acting_osds(pgid, tacting);
int role = osdmap->calc_pg_role(whoami, tacting, nrep);
// no change?
if (tacting == pg->acting)
continue;
// -- there was a change! --
_lock_pg(pgid);
int oldrole = pg->get_role();
int oldprimary = pg->get_primary();
int oldacker = pg->get_acker();
vector<int> oldacting = pg->acting;
// update PG
pg->acting.swap(tacting);
pg->set_role(role);
// did primary|acker change?
pg->info.history.same_since = osdmap->get_epoch();
if (oldprimary != pg->get_primary()) {
pg->info.history.same_primary_since = osdmap->get_epoch();
pg->cancel_recovery();
}
if (oldacker != pg->get_acker()) {
pg->info.history.same_acker_since = osdmap->get_epoch();
}
// deactivate.
pg->state_clear(PG::STATE_ACTIVE);
// reset primary state?
if (oldrole == 0 || pg->get_role() == 0)
pg->clear_primary_state();
// apply any repops in progress.
if (oldacker == whoami) {
// apply repops
for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
p != pg->repop_gather.end();
p++) {
if (!p->second->applied)
apply_repop(pg, p->second);
delete p->second->op;
delete p->second;
}
pg->repop_gather.clear();
// and repop waiters
for (map<tid_t, list<Message*> >::iterator p = pg->waiting_for_repop.begin();
p != pg->waiting_for_repop.end();
p++)
for (list<Message*>::iterator pm = p->second.begin();
pm != p->second.end();
pm++)
delete *pm;
pg->waiting_for_repop.clear();
}
if (role != oldrole) {
// old primary?
if (oldrole == 0) {
pg->state_clear(PG::STATE_CLEAN);
// take replay queue waiters
list<Message*> ls;
for (map<eversion_t,MOSDOp*>::iterator it = pg->replay_queue.begin();
it != pg->replay_queue.end();
it++)
ls.push_back(it->second);
pg->replay_queue.clear();
take_waiters(ls);
// take active waiters
take_waiters(pg->waiting_for_active);
// take object waiters
for (hash_map<object_t, list<Message*> >::iterator it = pg->waiting_for_missing_object.begin();
it != pg->waiting_for_missing_object.end();
it++)
take_waiters(it->second);
pg->waiting_for_missing_object.clear();
}
// new primary?
if (role == 0) {
// i am new primary
pg->state_clear(PG::STATE_STRAY);
} else {
// i am now replica|stray. we need to send a notify.
pg->state_set(PG::STATE_STRAY);
if (nrep == 0) {
pg->state_set(PG::STATE_CRASHED);
dout(1) << *pg << " is crashed" << endl;
}
}
// my role changed.
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
<< ", role " << oldrole << " -> " << role << endl;
} else {
// no role change.
// did primary change?
if (pg->get_primary() != oldprimary) {
// we need to announce
pg->state_set(PG::STATE_STRAY);
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
<< ", acting primary "
<< oldprimary << " -> " << pg->get_primary()
<< endl;
} else {
// primary is the same.
if (role == 0) {
// i am (still) primary. but my replica set changed.
pg->state_clear(PG::STATE_CLEAN);
pg->state_clear(PG::STATE_REPLAY);
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
<< ", replicas changed" << endl;
}
}
}
_unlock_pg(pgid);
}
}
}
void OSD::activate_map(ObjectStore::Transaction& t)
{
dout(7) << "activate_map version " << osdmap->get_epoch() << endl;
map< int, list<PG::Info> > notify_list; // primary -> list
map< int, map<pg_t,PG::Query> > query_map; // peer -> PG -> get_summary_since
// scan pg's
for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
it++) {
//pg_t pgid = it->first;
PG *pg = it->second;
if (pg->is_active()) {
// update started counter
pg->info.last_epoch_started = osdmap->get_epoch();
}
else if (pg->get_role() == 0 && !pg->is_active()) {
// i am (inactive) primary
pg->build_prior();
pg->peer(t, query_map);
}
else if (pg->is_stray() &&
pg->get_primary() >= 0) {
// i am residual|replica
notify_list[pg->get_primary()].push_back(pg->info);
}
}
if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs
return;
// notify? (residual|replica)
do_notifies(notify_list);
// do queries.
do_queries(query_map);
logger->set("numpg", pg_map.size());
}
void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full)
{
dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
<< " to " << inst << endl;
MOSDMap *m = new MOSDMap;
for (epoch_t e = osdmap->get_epoch();
e > since;
e--) {
bufferlist bl;
if (get_inc_map_bl(e,bl)) {
m->incremental_maps[e].claim(bl);
} else if (get_map_bl(e,bl)) {
m->maps[e].claim(bl);
if (!full) break;
}
else {
assert(0); // we should have all maps.
}
}
messenger->send_message(m, inst);
}
bool OSD::get_map_bl(epoch_t e, bufferlist& bl)
{
return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0;
}
bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl)
{
return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0;
}
void OSD::get_map(epoch_t epoch, OSDMap &m)
{
// find a complete map
list<OSDMap::Incremental> incs;
epoch_t e;
for (e = epoch; e > 0; e--) {
bufferlist bl;
if (get_map_bl(e, bl)) {
//dout(10) << "get_map " << epoch << " full " << e << endl;
m.decode(bl);
break;
} else {
OSDMap::Incremental inc;
bool got = get_inc_map(e, inc);
assert(got);
incs.push_front(inc);
}
}
assert(e > 0);
// apply incrementals
for (e++; e <= epoch; e++) {
//dout(10) << "get_map " << epoch << " inc " << e << endl;
m.apply_incremental( incs.front() );
incs.pop_front();
}
}
bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc)
{
bufferlist bl;
if (!get_inc_map_bl(e, bl))
return false;
int off = 0;
inc.decode(bl, off);
return true;
}
bool OSD::require_current_map(Message *m, epoch_t ep)
{
// older map?
if (ep < osdmap->get_epoch()) {
dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl;
delete m; // discard and ignore.
return false;
}
// newer map?
if (ep > osdmap->get_epoch()) {
dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl;
wait_for_new_map(m);
return false;
}
assert(ep == osdmap->get_epoch());
return true;
}
/*
* require that we have same (or newer) map, and that
* the source is the pg primary.
*/
bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
{
dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl;
// newer map?
if (epoch > osdmap->get_epoch()) {
dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl;
wait_for_new_map(m);
return false;
}
if (epoch < boot_epoch) {
dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl;
delete m;
return false;
}
return true;
}
// ======================================================
// REPLICATION
// PG
bool OSD::pg_exists(pg_t pgid)
{
return store->collection_exists(pgid);
}
PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t)
{
if (pg_map.count(pgid)) {
dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl;
}
assert(pg_map.count(pgid) == 0);
assert(!pg_exists(pgid));
PG *pg = new PG(this, pgid);
pg_map[pgid] = pg;
t.create_collection(pgid);
return pg;
}
PG *OSD::get_pg(pg_t pgid)
{
if (pg_map.count(pgid))
return pg_map[pgid];
return 0;
}
void OSD::load_pgs()
{
dout(10) << "load_pgs" << endl;
assert(pg_map.empty());
list<coll_t> ls;
store->list_collections(ls);
for (list<coll_t>::iterator it = ls.begin();
it != ls.end();
it++) {
pg_t pgid = *it;
PG *pg = new PG(this, pgid);
pg_map[pgid] = pg;
// read pg info
store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info));
// read pg log
pg->read_log(store);
// generate state for current mapping
int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting);
int role = osdmap->calc_pg_role(whoami, pg->acting, nrep);
pg->set_role(role);
dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl;
}
}
/**
* check epochs starting from start to verify the pg acting set hasn't changed
* up until now
*/
void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from)
{
dout(15) << "project_pg_history " << pgid
<< " from " << from << " to " << osdmap->get_epoch()
<< ", start " << h
<< endl;
vector<int> last;
osdmap->pg_to_acting_osds(pgid, last);
for (epoch_t e = osdmap->get_epoch()-1;
e >= from;
e--) {
// verify during intermediate epoch
OSDMap oldmap;
get_map(e, oldmap);
vector<int> acting;
oldmap.pg_to_acting_osds(pgid, acting);
// acting set change?
if (acting != last &&
e <= h.same_since) {
dout(15) << "project_pg_history " << pgid << " changed in " << e+1
<< " from " << acting << " -> " << last << endl;
h.same_since = e+1;
}
// primary change?
if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) &&
e <= h.same_primary_since) {
dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl;
h.same_primary_since = e+1;
if (g_conf.osd_rep == OSD_REP_PRIMARY)
h.same_acker_since = h.same_primary_since;
}
// acker change?
if (g_conf.osd_rep != OSD_REP_PRIMARY) {
if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) &&
e <= h.same_acker_since) {
dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl;
h.same_acker_since = e+1;
}
}
if (h.same_since > e &&
h.same_primary_since > e &&
h.same_acker_since > e) break;
}
dout(15) << "project_pg_history end " << h << endl;
}
/** do_notifies
* Send an MOSDPGNotify to a primary, with a list of PGs that I have
* content for, and they are primary for.
*/
void OSD::do_notifies(map< int, list<PG::Info> >& notify_list)
{
for (map< int, list<PG::Info> >::iterator it = notify_list.begin();
it != notify_list.end();
it++) {
if (it->first == whoami) {
dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl;
continue;
}
dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl;
MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second);
_share_map_outgoing(osdmap->get_inst(it->first));
messenger->send_message(m, osdmap->get_inst(it->first));
}
}
/** do_queries
* send out pending queries for info | summaries
*/
void OSD::do_queries(map< int, map<pg_t,PG::Query> >& query_map)
{
for (map< int, map<pg_t,PG::Query> >::iterator pit = query_map.begin();
pit != query_map.end();
pit++) {
int who = pit->first;
dout(7) << "do_queries querying osd" << who
<< " on " << pit->second.size() << " PGs" << endl;
MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(),
pit->second);
_share_map_outgoing(osdmap->get_inst(who));
messenger->send_message(m, osdmap->get_inst(who));
}
}
/** PGNotify
* from non-primary to primary
* includes PG::Info.
* NOTE: called with opqueue active.
*/
void OSD::handle_pg_notify(MOSDPGNotify *m)
{
dout(7) << "handle_pg_notify from " << m->get_source() << endl;
int from = m->get_source().num();
if (!require_same_or_newer_map(m, m->get_epoch())) return;
ObjectStore::Transaction t;
// look for unknown PGs i'm primary for
map< int, map<pg_t,PG::Query> > query_map;
for (list<PG::Info>::iterator it = m->get_pg_list().begin();
it != m->get_pg_list().end();
it++) {
pg_t pgid = it->pgid;
PG *pg;
if (pg_map.count(pgid) == 0) {
// same primary?
PG::Info::History history = it->history;
project_pg_history(pgid, history, m->get_epoch());
if (m->get_epoch() < history.same_primary_since) {
dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in "
<< history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
continue;
}
// ok, create PG!
pg = create_pg(pgid, t);
osdmap->pg_to_acting_osds(pgid, pg->acting);
pg->set_role(0);
pg->info.history = history;
pg->last_epoch_started_any = it->last_epoch_started;
pg->build_prior();
t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
dout(10) << *pg << " is new" << endl;
// kick any waiters
if (waiting_for_pg.count(pgid)) {
take_waiters(waiting_for_pg[pgid]);
waiting_for_pg.erase(pgid);
}
_lock_pg(pgid);
} else {
// already had it. am i (still) the primary?
pg = _lock_pg(pgid);
if (m->get_epoch() < pg->info.history.same_primary_since) {
dout(10) << *pg << " handle_pg_notify primary changed in "
<< pg->info.history.same_primary_since
<< " (msg from " << m->get_epoch() << ")" << endl;
_unlock_pg(pgid);
continue;
}
}
// ok!
// stray?
bool acting = pg->is_acting(from);
if (!acting && (*it).last_epoch_started > 0) {
dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl;
pg->stray_set.insert(from);
pg->state_clear(PG::STATE_CLEAN);
}
// save info.
bool had = pg->peer_info.count(from);
pg->peer_info[from] = *it;
if (had) {
if (pg->is_active() &&
(*it).is_clean() && acting) {
pg->clean_set.insert(from);
dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set
<< "): " << *it << endl;
if (pg->is_all_clean()) {
dout(-10) << *pg << " now clean on all replicas" << endl;
pg->state_set(PG::STATE_CLEAN);
pg->clean_replicas();
}
} else {
// hmm, maybe keep an eye out for cases where we see this, but peer should happen.
dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl;
}
} else {
// adjust prior?
if (it->last_epoch_started > pg->last_epoch_started_any)
pg->adjust_prior();
// peer
pg->peer(t, query_map);
}
_unlock_pg(pgid);
}
unsigned tr = store->apply_transaction(t);
assert(tr == 0);
do_queries(query_map);
delete m;
}
/** PGLog
* from non-primary to primary
* includes log and info
* from primary to non-primary
* includes log for use in recovery
* NOTE: called with opqueue active.
*/
void OSD::handle_pg_log(MOSDPGLog *m)
{
int from = m->get_source().num();
const pg_t pgid = m->get_pgid();
if (!require_same_or_newer_map(m, m->get_epoch())) return;
if (pg_map.count(pgid) == 0) {
dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl;
assert(m->get_epoch() < osdmap->get_epoch());
delete m;
return;
}
PG *pg = _lock_pg(pgid);
assert(pg);
if (m->get_epoch() < pg->info.history.same_since) {
dout(10) << "handle_pg_log " << *pg
<< " from " << m->get_source()
<< " is old, discarding"
<< endl;
delete m;
return;
}
dout(7) << "handle_pg_log " << *pg
<< " got " << m->log << " " << m->missing
<< " from " << m->get_source() << endl;
//m->log.print(cout);
ObjectStore::Transaction t;
if (pg->is_primary()) {
// i am PRIMARY
assert(pg->peer_log_requested.count(from) ||
pg->peer_summary_requested.count(from));
pg->proc_replica_log(m->log, m->missing, from);
// peer
map< int, map<pg_t,PG::Query> > query_map;
pg->peer(t, query_map);
do_queries(query_map);
} else {
// i am REPLICA
dout(10) << *pg << " got " << m->log << " " << m->missing << endl;
// merge log
pg->merge_log(m->log, m->missing, from);
pg->proc_missing(m->log, m->missing, from);
assert(pg->missing.num_lost() == 0);
// ok activate!
pg->activate(t);
}
unsigned tr = store->apply_transaction(t);
assert(tr == 0);
_unlock_pg(pgid);
delete m;
}
/** PGQuery
* from primary to replica | stray
* NOTE: called with opqueue active.
*/
void OSD::handle_pg_query(MOSDPGQuery *m)
{
dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl;
int from = m->get_source().num();
if (!require_same_or_newer_map(m, m->get_epoch())) return;
map< int, list<PG::Info> > notify_list;
for (map<pg_t,PG::Query>::iterator it = m->pg_list.begin();
it != m->pg_list.end();
it++) {
pg_t pgid = it->first;
PG *pg = 0;
if (pg_map.count(pgid) == 0) {
// same primary?
PG::Info::History history = it->second.history;
project_pg_history(pgid, history, m->get_epoch());
if (m->get_epoch() < history.same_since) {
dout(10) << " pg " << pgid << " dne, and pg has changed in "
<< history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
continue;
}
// get active rush mapping
vector<int> acting;
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role < 0) {
dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl;
PG::Info empty(pgid);
notify_list[from].push_back(empty);
continue;
}
assert(role > 0);
ObjectStore::Transaction t;
pg = create_pg(pgid, t);
pg->acting.swap( acting );
pg->set_role(role);
pg->info.history = history;
t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
store->apply_transaction(t);
dout(10) << *pg << " dne (before), but i am role " << role << endl;
_lock_pg(pgid);
} else {
pg = _lock_pg(pgid);
// same primary?
if (m->get_epoch() < pg->info.history.same_since) {
dout(10) << *pg << " handle_pg_query primary changed in "
<< pg->info.history.same_since
<< " (msg from " << m->get_epoch() << ")" << endl;
_unlock_pg(pgid);
continue;
}
}
// ok, process query!
assert(!pg->acting.empty());
assert(from == pg->acting[0]);
if (it->second.type == PG::Query::INFO) {
// info
dout(10) << *pg << " sending info" << endl;
notify_list[from].push_back(pg->info);
} else {
MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid());
m->info = pg->info;
m->missing = pg->missing;
if (it->second.type == PG::Query::LOG) {
dout(10) << *pg << " sending info+missing+log since split " << it->second.split
<< " from floor " << it->second.floor
<< endl;
if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) {
dout(10) << *pg << " divergent, sending backlog" << endl;
it->second.type = PG::Query::BACKLOG;
}
}
if (it->second.type == PG::Query::BACKLOG) {
dout(10) << *pg << " sending info+missing+backlog" << endl;
if (pg->log.backlog) {
m->log = pg->log;
} else {
pg->generate_backlog();
m->log = pg->log;
pg->drop_backlog();
}
}
else if (it->second.type == PG::Query::FULLLOG) {
dout(10) << *pg << " sending info+missing+full log" << endl;
m->log.copy_non_backlog(pg->log);
}
dout(10) << *pg << " sending " << m->log << " " << m->missing << endl;
//m->log.print(cout);
_share_map_outgoing(osdmap->get_inst(from));
messenger->send_message(m, osdmap->get_inst(from));
}
_unlock_pg(pgid);
}
do_notifies(notify_list);
delete m;
}
void OSD::handle_pg_remove(MOSDPGRemove *m)
{
dout(7) << "handle_pg_remove from " << m->get_source() << endl;
if (!require_same_or_newer_map(m, m->get_epoch())) return;
for (set<pg_t>::iterator it = m->pg_list.begin();
it != m->pg_list.end();
it++) {
pg_t pgid = *it;
PG *pg;
if (pg_map.count(pgid) == 0) {
dout(10) << " don't have pg " << pgid << endl;
continue;
}
pg = _lock_pg(pgid);
dout(10) << *pg << " removing." << endl;
assert(pg->get_role() == -1);
_remove_pg(pgid);
// unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0.
assert(pg_lock_waiters.count(pgid) == 0);
_unlock_pg(pgid);
}
delete m;
}
/*** RECOVERY ***/
/** pull - request object from a peer
*/
void OSD::pull(PG *pg, object_t oid)
{
assert(pg->missing.loc.count(oid));
eversion_t v = pg->missing.missing[oid];
int osd = pg->missing.loc[oid];
dout(7) << *pg << " pull " << oid
<< " v " << v
<< " from osd" << osd
<< endl;
// send op
tid_t tid = ++last_tid;
MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid,
oid, pg->get_pgid(),
osdmap->get_epoch(),
OSD_OP_PULL);
op->set_version(v);
messenger->send_message(op, osdmap->get_inst(osd));
// take note
assert(pg->objects_pulling.count(oid) == 0);
num_pulling++;
pg->objects_pulling[oid] = v;
}
/** push - send object to a peer
*/
void OSD::push(PG *pg, object_t oid, int dest)
{
// read data+attrs
bufferlist bl;
eversion_t v;
int vlen = sizeof(v);
map<string,bufferptr> attrset;
ObjectStore::Transaction t;
t.read(oid, 0, 0, &bl);
t.getattr(oid, "version", &v, &vlen);
t.getattrs(oid, attrset);
unsigned tr = store->apply_transaction(t);
assert(tr == 0); // !!!
// ok
dout(7) << *pg << " push " << oid << " v " << v
<< " size " << bl.length()
<< " to osd" << dest
<< endl;
logger->inc("r_push");
logger->inc("r_pushb", bl.length());
// send
MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid,
oid, pg->info.pgid, osdmap->get_epoch(),
OSD_OP_PUSH);
op->set_offset(0);
op->set_length(bl.length());
op->set_data(bl); // note: claims bl, set length above here!
op->set_version(v);
op->set_attrset(attrset);
messenger->send_message(op, osdmap->get_inst(dest));
}
/** op_pull
* process request to pull an entire object.
* NOTE: called from opqueue.
*/
void OSD::op_pull(MOSDOp *op, PG *pg)
{
const object_t oid = op->get_oid();
const eversion_t v = op->get_version();
int from = op->get_source().num();
dout(7) << *pg << " op_pull " << oid << " v " << op->get_version()
<< " from " << op->get_source()
<< endl;
// is a replica asking? are they missing it?
if (pg->is_primary()) {
// primary
assert(pg->peer_missing.count(from)); // we had better know this, from the peering process.
if (!pg->peer_missing[from].is_missing(oid)) {
dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl;
delete op;
return;
}
// do we have it yet?
if (waitfor_missing_object(op, pg))
return;
} else {
// non-primary
if (pg->missing.is_missing(oid)) {
dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl;
delete op;
return;
}
}
// push it back!
push(pg, oid, op->get_source().num());
}
/** op_push
* NOTE: called from opqueue.
*/
void OSD::op_push(MOSDOp *op, PG *pg)
{
object_t oid = op->get_oid();
eversion_t v = op->get_version();
if (!pg->missing.is_missing(oid)) {
dout(7) << *pg << " op_push not missing " << oid << endl;
return;
}
dout(7) << *pg << " op_push "
<< oid
<< " v " << v
<< " size " << op->get_length() << " " << op->get_data().length()
<< endl;
assert(op->get_data().length() == op->get_length());
// write object and add it to the PG
ObjectStore::Transaction t;
t.remove(oid); // in case old version exists
t.write(oid, 0, op->get_length(), op->get_data());
t.setattrs(oid, op->get_attrset());
t.collection_add(pg->info.pgid, oid);
// close out pull op?
num_pulling--;
if (pg->objects_pulling.count(oid))
pg->objects_pulling.erase(oid);
pg->missing.got(oid, v);
// raise last_complete?
assert(pg->log.complete_to != pg->log.log.end());
while (pg->log.complete_to != pg->log.log.end()) {
if (pg->missing.missing.count(pg->log.complete_to->oid)) break;
if (pg->info.last_complete < pg->log.complete_to->version)
pg->info.last_complete = pg->log.complete_to->version;
pg->log.complete_to++;
}
dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl;
// apply to disk!
t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info));
unsigned r = store->apply_transaction(t);
assert(r == 0);
// am i primary? are others missing this too?
if (pg->is_primary()) {
for (unsigned i=1; i<pg->acting.size(); i++) {
int peer = pg->acting[i];
assert(pg->peer_missing.count(peer));
if (pg->peer_missing[peer].is_missing(oid)) {
// ok, push it, and they (will) have it now.
pg->peer_missing[peer].got(oid, v);
push(pg, oid, peer);
}
}
}
// continue recovery
pg->do_recovery();
// kick waiters
if (pg->waiting_for_missing_object.count(oid))
take_waiters(pg->waiting_for_missing_object[oid]);
delete op;
}
// op_rep_modify
// commit (to disk) callback
class C_OSD_RepModifyCommit : public Context {
public:
OSD *osd;
MOSDOp *op;
int destosd;
eversion_t pg_last_complete;
Mutex lock;
Cond cond;
bool acked;
bool waiting;
C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) :
osd(o), op(oo), destosd(dosd), pg_last_complete(lc),
acked(false), waiting(false) { }
void finish(int r) {
lock.Lock();
assert(!waiting);
while (!acked) {
waiting = true;
cond.Wait(lock);
}
assert(acked);
lock.Unlock();
osd->op_rep_modify_commit(op, destosd, pg_last_complete);
}
void ack() {
lock.Lock();
assert(!acked);
acked = true;
if (waiting) cond.Signal();
// discard my reference to buffer
op->get_data().clear();
lock.Unlock();
}
};
void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete)
{
// send commit.
dout(10) << "rep_modify_commit on op " << *op
<< ", sending commit to osd" << ackerosd
<< endl;
MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
commit->set_pg_complete_thru(last_complete);
messenger->send_message(commit, osdmap->get_inst(ackerosd));
delete op;
}
// process a modification operation
class C_OSD_WriteCommit : public Context {
public:
OSD *osd;
pg_t pgid;
tid_t rep_tid;
eversion_t pg_last_complete;
C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {}
void finish(int r) {
osd->op_modify_commit(pgid, rep_tid, pg_last_complete);
}
};
/** op_rep_modify
* process a replicated modify.
* NOTE: called from opqueue.
*/
void OSD::op_rep_modify(MOSDOp *op, PG *pg)
{
object_t oid = op->get_oid();
eversion_t nv = op->get_version();
const char *opname = MOSDOp::get_opname(op->get_op());
// check crev
objectrev_t crev = 0;
store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
dout(10) << "op_rep_modify " << opname
<< " " << oid
<< " v " << nv
<< " " << op->get_offset() << "~" << op->get_length()
<< " in " << *pg
<< endl;
// we better not be missing this.
assert(!pg->missing.is_missing(oid));
// prepare our transaction
ObjectStore::Transaction t;
// am i acker?
PG::RepOpGather *repop = 0;
int ackerosd = pg->acting[0];
if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) {
ackerosd = pg->get_acker();
if (pg->is_acker()) {
// i am tail acker.
if (pg->repop_gather.count(op->get_rep_tid())) {
repop = pg->repop_gather[ op->get_rep_tid() ];
} else {
repop = new_repop_gather(pg, op);
}
// infer ack from source
int fromosd = op->get_source().num();
get_repop_gather(repop);
{
//assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice.
repop->waitfor_ack.erase(fromosd);
}
put_repop_gather(pg, repop);
// prepare dest socket
//messenger->prepare_send_message(op->get_client());
}
// chain? forward?
if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) {
// chain rep, not at the tail yet.
int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
int next = myrank+1;
if (next == (int)pg->acting.size())
next = 1;
issue_repop(pg, op, pg->acting[next]);
}
}
// do op?
C_OSD_RepModifyCommit *oncommit = 0;
logger->inc("r_wr");
logger->inc("r_wrb", op->get_length());
if (repop) {
// acker. we'll apply later.
if (op->get_op() != OSD_OP_WRNOOP) {
prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
}
} else {
// middle|replica.
if (op->get_op() != OSD_OP_WRNOOP) {
prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
}
oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete);
// apply log update. and possibly update itself.
unsigned tr = store->apply_transaction(t, oncommit);
if (tr != 0 && // no errors
tr != 2) { // or error on collection_add
cerr << "error applying transaction: r = " << tr << endl;
assert(tr == 0);
}
}
// ack?
if (repop) {
// (logical) local ack. this may induce the actual update.
get_repop_gather(repop);
{
assert(repop->waitfor_ack.count(whoami));
repop->waitfor_ack.erase(whoami);
}
put_repop_gather(pg, repop);
}
else {
// send ack to acker?
if (g_conf.osd_rep != OSD_REP_CHAIN) {
MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false);
messenger->send_message(ack, osdmap->get_inst(ackerosd));
}
// ack myself.
assert(oncommit);
oncommit->ack();
}
}
// =========================================================
// OPS
void OSD::handle_op(MOSDOp *op)
{
const pg_t pgid = op->get_pg();
PG *pg = get_pg(pgid);
logger->set("buf", buffer_total_alloc);
// update qlen stats
hb_stat_ops++;
hb_stat_qlen += pending_ops;
// require same or newer map
if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
// share our map with sender, if they're old
_share_map_incoming(op->get_source_inst(), op->get_map_epoch());
// what kind of op?
bool read = op->get_op() < 10; // read, stat. but not pull.
if (!op->get_source().is_osd()) {
// REGULAR OP (non-replication)
// note original source
op->set_client_inst( op->get_source_inst() );
op->clear_payload(); // and hose encoded payload (in case we forward)
// have pg?
if (!pg) {
dout(7) << "hit non-existent pg "
<< pgid
<< ", waiting" << endl;
waiting_for_pg[pgid].push_back(op);
return;
}
if (read) {
// read. am i the (same) acker?
if (//pg->get_acker() != whoami ||
op->get_map_epoch() < pg->info.history.same_acker_since) {
dout(7) << "acting acker is osd" << pg->get_acker()
<< " since " << pg->info.history.same_acker_since
<< ", dropping" << endl;
assert(op->get_map_epoch() < osdmap->get_epoch());
delete op;
return;
}
} else {
// write. am i the (same) primary?
if (pg->get_primary() != whoami ||
op->get_map_epoch() < pg->info.history.same_primary_since) {
dout(7) << "acting primary is osd" << pg->get_primary()
<< " since " << pg->info.history.same_primary_since
<< ", dropping" << endl;
assert(op->get_map_epoch() < osdmap->get_epoch());
delete op;
return;
}
}
// must be active.
if (!pg->is_active()) {
// replay?
if (op->get_version().version > 0) {
if (op->get_version() > pg->info.last_update) {
dout(7) << *pg << " queueing replay at " << op->get_version()
<< " for " << *op << endl;
pg->replay_queue[op->get_version()] = op;
return;
} else {
dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update
<< " for " << *op
<< ", will queue for WRNOOP" << endl;
}
}
dout(7) << *pg << " not active (yet)" << endl;
pg->waiting_for_active.push_back(op);
return;
}
// missing object?
if (read && op->get_oid().rev > 0) {
// versioned read. hrm.
// are we missing a revision that we might need?
object_t moid = op->get_oid();
if (pick_missing_object_rev(moid, pg)) {
// is there a local revision we might use instead?
object_t loid = op->get_oid();
if (store->pick_object_revision_lt(loid) &&
moid <= loid) {
// we need moid. pull it.
dout(10) << "handle_op read on " << op->get_oid()
<< ", have " << loid
<< ", but need missing " << moid
<< ", pulling" << endl;
pull(pg, moid);
pg->waiting_for_missing_object[moid].push_back(op);
return;
}
dout(10) << "handle_op read on " << op->get_oid()
<< ", have " << loid
<< ", don't need missing " << moid
<< endl;
}
} else {
// live revision. easy.
if (op->get_op() != OSD_OP_PUSH &&
waitfor_missing_object(op, pg)) return;
}
dout(7) << "handle_op " << *op << " in " << *pg << endl;
// balance reads?
if (read &&
g_conf.osd_balance_reads &&
pg->get_acker() == whoami) {
// test
if (false) {
if (pg->acting.size() > 1) {
int peer = pg->acting[1];
dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl;
messenger->send_message(op, osdmap->get_inst(peer));
return;
}
}
// am i above my average?
float my_avg = hb_stat_qlen / hb_stat_ops;
if (pending_ops > my_avg) {
// is there a peer who is below my average?
for (unsigned i=1; i<pg->acting.size(); ++i) {
int peer = pg->acting[i];
if (peer_qlen.count(peer) &&
peer_qlen[peer] < my_avg) {
// calculate a probability that we should redirect
float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb.
if (drand48() <= p) {
// take the first one
dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg
<< ", p=" << p
<< ", fwd to peer w/ qlen " << peer_qlen[peer]
<< " osd" << peer
<< endl;
messenger->send_message(op, osdmap->get_inst(peer));
return;
}
}
}
}
}
} else {
// REPLICATION OP (it's from another OSD)
// have pg?
if (!pg) {
derr(-7) << "handle_rep_op " << *op
<< " pgid " << pgid << " dne" << endl;
delete op;
//assert(0); // wtf, shouldn't happen.
return;
}
// check osd map: same set, or primary+acker?
if (g_conf.osd_rep == OSD_REP_CHAIN &&
op->get_map_epoch() < pg->info.history.same_since) {
dout(10) << "handle_rep_op pg changed " << pg->info.history
<< " after " << op->get_map_epoch()
<< ", dropping" << endl;
delete op;
return;
}
if (g_conf.osd_rep != OSD_REP_CHAIN &&
(op->get_map_epoch() < pg->info.history.same_primary_since ||
op->get_map_epoch() < pg->info.history.same_acker_since)) {
dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history
<< " after " << op->get_map_epoch()
<< ", dropping" << endl;
delete op;
return;
}
assert(pg->get_role() >= 0);
dout(7) << "handle_rep_op " << op << " in " << *pg << endl;
}
if (g_conf.osd_maxthreads < 1) {
_lock_pg(pgid);
do_op(op, pg); // do it now
_unlock_pg(pgid);
} else {
// queue for worker threads
if (read)
enqueue_op(0, op); // no locking needed for reads
else
enqueue_op(pgid, op);
}
}
void OSD::handle_op_reply(MOSDOpReply *op)
{
if (op->get_map_epoch() < boot_epoch) {
dout(3) << "replica op reply from before boot" << endl;
delete op;
return;
}
// must be a rep op.
assert(op->get_source().is_osd());
// make sure we have the pg
const pg_t pgid = op->get_pg();
PG *pg = get_pg(pgid);
// require same or newer map
if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
// share our map with sender, if they're old
_share_map_incoming(op->get_source_inst(), op->get_map_epoch());
if (!pg) {
// hmm.
delete op;
}
if (g_conf.osd_maxthreads < 1) {
_lock_pg(pgid);
do_op(op, pg); // do it now
_unlock_pg(pgid);
} else {
enqueue_op(pgid, op); // queue for worker threads
}
}
/*
* enqueue called with osd_lock held
*/
void OSD::enqueue_op(pg_t pgid, Message *op)
{
while (pending_ops > g_conf.osd_max_opq) {
dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl;
op_queue_cond.Wait(osd_lock);
}
op_queue[pgid].push_back(op);
pending_ops++;
logger->set("opq", pending_ops);
threadpool->put_op(pgid);
}
/*
* NOTE: dequeue called in worker thread, without osd_lock
*/
void OSD::dequeue_op(pg_t pgid)
{
Message *op = 0;
PG *pg = 0;
osd_lock.Lock();
{
if (pgid) {
// lock pg
pg = _lock_pg(pgid);
}
// get pending op
list<Message*> &ls = op_queue[pgid];
assert(!ls.empty());
op = ls.front();
ls.pop_front();
if (pgid) {
dout(10) << "dequeue_op " << op << " write pg " << pgid
<< ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
} else {
dout(10) << "dequeue_op " << op << " read "
<< ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
}
if (ls.empty())
op_queue.erase(pgid);
}
osd_lock.Unlock();
// do it
do_op(op, pg);
// finish
osd_lock.Lock();
{
if (pgid) {
// unlock pg
_unlock_pg(pgid);
}
dout(10) << "dequeue_op " << op << " finish" << endl;
assert(pending_ops > 0);
if (pending_ops > g_conf.osd_max_opq)
op_queue_cond.Signal();
pending_ops--;
logger->set("opq", pending_ops);
if (pending_ops == 0 && waiting_for_no_ops)
no_pending_ops.Signal();
}
osd_lock.Unlock();
}
/** do_op - do an op
* object lock will be held (if multithreaded)
* osd_lock NOT held.
*/
void OSD::do_op(Message *m, PG *pg)
{
//dout(15) << "do_op " << *m << endl;
if (m->get_type() == MSG_OSD_OP) {
MOSDOp *op = (MOSDOp*)m;
logger->inc("op");
switch (op->get_op()) {
// reads
case OSD_OP_READ:
op_read(op);//, pg);
break;
case OSD_OP_STAT:
op_stat(op);//, pg);
break;
// rep stuff
case OSD_OP_PULL:
op_pull(op, pg);
break;
case OSD_OP_PUSH:
op_push(op, pg);
break;
// writes
case OSD_OP_WRNOOP:
case OSD_OP_WRITE:
case OSD_OP_ZERO:
case OSD_OP_DELETE:
case OSD_OP_TRUNCATE:
case OSD_OP_WRLOCK:
case OSD_OP_WRUNLOCK:
case OSD_OP_RDLOCK:
case OSD_OP_RDUNLOCK:
case OSD_OP_UPLOCK:
case OSD_OP_DNLOCK:
if (op->get_source().is_osd())
op_rep_modify(op, pg);
else
op_modify(op, pg);
break;
default:
assert(0);
}
}
else if (m->get_type() == MSG_OSD_OPREPLY) {
// must be replication.
MOSDOpReply *r = (MOSDOpReply*)m;
tid_t rep_tid = r->get_rep_tid();
if (pg->repop_gather.count(rep_tid)) {
// oh, good.
int fromosd = r->get_source().num();
repop_ack(pg, pg->repop_gather[rep_tid],
r->get_result(), r->get_commit(),
fromosd,
r->get_pg_complete_thru());
delete m;
} else {
// early ack.
pg->waiting_for_repop[rep_tid].push_back(r);
}
} else
assert(0);
}
void OSD::wait_for_no_ops()
{
if (pending_ops > 0) {
dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl;
waiting_for_no_ops = true;
while (pending_ops > 0)
no_pending_ops.Wait(osd_lock);
waiting_for_no_ops = false;
assert(pending_ops == 0);
}
dout(7) << "wait_for_no_ops - none" << endl;
}
// ==============================
// Object locking
//
// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock
// returns true if object was locked, otherwise returns false
//
bool OSD::block_if_wrlocked(MOSDOp* op)
{
object_t oid = op->get_oid();
entity_name_t source;
int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t));
//cout << "getattr returns " << len << " on " << oid << endl;
if (len == sizeof(source) &&
source != op->get_client()) {
//the object is locked for writing by someone else -- add the op to the waiting queue
waiting_for_wr_unlock[oid].push_back(op);
return true;
}
return false; //the object wasn't locked, so the operation can be handled right away
}
// ===============================
// OPS
/*
int OSD::list_missing_revs(object_t oid, set<object_t>& revs, PG *pg)
{
int c = 0;
oid.rev = 0;
map<object_t,eversion_t>::iterator p = pg->missing.missing.lower_bound(oid);
if (p == pg->missing.missing.end())
return 0; // clearly not
while (p->first.ino == oid.ino &&
p->first.bno == oid.bno) {
revs.insert(p->first);
c++;
}
return c;
}*/
bool OSD::pick_missing_object_rev(object_t& oid, PG *pg)
{
map<object_t,eversion_t>::iterator p = pg->missing.missing.upper_bound(oid);
if (p == pg->missing.missing.end())
return false; // clearly no candidate
if (p->first.ino == oid.ino && p->first.bno == oid.bno) {
oid = p->first; // yes! it's an upper bound revision for me.
return true;
}
return false;
}
bool OSD::pick_object_rev(object_t& oid)
{
object_t t = oid;
if (!store->pick_object_revision_lt(t))
return false; // we have no revisions of this object!
objectrev_t crev;
int r = store->getattr(t, "crev", &crev, sizeof(crev));
assert(r >= 0);
if (crev <= oid.rev) {
dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl;
oid = t;
return true;
}
return false;
}
bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg)
{
const object_t oid = op->get_oid();
// are we missing the object?
if (pg->missing.missing.count(oid)) {
// we don't have it (yet).
eversion_t v = pg->missing.missing[oid];
if (pg->objects_pulling.count(oid)) {
dout(7) << "missing "
<< oid
<< " v " << v
<< " in " << *pg
<< ", already pulling"
<< endl;
} else {
dout(7) << "missing "
<< oid
<< " v " << v
<< " in " << *pg
<< ", pulling"
<< endl;
pull(pg, oid);
}
pg->waiting_for_missing_object[oid].push_back(op);
return true;
}
return false;
}
// READ OPS
/** op_read
* client read op
* NOTE: called from opqueue.
*/
void OSD::op_read(MOSDOp *op)//, PG *pg)
{
object_t oid = op->get_oid();
// if the target object is locked for writing by another client, put 'op' to the waiting queue
// for _any_ op type -- eg only the locker can unlock!
if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks
dout(10) << "op_read " << oid
<< " " << op->get_offset() << "~" << op->get_length()
//<< " in " << *pg
<< endl;
long r = 0;
bufferlist bl;
if (oid.rev && !pick_object_rev(oid)) {
// we have no revision for this request.
r = -EEXIST;
} else {
// read into a buffer
r = store->read(oid,
op->get_offset(), op->get_length(),
bl);
}
// set up reply
MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
if (r >= 0) {
reply->set_result(0);
reply->set_data(bl);
reply->set_length(r);
logger->inc("c_rd");
logger->inc("c_rdb", r);
} else {
reply->set_result(r); // error
reply->set_length(0);
}
dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl;
logger->inc("rd");
if (r >= 0) logger->inc("rdb", r);
// send it
messenger->send_message(reply, op->get_client_inst());
delete op;
}
/** op_stat
* client stat
* NOTE: called from opqueue
*/
void OSD::op_stat(MOSDOp *op)//, PG *pg)
{
object_t oid = op->get_oid();
// if the target object is locked for writing by another client, put 'op' to the waiting queue
if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks
struct stat st;
memset(&st, sizeof(st), 0);
int r = 0;
if (oid.rev && !pick_object_rev(oid)) {
// we have no revision for this request.
r = -EEXIST;
} else {
r = store->stat(oid, &st);
}
dout(3) << "op_stat on " << oid
<< " r = " << r
<< " size = " << st.st_size
//<< " in " << *pg
<< endl;
MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true);
reply->set_object_size(st.st_size);
messenger->send_message(reply, op->get_client_inst());
logger->inc("stat");
delete op;
}
/*********
* new repops
*/
void OSD::get_repop_gather(PG::RepOpGather *repop)
{
//repop->lock.Lock();
dout(10) << "get_repop " << *repop << endl;
}
void OSD::apply_repop(PG *pg, PG::RepOpGather *repop)
{
dout(10) << "apply_repop applying update on " << *repop << endl;
assert(!repop->applied);
Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete);
unsigned r = store->apply_transaction(repop->t, oncommit);
if (r)
dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl;
// discard my reference to buffer
repop->op->get_data().clear();
repop->applied = true;
}
void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop)
{
dout(10) << "put_repop " << *repop << endl;
// commit?
if (repop->can_send_commit() &&
repop->op->wants_commit()) {
// send commit.
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true);
dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl;
messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_commit = true;
}
// ack?
else if (repop->can_send_ack() &&
repop->op->wants_ack()) {
// apply
apply_repop(pg, repop);
// send ack
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false);
dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl;
messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_ack = true;
utime_t now = g_clock.now();
now -= repop->start;
logger->finc("rlsum", now);
logger->inc("rlnum", 1);
}
// done.
if (repop->can_delete()) {
// adjust peers_complete_thru
if (!repop->pg_complete_thru.empty()) {
eversion_t min = pg->info.last_complete; // hrm....
for (unsigned i=0; i<pg->acting.size(); i++) {
if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want.
min = repop->pg_complete_thru[pg->acting[i]];
}
if (min > pg->peers_complete_thru) {
dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl;
pg->peers_complete_thru = min;
}
}
dout(10) << "put_repop deleting " << *repop << endl;
//repop->lock.Unlock();
assert(pg->repop_gather.count(repop->rep_tid));
pg->repop_gather.erase(repop->rep_tid);
delete repop->op;
delete repop;
} else {
//repop->lock.Unlock();
}
}
void OSD::issue_repop(PG *pg, MOSDOp *op, int osd)
{
object_t oid = op->get_oid();
dout(7) << " issue_repop rep_tid " << op->get_rep_tid()
<< " in " << *pg
<< " o " << oid
<< " to osd" << osd
<< endl;
// forward the write/update/whatever
MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid,
oid,
pg->get_pgid(),
osdmap->get_epoch(),
op->get_op());
wr->get_data() = op->get_data(); // _copy_ bufferlist
wr->set_length(op->get_length());
wr->set_offset(op->get_offset());
wr->set_version(op->get_version());
wr->set_rep_tid(op->get_rep_tid());
wr->set_pg_trim_to(pg->peers_complete_thru);
messenger->send_message(wr, osdmap->get_inst(osd));
}
PG::RepOpGather *OSD::new_repop_gather(PG *pg,
MOSDOp *op)
{
dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl;
PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(),
op->get_version(),
pg->info.last_complete);
// osds. commits all come to me.
for (unsigned i=0; i<pg->acting.size(); i++) {
int osd = pg->acting[i];
repop->osds.insert(osd);
repop->waitfor_commit.insert(osd);
}
// acks vary:
if (g_conf.osd_rep == OSD_REP_CHAIN) {
// chain rep.
// there's my local ack...
repop->osds.insert(whoami);
repop->waitfor_ack.insert(whoami);
repop->waitfor_commit.insert(whoami);
// also, the previous guy will ack to me
int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
if (myrank > 0) {
int osd = pg->acting[ myrank-1 ];
repop->osds.insert(osd);
repop->waitfor_ack.insert(osd);
repop->waitfor_commit.insert(osd);
}
} else {
// primary, splay. all osds ack to me.
for (unsigned i=0; i<pg->acting.size(); i++) {
int osd = pg->acting[i];
repop->waitfor_ack.insert(osd);
}
}
repop->start = g_clock.now();
pg->repop_gather[ repop->rep_tid ] = repop;
// anyone waiting? (acks that got here before the op did)
if (pg->waiting_for_repop.count(repop->rep_tid)) {
take_waiters(pg->waiting_for_repop[repop->rep_tid]);
pg->waiting_for_repop.erase(repop->rep_tid);
}
return repop;
}
void OSD::repop_ack(PG *pg, PG::RepOpGather *repop,
int result, bool commit,
int fromosd, eversion_t pg_complete_thru)
{
MOSDOp *op = repop->op;
dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op
<< " result " << result << " commit " << commit << " from osd" << fromosd
<< " in " << *pg
<< endl;
get_repop_gather(repop);
{
if (commit) {
// commit
assert(repop->waitfor_commit.count(fromosd));
repop->waitfor_commit.erase(fromosd);
repop->waitfor_ack.erase(fromosd);
repop->pg_complete_thru[fromosd] = pg_complete_thru;
} else {
// ack
repop->waitfor_ack.erase(fromosd);
}
}
put_repop_gather(pg, repop);
}
/** op_modify_commit
* transaction commit on the acker.
*/
void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru)
{
PG *pg = lock_pg(pgid);
if (pg) {
if (pg->repop_gather.count(rep_tid)) {
PG::RepOpGather *repop = pg->repop_gather[rep_tid];
dout(10) << "op_modify_commit " << *repop->op << endl;
get_repop_gather(repop);
{
assert(repop->waitfor_commit.count(whoami));
repop->waitfor_commit.erase(whoami);
repop->pg_complete_thru[whoami] = pg_complete_thru;
}
put_repop_gather(pg, repop);
dout(10) << "op_modify_commit done on " << repop << endl;
} else {
dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl;
}
unlock_pg(pgid);
} else {
dout(10) << "op_modify_commit pg " << pgid << " dne" << endl;
}
}
/** op_modify
* process client modify op
* NOTE: called from opqueue.
*/
void OSD::op_modify(MOSDOp *op, PG *pg)
{
object_t oid = op->get_oid();
const char *opname = MOSDOp::get_opname(op->get_op());
// are any peers missing this?
for (unsigned i=1; i<pg->acting.size(); i++) {
int peer = pg->acting[i];
if (pg->peer_missing.count(peer) &&
pg->peer_missing[peer].is_missing(oid)) {
// push it before this update.
// FIXME, this is probably extra much work (eg if we're about to overwrite)
pg->peer_missing[peer].got(oid);
push(pg, oid, peer);
}
}
// dup op?
if (pg->log.logged_req(op->get_reqid())) {
dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid()
<< ", doing WRNOOP" << endl;
op->set_op(OSD_OP_WRNOOP);
opname = MOSDOp::get_opname(op->get_op());
}
// locked by someone else?
// for _any_ op type -- eg only the locker can unlock!
if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush
block_if_wrlocked(op))
return; // op will be handled later, after the object unlocks
// check crev
objectrev_t crev = 0;
store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
// assign version
eversion_t clone_version;
eversion_t nv = pg->log.top;
if (op->get_op() != OSD_OP_WRNOOP) {
nv.epoch = osdmap->get_epoch();
nv.version++;
assert(nv > pg->info.last_update);
assert(nv > pg->log.top);
// will clone?
if (crev && op->get_rev() && op->get_rev() > crev) {
clone_version = nv;
nv.version++;
}
if (op->get_version().version) {
// replay!
if (nv.version < op->get_version().version) {
nv.version = op->get_version().version;
// clone?
if (crev && op->get_rev() && op->get_rev() > crev) {
// backstep clone
clone_version = nv;
clone_version.version--;
}
}
}
}
// set version in op, for benefit of client and our eventual reply
op->set_version(nv);
dout(10) << "op_modify " << opname
<< " " << oid
<< " v " << nv
<< " crev " << crev
<< " rev " << op->get_rev()
<< " " << op->get_offset() << "~" << op->get_length()
<< endl;
if (op->get_op() == OSD_OP_WRITE) {
logger->inc("c_wr");
logger->inc("c_wrb", op->get_length());
}
// share latest osd map?
osd_lock.Lock();
{
for (unsigned i=1; i<pg->acting.size(); i++) {
int osd = pg->acting[i];
_share_map_outgoing( osdmap->get_inst(osd) );
}
}
osd_lock.Unlock();
// issue replica writes
PG::RepOpGather *repop = 0;
bool alone = (pg->acting.size() == 1);
tid_t rep_tid = ++last_tid;
op->set_rep_tid(rep_tid);
if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) {
// chain rep. send to #2 only.
int next = pg->acting[1];
if (pg->acting.size() > 2)
next = pg->acting[2];
issue_repop(pg, op, next);
}
else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) {
// splay rep. send to rest.
for (unsigned i=1; i<pg->acting.size(); ++i)
//for (unsigned i=pg->acting.size()-1; i>=1; --i)
issue_repop(pg, op, pg->acting[i]);
} else {
// primary rep, or alone.
repop = new_repop_gather(pg, op);
// send to rest.
if (!alone)
for (unsigned i=1; i<pg->acting.size(); i++)
issue_repop(pg, op, pg->acting[i]);
}
if (repop) {
// we are acker.
if (op->get_op() != OSD_OP_WRNOOP) {
// log and update later.
prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
}
// (logical) local ack.
// (if alone, this will apply the update.)
get_repop_gather(repop);
{
assert(repop->waitfor_ack.count(whoami));
repop->waitfor_ack.erase(whoami);
}
put_repop_gather(pg, repop);
} else {
// chain or splay. apply.
ObjectStore::Transaction t;
prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(),
pg->info.last_complete);
unsigned r = store->apply_transaction(t, oncommit);
if (r != 0 && // no errors
r != 2) { // or error on collection_add
cerr << "error applying transaction: r = " << r << endl;
assert(r == 0);
}
oncommit->ack();
}
}
void OSD::prepare_log_transaction(ObjectStore::Transaction& t,
MOSDOp *op, eversion_t& version,
objectrev_t crev, objectrev_t rev,
PG *pg,
eversion_t trim_to)
{
const object_t oid = op->get_oid();
// clone entry?
if (crev && rev && rev > crev) {
eversion_t cv = version;
cv.version--;
PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid());
pg->log.add(cloneentry);
dout(10) << "prepare_log_transaction " << op->get_op()
<< " " << cloneentry
<< " in " << *pg << endl;
}
// actual op
int opcode = PG::Log::Entry::MODIFY;
if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE;
PG::Log::Entry logentry(opcode, oid, version, op->get_reqid());
dout(10) << "prepare_log_transaction " << op->get_op()
<< " " << logentry
<< " in " << *pg << endl;
// append to log
assert(version > pg->log.top);
pg->log.add(logentry);
assert(pg->log.top == version);
dout(10) << "prepare_log_transaction appended to " << *pg << endl;
// write to pg log on disk
pg->append_log(t, logentry, trim_to);
}
/** prepare_op_transaction
* apply an op to the store wrapped in a transaction.
*/
void OSD::prepare_op_transaction(ObjectStore::Transaction& t,
MOSDOp *op, eversion_t& version,
objectrev_t crev, objectrev_t rev,
PG *pg)
{
const object_t oid = op->get_oid();
const pg_t pgid = op->get_pg();
bool did_clone = false;
dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() )
<< " " << oid
<< " v " << version
<< " crev " << crev
<< " rev " << rev
<< " in " << *pg << endl;
// WRNOOP does nothing.
if (op->get_op() == OSD_OP_WRNOOP)
return;
// raise last_complete?
if (pg->info.last_complete == pg->info.last_update)
pg->info.last_complete = version;
// raise last_update.
assert(version > pg->info.last_update);
pg->info.last_update = version;
// write pg info
t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info));
// clone?
if (crev && rev && rev > crev) {
object_t noid = oid;
noid.rev = rev;
dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl;
t.clone(oid, noid);
did_clone = true;
}
// apply the op
switch (op->get_op()) {
case OSD_OP_WRLOCK:
{ // lock object
//r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit);
t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t));
}
break;
case OSD_OP_WRUNLOCK:
{ // unlock objects
//r = store->rmattr(oid, "wrlock", oncommit);
t.rmattr(oid, "wrlock");
// unblock all operations that were waiting for this object to become unlocked
if (waiting_for_wr_unlock.count(oid)) {
take_waiters(waiting_for_wr_unlock[oid]);
waiting_for_wr_unlock.erase(oid);
}
}
break;
case OSD_OP_WRITE:
{ // write
assert(op->get_data().length() == op->get_length());
bufferlist bl;
bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time!
//if (oid < 100000000000000ULL) // hack hack-- don't write client data
t.write( oid, op->get_offset(), op->get_length(), bl );
}
break;
case OSD_OP_ZERO:
{
assert(0); // are you sure this is what you want?
// zero, remove, or truncate?
struct stat st;
int r = store->stat(oid, &st);
if (r >= 0) {
if (op->get_offset() + op->get_length() >= st.st_size) {
if (op->get_offset())
t.truncate(oid, op->get_length() + op->get_offset());
else
t.remove(oid);
} else {
// zero. the dumb way. FIXME.
bufferptr bp(op->get_length());
bp.zero();
bufferlist bl;
bl.push_back(bp);
t.write(oid, op->get_offset(), op->get_length(), bl);
}
} else {
// noop?
dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl;
}
}
break;
case OSD_OP_TRUNCATE:
{ // truncate
//r = store->truncate(oid, op->get_offset());
t.truncate(oid, op->get_length() );
}
break;
case OSD_OP_DELETE:
{ // delete
//r = store->remove(oid);
t.remove(oid);
}
break;
default:
assert(0);
}
// object collection, version
if (op->get_op() == OSD_OP_DELETE) {
// remove object from c
t.collection_remove(pgid, oid);
} else {
// add object to c
t.collection_add(pgid, oid);
// object version
t.setattr(oid, "version", &version, sizeof(version));
// set object crev
if (crev == 0 || // new object
did_clone) // we cloned
t.setattr(oid, "crev", &rev, sizeof(rev));
}
}