mirror of
https://github.com/ceph/ceph
synced 2025-01-08 20:21:33 +00:00
284a61df43
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1175 29311d96-e01e-0410-9327-a35deaab8ce9
831 lines
22 KiB
C++
831 lines
22 KiB
C++
|
|
#include "Objecter.h"
|
|
#include "osd/OSDMap.h"
|
|
#include "mon/MonMap.h"
|
|
|
|
#include "msg/Messenger.h"
|
|
#include "msg/Message.h"
|
|
|
|
#include "messages/MOSDOp.h"
|
|
#include "messages/MOSDOpReply.h"
|
|
#include "messages/MOSDMap.h"
|
|
#include "messages/MOSDGetMap.h"
|
|
|
|
#include "messages/MOSDFailure.h"
|
|
|
|
#include <errno.h>
|
|
|
|
#include "config.h"
|
|
#undef dout
|
|
#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
|
|
#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
|
|
|
|
|
|
// messages ------------------------------
|
|
|
|
void Objecter::dispatch(Message *m)
|
|
{
|
|
switch (m->get_type()) {
|
|
case MSG_OSD_OPREPLY:
|
|
handle_osd_op_reply((MOSDOpReply*)m);
|
|
break;
|
|
|
|
case MSG_OSD_MAP:
|
|
handle_osd_map((MOSDMap*)m);
|
|
break;
|
|
|
|
default:
|
|
dout(1) << "don't know message type " << m->get_type() << endl;
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
void Objecter::handle_osd_map(MOSDMap *m)
|
|
{
|
|
assert(osdmap);
|
|
|
|
if (m->get_last() <= osdmap->get_epoch()) {
|
|
dout(3) << "handle_osd_map ignoring epochs ["
|
|
<< m->get_first() << "," << m->get_last()
|
|
<< "] <= " << osdmap->get_epoch() << endl;
|
|
}
|
|
else {
|
|
dout(3) << "handle_osd_map got epochs ["
|
|
<< m->get_first() << "," << m->get_last()
|
|
<< "] > " << osdmap->get_epoch()
|
|
<< endl;
|
|
|
|
set<pg_t> changed_pgs;
|
|
|
|
for (epoch_t e = osdmap->get_epoch() + 1;
|
|
e <= m->get_last();
|
|
e++) {
|
|
if (m->incremental_maps.count(e)) {
|
|
dout(3) << "handle_osd_map decoding incremental epoch " << e << endl;
|
|
OSDMap::Incremental inc;
|
|
int off = 0;
|
|
inc.decode(m->incremental_maps[e], off);
|
|
osdmap->apply_incremental(inc);
|
|
|
|
// notify messenger
|
|
for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
|
|
i != inc.new_down.end();
|
|
i++)
|
|
messenger->mark_down(i->second.addr);
|
|
|
|
}
|
|
else if (m->maps.count(e)) {
|
|
dout(3) << "handle_osd_map decoding full epoch " << e << endl;
|
|
osdmap->decode(m->maps[e]);
|
|
}
|
|
else {
|
|
dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl;
|
|
int mon = monmap->pick_mon();
|
|
messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
|
|
monmap->get_inst(mon));
|
|
break;
|
|
}
|
|
|
|
// scan pgs for changes
|
|
scan_pgs(changed_pgs);
|
|
|
|
assert(e == osdmap->get_epoch());
|
|
}
|
|
|
|
// kick requests who might be timing out on the wrong osds
|
|
if (!changed_pgs.empty())
|
|
kick_requests(changed_pgs);
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
void Objecter::scan_pgs(set<pg_t>& changed_pgs)
|
|
{
|
|
dout(10) << "scan_pgs" << endl;
|
|
|
|
for (hash_map<pg_t,PG>::iterator i = pg_map.begin();
|
|
i != pg_map.end();
|
|
i++) {
|
|
pg_t pgid = i->first;
|
|
PG& pg = i->second;
|
|
|
|
// calc new.
|
|
vector<int> other;
|
|
osdmap->pg_to_acting_osds(pgid, other);
|
|
|
|
if (other == pg.acting)
|
|
continue; // no change.
|
|
|
|
other.swap(pg.acting);
|
|
|
|
if (g_conf.osd_rep == OSD_REP_PRIMARY) {
|
|
// same primary?
|
|
if (!other.empty() &&
|
|
!pg.acting.empty() &&
|
|
other[0] == pg.acting[0])
|
|
continue;
|
|
}
|
|
else if (g_conf.osd_rep == OSD_REP_SPLAY) {
|
|
// same primary and acker?
|
|
if (!other.empty() &&
|
|
!pg.acting.empty() &&
|
|
other[0] == pg.acting[0] &&
|
|
other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0])
|
|
continue;
|
|
}
|
|
else if (g_conf.osd_rep == OSD_REP_CHAIN) {
|
|
// any change is significant.
|
|
}
|
|
|
|
// changed significantly.
|
|
dout(10) << "scan_pgs pg " << pgid
|
|
<< " (" << pg.active_tids << ")"
|
|
<< " " << other << " -> " << pg.acting
|
|
<< endl;
|
|
changed_pgs.insert(pgid);
|
|
}
|
|
}
|
|
|
|
void Objecter::kick_requests(set<pg_t>& changed_pgs)
|
|
{
|
|
dout(10) << "kick_requests in pgs " << changed_pgs << endl;
|
|
|
|
for (set<pg_t>::iterator i = changed_pgs.begin();
|
|
i != changed_pgs.end();
|
|
i++) {
|
|
pg_t pgid = *i;
|
|
PG& pg = pg_map[pgid];
|
|
|
|
// resubmit ops!
|
|
set<tid_t> tids;
|
|
tids.swap( pg.active_tids );
|
|
close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing
|
|
|
|
for (set<tid_t>::iterator p = tids.begin();
|
|
p != tids.end();
|
|
p++) {
|
|
tid_t tid = *p;
|
|
|
|
if (op_modify.count(tid)) {
|
|
OSDModify *wr = op_modify[tid];
|
|
op_modify.erase(tid);
|
|
|
|
// WRITE
|
|
if (wr->tid_version.count(tid)) {
|
|
if (wr->op == OSD_OP_WRITE &&
|
|
!g_conf.objecter_buffer_uncommitted) {
|
|
dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl;
|
|
} else {
|
|
dout(0) << "kick_requests missing commit, replay write " << tid
|
|
<< " v " << wr->tid_version[tid] << endl;
|
|
modifyx_submit(wr, wr->waitfor_commit[tid], tid);
|
|
}
|
|
}
|
|
else if (wr->waitfor_ack.count(tid)) {
|
|
dout(0) << "kick_requests missing ack, resub write " << tid << endl;
|
|
modifyx_submit(wr, wr->waitfor_ack[tid], tid);
|
|
}
|
|
}
|
|
|
|
else if (op_read.count(tid)) {
|
|
// READ
|
|
OSDRead *rd = op_read[tid];
|
|
op_read.erase(tid);
|
|
dout(0) << "kick_requests resub read " << tid << endl;
|
|
|
|
// resubmit
|
|
readx_submit(rd, rd->ops[tid]);
|
|
rd->ops.erase(tid);
|
|
}
|
|
|
|
else if (op_stat.count(tid)) {
|
|
OSDStat *st = op_stat[tid];
|
|
op_stat.erase(tid);
|
|
|
|
dout(0) << "kick_requests resub stat " << tid << endl;
|
|
|
|
// resubmit
|
|
stat_submit(st);
|
|
}
|
|
|
|
else
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void Objecter::handle_osd_op_reply(MOSDOpReply *m)
|
|
{
|
|
// read or modify?
|
|
switch (m->get_op()) {
|
|
case OSD_OP_READ:
|
|
handle_osd_read_reply(m);
|
|
break;
|
|
|
|
case OSD_OP_STAT:
|
|
handle_osd_stat_reply(m);
|
|
break;
|
|
|
|
case OSD_OP_WRNOOP:
|
|
case OSD_OP_WRITE:
|
|
case OSD_OP_ZERO:
|
|
case OSD_OP_DELETE:
|
|
case OSD_OP_WRUNLOCK:
|
|
case OSD_OP_WRLOCK:
|
|
case OSD_OP_RDLOCK:
|
|
case OSD_OP_RDUNLOCK:
|
|
case OSD_OP_UPLOCK:
|
|
case OSD_OP_DNLOCK:
|
|
handle_osd_modify_reply(m);
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// stat -----------------------------------
|
|
|
|
tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout& ol, Context *onfinish,
|
|
objectrev_t rev)
|
|
{
|
|
OSDStat *st = new OSDStat(size);
|
|
st->extents.push_back(ObjectExtent(oid, 0, 0));
|
|
st->extents.front().layout = ol;
|
|
st->extents.front().rev = rev;
|
|
st->onfinish = onfinish;
|
|
|
|
return stat_submit(st);
|
|
}
|
|
|
|
tid_t Objecter::stat_submit(OSDStat *st)
|
|
{
|
|
// find OSD
|
|
ObjectExtent &ex = st->extents.front();
|
|
PG &pg = get_pg( ex.layout.pgid );
|
|
|
|
// send
|
|
last_tid++;
|
|
assert(client_inc >= 0);
|
|
MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
|
|
ex.oid, ex.layout, osdmap->get_epoch(),
|
|
OSD_OP_STAT);
|
|
|
|
dout(10) << "stat_submit " << st << " tid " << last_tid
|
|
<< " oid " << ex.oid
|
|
<< " " << ex.layout
|
|
<< " osd" << pg.acker()
|
|
<< endl;
|
|
|
|
if (pg.acker() >= 0)
|
|
messenger->send_message(m, osdmap->get_inst(pg.acker()));
|
|
|
|
// add to gather set
|
|
st->tid = last_tid;
|
|
op_stat[last_tid] = st;
|
|
|
|
pg.active_tids.insert(last_tid);
|
|
|
|
return last_tid;
|
|
}
|
|
|
|
void Objecter::handle_osd_stat_reply(MOSDOpReply *m)
|
|
{
|
|
// get pio
|
|
tid_t tid = m->get_tid();
|
|
|
|
if (op_stat.count(tid) == 0) {
|
|
dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl;
|
|
delete m;
|
|
return;
|
|
}
|
|
|
|
dout(7) << "handle_osd_stat_reply " << tid
|
|
<< " r=" << m->get_result()
|
|
<< " size=" << m->get_object_size()
|
|
<< endl;
|
|
OSDStat *st = op_stat[ tid ];
|
|
op_stat.erase( tid );
|
|
|
|
// remove from osd/tid maps
|
|
PG& pg = get_pg( m->get_pg() );
|
|
assert(pg.active_tids.count(tid));
|
|
pg.active_tids.erase(tid);
|
|
if (pg.active_tids.empty()) close_pg( m->get_pg() );
|
|
|
|
// success?
|
|
if (m->get_result() == -EAGAIN) {
|
|
dout(7) << " got -EAGAIN, resubmitting" << endl;
|
|
stat_submit(st);
|
|
delete m;
|
|
return;
|
|
}
|
|
//assert(m->get_result() >= 0);
|
|
|
|
// ok!
|
|
if (m->get_result() < 0) {
|
|
*st->size = -1;
|
|
} else {
|
|
*st->size = m->get_object_size();
|
|
}
|
|
|
|
// finish, clean up
|
|
Context *onfinish = st->onfinish;
|
|
|
|
// done
|
|
delete st;
|
|
if (onfinish) {
|
|
onfinish->finish(m->get_result());
|
|
delete onfinish;
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
// read -----------------------------------
|
|
|
|
|
|
tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout& ol, bufferlist *bl,
|
|
Context *onfinish,
|
|
objectrev_t rev)
|
|
{
|
|
OSDRead *rd = new OSDRead(bl);
|
|
rd->extents.push_back(ObjectExtent(oid, off, len));
|
|
rd->extents.front().layout = ol;
|
|
rd->extents.front().rev = rev;
|
|
readx(rd, onfinish);
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
tid_t Objecter::readx(OSDRead *rd, Context *onfinish)
|
|
{
|
|
rd->onfinish = onfinish;
|
|
|
|
// issue reads
|
|
for (list<ObjectExtent>::iterator it = rd->extents.begin();
|
|
it != rd->extents.end();
|
|
it++)
|
|
readx_submit(rd, *it);
|
|
|
|
return last_tid;
|
|
}
|
|
|
|
tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex)
|
|
{
|
|
// find OSD
|
|
PG &pg = get_pg( ex.layout.pgid );
|
|
|
|
// send
|
|
last_tid++;
|
|
assert(client_inc >= 0);
|
|
MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
|
|
ex.oid, ex.layout, osdmap->get_epoch(),
|
|
OSD_OP_READ);
|
|
m->set_length(ex.length);
|
|
m->set_offset(ex.start);
|
|
dout(10) << "readx_submit " << rd << " tid " << last_tid
|
|
<< " oid " << ex.oid << " " << ex.start << "~" << ex.length
|
|
<< " (" << ex.buffer_extents.size() << " buffer fragments)"
|
|
<< " " << ex.layout
|
|
<< " osd" << pg.acker()
|
|
<< endl;
|
|
|
|
if (pg.acker() >= 0)
|
|
messenger->send_message(m, osdmap->get_inst(pg.acker()));
|
|
|
|
// add to gather set
|
|
rd->ops[last_tid] = ex;
|
|
op_read[last_tid] = rd;
|
|
|
|
pg.active_tids.insert(last_tid);
|
|
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
void Objecter::handle_osd_read_reply(MOSDOpReply *m)
|
|
{
|
|
// get pio
|
|
tid_t tid = m->get_tid();
|
|
|
|
if (op_read.count(tid) == 0) {
|
|
dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl;
|
|
delete m;
|
|
return;
|
|
}
|
|
|
|
dout(7) << "handle_osd_read_reply " << tid << endl;
|
|
OSDRead *rd = op_read[ tid ];
|
|
op_read.erase( tid );
|
|
|
|
// remove from osd/tid maps
|
|
PG& pg = get_pg( m->get_pg() );
|
|
assert(pg.active_tids.count(tid));
|
|
pg.active_tids.erase(tid);
|
|
if (pg.active_tids.empty()) close_pg( m->get_pg() );
|
|
|
|
// our op finished
|
|
rd->ops.erase(tid);
|
|
|
|
// success?
|
|
if (m->get_result() == -EAGAIN) {
|
|
dout(7) << " got -EAGAIN, resubmitting" << endl;
|
|
readx_submit(rd, rd->ops[tid]);
|
|
delete m;
|
|
return;
|
|
}
|
|
//assert(m->get_result() >= 0);
|
|
|
|
// what buffer offset are we?
|
|
dout(7) << " got frag from " << m->get_oid() << " "
|
|
<< m->get_offset() << "~" << m->get_length()
|
|
<< ", still have " << rd->ops.size() << " more ops" << endl;
|
|
|
|
if (rd->ops.empty()) {
|
|
// all done
|
|
size_t bytes_read = 0;
|
|
|
|
if (rd->read_data.size()) {
|
|
dout(15) << " assembling frags" << endl;
|
|
|
|
/** FIXME This doesn't handle holes efficiently.
|
|
* It allocates zero buffers to fill whole buffer, and
|
|
* then discards trailing ones at the end.
|
|
*
|
|
* Actually, this whole thing is pretty messy with temporary bufferlist*'s all over
|
|
* the heap.
|
|
*/
|
|
|
|
// we have other fragments, assemble them all... blech!
|
|
rd->read_data[m->get_oid()] = new bufferlist;
|
|
rd->read_data[m->get_oid()]->claim( m->get_data() );
|
|
|
|
// map extents back into buffer
|
|
map<off_t, bufferlist*> by_off; // buffer offset -> bufferlist
|
|
|
|
// for each object extent...
|
|
for (list<ObjectExtent>::iterator eit = rd->extents.begin();
|
|
eit != rd->extents.end();
|
|
eit++) {
|
|
bufferlist *ox_buf = rd->read_data[eit->oid];
|
|
unsigned ox_len = ox_buf->length();
|
|
unsigned ox_off = 0;
|
|
assert(ox_len <= eit->length);
|
|
|
|
// for each buffer extent we're mapping into...
|
|
for (map<size_t,size_t>::iterator bit = eit->buffer_extents.begin();
|
|
bit != eit->buffer_extents.end();
|
|
bit++) {
|
|
dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl;
|
|
by_off[bit->first] = new bufferlist;
|
|
|
|
if (ox_off + bit->second <= ox_len) {
|
|
// we got the whole bx
|
|
by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second);
|
|
if (bytes_read < bit->first + bit->second)
|
|
bytes_read = bit->first + bit->second;
|
|
} else if (ox_off + bit->second > ox_len && ox_off < ox_len) {
|
|
// we got part of this bx
|
|
by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off));
|
|
if (bytes_read < bit->first + ox_len-ox_off)
|
|
bytes_read = bit->first + ox_len-ox_off;
|
|
|
|
// zero end of bx
|
|
dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl;
|
|
bufferptr z(ox_off + bit->second - ox_len);
|
|
z.zero();
|
|
by_off[bit->first]->append( z );
|
|
} else {
|
|
// we got none of this bx. zero whole thing.
|
|
assert(ox_off >= ox_len);
|
|
dout(21) << " adding all zeros for this bit " << bit->second << endl;
|
|
bufferptr z(bit->second);
|
|
z.zero();
|
|
by_off[bit->first]->append( z );
|
|
}
|
|
ox_off += bit->second;
|
|
}
|
|
assert(ox_off == eit->length);
|
|
}
|
|
|
|
// sort and string bits together
|
|
for (map<off_t, bufferlist*>::iterator it = by_off.begin();
|
|
it != by_off.end();
|
|
it++) {
|
|
assert(it->second->length());
|
|
if (it->first < (off_t)bytes_read) {
|
|
dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl;
|
|
rd->bl->claim_append(*(it->second));
|
|
} else {
|
|
dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl;
|
|
}
|
|
delete it->second;
|
|
}
|
|
|
|
// trim trailing zeros?
|
|
if (rd->bl->length() > bytes_read) {
|
|
dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read
|
|
<< " len=" << rd->bl->length() << endl;
|
|
rd->bl->splice(bytes_read, rd->bl->length() - bytes_read);
|
|
assert(bytes_read == rd->bl->length());
|
|
}
|
|
|
|
// hose p->read_data bufferlist*'s
|
|
for (map<object_t, bufferlist*>::iterator it = rd->read_data.begin();
|
|
it != rd->read_data.end();
|
|
it++) {
|
|
delete it->second;
|
|
}
|
|
} else {
|
|
dout(15) << " only one frag" << endl;
|
|
|
|
// only one fragment, easy
|
|
rd->bl->claim( m->get_data() );
|
|
bytes_read = rd->bl->length();
|
|
}
|
|
|
|
// finish, clean up
|
|
Context *onfinish = rd->onfinish;
|
|
|
|
dout(7) << " " << bytes_read << " bytes "
|
|
<< rd->bl->length()
|
|
<< endl;
|
|
|
|
// done
|
|
delete rd;
|
|
if (onfinish) {
|
|
onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result());
|
|
delete onfinish;
|
|
}
|
|
} else {
|
|
// store my bufferlist for later assembling
|
|
rd->read_data[m->get_oid()] = new bufferlist;
|
|
rd->read_data[m->get_oid()]->claim( m->get_data() );
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
|
|
// write ------------------------------------
|
|
|
|
tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout& ol, bufferlist &bl,
|
|
Context *onack, Context *oncommit,
|
|
objectrev_t rev)
|
|
{
|
|
OSDWrite *wr = new OSDWrite(bl);
|
|
wr->extents.push_back(ObjectExtent(oid, off, len));
|
|
wr->extents.front().layout = ol;
|
|
wr->extents.front().buffer_extents[0] = len;
|
|
wr->extents.front().rev = rev;
|
|
modifyx(wr, onack, oncommit);
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
// zero
|
|
|
|
tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout& ol,
|
|
Context *onack, Context *oncommit,
|
|
objectrev_t rev)
|
|
{
|
|
OSDModify *z = new OSDModify(OSD_OP_ZERO);
|
|
z->extents.push_back(ObjectExtent(oid, off, len));
|
|
z->extents.front().layout = ol;
|
|
z->extents.front().rev = rev;
|
|
modifyx(z, onack, oncommit);
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
// lock ops
|
|
|
|
tid_t Objecter::lock(int op, object_t oid, ObjectLayout& ol,
|
|
Context *onack, Context *oncommit)
|
|
{
|
|
OSDModify *l = new OSDModify(op);
|
|
l->extents.push_back(ObjectExtent(oid, 0, 0));
|
|
l->extents.front().layout = ol;
|
|
modifyx(l, onack, oncommit);
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
|
|
// generic modify -----------------------------------
|
|
|
|
tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit)
|
|
{
|
|
wr->onack = onack;
|
|
wr->oncommit = oncommit;
|
|
|
|
// issue writes/whatevers
|
|
for (list<ObjectExtent>::iterator it = wr->extents.begin();
|
|
it != wr->extents.end();
|
|
it++)
|
|
modifyx_submit(wr, *it);
|
|
|
|
return last_tid;
|
|
}
|
|
|
|
|
|
tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid)
|
|
{
|
|
// find
|
|
PG &pg = get_pg( ex.layout.pgid );
|
|
|
|
// send
|
|
tid_t tid;
|
|
if (usetid > 0)
|
|
tid = usetid;
|
|
else
|
|
tid = ++last_tid;
|
|
|
|
MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
|
|
ex.oid, ex.layout, osdmap->get_epoch(),
|
|
wr->op);
|
|
m->set_length(ex.length);
|
|
m->set_offset(ex.start);
|
|
m->set_rev(ex.rev);
|
|
|
|
if (wr->tid_version.count(tid))
|
|
m->set_version(wr->tid_version[tid]); // we're replaying this op!
|
|
|
|
// what type of op?
|
|
switch (wr->op) {
|
|
case OSD_OP_WRITE:
|
|
{
|
|
// map buffer segments into this extent
|
|
// (may be fragmented bc of striping)
|
|
bufferlist cur;
|
|
for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
|
|
bit != ex.buffer_extents.end();
|
|
bit++) {
|
|
bufferlist thisbit;
|
|
thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
|
|
cur.claim_append(thisbit);
|
|
}
|
|
assert(cur.length() == ex.length);
|
|
m->set_data(cur);//.claim(cur);
|
|
}
|
|
break;
|
|
}
|
|
|
|
// add to gather set
|
|
wr->waitfor_ack[tid] = ex;
|
|
wr->waitfor_commit[tid] = ex;
|
|
op_modify[tid] = wr;
|
|
pg.active_tids.insert(tid);
|
|
|
|
++num_unacked;
|
|
++num_uncommitted;
|
|
|
|
// send
|
|
dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid
|
|
<< " oid " << ex.oid
|
|
<< " " << ex.start << "~" << ex.length
|
|
<< " " << ex.layout
|
|
<< " osd" << pg.primary()
|
|
<< endl;
|
|
if (pg.primary() >= 0)
|
|
messenger->send_message(m, osdmap->get_inst(pg.primary()));
|
|
|
|
dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl;
|
|
|
|
return tid;
|
|
}
|
|
|
|
|
|
|
|
void Objecter::handle_osd_modify_reply(MOSDOpReply *m)
|
|
{
|
|
// get pio
|
|
tid_t tid = m->get_tid();
|
|
|
|
if (op_modify.count(tid) == 0) {
|
|
dout(7) << "handle_osd_modify_reply " << tid
|
|
<< (m->get_commit() ? " commit":" ack")
|
|
<< " ... stray" << endl;
|
|
delete m;
|
|
return;
|
|
}
|
|
|
|
dout(7) << "handle_osd_modify_reply " << tid
|
|
<< (m->get_commit() ? " commit":" ack")
|
|
<< " v " << m->get_version()
|
|
<< endl;
|
|
OSDModify *wr = op_modify[ tid ];
|
|
|
|
Context *onack = 0;
|
|
Context *oncommit = 0;
|
|
|
|
PG &pg = get_pg( m->get_pg() );
|
|
|
|
// ignore?
|
|
if (pg.acker() != m->get_source().num()) {
|
|
dout(7) << " ignoring ack|commit from non-acker" << endl;
|
|
delete m;
|
|
return;
|
|
}
|
|
|
|
assert(m->get_result() >= 0);
|
|
|
|
// ack or commit?
|
|
if (m->get_commit()) {
|
|
//dout(15) << " handle_osd_write_reply commit on " << tid << endl;
|
|
assert(wr->tid_version.count(tid) == 0 ||
|
|
m->get_version() == wr->tid_version[tid]);
|
|
|
|
// remove from tid/osd maps
|
|
assert(pg.active_tids.count(tid));
|
|
pg.active_tids.erase(tid);
|
|
if (pg.active_tids.empty()) close_pg( m->get_pg() );
|
|
|
|
// commit.
|
|
op_modify.erase( tid );
|
|
wr->waitfor_ack.erase(tid);
|
|
wr->waitfor_commit.erase(tid);
|
|
|
|
num_uncommitted--;
|
|
|
|
if (wr->waitfor_commit.empty()) {
|
|
onack = wr->onack;
|
|
oncommit = wr->oncommit;
|
|
delete wr;
|
|
}
|
|
} else {
|
|
// ack.
|
|
//dout(15) << " handle_osd_write_reply ack on " << tid << endl;
|
|
assert(wr->waitfor_ack.count(tid));
|
|
wr->waitfor_ack.erase(tid);
|
|
|
|
num_unacked--;
|
|
|
|
if (wr->tid_version.count(tid) &&
|
|
wr->tid_version[tid].version != m->get_version().version) {
|
|
dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid
|
|
<< " did not achieve previous ordering" << endl;
|
|
}
|
|
wr->tid_version[tid] = m->get_version();
|
|
|
|
if (wr->waitfor_ack.empty()) {
|
|
onack = wr->onack;
|
|
wr->onack = 0; // only do callback once
|
|
|
|
// buffer uncommitted?
|
|
if (!g_conf.objecter_buffer_uncommitted &&
|
|
wr->op == OSD_OP_WRITE) {
|
|
// discard buffer!
|
|
((OSDWrite*)wr)->bl.clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
// do callbacks
|
|
if (onack) {
|
|
onack->finish(0);
|
|
delete onack;
|
|
}
|
|
if (oncommit) {
|
|
oncommit->finish(0);
|
|
delete oncommit;
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
|
|
void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst)
|
|
{
|
|
if (dest.is_mon()) {
|
|
// try a new mon
|
|
int mon = monmap->pick_mon(true);
|
|
dout(0) << "ms_handle_failure " << dest << " inst " << inst
|
|
<< ", resending to mon" << mon
|
|
<< endl;
|
|
messenger->send_message(m, monmap->get_inst(mon));
|
|
}
|
|
else if (dest.is_osd()) {
|
|
int mon = monmap->pick_mon();
|
|
dout(0) << "ms_handle_failure " << dest << " inst " << inst
|
|
<< ", dropping and reporting to mon" << mon
|
|
<< endl;
|
|
messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()),
|
|
monmap->get_inst(mon));
|
|
delete m;
|
|
} else {
|
|
dout(0) << "ms_handle_failure " << dest << " inst " << inst
|
|
<< ", dropping" << endl;
|
|
delete m;
|
|
}
|
|
}
|