mirror of
https://github.com/ceph/ceph
synced 2025-01-11 13:41:02 +00:00
0d081ba016
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1107 29311d96-e01e-0410-9327-a35deaab8ce9
1792 lines
44 KiB
C++
1792 lines
44 KiB
C++
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
|
/*
|
|
* Ceph - scalable distributed file system
|
|
*
|
|
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
|
*
|
|
* This is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software
|
|
* Foundation. See file COPYING.
|
|
*
|
|
*/
|
|
|
|
#include "NewerMessenger.h"
|
|
|
|
#include <errno.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
|
|
#include "config.h"
|
|
|
|
#include "messages/MGenericMessage.h"
|
|
#include "messages/MNSConnect.h"
|
|
#include "messages/MNSConnectAck.h"
|
|
#include "messages/MNSRegister.h"
|
|
#include "messages/MNSRegisterAck.h"
|
|
#include "messages/MNSLookup.h"
|
|
#include "messages/MNSLookupReply.h"
|
|
#include "messages/MNSFailure.h"
|
|
|
|
//#include "messages/MFailure.h"
|
|
|
|
#include <netdb.h>
|
|
|
|
|
|
#undef dout
|
|
#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " "
|
|
#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " "
|
|
|
|
|
|
|
|
#include "tcp.cc"
|
|
|
|
|
|
Rank rank;
|
|
|
|
|
|
/********************************************
|
|
* Namer
|
|
*/
|
|
|
|
Rank::Namer::Namer(EntityMessenger *msgr) :
|
|
messenger(msgr),
|
|
nrank(0), nclient(0), nmds(0), nosd(0), nmon(0)
|
|
{
|
|
assert(rank.my_rank == 0);
|
|
nrank = g_conf.num_mon;
|
|
|
|
// announce myself
|
|
/*
|
|
cerr << "ceph ns is " << rank.accepter.listen_addr << endl;
|
|
cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl;
|
|
int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
|
|
::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t));
|
|
::fchmod(fd, 0755);
|
|
::close(fd);
|
|
*/
|
|
|
|
// ok
|
|
messenger->set_dispatcher(this);
|
|
}
|
|
|
|
Rank::Namer::~Namer()
|
|
{
|
|
//::unlink(".ceph_ns");
|
|
}
|
|
|
|
|
|
void Rank::Namer::dispatch(Message *m)
|
|
{
|
|
rank.lock.Lock();
|
|
int type = m->get_type();
|
|
switch (type) {
|
|
case MSG_NS_CONNECT:
|
|
handle_connect((class MNSConnect*)m);
|
|
break;
|
|
case MSG_NS_REGISTER:
|
|
handle_register((class MNSRegister*)m);
|
|
break;
|
|
case MSG_NS_STARTED:
|
|
handle_started(m);
|
|
break;
|
|
case MSG_NS_UNREGISTER:
|
|
handle_unregister(m);
|
|
break;
|
|
case MSG_NS_LOOKUP:
|
|
handle_lookup((class MNSLookup*)m);
|
|
break;
|
|
case MSG_NS_FAILURE:
|
|
handle_failure((class MNSFailure*)m);
|
|
break;
|
|
|
|
case MSG_FAILURE_ACK:
|
|
delete m;
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
rank.lock.Unlock();
|
|
}
|
|
|
|
void Rank::Namer::handle_connect(MNSConnect *m)
|
|
{
|
|
int newrank = nrank++;
|
|
dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl;
|
|
|
|
rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr();
|
|
rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank;
|
|
rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank));
|
|
|
|
messenger->send_message(new MNSConnectAck(newrank),
|
|
MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]);
|
|
delete m;
|
|
}
|
|
|
|
void Rank::Namer::manual_insert_inst(const entity_inst_t &inst)
|
|
{
|
|
rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst;
|
|
}
|
|
|
|
void Rank::Namer::handle_register(MNSRegister *m)
|
|
{
|
|
dout(10) << "namer.handle_register from rank " << m->get_rank()
|
|
<< " addr " << m->get_entity() << endl;
|
|
|
|
// pick id
|
|
entity_name_t entity = m->get_entity();
|
|
|
|
if (entity.is_new()) {
|
|
// make up a new address!
|
|
switch (entity.type()) {
|
|
case MSG_ADDR_MDS_BASE:
|
|
entity = MSG_ADDR_MDS(nmds++);
|
|
break;
|
|
|
|
case MSG_ADDR_OSD_BASE:
|
|
entity = MSG_ADDR_OSD(nosd++);
|
|
break;
|
|
|
|
case MSG_ADDR_CLIENT_BASE:
|
|
entity = MSG_ADDR_CLIENT(nclient++);
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
} else {
|
|
// specific address!
|
|
}
|
|
|
|
|
|
// register
|
|
if (rank.entity_map.count(entity)) {
|
|
dout(1) << "namer.handle_register re-registering " << entity
|
|
<< " inst " << m->get_source_inst()
|
|
<< " (was " << rank.entity_map[entity] << ")"
|
|
<< endl;
|
|
} else {
|
|
dout(1) << "namer.handle_register registering " << entity
|
|
<< " inst " << m->get_source_inst()
|
|
<< endl;
|
|
}
|
|
rank.entity_map[entity] = m->get_source_inst();
|
|
rank.entity_unstarted.insert(entity);
|
|
|
|
// reply w/ new id
|
|
messenger->send_message(new MNSRegisterAck(m->get_tid(), entity),
|
|
m->get_source(), rank.entity_map[entity]);
|
|
|
|
delete m;
|
|
}
|
|
|
|
void Rank::Namer::handle_started(Message *m)
|
|
{
|
|
entity_name_t who = m->get_source();
|
|
dout(10) << "namer.handle_started from entity " << who << endl;
|
|
|
|
assert(rank.entity_unstarted.count(who));
|
|
rank.entity_unstarted.erase(who);
|
|
|
|
// anybody waiting?
|
|
if (waiting.count(who)) {
|
|
list<Message*> ls;
|
|
ls.swap(waiting[who]);
|
|
waiting.erase(who);
|
|
|
|
dout(10) << "doing waiters on " << who << endl;
|
|
for (list<Message*>::iterator it = ls.begin();
|
|
it != ls.end();
|
|
it++)
|
|
dispatch(*it);
|
|
}
|
|
|
|
}
|
|
|
|
void Rank::Namer::handle_unregister(Message *m)
|
|
{
|
|
entity_name_t who = m->get_source();
|
|
dout(1) << "namer.handle_unregister entity " << who << endl;
|
|
|
|
rank.show_dir();
|
|
|
|
assert(rank.entity_map.count(who));
|
|
rank.entity_map.erase(who);
|
|
|
|
rank.show_dir();
|
|
|
|
// shut myself down? kick watcher.
|
|
if (rank.entity_map.size() == 2) {
|
|
dout(10) << "namer.handle_unregister stopping namer" << endl;
|
|
rank.lock.Unlock();
|
|
messenger->shutdown();
|
|
delete messenger;
|
|
rank.lock.Lock();
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
void Rank::Namer::handle_lookup(MNSLookup *m)
|
|
{
|
|
// have it?
|
|
if (rank.entity_map.count(m->get_entity()) == 0) {
|
|
dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl;
|
|
waiting[m->get_entity()].push_back(m);
|
|
return;
|
|
}
|
|
|
|
if (rank.entity_unstarted.count(m->get_entity())) {
|
|
dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl;
|
|
waiting[m->get_entity()].push_back(m);
|
|
return;
|
|
}
|
|
|
|
// look it up!
|
|
MNSLookupReply *reply = new MNSLookupReply(m);
|
|
|
|
reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()];
|
|
|
|
dout(10) << "namer " << m->get_source()
|
|
<< " lookup '" << m->get_entity()
|
|
<< "' -> " << rank.entity_map[m->get_entity()] << endl;
|
|
|
|
messenger->send_message(reply, m->get_source(), m->get_source_inst());
|
|
delete m;
|
|
}
|
|
|
|
void Rank::Namer::handle_failure(MNSFailure *m)
|
|
{
|
|
dout(10) << "namer.handle_failure inst " << m->get_inst()
|
|
<< endl;
|
|
|
|
// search for entities on this instance
|
|
list<entity_name_t> rm;
|
|
for (hash_map<entity_name_t,entity_inst_t>::iterator i = rank.entity_map.begin();
|
|
i != rank.entity_map.end();
|
|
i++) {
|
|
if (i->second != m->get_inst()) continue;
|
|
rm.push_back(i->first);
|
|
}
|
|
for (list<entity_name_t>::iterator i = rm.begin();
|
|
i != rm.end();
|
|
i++) {
|
|
dout(10) << "namer.handle_failure inst " << m->get_inst()
|
|
<< ", removing " << *i << endl;
|
|
|
|
rank.entity_map.erase(*i);
|
|
rank.entity_unstarted.erase(*i);
|
|
|
|
/*
|
|
if ((*i).is_osd()) {
|
|
// tell the monitor
|
|
messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0));
|
|
}
|
|
*/
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
|
|
/********************************************
|
|
* Accepter
|
|
*/
|
|
|
|
int Rank::Accepter::start()
|
|
{
|
|
// bind to a socket
|
|
dout(10) << "accepter.start binding to listen " << endl;
|
|
|
|
/* socket creation */
|
|
listen_sd = socket(AF_INET,SOCK_STREAM,0);
|
|
assert(listen_sd > 0);
|
|
|
|
/* bind to port */
|
|
memset((char*)&listen_addr, 0, sizeof(listen_addr));
|
|
listen_addr.sin_family = AF_INET;
|
|
listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
|
|
listen_addr.sin_port = 0;
|
|
|
|
int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
|
|
assert(rc >= 0);
|
|
|
|
socklen_t llen = sizeof(listen_addr);
|
|
getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
|
|
|
|
int myport = listen_addr.sin_port;
|
|
|
|
// listen!
|
|
rc = ::listen(listen_sd, 1000);
|
|
assert(rc >= 0);
|
|
|
|
//dout(10) << "accepter.start listening on " << myport << endl;
|
|
|
|
// my address is...
|
|
char host[100];
|
|
bzero(host, 100);
|
|
gethostname(host, 100);
|
|
//dout(10) << "accepter.start my hostname is " << host << endl;
|
|
|
|
struct hostent *myhostname = gethostbyname( host );
|
|
|
|
struct sockaddr_in my_addr;
|
|
memset(&my_addr, 0, sizeof(my_addr));
|
|
|
|
my_addr.sin_family = myhostname->h_addrtype;
|
|
memcpy((char *) &my_addr.sin_addr.s_addr,
|
|
myhostname->h_addr_list[0],
|
|
myhostname->h_length);
|
|
my_addr.sin_port = myport;
|
|
|
|
listen_addr = my_addr;
|
|
|
|
dout(10) << "accepter.start listen addr is " << listen_addr << endl;
|
|
|
|
// start thread
|
|
create();
|
|
|
|
return 0;
|
|
}
|
|
|
|
void *Rank::Accepter::entry()
|
|
{
|
|
dout(10) << "accepter starting" << endl;
|
|
|
|
while (!done) {
|
|
// accept
|
|
struct sockaddr_in addr;
|
|
socklen_t slen = sizeof(addr);
|
|
int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
|
|
if (sd > 0) {
|
|
dout(10) << "accepted incoming on sd " << sd << endl;
|
|
|
|
rank.lock.Lock();
|
|
Pipe *p = new Pipe(sd);
|
|
rank.pipes.insert(p);
|
|
rank.lock.Unlock();
|
|
} else {
|
|
dout(10) << "no incoming connection?" << endl;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
/**************************************
|
|
* Pipe
|
|
*/
|
|
|
|
int Rank::Pipe::accept()
|
|
{
|
|
// my creater gave me sd via accept()
|
|
|
|
// announce myself.
|
|
int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
|
|
if (rc < 0) {
|
|
::close(sd);
|
|
done = true;
|
|
return -1;
|
|
}
|
|
|
|
// identify peer
|
|
rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst));
|
|
if (rc < 0) {
|
|
dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl;
|
|
::close(sd);
|
|
done = true;
|
|
return -1;
|
|
}
|
|
|
|
// create writer thread.
|
|
writer_running = true;
|
|
writer_thread.create();
|
|
|
|
// register pipe.
|
|
if (peer_inst.rank >= 0) {
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.rank_pipe.count(peer_inst.rank) == 0) {
|
|
// install a pipe!
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl;
|
|
rank.rank_pipe[peer_inst.rank] = this;
|
|
} else {
|
|
// low ranks' Pipes "win"
|
|
if (peer_inst.rank < rank.my_inst.rank ||
|
|
rank.my_inst.rank < 0) {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
|
|
<< ", already had pipe, but switching to this new one" << endl;
|
|
// switch to this new Pipe
|
|
rank.rank_pipe[peer_inst.rank]->close(); // close old one
|
|
rank.rank_pipe[peer_inst.rank] = this;
|
|
} else {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
|
|
<< ", already had pipe, sticking with it" << endl;
|
|
}
|
|
}
|
|
}
|
|
rank.lock.Unlock();
|
|
} else {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl;
|
|
}
|
|
|
|
return 0; // success.
|
|
}
|
|
|
|
int Rank::Pipe::connect()
|
|
{
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl;
|
|
|
|
// create socket?
|
|
sd = socket(AF_INET,SOCK_STREAM,0);
|
|
assert(sd > 0);
|
|
|
|
// bind any port
|
|
struct sockaddr_in myAddr;
|
|
myAddr.sin_family = AF_INET;
|
|
myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
|
|
myAddr.sin_port = htons( 0 );
|
|
|
|
int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
|
|
assert(rc>=0);
|
|
|
|
// connect!
|
|
rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr));
|
|
if (rc < 0) return rc;
|
|
|
|
// identify peer
|
|
entity_inst_t inst;
|
|
rc = tcp_read(sd, (char*)&inst, sizeof(inst));
|
|
if (inst.rank < 0)
|
|
inst = peer_inst; // i know better than they do.
|
|
if (peer_inst != inst && inst.rank > 0) {
|
|
derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl;
|
|
assert(0);
|
|
return -1;
|
|
}
|
|
|
|
// identify myself
|
|
rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
|
|
if (rc < 0)
|
|
return -1;
|
|
|
|
// register pipe
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.rank_pipe.count(peer_inst.rank) == 0) {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl;
|
|
rank.rank_pipe[peer_inst.rank] = this;
|
|
} else {
|
|
// this is normal.
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl;
|
|
}
|
|
}
|
|
rank.lock.Unlock();
|
|
|
|
// start reader
|
|
reader_running = true;
|
|
reader_thread.create();
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void Rank::Pipe::close()
|
|
{
|
|
if (sent_close) {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl;
|
|
return;
|
|
}
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl;
|
|
|
|
// unreg ourselves
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.rank_pipe.count(peer_inst.rank) &&
|
|
rank.rank_pipe[peer_inst.rank] == this) {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl;
|
|
rank.rank_pipe.erase(peer_inst.rank);
|
|
}
|
|
}
|
|
rank.lock.Unlock();
|
|
|
|
// queue close message.
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl;
|
|
lock.Lock();
|
|
q.push_back(new MGenericMessage(MSG_CLOSE));
|
|
cond.Signal();
|
|
sent_close = true;
|
|
lock.Unlock();
|
|
}
|
|
|
|
|
|
/* read msgs from socket.
|
|
* also, server.
|
|
*
|
|
*/
|
|
void Rank::Pipe::reader()
|
|
{
|
|
if (server)
|
|
accept();
|
|
|
|
// loop.
|
|
while (!done) {
|
|
Message *m = read_message();
|
|
if (!m || m->get_type() == 0) {
|
|
if (m) {
|
|
delete m;
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl;
|
|
} else {
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl;
|
|
}
|
|
|
|
if (!sent_close)
|
|
close();
|
|
|
|
done = true;
|
|
cond.Signal(); // wake up writer too.
|
|
break;
|
|
}
|
|
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl;
|
|
|
|
EntityMessenger *entity = 0;
|
|
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.entity_map.count(m->get_source()) &&
|
|
rank.entity_map[m->get_source()] > m->get_source_inst()) {
|
|
derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source()
|
|
<< " inst " << m->get_source_inst()
|
|
<< " > " << rank.entity_map[m->get_source()]
|
|
<< ", WATCH OUT " << *m << endl;
|
|
assert(0);
|
|
}
|
|
|
|
if (m->get_dest().type() == MSG_ADDR_RANK_BASE) {
|
|
// ours.
|
|
rank.dispatch(m);
|
|
} else {
|
|
if (g_conf.ms_single_dispatch) {
|
|
// submit to single dispatch queue
|
|
rank._submit_single_dispatch(m);
|
|
} else {
|
|
if (rank.local.count(m->get_dest())) {
|
|
// find entity
|
|
entity = rank.local[m->get_dest()];
|
|
} else {
|
|
derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
|
|
assert(0); // FIXME do this differently
|
|
//rank.waiting_for_lookup[m->get_dest()].push_back(m);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
rank.lock.Unlock();
|
|
|
|
if (entity)
|
|
entity->queue_message(m); // queue
|
|
}
|
|
|
|
|
|
// reap?
|
|
bool reap = false;
|
|
lock.Lock();
|
|
{
|
|
reader_running = false;
|
|
if (!writer_running) reap = true;
|
|
}
|
|
lock.Unlock();
|
|
|
|
if (reap) {
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl;
|
|
::close(sd);
|
|
rank.lock.Lock();
|
|
{
|
|
rank.pipe_reap_queue.push_back(this);
|
|
rank.wait_cond.Signal();
|
|
}
|
|
rank.lock.Unlock();
|
|
}
|
|
}
|
|
|
|
|
|
/* write msgs to socket.
|
|
* also, client.
|
|
*/
|
|
void Rank::Pipe::writer()
|
|
{
|
|
if (!server) {
|
|
int rc = connect();
|
|
if (rc < 0) {
|
|
derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl;
|
|
done = true;
|
|
list<Message*> out;
|
|
fail(out);
|
|
}
|
|
}
|
|
|
|
// loop.
|
|
lock.Lock();
|
|
while (!q.empty() || !done) {
|
|
|
|
if (!q.empty()) {
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl;
|
|
|
|
// grab outgoing list
|
|
list<Message*> out;
|
|
out.swap(q);
|
|
|
|
// drop lock while i send these
|
|
lock.Unlock();
|
|
|
|
while (!out.empty()) {
|
|
Message *m = out.front();
|
|
out.pop_front();
|
|
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl;
|
|
|
|
// stamp.
|
|
m->set_source_inst(rank.my_inst);
|
|
|
|
// marshall
|
|
if (m->empty_payload())
|
|
m->encode_payload();
|
|
|
|
if (write_message(m) < 0) {
|
|
// failed!
|
|
derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl;
|
|
out.push_front(m);
|
|
fail(out);
|
|
done = true;
|
|
break;
|
|
}
|
|
|
|
// did i just send a close?
|
|
if (m->get_type() == MSG_CLOSE)
|
|
done = true;
|
|
|
|
// clean up
|
|
delete m;
|
|
}
|
|
|
|
lock.Lock();
|
|
continue;
|
|
}
|
|
|
|
// wait
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl;
|
|
cond.Wait(lock);
|
|
}
|
|
lock.Unlock();
|
|
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl;
|
|
|
|
// reap?
|
|
bool reap = false;
|
|
lock.Lock();
|
|
{
|
|
writer_running = false;
|
|
if (!reader_running) reap = true;
|
|
}
|
|
lock.Unlock();
|
|
|
|
if (reap) {
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl;
|
|
::close(sd);
|
|
rank.lock.Lock();
|
|
{
|
|
rank.pipe_reap_queue.push_back(this);
|
|
rank.wait_cond.Signal();
|
|
}
|
|
rank.lock.Unlock();
|
|
}
|
|
}
|
|
|
|
|
|
Message *Rank::Pipe::read_message()
|
|
{
|
|
// envelope
|
|
//dout(10) << "receiver.read_message from sd " << sd << endl;
|
|
|
|
msg_envelope_t env;
|
|
if (!tcp_read( sd, (char*)&env, sizeof(env) ))
|
|
return 0;
|
|
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type
|
|
<< " src " << env.source << " dst " << env.dest
|
|
<< " nchunks=" << env.nchunks
|
|
<< endl;
|
|
|
|
// payload
|
|
bufferlist blist;
|
|
for (int i=0; i<env.nchunks; i++) {
|
|
int size;
|
|
if (!tcp_read( sd, (char*)&size, sizeof(size) )) return 0;
|
|
|
|
if (size == 0) continue;
|
|
|
|
bufferptr bp(size);
|
|
|
|
if (!tcp_read( sd, bp.c_str(), size )) return 0;
|
|
|
|
blist.push_back(bp);
|
|
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got frag " << i << " of " << env.nchunks
|
|
<< " len " << bp.length() << endl;
|
|
}
|
|
|
|
// unmarshall message
|
|
size_t s = blist.length();
|
|
Message *m = decode_message(env, blist);
|
|
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got " << s << " byte message from "
|
|
<< m->get_source() << endl;
|
|
|
|
return m;
|
|
}
|
|
|
|
|
|
|
|
int Rank::Pipe::write_message(Message *m)
|
|
{
|
|
// get envelope, buffers
|
|
msg_envelope_t *env = &m->get_envelope();
|
|
bufferlist blist;
|
|
blist.claim( m->get_payload() );
|
|
|
|
#ifdef TCP_KEEP_CHUNKS
|
|
env->nchunks = blist.buffers().size();
|
|
#else
|
|
env->nchunks = 1;
|
|
#endif
|
|
|
|
dout(20)// << g_clock.now()
|
|
<< "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m
|
|
<< " to " << m->get_dest()
|
|
<< endl;
|
|
|
|
// send envelope
|
|
int r = tcp_write( sd, (char*)env, sizeof(*env) );
|
|
if (r < 0) {
|
|
derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m
|
|
<< " to " << m->get_dest() << endl;
|
|
return -1;
|
|
}
|
|
|
|
// payload
|
|
#ifdef TCP_KEEP_CHUNKS
|
|
// send chunk-wise
|
|
int i = 0;
|
|
for (list<bufferptr>::const_iterator it = blist.buffers().begin();
|
|
it != blist.buffers().end();
|
|
it++) {
|
|
dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
|
|
int size = (*it).length();
|
|
r = tcp_write( sd, (char*)&size, sizeof(size) );
|
|
if (r < 0) {
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
|
|
return -1;
|
|
}
|
|
r = tcp_write( sd, (*it).c_str(), size );
|
|
if (r < 0) {
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl;
|
|
return -1;
|
|
}
|
|
i++;
|
|
}
|
|
#else
|
|
// one big chunk
|
|
int size = blist.length();
|
|
r = tcp_write( sd, (char*)&size, sizeof(size) );
|
|
if (r < 0) {
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
|
|
return -1;
|
|
}
|
|
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
|
|
|
|
for (list<bufferptr>::const_iterator it = blist.buffers().begin();
|
|
it != blist.buffers().end();
|
|
it++) {
|
|
if ((*it).length() == 0) continue; // blank buffer.
|
|
r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
|
|
if (r < 0) {
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
|
|
return -1;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void Rank::Pipe::fail(list<Message*>& out)
|
|
{
|
|
derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl;
|
|
|
|
// tell namer
|
|
if (!rank.messenger) {
|
|
derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl;
|
|
assert(0);
|
|
}
|
|
|
|
// FIXME: possible race before i reclaim lock here?
|
|
|
|
// deactivate myself
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.rank_pipe.count(peer_inst.rank) &&
|
|
rank.rank_pipe[peer_inst.rank] == this)
|
|
rank.rank_pipe.erase(peer_inst.rank);
|
|
}
|
|
rank.lock.Unlock();
|
|
|
|
// what do i do about reader()? FIXME
|
|
|
|
// sort my messages by (source) dispatcher, dest.
|
|
map<Dispatcher*, map<entity_name_t, list<Message*> > > by_dis;
|
|
lock.Lock();
|
|
{
|
|
// include out at front of queue
|
|
q.splice(q.begin(), out);
|
|
|
|
// sort
|
|
while (!q.empty()) {
|
|
if (q.front()->get_type() == MSG_CLOSE) {
|
|
delete q.front();
|
|
}
|
|
else if (rank.local.count(q.front()->get_source())) {
|
|
Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher();
|
|
by_dis[dis][q.front()->get_dest()].push_back(q.front());
|
|
}
|
|
else {
|
|
// oh well. sending entity musta just shut down?
|
|
assert(0);
|
|
delete q.front();
|
|
}
|
|
q.pop_front();
|
|
}
|
|
}
|
|
lock.Unlock();
|
|
|
|
// report failure(s) to dispatcher(s)
|
|
for (map<Dispatcher*, map<entity_name_t, list<Message*> > >::iterator i = by_dis.begin();
|
|
i != by_dis.end();
|
|
++i)
|
|
for (map<entity_name_t, list<Message*> >::iterator j = i->second.begin();
|
|
j != i->second.end();
|
|
++j)
|
|
for (list<Message*>::iterator k = j->second.begin();
|
|
k != j->second.end();
|
|
++k) {
|
|
derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl;
|
|
i->first->ms_handle_failure(*k, j->first, peer_inst);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/********************************************
|
|
* Rank
|
|
*/
|
|
|
|
Rank::Rank(int r) :
|
|
single_dispatcher(this),
|
|
my_rank(r),
|
|
namer(0) {
|
|
}
|
|
Rank::~Rank()
|
|
{
|
|
//FIXME
|
|
if (namer) delete namer;
|
|
}
|
|
|
|
|
|
void Rank::_submit_single_dispatch(Message *m)
|
|
{
|
|
assert(lock.is_locked());
|
|
|
|
if (local.count(m->get_dest()) &&
|
|
local[m->get_dest()]->is_ready()) {
|
|
rank.single_dispatch_queue.push_back(m);
|
|
rank.single_dispatch_cond.Signal();
|
|
} else {
|
|
waiting_for_ready[m->get_dest()].push_back(m);
|
|
}
|
|
}
|
|
|
|
|
|
void Rank::single_dispatcher_entry()
|
|
{
|
|
lock.Lock();
|
|
while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
|
|
if (!single_dispatch_queue.empty()) {
|
|
list<Message*> ls;
|
|
ls.swap(single_dispatch_queue);
|
|
|
|
lock.Unlock();
|
|
{
|
|
while (!ls.empty()) {
|
|
Message *m = ls.front();
|
|
ls.pop_front();
|
|
|
|
dout(1) //<< g_clock.now()
|
|
<< "---- "
|
|
<< m->get_source()// << ':' << m->get_source_port()
|
|
<< " to " << m->get_dest()// << ':' << m->get_dest_port()
|
|
<< " ---- " << m->get_type_name()
|
|
<< " ---- " << m
|
|
<< endl;
|
|
|
|
if (m->get_dest().type() == MSG_ADDR_RANK_BASE)
|
|
rank.dispatch(m);
|
|
else {
|
|
assert(local.count(m->get_dest()));
|
|
local[m->get_dest()]->dispatch(m);
|
|
}
|
|
}
|
|
}
|
|
lock.Lock();
|
|
continue;
|
|
}
|
|
single_dispatch_cond.Wait(lock);
|
|
}
|
|
lock.Unlock();
|
|
}
|
|
|
|
|
|
/*
|
|
* note: assumes lock is held
|
|
*/
|
|
void Rank::reaper()
|
|
{
|
|
dout(10) << "reaper" << endl;
|
|
assert(lock.is_locked());
|
|
|
|
while (!pipe_reap_queue.empty()) {
|
|
Pipe *p = pipe_reap_queue.front();
|
|
dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl;
|
|
pipe_reap_queue.pop_front();
|
|
assert(pipes.count(p));
|
|
pipes.erase(p);
|
|
p->join();
|
|
dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl;
|
|
delete p;
|
|
}
|
|
}
|
|
|
|
|
|
int Rank::start_rank()
|
|
{
|
|
dout(10) << "start_rank" << endl;
|
|
|
|
// bind to a socket
|
|
if (accepter.start() < 0)
|
|
return -1;
|
|
|
|
// start single thread dispatcher?
|
|
if (g_conf.ms_single_dispatch) {
|
|
single_dispatch_stop = false;
|
|
single_dispatcher.create();
|
|
}
|
|
|
|
lock.Lock();
|
|
|
|
// my_inst
|
|
my_inst.addr = accepter.listen_addr;
|
|
my_inst.rank = my_rank;
|
|
|
|
if (my_rank < 0) {
|
|
dout(10) << "start_rank connecting to namer0" << endl;
|
|
|
|
// connect to namer
|
|
assert(entity_map.count(MSG_ADDR_NAMER(0)));
|
|
Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]);
|
|
|
|
// send
|
|
Message *m = new MNSConnect(accepter.listen_addr);
|
|
m->set_dest(MSG_ADDR_NAMER(0), 0);
|
|
pipe->send(m);
|
|
|
|
// wait
|
|
while (my_rank < 0)
|
|
waiting_for_rank.Wait(lock);
|
|
assert(my_rank >= 0);
|
|
|
|
dout(10) << "start_rank got rank " << my_rank << endl;
|
|
|
|
// create rank entity
|
|
entity_map[MSG_ADDR_RANK(my_rank)] = my_inst;
|
|
local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank));
|
|
messenger->set_dispatcher(this);
|
|
} else {
|
|
// create my rank
|
|
entity_name_t raddr = MSG_ADDR_RANK(my_rank);
|
|
entity_map[raddr] = my_inst;
|
|
entity_unstarted.insert(raddr);
|
|
local[raddr] = messenger = new EntityMessenger(raddr);
|
|
messenger->set_dispatcher(this);
|
|
|
|
dout(1) << "start_rank " << my_rank << " at " << my_inst << endl;
|
|
}
|
|
|
|
lock.Unlock();
|
|
return 0;
|
|
}
|
|
|
|
void Rank::start_namer()
|
|
{
|
|
// create namer0
|
|
entity_name_t naddr = MSG_ADDR_NAMER(0);
|
|
entity_map[naddr] = my_inst;
|
|
local[naddr] = new EntityMessenger(naddr);
|
|
namer = new Namer(local[naddr]);
|
|
namer_inst = my_inst;
|
|
}
|
|
|
|
void Rank::set_namer(const tcpaddr_t& ns)
|
|
{
|
|
namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns;
|
|
namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0;
|
|
}
|
|
|
|
/* connect_rank
|
|
* NOTE: assumes rank.lock held.
|
|
*/
|
|
Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst)
|
|
{
|
|
assert(rank.lock.is_locked());
|
|
assert(inst != rank.my_inst);
|
|
|
|
dout(10) << "connect_rank to " << inst << endl;
|
|
|
|
// create pipe
|
|
Pipe *pipe = new Pipe(inst);
|
|
rank.rank_pipe[inst.rank] = pipe;
|
|
pipes.insert(pipe);
|
|
|
|
return pipe;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void Rank::show_dir()
|
|
{
|
|
dout(10) << "show_dir ---" << endl;
|
|
|
|
for (hash_map<entity_name_t, entity_inst_t>::iterator i = entity_map.begin();
|
|
i != entity_map.end();
|
|
i++) {
|
|
if (local.count(i->first)) {
|
|
dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
|
|
} else {
|
|
dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* lookup
|
|
* NOTE: assumes directory.lock held
|
|
*/
|
|
void Rank::lookup(entity_name_t addr)
|
|
{
|
|
dout(10) << "lookup " << addr << endl;
|
|
assert(lock.is_locked());
|
|
|
|
assert(looking_up.count(addr) == 0);
|
|
looking_up.insert(addr);
|
|
|
|
MNSLookup *r = new MNSLookup(addr);
|
|
messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst);
|
|
}
|
|
|
|
|
|
|
|
/* register_entity
|
|
*/
|
|
Rank::EntityMessenger *Rank::register_entity(entity_name_t addr)
|
|
{
|
|
dout(10) << "register_entity " << addr << endl;
|
|
lock.Lock();
|
|
|
|
// register with namer
|
|
static long reg_attempt = 0;
|
|
long id = ++reg_attempt;
|
|
|
|
Message *reg = new MNSRegister(addr, my_rank, id);
|
|
reg->set_source(MSG_ADDR_RANK(my_rank), 0);
|
|
reg->set_source_inst(my_inst);
|
|
reg->set_dest(MSG_ADDR_DIRECTORY, 0);
|
|
|
|
// prepare cond
|
|
Cond cond;
|
|
waiting_for_register_cond[id] = &cond;
|
|
|
|
// send request
|
|
lock.Unlock();
|
|
submit_message(reg);
|
|
lock.Lock();
|
|
|
|
// wait
|
|
while (!waiting_for_register_result.count(id))
|
|
cond.Wait(lock);
|
|
|
|
// grab result
|
|
addr = waiting_for_register_result[id];
|
|
dout(10) << "register_entity got " << addr << endl;
|
|
|
|
// clean up
|
|
waiting_for_register_cond.erase(id);
|
|
waiting_for_register_result.erase(id);
|
|
|
|
// create messenger
|
|
EntityMessenger *msgr = new EntityMessenger(addr);
|
|
|
|
// add to directory
|
|
entity_map[addr] = my_inst;
|
|
local[addr] = msgr;
|
|
|
|
// was anyone waiting?
|
|
if (waiting_for_lookup.count(addr)) {
|
|
submit_messages(waiting_for_lookup[addr]);
|
|
waiting_for_lookup.erase(addr);
|
|
}
|
|
|
|
lock.Unlock();
|
|
return msgr;
|
|
}
|
|
|
|
void Rank::unregister_entity(EntityMessenger *msgr)
|
|
{
|
|
lock.Lock();
|
|
dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
|
|
|
|
// remove from local directory.
|
|
assert(local.count(msgr->get_myaddr()));
|
|
local.erase(msgr->get_myaddr());
|
|
|
|
if (my_rank > 0) {
|
|
assert(entity_map.count(msgr->get_myaddr()));
|
|
entity_map.erase(msgr->get_myaddr());
|
|
} // else namer will do it.
|
|
|
|
// tell namer.
|
|
if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) &&
|
|
msgr->get_myaddr() != MSG_ADDR_RANK(0))
|
|
msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER),
|
|
MSG_ADDR_NAMER(0), namer_inst);
|
|
|
|
// kick wait()?
|
|
if (local.size() <= 2)
|
|
wait_cond.Signal();
|
|
|
|
lock.Unlock();
|
|
}
|
|
|
|
|
|
void Rank::submit_messages(list<Message*>& ls)
|
|
{
|
|
for (list<Message*>::iterator i = ls.begin(); i != ls.end(); i++)
|
|
submit_message(*i);
|
|
ls.clear();
|
|
}
|
|
|
|
|
|
|
|
void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
|
|
{
|
|
const entity_name_t dest = m->get_dest();
|
|
|
|
// lookup
|
|
EntityMessenger *entity = 0;
|
|
Pipe *pipe = 0;
|
|
|
|
lock.Lock();
|
|
{
|
|
// local?
|
|
if (dest_inst.rank == my_inst.rank) {
|
|
if (local.count(dest)) {
|
|
// local
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
|
|
if (g_conf.ms_single_dispatch) {
|
|
_submit_single_dispatch(m);
|
|
} else {
|
|
entity = local[dest];
|
|
}
|
|
} else {
|
|
// mid-register
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl;
|
|
assert(0); // hmpf
|
|
waiting_for_lookup[dest].push_back(m);
|
|
}
|
|
}
|
|
else {
|
|
// remote.
|
|
if (rank_pipe.count( dest_inst.rank )) {
|
|
//&&
|
|
//rank_pipe[dest_inst.rank]->inst == dest_inst) {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl;
|
|
// connected.
|
|
pipe = rank_pipe[ dest_inst.rank ];
|
|
} else {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
|
|
// not connected.
|
|
pipe = connect_rank( dest_inst );
|
|
}
|
|
}
|
|
}
|
|
lock.Unlock();
|
|
|
|
// do it
|
|
if (entity) {
|
|
// local!
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
|
|
entity->queue_message(m);
|
|
}
|
|
else if (pipe) {
|
|
// remote!
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
|
|
pipe->send(m);
|
|
}
|
|
}
|
|
|
|
|
|
void Rank::submit_message(Message *m)
|
|
{
|
|
const entity_name_t dest = m->get_dest();
|
|
|
|
// lookup
|
|
EntityMessenger *entity = 0;
|
|
Pipe *pipe = 0;
|
|
|
|
lock.Lock();
|
|
{
|
|
if (local.count(dest)) {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
|
|
|
|
// local
|
|
if (g_conf.ms_single_dispatch) {
|
|
_submit_single_dispatch(m);
|
|
} else {
|
|
entity = local[dest];
|
|
}
|
|
} else if (entity_map.count( dest )) {
|
|
// remote, known rank addr.
|
|
entity_inst_t inst = entity_map[dest];
|
|
|
|
if (inst == my_inst) {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
|
|
waiting_for_lookup[dest].push_back(m);
|
|
}
|
|
else if (rank_pipe.count( inst.rank ) &&
|
|
rank_pipe[inst.rank]->get_peer_inst() == inst) {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
|
|
// connected.
|
|
pipe = rank_pipe[ inst.rank ];
|
|
} else {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
|
|
// not connected.
|
|
pipe = connect_rank( inst );
|
|
}
|
|
} else {
|
|
// unknown dest rank or rank addr.
|
|
if (looking_up.count(dest) == 0) {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
|
|
lookup(dest);
|
|
} else {
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
|
|
}
|
|
waiting_for_lookup[dest].push_back(m);
|
|
}
|
|
}
|
|
lock.Unlock();
|
|
|
|
// do it
|
|
if (entity) {
|
|
// local!
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
|
|
entity->queue_message(m);
|
|
}
|
|
else if (pipe) {
|
|
// remote!
|
|
dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
|
|
pipe->send(m);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
void Rank::dispatch(Message *m)
|
|
{
|
|
lock.Lock();
|
|
|
|
dout(10) << "dispatching " << *m << endl;
|
|
|
|
switch (m->get_type()) {
|
|
case MSG_NS_CONNECTACK:
|
|
handle_connect_ack((MNSConnectAck*)m);
|
|
break;
|
|
|
|
case MSG_NS_REGISTERACK:
|
|
handle_register_ack((MNSRegisterAck*)m);
|
|
break;
|
|
|
|
case MSG_NS_LOOKUPREPLY:
|
|
handle_lookup_reply((MNSLookupReply*)m);
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
|
|
lock.Unlock();
|
|
}
|
|
|
|
void Rank::handle_connect_ack(MNSConnectAck *m)
|
|
{
|
|
dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl;
|
|
my_rank = m->get_rank();
|
|
|
|
my_inst.addr = accepter.listen_addr;
|
|
my_inst.rank = my_rank;
|
|
|
|
waiting_for_rank.SignalAll();
|
|
delete m;
|
|
|
|
// logger!
|
|
/*dout(10) << "logger" << endl;
|
|
char names[100];
|
|
sprintf(names, "rank%d", my_rank);
|
|
string name = names;
|
|
|
|
if (g_conf.tcp_log) {
|
|
logger = new Logger(name, (LogType*)&rank_logtype);
|
|
rank_logtype.add_set("num");
|
|
rank_logtype.add_inc("in");
|
|
rank_logtype.add_inc("inb");
|
|
rank_logtype.add_inc("dis");
|
|
rank_logtype.add_set("inq");
|
|
rank_logtype.add_set("inqb");
|
|
rank_logtype.add_set("outq");
|
|
rank_logtype.add_set("outqb");
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
void Rank::handle_register_ack(MNSRegisterAck *m)
|
|
{
|
|
dout(10) << "handle_register_ack " << m->get_entity() << endl;
|
|
|
|
long tid = m->get_tid();
|
|
waiting_for_register_result[tid] = m->get_entity();
|
|
waiting_for_register_cond[tid]->Signal();
|
|
delete m;
|
|
}
|
|
|
|
void Rank::handle_lookup_reply(MNSLookupReply *m)
|
|
{
|
|
list<Message*> waiting;
|
|
dout(10) << "got lookup reply" << endl;
|
|
|
|
for (map<entity_name_t,entity_inst_t>::iterator it = m->entity_map.begin();
|
|
it != m->entity_map.end();
|
|
it++) {
|
|
dout(10) << "lookup got " << it->first << " at " << it->second << endl;
|
|
entity_name_t addr = it->first;
|
|
entity_inst_t inst = it->second;
|
|
|
|
if (entity_map.count(addr) &&
|
|
entity_map[addr] > inst) {
|
|
dout(10) << "ignoring lookup results for " << addr << ", " \
|
|
<< entity_map[addr] << " > " << inst << endl;
|
|
continue;
|
|
}
|
|
|
|
// update map.
|
|
entity_map[addr] = inst;
|
|
|
|
if (inst.rank == my_rank) {
|
|
// local
|
|
dout(10) << "delivering lookup results locally" << endl;
|
|
if (local.count(addr)) {
|
|
if (g_conf.ms_single_dispatch) {
|
|
single_dispatch_queue.splice(single_dispatch_queue.end(),
|
|
waiting_for_lookup[addr]);
|
|
} else {
|
|
local[addr]->queue_messages(waiting_for_lookup[addr]);
|
|
}
|
|
waiting_for_lookup.erase(addr);
|
|
} else
|
|
lookup(addr); // try again!
|
|
|
|
} else {
|
|
// remote
|
|
if (rank_pipe.count(inst.rank) == 0)
|
|
connect_rank(inst);
|
|
else if (rank_pipe[inst.rank]->get_peer_inst() != inst) {
|
|
dout(0) << "lookup got rank addr change, WATCH OUT" << endl;
|
|
// FIXME BUG possible message loss weirdness?
|
|
rank_pipe[inst.rank]->close();
|
|
rank_pipe.erase(inst.rank);
|
|
connect_rank(inst);
|
|
}
|
|
|
|
// take waiters
|
|
Pipe *pipe = rank_pipe[inst.rank];
|
|
assert(pipe);
|
|
|
|
if (waiting_for_lookup.count(addr)) {
|
|
pipe->send(waiting_for_lookup[addr]);
|
|
waiting_for_lookup.erase(addr);
|
|
}
|
|
}
|
|
}
|
|
|
|
delete m;
|
|
}
|
|
|
|
|
|
void Rank::wait()
|
|
{
|
|
lock.Lock();
|
|
while (1) {
|
|
// reap dead pipes
|
|
reaper();
|
|
|
|
if (local.size() == 0) {
|
|
dout(10) << "wait: everything stopped" << endl;
|
|
break; // everything stopped.
|
|
}
|
|
|
|
if (local.size() == 1 &&
|
|
!messenger->is_stopped()) {
|
|
dout(10) << "wait: stopping rank" << endl;
|
|
lock.Unlock();
|
|
messenger->shutdown();
|
|
delete messenger;
|
|
lock.Lock();
|
|
continue;
|
|
}
|
|
|
|
wait_cond.Wait(lock);
|
|
}
|
|
lock.Unlock();
|
|
|
|
// done! clean up.
|
|
|
|
// stop dispatch thread
|
|
if (g_conf.ms_single_dispatch) {
|
|
dout(10) << "wait: stopping dispatch thread" << endl;
|
|
lock.Lock();
|
|
single_dispatch_stop = true;
|
|
single_dispatch_cond.Signal();
|
|
lock.Unlock();
|
|
single_dispatcher.join();
|
|
}
|
|
|
|
// reap pipes
|
|
lock.Lock();
|
|
{
|
|
dout(10) << "wait: closing pipes" << endl;
|
|
list<Pipe*> toclose;
|
|
for (hash_map<int,Pipe*>::iterator i = rank_pipe.begin();
|
|
i != rank_pipe.end();
|
|
i++)
|
|
toclose.push_back(i->second);
|
|
for (list<Pipe*>::iterator i = toclose.begin();
|
|
i != toclose.end();
|
|
i++)
|
|
(*i)->close();
|
|
|
|
dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl;
|
|
while (!pipes.empty()) {
|
|
wait_cond.Wait(lock);
|
|
reaper();
|
|
}
|
|
}
|
|
lock.Unlock();
|
|
|
|
dout(10) << "wait: done." << endl;
|
|
}
|
|
|
|
|
|
|
|
int Rank::find_ns_addr(tcpaddr_t &nsa)
|
|
{
|
|
// file?
|
|
int fd = ::open(".ceph_ns",O_RDONLY);
|
|
if (fd > 0) {
|
|
::read(fd, (void*)&nsa, sizeof(nsa));
|
|
::close(fd);
|
|
cout << "ceph ns is " << nsa << endl;
|
|
return 0;
|
|
}
|
|
|
|
// env var?
|
|
char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");
|
|
if (nsaddr) {
|
|
while (nsaddr[0] != '=') nsaddr++;
|
|
nsaddr++;
|
|
|
|
if (tcp_hostlookup(nsaddr, nsa) < 0) {
|
|
cout << "can't resolve " << nsaddr << endl;
|
|
return -1;
|
|
}
|
|
|
|
cout << "ceph ns is " << nsa << endl;
|
|
return 0;
|
|
}
|
|
|
|
cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl;
|
|
return -1;
|
|
}
|
|
|
|
|
|
|
|
/**********************************
|
|
* EntityMessenger
|
|
*/
|
|
|
|
Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) :
|
|
Messenger(myaddr),
|
|
stop(false),
|
|
dispatch_thread(this)
|
|
{
|
|
}
|
|
Rank::EntityMessenger::~EntityMessenger()
|
|
{
|
|
}
|
|
|
|
void Rank::EntityMessenger::dispatch_entry()
|
|
{
|
|
lock.Lock();
|
|
while (!stop) {
|
|
if (!dispatch_queue.empty()) {
|
|
list<Message*> ls;
|
|
ls.swap(dispatch_queue);
|
|
|
|
lock.Unlock();
|
|
{
|
|
// deliver
|
|
while (!ls.empty()) {
|
|
Message *m = ls.front();
|
|
ls.pop_front();
|
|
dout(1) //<< g_clock.now()
|
|
<< "---- "
|
|
<< m->get_source()// << ':' << m->get_source_port()
|
|
<< " to " << m->get_dest()// << ':' << m->get_dest_port()
|
|
<< " ---- " << m->get_type_name()
|
|
<< " ---- " << m->get_source_inst()
|
|
<< " ---- " << m
|
|
<< endl;
|
|
dispatch(m);
|
|
}
|
|
}
|
|
lock.Lock();
|
|
continue;
|
|
}
|
|
cond.Wait(lock);
|
|
}
|
|
lock.Unlock();
|
|
}
|
|
|
|
void Rank::EntityMessenger::ready()
|
|
{
|
|
dout(10) << "ready " << get_myaddr() << endl;
|
|
|
|
if (g_conf.ms_single_dispatch) {
|
|
rank.lock.Lock();
|
|
if (rank.waiting_for_ready.count(get_myaddr())) {
|
|
rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
|
|
rank.waiting_for_ready[get_myaddr()]);
|
|
rank.waiting_for_ready.erase(get_myaddr());
|
|
rank.single_dispatch_cond.Signal();
|
|
}
|
|
rank.lock.Unlock();
|
|
} else {
|
|
// start my dispatch thread
|
|
dispatch_thread.create();
|
|
}
|
|
|
|
// tell namer
|
|
if (get_myaddr() != MSG_ADDR_NAMER(0) &&
|
|
get_myaddr() != MSG_ADDR_RANK(0))
|
|
send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst);
|
|
}
|
|
|
|
|
|
int Rank::EntityMessenger::shutdown()
|
|
{
|
|
dout(10) << "shutdown " << get_myaddr() << endl;
|
|
|
|
// deregister
|
|
rank.unregister_entity(this);
|
|
|
|
// stop my dispatch thread
|
|
if (dispatch_thread.am_self()) {
|
|
dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
|
|
stop = true;
|
|
} else {
|
|
dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
|
|
lock.Lock();
|
|
stop = true;
|
|
cond.Signal();
|
|
lock.Unlock();
|
|
dispatch_thread.join();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst)
|
|
{
|
|
rank.lock.Lock();
|
|
{
|
|
if (rank.rank_pipe.count(inst.rank) == 0)
|
|
rank.connect_rank(inst);
|
|
}
|
|
rank.lock.Unlock();
|
|
}
|
|
|
|
int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst,
|
|
int port, int fromport)
|
|
{
|
|
// set envelope
|
|
m->set_source(get_myaddr(), fromport);
|
|
m->set_dest(dest, port);
|
|
|
|
m->set_source_inst(rank.my_inst);
|
|
|
|
dout(1) << "--> "
|
|
<< m->get_source() //<< ':' << m->get_source_port()
|
|
<< " to " << m->get_dest() //<< ':' << m->get_dest_port()
|
|
<< " ---- " << m->get_type_name()
|
|
<< " ---- " << rank.my_inst << " --> " << inst
|
|
<< " ---- " << m
|
|
<< endl;
|
|
|
|
rank.submit_message(m, inst);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport)
|
|
{
|
|
// set envelope
|
|
m->set_source(get_myaddr(), fromport);
|
|
m->set_dest(dest, port);
|
|
|
|
m->set_source_inst(rank.my_inst);
|
|
|
|
dout(1) << "--> "
|
|
<< m->get_source() //<< ':' << m->get_source_port()
|
|
<< " to " << m->get_dest() //<< ':' << m->get_dest_port()
|
|
<< " ---- " << m->get_type_name()
|
|
<< " ---- " << rank.my_inst << " --> ? (DEPRECATED)"
|
|
<< " ---- " << m
|
|
<< endl;
|
|
|
|
rank.submit_message(m);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i)
|
|
{
|
|
assert(a != get_myaddr());
|
|
rank.mark_down(a,i);
|
|
}
|
|
|
|
void Rank::mark_down(entity_name_t a, entity_inst_t& inst)
|
|
{
|
|
//if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
|
|
lock.Lock();
|
|
if (entity_map.count(a) &&
|
|
entity_map[a] > inst) {
|
|
dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
|
|
derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
|
|
// do nothing!
|
|
} else {
|
|
if (entity_map.count(a) == 0) {
|
|
// don't know it
|
|
dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
|
|
derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
|
|
|
|
waiting_for_lookup.erase(a);
|
|
looking_up.erase(a);
|
|
} else {
|
|
// know it
|
|
assert(entity_map[a] <= inst);
|
|
dout(10) << "mark_down " << a << " inst " << inst << endl;
|
|
derr(10) << "mark_down " << a << " inst " << inst << endl;
|
|
|
|
entity_map.erase(a);
|
|
|
|
if (rank_pipe.count(inst.rank)) {
|
|
rank_pipe[inst.rank]->close();
|
|
rank_pipe.erase(inst.rank);
|
|
}
|
|
|
|
// kill rank# too? only if i'm the namer.
|
|
if (my_rank == 0) {
|
|
entity_map.erase(MSG_ADDR_RANK(inst.rank));
|
|
}
|
|
}
|
|
}
|
|
lock.Unlock();
|
|
}
|
|
|
|
void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i)
|
|
{
|
|
assert(a != get_myaddr());
|
|
rank.mark_up(a, i);
|
|
}
|
|
|
|
void Rank::mark_up(entity_name_t a, entity_inst_t& i)
|
|
{
|
|
if (my_rank == 0) return;
|
|
lock.Lock();
|
|
{
|
|
dout(10) << "mark_up " << a << " inst " << i << endl;
|
|
derr(10) << "mark_up " << a << " inst " << i << endl;
|
|
|
|
assert(i.rank != my_rank); // hrm?
|
|
|
|
if (entity_map.count(a) == 0 ||
|
|
entity_map[a] < i) {
|
|
entity_map[a] = i;
|
|
connect_rank(i);
|
|
} else if (entity_map[a] == i) {
|
|
dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
|
|
derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
|
|
} else {
|
|
dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
|
|
derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
|
|
}
|
|
|
|
//if (waiting_for_lookup.count(a))
|
|
//lookup(a);
|
|
}
|
|
lock.Unlock();
|
|
}
|
|
|