mds: name mds daemon instances independent of their current logical rank

This commit is contained in:
Sage Weil 2009-03-11 13:04:03 -07:00
parent 735d89bbc3
commit ddd08fd767
12 changed files with 80 additions and 61 deletions

View File

@ -35,7 +35,7 @@ using namespace std;
void usage()
{
cerr << "usage: cmds [flags] [--mds rank] [--shadow rank]\n";
cerr << "usage: cmds name [flags] [--mds rank] [--shadow rank]\n";
cerr << " -m monitorip:port\n";
cerr << " connect to monitor at given address\n";
cerr << " --debug_mds n\n";
@ -52,22 +52,24 @@ int main(int argc, const char **argv)
common_init(args, "mds");
// mds specific args
const char *monhost = 0;
int whoami = -1;
int shadow = -1;
const char *name = 0;
for (unsigned i=0; i<args.size(); i++) {
if (strcmp(args[i], "--mds") == 0)
whoami = atoi(args[++i]);
else if (strcmp(args[i], "--shadow") == 0)
whoami = shadow = atoi(args[++i]);
else if (monhost == 0)
monhost = args[i];
else if (!name)
name = args[i];
else {
cerr << "unrecognized arg " << args[i] << std::endl;
usage();
return -1;
}
}
if (!name)
usage();
if (g_conf.clock_tare) g_clock.tare();
@ -93,7 +95,7 @@ int main(int argc, const char **argv)
rank.start();
// start mds
MDS *mds = new MDS(whoami, m, &monmap);
MDS *mds = new MDS(name, whoami, m, &monmap);
mds->standby_replay_for = shadow;
mds->init();

View File

@ -29,5 +29,7 @@
host = cosd5
btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0"
[mds0]
[mds.foo]
host = cosd0
[mds.bar]
host = cosd0

View File

@ -44,8 +44,8 @@
#define CEPH_MDS_PROTOCOL 5 /* cluster internal */
#define CEPH_MON_PROTOCOL 4 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 5 /* public/client */
#define CEPH_MDSC_PROTOCOL 7 /* public/client */
#define CEPH_MONC_PROTOCOL 6 /* public/client */
#define CEPH_MDSC_PROTOCOL 8 /* public/client */
#define CEPH_MONC_PROTOCOL 7 /* public/client */
/*

View File

@ -133,7 +133,8 @@ get_name_list "$@"
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
num=`echo $name | cut -c 4-`
id=`echo $name | cut -c 4- | sed 's/\\.//'`
num=$id
sections="$name $type global"
check_host || continue
@ -149,8 +150,12 @@ for name in $what; do
fi
if [[ $name =~ "mds" ]]; then
get_conf mds "" "mds" $sections
get_conf shadow "" "shadow" $sections
module_opt="$mon_addr_arg"
module_bin="$BINDIR/cmds"
module_bin="$BINDIR/cmds $id"
[[ $mds != "" ]] && $module_bin="$module_bin --mds $mds"
[[ $shadow != "" ]] && $module_bin="$module_bin --shadow $shadow"
fi
if [[ $name =~ "osd" ]]; then

View File

@ -71,9 +71,10 @@
// cons/des
MDS::MDS(int whoami_, Messenger *m, MonMap *mm) :
MDS::MDS(const char *n, int whoami_, Messenger *m, MonMap *mm) :
mds_lock("MDS::mds_lock"),
timer(mds_lock),
name(n),
whoami(whoami_), incarnation(0),
standby_replay_for(-1),
messenger(m),
@ -450,7 +451,7 @@ void MDS::beacon_send()
beacon_seq_stamp[beacon_last_seq] = g_clock.now();
messenger->send_message(new MMDSBeacon(monmap->fsid, mdsmap->get_epoch(),
messenger->send_message(new MMDSBeacon(monmap->fsid, name, mdsmap->get_epoch(),
want_state, beacon_last_seq, want_rank),
monmap->get_inst(mon));

View File

@ -117,6 +117,7 @@ class MDS : public Dispatcher {
Mutex mds_lock;
SafeTimer timer;
string name;
int whoami;
int incarnation;
@ -263,7 +264,7 @@ class MDS : public Dispatcher {
private:
virtual bool dispatch_impl(Message *m);
public:
MDS(int whoami, Messenger *m, MonMap *mm);
MDS(const char *n, int whoami, Messenger *m, MonMap *mm);
~MDS();
// who am i etc

View File

@ -39,7 +39,7 @@ void MDSMap::print(ostream& out)
for (map<entity_addr_t,mds_info_t>::iterator p = mds_info.begin();
p != mds_info.end();
p++)
foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.mds, p->second.inc-1), p->first));
foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.rank, p->second.inc-1), p->first));
for (multimap< pair<unsigned,unsigned>, entity_addr_t >::iterator p = foo.begin();
p != foo.end();
@ -47,7 +47,8 @@ void MDSMap::print(ostream& out)
mds_info_t& info = mds_info[p->second];
out << info.addr
<< " mds" << info.mds
<< " '" << info.name << "'"
<< " mds" << info.rank
<< "." << info.inc
<< " " << get_state_name(info.state)
<< " seq " << info.state_seq;

View File

@ -104,22 +104,24 @@ class MDSMap {
}
struct mds_info_t {
int32_t mds;
string name;
int32_t rank;
int32_t inc;
int32_t state;
version_t state_seq;
entity_addr_t addr;
utime_t laggy_since;
mds_info_t() : mds(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
mds_info_t() : rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
bool laggy() const { return !(laggy_since == utime_t()); }
void clear_laggy() { laggy_since = utime_t(); }
entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(mds), addr); }
entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); }
void encode(bufferlist& bl) const {
::encode(mds, bl);
::encode(name, bl);
::encode(rank, bl);
::encode(inc, bl);
::encode(state, bl);
::encode(state_seq, bl);
@ -127,7 +129,8 @@ class MDSMap {
::encode(laggy_since, bl);
}
void decode(bufferlist::iterator& bl) {
::decode(mds, bl);
::decode(name, bl);
::decode(rank, bl);
::decode(inc, bl);
::decode(state, bl);
::decode(state_seq, bl);
@ -240,14 +243,14 @@ class MDSMap {
p != mds_info.end();
p++)
if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING)
s.insert(p->second.mds);
s.insert(p->second.rank);
}
void get_mds_set(set<int>& s, int state) {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++)
if (p->second.state == state)
s.insert(p->second.mds);
s.insert(p->second.rank);
}
int get_random_up_mds() {
@ -262,7 +265,7 @@ class MDSMap {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
if (p->second.mds == mds &&
if (p->second.rank == mds &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
a = p->second.addr;
@ -272,7 +275,7 @@ class MDSMap {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
if (p->second.mds == -1 &&
if (p->second.rank == -1 &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
a = p->second.addr;
@ -355,7 +358,7 @@ class MDSMap {
int get_rank(const entity_addr_t& addr) {
if (mds_info.count(addr))
return mds_info[addr].mds;
return mds_info[addr].rank;
return -1;
}

View File

@ -23,6 +23,7 @@
class MMDSBeacon : public Message {
ceph_fsid_t fsid;
string name;
epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree
__u32 state;
version_t seq;
@ -30,11 +31,12 @@ class MMDSBeacon : public Message {
public:
MMDSBeacon() : Message(MSG_MDS_BEACON) {}
MMDSBeacon(ceph_fsid_t &f, epoch_t les, int st, version_t se, int wr) :
MMDSBeacon(ceph_fsid_t &f, string& n, epoch_t les, int st, version_t se, int wr) :
Message(MSG_MDS_BEACON),
fsid(f), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { }
fsid(f), name(n), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { }
ceph_fsid_t& get_fsid() { return fsid; }
string& get_name() { return name; }
epoch_t get_last_epoch_seen() { return last_epoch_seen; }
int get_state() { return state; }
version_t get_seq() { return seq; }
@ -42,7 +44,7 @@ class MMDSBeacon : public Message {
int get_want_rank() { return want_rank; }
void print(ostream& out) {
out << "mdsbeacon(" << MDSMap::get_state_name(state)
out << "mdsbeacon(" << name << " " << MDSMap::get_state_name(state)
<< " seq " << seq << ")";
}
@ -51,6 +53,7 @@ class MMDSBeacon : public Message {
::encode(last_epoch_seen, payload);
::encode(state, payload);
::encode(seq, payload);
::encode(name, payload);
::encode(want_rank, payload);
}
void decode_payload() {
@ -59,6 +62,7 @@ class MMDSBeacon : public Message {
::decode(last_epoch_seen, p);
::decode(state, p);
::decode(seq, p);
::decode(name, p);
::decode(want_rank, p);
}
};

View File

@ -112,7 +112,8 @@ fi
# create monitors, osds
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $name is 'mon1'
num=`echo $name | cut -c 4-`
id=`echo $name | cut -c 4- | sed 's/\\.//'`
num=$id
sections="$name $type global"
check_host || continue

View File

@ -219,8 +219,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
if (info.state == MDSMap::STATE_STANDBY &&
state == MDSMap::STATE_STANDBY_REPLAY &&
(pending_mdsmap.is_degraded() ||
pending_mdsmap.get_state(info.mds) < MDSMap::STATE_ACTIVE)) {
dout(10) << "mds_beacon can't standby-replay mds" << info.mds << " at this time (cluster degraded, or mds not active)" << dendl;
pending_mdsmap.get_state(info.rank) < MDSMap::STATE_ACTIVE)) {
dout(10) << "mds_beacon can't standby-replay mds" << info.rank << " at this time (cluster degraded, or mds not active)" << dendl;
goto ignore;
}
@ -232,7 +232,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// note time and reply
dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl;
last_beacon[addr] = g_clock.now();
mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid,
mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, m->get_name(),
mdsmap.get_epoch(), state, seq, 0),
m->get_orig_source_inst());
@ -288,7 +288,8 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
// add
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
info.mds = standby_for;
info.name = m->get_name();
info.rank = standby_for;
info.addr = addr;
info.state = MDSMap::STATE_STANDBY;
@ -304,15 +305,15 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
info.clear_laggy();
}
dout(10) << "prepare_beacon mds" << info.mds
dout(10) << "prepare_beacon mds" << info.rank
<< " " << MDSMap::get_state_name(info.state)
<< " -> " << MDSMap::get_state_name(state)
<< dendl;
if (state == MDSMap::STATE_STOPPED) {
pending_mdsmap.up.erase(info.mds);
pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(addr);
pending_mdsmap.stopped.insert(info.mds);
pending_mdsmap.in.erase(info.mds);
pending_mdsmap.stopped.insert(info.rank);
pending_mdsmap.in.erase(info.rank);
} else {
info.state = state;
info.state_seq = seq;
@ -514,7 +515,7 @@ void MDSMonitor::tick()
dout(1) << "adding standby " << addr << " as mds" << mds << dendl;
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
info.mds = mds;
info.rank = mds;
if (pending_mdsmap.stopped.count(mds))
info.state = MDSMap::STATE_STARTING;
else
@ -552,17 +553,17 @@ void MDSMonitor::tick()
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
dout(10) << "no beacon from " << addr << " mds" << info.mds << "." << info.inc
dout(10) << "no beacon from " << addr << " mds" << info.rank << "." << info.inc
<< " " << MDSMap::get_state_name(info.state)
<< " since " << since << dendl;
// are we in?
// and is there a non-laggy standby that can take over for us?
entity_addr_t sa;
if (info.mds >= 0 &&
if (info.rank >= 0 &&
info.state > 0 && //|| info.state == MDSMap::STATE_STANDBY_REPLAY) &&
pending_mdsmap.find_standby_for(info.mds, sa)) {
dout(10) << " replacing " << addr << " mds" << info.mds << "." << info.inc
pending_mdsmap.find_standby_for(info.rank, sa)) {
dout(10) << " replacing " << addr << " mds" << info.rank << "." << info.inc
<< " " << MDSMap::get_state_name(info.state)
<< " with " << sa << dendl;
MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
@ -583,10 +584,10 @@ void MDSMonitor::tick()
default:
assert(0);
}
si.mds = info.mds;
si.rank = info.rank;
if (si.state > 0) {
si.inc = ++pending_mdsmap.inc[info.mds];
pending_mdsmap.up[info.mds] = sa;
si.inc = ++pending_mdsmap.inc[info.rank];
pending_mdsmap.up[info.rank] = sa;
pending_mdsmap.last_failure = pending_mdsmap.epoch;
}
pending_mdsmap.mds_info.erase(addr);
@ -601,14 +602,14 @@ void MDSMonitor::tick()
do_propose = true;
} else if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
dout(10) << " failing " << addr << " mds" << info.mds << "." << info.inc
dout(10) << " failing " << addr << " mds" << info.rank << "." << info.inc
<< " " << MDSMap::get_state_name(info.state)
<< dendl;
pending_mdsmap.mds_info.erase(addr);
do_propose = true;
} else if (!info.laggy()) {
// just mark laggy
dout(10) << " marking " << addr << " mds" << info.mds << "." << info.inc
dout(10) << " marking " << addr << " mds" << info.rank << "." << info.inc
<< " " << MDSMap::get_state_name(info.state)
<< " laggy" << dendl;
info.laggy_since = now;
@ -636,7 +637,7 @@ void MDSMonitor::tick()
dout(0) << " taking over failed mds" << f << " with " << sa << dendl;
MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
si.state = MDSMap::STATE_REPLAY;
si.mds = f;
si.rank = f;
si.inc = ++pending_mdsmap.inc[f];
pending_mdsmap.in.insert(f);
pending_mdsmap.up[f] = sa;
@ -656,10 +657,10 @@ void MDSMonitor::tick()
p != pending_mdsmap.mds_info.end();
p++) {
if (p->second.state == MDSMap::STATE_STANDBY_REPLAY)
shadowed.insert(p->second.mds);
shadowed.insert(p->second.rank);
if (p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy())
avail[p->second.mds].insert(p->first);
avail[p->second.rank].insert(p->first);
}
// find an mds that needs a standby
@ -682,7 +683,7 @@ void MDSMonitor::tick()
dout(10) << "mds" << *p << " will be shadowed by " << s << dendl;
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[s];
info.mds = *p;
info.rank = *p;
info.state = MDSMap::STATE_STANDBY_REPLAY;
do_propose = true;
}
@ -715,21 +716,21 @@ void MDSMonitor::do_stop()
info.state = MDSMap::STATE_STOPPING;
break;
case MDSMap::STATE_STARTING:
pending_mdsmap.stopped.insert(info.mds);
pending_mdsmap.stopped.insert(info.rank);
case MDSMap::STATE_CREATING:
pending_mdsmap.up.erase(info.mds);
pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(info.addr);
pending_mdsmap.in.erase(info.mds);
pending_mdsmap.in.erase(info.rank);
break;
case MDSMap::STATE_REPLAY:
case MDSMap::STATE_RESOLVE:
case MDSMap::STATE_RECONNECT:
case MDSMap::STATE_REJOIN:
// BUG: hrm, if this is the case, the STOPPING guys won't be able to stop, will they?
pending_mdsmap.failed.insert(info.mds);
pending_mdsmap.up.erase(info.mds);
pending_mdsmap.failed.insert(info.rank);
pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(info.addr);
pending_mdsmap.in.erase(info.mds);
pending_mdsmap.in.erase(info.rank);
break;
}
}

View File

@ -17,10 +17,10 @@
[global]
conf = ceph.conf
restart on core dump = true
pid file = /var/run/ceph/$name.pid
; monitor
[mon]
pid file = /var/run/ceph/mon$mon.pid
[mon0]
host = alpha
@ -39,15 +39,13 @@
; mds
[mds]
pid file = /var/run/ceph/mds$mds.pid
[mds0]
[mds.alpha]
host = alpha
; osd
[osd]
pid file = /var/run/ceph/osd$osd.pid
sudo = true
[osd0]