From ddd08fd767165414d7d2ea30c1b193c33bc41c22 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 11 Mar 2009 13:04:03 -0700 Subject: [PATCH] mds: name mds daemon instances independent of their current logical rank --- src/cmds.cc | 12 +++++---- src/cosd.cluster.conf | 4 ++- src/include/ceph_fs.h | 4 +-- src/init-ceph | 9 +++++-- src/mds/MDS.cc | 5 ++-- src/mds/MDS.h | 3 ++- src/mds/MDSMap.cc | 5 ++-- src/mds/MDSMap.h | 23 +++++++++------- src/messages/MMDSBeacon.h | 10 ++++--- src/mkcephfs | 3 ++- src/mon/MDSMonitor.cc | 57 ++++++++++++++++++++------------------- src/sample.cluster.conf | 6 ++--- 12 files changed, 80 insertions(+), 61 deletions(-) diff --git a/src/cmds.cc b/src/cmds.cc index 05555bc0e34..ca1f242c306 100644 --- a/src/cmds.cc +++ b/src/cmds.cc @@ -35,7 +35,7 @@ using namespace std; void usage() { - cerr << "usage: cmds [flags] [--mds rank] [--shadow rank]\n"; + cerr << "usage: cmds name [flags] [--mds rank] [--shadow rank]\n"; cerr << " -m monitorip:port\n"; cerr << " connect to monitor at given address\n"; cerr << " --debug_mds n\n"; @@ -52,22 +52,24 @@ int main(int argc, const char **argv) common_init(args, "mds"); // mds specific args - const char *monhost = 0; int whoami = -1; int shadow = -1; + const char *name = 0; for (unsigned i=0; istandby_replay_for = shadow; mds->init(); diff --git a/src/cosd.cluster.conf b/src/cosd.cluster.conf index c1dcc4bb731..7a3e639dd33 100644 --- a/src/cosd.cluster.conf +++ b/src/cosd.cluster.conf @@ -29,5 +29,7 @@ host = cosd5 btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0" -[mds0] +[mds.foo] + host = cosd0 +[mds.bar] host = cosd0 \ No newline at end of file diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 060d8d05dda..edb5c45d588 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -44,8 +44,8 @@ #define CEPH_MDS_PROTOCOL 5 /* cluster internal */ #define CEPH_MON_PROTOCOL 4 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 5 /* public/client */ -#define CEPH_MDSC_PROTOCOL 7 /* public/client */ -#define CEPH_MONC_PROTOCOL 6 /* public/client */ +#define CEPH_MDSC_PROTOCOL 8 /* public/client */ +#define CEPH_MONC_PROTOCOL 7 /* public/client */ /* diff --git a/src/init-ceph b/src/init-ceph index 03e42287efd..27c60be4658 100755 --- a/src/init-ceph +++ b/src/init-ceph @@ -133,7 +133,8 @@ get_name_list "$@" for name in $what; do type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' - num=`echo $name | cut -c 4-` + id=`echo $name | cut -c 4- | sed 's/\\.//'` + num=$id sections="$name $type global" check_host || continue @@ -149,8 +150,12 @@ for name in $what; do fi if [[ $name =~ "mds" ]]; then + get_conf mds "" "mds" $sections + get_conf shadow "" "shadow" $sections module_opt="$mon_addr_arg" - module_bin="$BINDIR/cmds" + module_bin="$BINDIR/cmds $id" + [[ $mds != "" ]] && $module_bin="$module_bin --mds $mds" + [[ $shadow != "" ]] && $module_bin="$module_bin --shadow $shadow" fi if [[ $name =~ "osd" ]]; then diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index af9e387d70a..7167291bdfc 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -71,9 +71,10 @@ // cons/des -MDS::MDS(int whoami_, Messenger *m, MonMap *mm) : +MDS::MDS(const char *n, int whoami_, Messenger *m, MonMap *mm) : mds_lock("MDS::mds_lock"), timer(mds_lock), + name(n), whoami(whoami_), incarnation(0), standby_replay_for(-1), messenger(m), @@ -450,7 +451,7 @@ void MDS::beacon_send() beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - messenger->send_message(new MMDSBeacon(monmap->fsid, mdsmap->get_epoch(), + messenger->send_message(new MMDSBeacon(monmap->fsid, name, mdsmap->get_epoch(), want_state, beacon_last_seq, want_rank), monmap->get_inst(mon)); diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 5bfec8ca7b4..19d90f1762f 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -117,6 +117,7 @@ class MDS : public Dispatcher { Mutex mds_lock; SafeTimer timer; + string name; int whoami; int incarnation; @@ -263,7 +264,7 @@ class MDS : public Dispatcher { private: virtual bool dispatch_impl(Message *m); public: - MDS(int whoami, Messenger *m, MonMap *mm); + MDS(const char *n, int whoami, Messenger *m, MonMap *mm); ~MDS(); // who am i etc diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 01242bc6bd4..217a28b6830 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -39,7 +39,7 @@ void MDSMap::print(ostream& out) for (map::iterator p = mds_info.begin(); p != mds_info.end(); p++) - foo.insert(pair,entity_addr_t>(pair(p->second.mds, p->second.inc-1), p->first)); + foo.insert(pair,entity_addr_t>(pair(p->second.rank, p->second.inc-1), p->first)); for (multimap< pair, entity_addr_t >::iterator p = foo.begin(); p != foo.end(); @@ -47,7 +47,8 @@ void MDSMap::print(ostream& out) mds_info_t& info = mds_info[p->second]; out << info.addr - << " mds" << info.mds + << " '" << info.name << "'" + << " mds" << info.rank << "." << info.inc << " " << get_state_name(info.state) << " seq " << info.state_seq; diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 8428c4d77ce..7baf1c00814 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -104,22 +104,24 @@ class MDSMap { } struct mds_info_t { - int32_t mds; + string name; + int32_t rank; int32_t inc; int32_t state; version_t state_seq; entity_addr_t addr; utime_t laggy_since; - mds_info_t() : mds(-1), inc(0), state(STATE_STANDBY), state_seq(0) { } + mds_info_t() : rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { } bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } - entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(mds), addr); } + entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); } void encode(bufferlist& bl) const { - ::encode(mds, bl); + ::encode(name, bl); + ::encode(rank, bl); ::encode(inc, bl); ::encode(state, bl); ::encode(state_seq, bl); @@ -127,7 +129,8 @@ class MDSMap { ::encode(laggy_since, bl); } void decode(bufferlist::iterator& bl) { - ::decode(mds, bl); + ::decode(name, bl); + ::decode(rank, bl); ::decode(inc, bl); ::decode(state, bl); ::decode(state_seq, bl); @@ -240,14 +243,14 @@ class MDSMap { p != mds_info.end(); p++) if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) - s.insert(p->second.mds); + s.insert(p->second.rank); } void get_mds_set(set& s, int state) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) if (p->second.state == state) - s.insert(p->second.mds); + s.insert(p->second.rank); } int get_random_up_mds() { @@ -262,7 +265,7 @@ class MDSMap { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) { - if (p->second.mds == mds && + if (p->second.rank == mds && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { a = p->second.addr; @@ -272,7 +275,7 @@ class MDSMap { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) { - if (p->second.mds == -1 && + if (p->second.rank == -1 && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { a = p->second.addr; @@ -355,7 +358,7 @@ class MDSMap { int get_rank(const entity_addr_t& addr) { if (mds_info.count(addr)) - return mds_info[addr].mds; + return mds_info[addr].rank; return -1; } diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index 168cc2122fe..2228a469257 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -23,6 +23,7 @@ class MMDSBeacon : public Message { ceph_fsid_t fsid; + string name; epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree __u32 state; version_t seq; @@ -30,11 +31,12 @@ class MMDSBeacon : public Message { public: MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(ceph_fsid_t &f, epoch_t les, int st, version_t se, int wr) : + MMDSBeacon(ceph_fsid_t &f, string& n, epoch_t les, int st, version_t se, int wr) : Message(MSG_MDS_BEACON), - fsid(f), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { } + fsid(f), name(n), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { } ceph_fsid_t& get_fsid() { return fsid; } + string& get_name() { return name; } epoch_t get_last_epoch_seen() { return last_epoch_seen; } int get_state() { return state; } version_t get_seq() { return seq; } @@ -42,7 +44,7 @@ class MMDSBeacon : public Message { int get_want_rank() { return want_rank; } void print(ostream& out) { - out << "mdsbeacon(" << MDSMap::get_state_name(state) + out << "mdsbeacon(" << name << " " << MDSMap::get_state_name(state) << " seq " << seq << ")"; } @@ -51,6 +53,7 @@ class MMDSBeacon : public Message { ::encode(last_epoch_seen, payload); ::encode(state, payload); ::encode(seq, payload); + ::encode(name, payload); ::encode(want_rank, payload); } void decode_payload() { @@ -59,6 +62,7 @@ class MMDSBeacon : public Message { ::decode(last_epoch_seen, p); ::decode(state, p); ::decode(seq, p); + ::decode(name, p); ::decode(want_rank, p); } }; diff --git a/src/mkcephfs b/src/mkcephfs index 75c25decdb9..95bfa34291d 100755 --- a/src/mkcephfs +++ b/src/mkcephfs @@ -112,7 +112,8 @@ fi # create monitors, osds for name in $what; do type=`echo $name | cut -c 1-3` # e.g. 'mon', if $name is 'mon1' - num=`echo $name | cut -c 4-` + id=`echo $name | cut -c 4- | sed 's/\\.//'` + num=$id sections="$name $type global" check_host || continue diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 9ea4288c195..b9b35c1ad3b 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -219,8 +219,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) if (info.state == MDSMap::STATE_STANDBY && state == MDSMap::STATE_STANDBY_REPLAY && (pending_mdsmap.is_degraded() || - pending_mdsmap.get_state(info.mds) < MDSMap::STATE_ACTIVE)) { - dout(10) << "mds_beacon can't standby-replay mds" << info.mds << " at this time (cluster degraded, or mds not active)" << dendl; + pending_mdsmap.get_state(info.rank) < MDSMap::STATE_ACTIVE)) { + dout(10) << "mds_beacon can't standby-replay mds" << info.rank << " at this time (cluster degraded, or mds not active)" << dendl; goto ignore; } @@ -232,7 +232,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // note time and reply dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl; last_beacon[addr] = g_clock.now(); - mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, + mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, m->get_name(), mdsmap.get_epoch(), state, seq, 0), m->get_orig_source_inst()); @@ -288,7 +288,8 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) // add MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - info.mds = standby_for; + info.name = m->get_name(); + info.rank = standby_for; info.addr = addr; info.state = MDSMap::STATE_STANDBY; @@ -304,15 +305,15 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) info.clear_laggy(); } - dout(10) << "prepare_beacon mds" << info.mds + dout(10) << "prepare_beacon mds" << info.rank << " " << MDSMap::get_state_name(info.state) << " -> " << MDSMap::get_state_name(state) << dendl; if (state == MDSMap::STATE_STOPPED) { - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(addr); - pending_mdsmap.stopped.insert(info.mds); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.stopped.insert(info.rank); + pending_mdsmap.in.erase(info.rank); } else { info.state = state; info.state_seq = seq; @@ -514,7 +515,7 @@ void MDSMonitor::tick() dout(1) << "adding standby " << addr << " as mds" << mds << dendl; MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - info.mds = mds; + info.rank = mds; if (pending_mdsmap.stopped.count(mds)) info.state = MDSMap::STATE_STARTING; else @@ -552,17 +553,17 @@ void MDSMonitor::tick() MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - dout(10) << "no beacon from " << addr << " mds" << info.mds << "." << info.inc + dout(10) << "no beacon from " << addr << " mds" << info.rank << "." << info.inc << " " << MDSMap::get_state_name(info.state) << " since " << since << dendl; // are we in? // and is there a non-laggy standby that can take over for us? entity_addr_t sa; - if (info.mds >= 0 && + if (info.rank >= 0 && info.state > 0 && //|| info.state == MDSMap::STATE_STANDBY_REPLAY) && - pending_mdsmap.find_standby_for(info.mds, sa)) { - dout(10) << " replacing " << addr << " mds" << info.mds << "." << info.inc + pending_mdsmap.find_standby_for(info.rank, sa)) { + dout(10) << " replacing " << addr << " mds" << info.rank << "." << info.inc << " " << MDSMap::get_state_name(info.state) << " with " << sa << dendl; MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa]; @@ -583,10 +584,10 @@ void MDSMonitor::tick() default: assert(0); } - si.mds = info.mds; + si.rank = info.rank; if (si.state > 0) { - si.inc = ++pending_mdsmap.inc[info.mds]; - pending_mdsmap.up[info.mds] = sa; + si.inc = ++pending_mdsmap.inc[info.rank]; + pending_mdsmap.up[info.rank] = sa; pending_mdsmap.last_failure = pending_mdsmap.epoch; } pending_mdsmap.mds_info.erase(addr); @@ -601,14 +602,14 @@ void MDSMonitor::tick() do_propose = true; } else if (info.state == MDSMap::STATE_STANDBY_REPLAY) { - dout(10) << " failing " << addr << " mds" << info.mds << "." << info.inc + dout(10) << " failing " << addr << " mds" << info.rank << "." << info.inc << " " << MDSMap::get_state_name(info.state) << dendl; pending_mdsmap.mds_info.erase(addr); do_propose = true; } else if (!info.laggy()) { // just mark laggy - dout(10) << " marking " << addr << " mds" << info.mds << "." << info.inc + dout(10) << " marking " << addr << " mds" << info.rank << "." << info.inc << " " << MDSMap::get_state_name(info.state) << " laggy" << dendl; info.laggy_since = now; @@ -636,7 +637,7 @@ void MDSMonitor::tick() dout(0) << " taking over failed mds" << f << " with " << sa << dendl; MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa]; si.state = MDSMap::STATE_REPLAY; - si.mds = f; + si.rank = f; si.inc = ++pending_mdsmap.inc[f]; pending_mdsmap.in.insert(f); pending_mdsmap.up[f] = sa; @@ -656,10 +657,10 @@ void MDSMonitor::tick() p != pending_mdsmap.mds_info.end(); p++) { if (p->second.state == MDSMap::STATE_STANDBY_REPLAY) - shadowed.insert(p->second.mds); + shadowed.insert(p->second.rank); if (p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) - avail[p->second.mds].insert(p->first); + avail[p->second.rank].insert(p->first); } // find an mds that needs a standby @@ -682,7 +683,7 @@ void MDSMonitor::tick() dout(10) << "mds" << *p << " will be shadowed by " << s << dendl; MDSMap::mds_info_t& info = pending_mdsmap.mds_info[s]; - info.mds = *p; + info.rank = *p; info.state = MDSMap::STATE_STANDBY_REPLAY; do_propose = true; } @@ -715,21 +716,21 @@ void MDSMonitor::do_stop() info.state = MDSMap::STATE_STOPPING; break; case MDSMap::STATE_STARTING: - pending_mdsmap.stopped.insert(info.mds); + pending_mdsmap.stopped.insert(info.rank); case MDSMap::STATE_CREATING: - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(info.addr); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.in.erase(info.rank); break; case MDSMap::STATE_REPLAY: case MDSMap::STATE_RESOLVE: case MDSMap::STATE_RECONNECT: case MDSMap::STATE_REJOIN: // BUG: hrm, if this is the case, the STOPPING guys won't be able to stop, will they? - pending_mdsmap.failed.insert(info.mds); - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.failed.insert(info.rank); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(info.addr); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.in.erase(info.rank); break; } } diff --git a/src/sample.cluster.conf b/src/sample.cluster.conf index 1ab7b59257b..6e8450024e0 100644 --- a/src/sample.cluster.conf +++ b/src/sample.cluster.conf @@ -17,10 +17,10 @@ [global] conf = ceph.conf restart on core dump = true + pid file = /var/run/ceph/$name.pid ; monitor [mon] - pid file = /var/run/ceph/mon$mon.pid [mon0] host = alpha @@ -39,15 +39,13 @@ ; mds [mds] - pid file = /var/run/ceph/mds$mds.pid -[mds0] +[mds.alpha] host = alpha ; osd [osd] - pid file = /var/run/ceph/osd$osd.pid sudo = true [osd0]