diff --git a/doc/cephfs/mantle.rst b/doc/cephfs/mantle.rst index 9d0b6bd97f5..f5f7f303c52 100644 --- a/doc/cephfs/mantle.rst +++ b/doc/cephfs/mantle.rst @@ -51,12 +51,12 @@ Mantle with `vstart.sh` :: - ./vstart.sh -n -l + cd build + ../src/vstart.sh -n -l for i in a b c; do - ./ceph --admin-daemon out/mds.$i.asok config set debug_ms 0 - ./ceph --admin-daemon out/mds.$i.asok config set debug_mds 0 - ./ceph --admin-daemon out/mds.$i.asok config set debug_mds_balancer 2 - ./ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500 + bin/ceph --admin-daemon out/mds.$i.asok config set debug_ms 0 + bin/ceph --admin-daemon out/mds.$i.asok config set debug_mds 2 + bin/ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500 done @@ -64,23 +64,23 @@ Mantle with `vstart.sh` :: - ./rados put --pool=cephfs_metadata_a greedyspill.lua mds/balancers/greedyspill.lua + bin/rados put --pool=cephfs_metadata_a greedyspill.lua ../src/mds/balancers/greedyspill.lua 3. Activate Mantle: :: - ./ceph mds set allow_multimds true --yes-i-really-mean-it - ./ceph mds set max_mds 5 - ./ceph mds set balancer greedyspill.lua + bin/ceph mds set allow_multimds true --yes-i-really-mean-it + bin/ceph mds set max_mds 5 + bin/ceph fs set cephfs_a balancer greedyspill.lua 4. Mount CephFS in another window: :: - ./ceph-fuse /cephfs -o allow_other & + bin/ceph-fuse /cephfs -o allow_other & tail -f out/mds.a.log @@ -89,8 +89,7 @@ Mantle with `vstart.sh` last MDS tries to check the load of its neighbor, which does not exist. 5. Run a simple benchmark. In our case, we use the Docker mdtest image to - create load. Assuming that CephFS is mounted in the first container, we can - share the mount and run 3 clients using: + create load: :: @@ -213,6 +212,27 @@ jargon: a dummy value is pushed onto the stack and the next iterator replaces the top of the stack with a (k, v) pair. After reading each value, pop that value but keep the key for the next call to `lua_next`. +Reading from RADOS +~~~~~~~~~~~~~~~~~~ + +All MDSs will read balancing code from RADOS when the balancer version changes +in the MDS Map. The balancer pulls the Lua code from RADOS synchronously. We do +this with a timeout: if the asynchronous read does not come back within half +the balancing tick interval the operation is cancelled and a Connection Timeout +error is returned. By default, the balancing tick interval is 10 seconds, so +Mantle will use a 5 second second timeout. This design allows Mantle to +immediately return an error if anything RADOS-related goes wrong. + +We use this implementation because we do not want to do a blocking OSD read +from inside the global MDS lock. Doing so would bring down the MDS cluster if +any of the OSDs are not responsive -- this is tested in the ceph-qa-suite by +setting all OSDs to down/out and making sure the MDS cluster stays active. + +One approach would be to asynchronously fire the read when handling the MDS Map +and fill in the Lua code in the background. We cannot do this because the MDS +does not support daemon-local fallbacks and the balancer assumes that all MDSs +come to the same decision at the same time (e.g., importers, exporters, etc.). + Debugging ~~~~~~~~ @@ -228,6 +248,10 @@ It is implemented by passing a function that wraps the `dout` logging framework (`dout_wrapper`) to Lua with the `lua_register()` primitive. The Lua code is actually calling the `dout` function in C++. +Warning and Info messages are centralized using the clog/Beacon. Successful +messages are only sent on version changes by the first MDS to avoid spamming +the `ceph -w` utility. These messages are used for the integration tests. + Testing ~~~~~~ diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index df66226f79d..d8a0b5657a7 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -171,24 +171,45 @@ mds_load_t MDBalancer::get_load(utime_t now) return load; } -int MDBalancer::localize_balancer(string const balancer) +/* + * Read synchronously from RADOS using a timeout. We cannot do daemon-local + * fallbacks (i.e. kick off async read when we are processing the map and + * check status when we get here) with the way the mds is structured. + */ +int MDBalancer::localize_balancer() { - int64_t pool_id = mds->mdsmap->get_metadata_pool(); - string fname = "/tmp/" + balancer; + /* reset everything */ + bool ack = false; + int r = 0; + bufferlist lua_src; + Mutex lock("lock"); + Cond cond; - dout(15) << "looking for balancer=" << balancer << " in RADOS pool_id=" << pool_id << dendl; - object_t oid = object_t(balancer); - object_locator_t oloc(pool_id); - bufferlist data; - C_SaferCond waiter; - mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &data, 0, &waiter); - int r = waiter.wait(); - if (r == 0) { - dout(15) << "write data from RADOS into fname=" << fname << " data=" << data.c_str() << dendl; - data.write_file(fname.c_str()); - } else { - dout(0) << "tick could not find balancer " << balancer - << " in RADOS: " << cpp_strerror(r) << dendl; + /* we assume that balancer is in the metadata pool */ + object_t oid = object_t(mds->mdsmap->get_balancer()); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0, + new C_SafeCond(&lock, &cond, &ack, &r)); + dout(15) << "launched non-blocking read tid=" << tid + << " oid=" << oid << " oloc=" << oloc << dendl; + + /* timeout: if we waste half our time waiting for RADOS, then abort! */ + double t = ceph_clock_now(g_ceph_context) + g_conf->mds_bal_interval/2; + utime_t timeout; + timeout.set_from_double(t); + lock.Lock(); + int ret_t = cond.WaitUntil(lock, timeout); + lock.Unlock(); + + /* success: store the balancer in memory and set the version. */ + if (!r) { + if (ret_t == ETIMEDOUT) { + mds->objecter->op_cancel(tid, -ECANCELED); + return -ETIMEDOUT; + } + bal_code.assign(lua_src.to_str()); + bal_version.assign(oid.name); + dout(0) << "localized balancer, bal_code=" << bal_code << dendl; } return r; } @@ -301,16 +322,14 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) // let's go! //export_empties(); // no! - int r = mantle_prep_rebalance(); - if (!r) { - mds->clog->info() << "mantle succeeded; " - << "balancer=" << mds->mdsmap->get_balancer(); - return; + /* avoid spamming ceph -w if user does not turn mantle on */ + if (mds->mdsmap->get_balancer() != "") { + int r = mantle_prep_rebalance(); + if (!r) return; + mds->clog->warn() << "using old balancer; mantle failed for " + << "balancer=" << mds->mdsmap->get_balancer() + << " : " << cpp_strerror(r); } - - mds->clog->warn() << "mantle failed (falling back to original balancer); " - << "balancer=" << mds->mdsmap->get_balancer() - << " : " << cpp_strerror(r); prep_rebalance(m->get_beat()); } } @@ -629,13 +648,16 @@ void MDBalancer::prep_rebalance(int beat) int MDBalancer::mantle_prep_rebalance() { - /* pull metadata balancer from RADOS */ - string const balancer = mds->mdsmap->get_balancer(); - if (balancer == "" || localize_balancer(balancer)) - return -ENOENT; - ifstream f("/tmp/" + balancer); - string script((istreambuf_iterator(f)), - istreambuf_iterator()); + /* refresh balancer if it has changed */ + if (bal_version != mds->mdsmap->get_balancer()) { + bal_version.assign(""); + int r = localize_balancer(); + if (r) return r; + + /* only spam the cluster log from 1 mds on version changes */ + if (mds->get_nodeid() == 0) + mds->clog->info() << "mantle balancer version changed: " << bal_version; + } /* prepare for balancing */ int cluster_size = mds->get_mds_map()->get_num_in_mds(); @@ -654,17 +676,16 @@ int MDBalancer::mantle_prep_rebalance() std::pair < map::iterator, bool > r(mds_load.insert(val)); mds_load_t &load(r.first->second); - metrics[i].insert(make_pair("auth.meta_load", load.auth.meta_load())); - metrics[i].insert(make_pair("all.meta_load", load.all.meta_load())); - metrics[i].insert(make_pair("req_rate", load.req_rate)); - metrics[i].insert(make_pair("queue_len", load.queue_len)); - metrics[i].insert(make_pair("cpu_load_avg", load.cpu_load_avg)); + metrics[i] = {{"auth.meta_load", load.auth.meta_load()}, + {"all.meta_load", load.all.meta_load()}, + {"req_rate", load.req_rate}, + {"queue_len", load.queue_len}, + {"cpu_load_avg", load.cpu_load_avg}}; } /* execute the balancer */ - Mantle *mantle = new Mantle(); - int ret = mantle->balance(script, mds->get_nodeid(), metrics, my_targets); - delete mantle; + Mantle mantle; + int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, my_targets); dout(2) << " mantle decided that new targets=" << my_targets << dendl; /* mantle doesn't know about cluster size, so check target len here */ diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h index b8ff079bd35..c98974de1c5 100644 --- a/src/mds/MDBalancer.h +++ b/src/mds/MDBalancer.h @@ -24,6 +24,7 @@ using std::map; #include "include/types.h" #include "common/Clock.h" +#include "common/Cond.h" #include "CInode.h" @@ -44,6 +45,8 @@ class MDBalancer { int last_epoch_under; int last_epoch_over; + string bal_code; + string bal_version; utime_t last_heartbeat; utime_t last_fragment; @@ -88,7 +91,7 @@ public: int proc_message(Message *m); - int localize_balancer(string const balancer); + int localize_balancer(); void send_heartbeat(); void handle_heartbeat(MHeartbeat *m); diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index a0d7336e2b6..6a666725b54 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -184,6 +184,7 @@ void MDSMap::dump(Formatter *f) const f->dump_int("metadata_pool", metadata_pool); f->dump_bool("enabled", enabled); f->dump_string("fs_name", fs_name); + f->dump_string("balancer", balancer); } void MDSMap::generate_test_instances(list& ls) @@ -226,6 +227,7 @@ void MDSMap::print(ostream& out) const out << "data_pools\t" << data_pools << "\n"; out << "metadata_pool\t" << metadata_pool << "\n"; out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; + out << "balancer\t" << balancer << "\n"; multimap< pair, mds_gid_t > foo; for (const auto &p : mds_info) { @@ -487,7 +489,6 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); - ::encode(balancer, bl); __u32 n = mds_info.size(); ::encode(n, bl); for (map::const_iterator i = mds_info.begin(); @@ -516,7 +517,6 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); - ::encode(balancer, bl); __u32 n = mds_info.size(); ::encode(n, bl); for (map::const_iterator i = mds_info.begin(); @@ -544,7 +544,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(6, 4, bl); + ENCODE_START(5, 4, bl); ::encode(epoch, bl); ::encode(flags, bl); ::encode(last_failure, bl); @@ -553,13 +553,12 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); - ::encode(balancer, bl); ::encode(mds_info, bl, features); ::encode(data_pools, bl); ::encode(cas_pool, bl); // kclient ignores everything from here - __u16 ev = 10; + __u16 ev = 11; ::encode(ev, bl); ::encode(compat, bl); ::encode(metadata_pool, bl); @@ -578,6 +577,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(enabled, bl); ::encode(fs_name, bl); ::encode(damaged, bl); + ::encode(balancer, bl); ENCODE_FINISH(bl); } @@ -586,7 +586,7 @@ void MDSMap::decode(bufferlist::iterator& p) std::map inc; // Legacy field, parse and drop cached_up_features = 0; - DECODE_START_LEGACY_COMPAT_LEN_16(6, 4, 4, p); + DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p); ::decode(epoch, p); ::decode(flags, p); ::decode(last_failure, p); @@ -595,7 +595,6 @@ void MDSMap::decode(bufferlist::iterator& p) ::decode(session_autoclose, p); ::decode(max_file_size, p); ::decode(max_mds, p); - ::decode(balancer, p); ::decode(mds_info, p); if (struct_v < 3) { __u32 n; @@ -683,6 +682,10 @@ void MDSMap::decode(bufferlist::iterator& p) if (ev >= 9) { ::decode(damaged, p); } + + if (ev >= 11) { + ::decode(balancer, p); + } DECODE_FINISH(p); } diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 6cdb6b4b520..bde8864b6f2 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -193,7 +193,7 @@ protected: */ mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */ - string balancer; /* The name and version of the metadata load balancer. */ + string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */ std::set in; // currently defined cluster @@ -290,7 +290,7 @@ public: mds_rank_t get_max_mds() const { return max_mds; } void set_max_mds(mds_rank_t m) { max_mds = m; } - std::string get_balancer() const { return balancer; } + const std::string get_balancer() const { return balancer; } void set_balancer(std::string val) { balancer.assign(val); } mds_rank_t get_tableserver() const { return tableserver; } diff --git a/src/mds/Mantle.cc b/src/mds/Mantle.cc index e75ce5ef337..eb4fff5520a 100644 --- a/src/mds/Mantle.cc +++ b/src/mds/Mantle.cc @@ -1,3 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Michael Sevilla + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + #include "mdstypes.h" #include "MDSRank.h" #include "Mantle.h" diff --git a/src/mds/Mantle.h b/src/mds/Mantle.h index d4651798939..7aee990b6a1 100644 --- a/src/mds/Mantle.h +++ b/src/mds/Mantle.h @@ -1,23 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Michael Sevilla + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MANTLE_H +#define CEPH_MANTLE_H + #include #include #include -using std::list; -using std::map; #include "include/types.h" #include "common/Clock.h" #include "CInode.h" - - -class MDSRank; -class Message; -class MHeartbeat; -class CInode; -class CDir; -class Messenger; -class MonClient; - class Mantle { protected: lua_State *L; @@ -32,3 +37,5 @@ class Mantle { vector < map > metrics, map &my_targets); }; + +#endif diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 6832e4643a3..f20edb521a2 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -330,7 +330,7 @@ COMMAND_WITH_FLAG("mds set_max_mds " \ "set max MDS index", "mds", "rw", "cli,rest", FLAG(DEPRECATED)) COMMAND_WITH_FLAG("mds set " \ "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags|balancer " \ + "|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags " \ "name=val,type=CephString " \ "name=confirm,type=CephString,req=false", \ "set mds parameter to ", "mds", "rw", "cli,rest", FLAG(DEPRECATED)) @@ -398,7 +398,7 @@ COMMAND("fs get name=fs_name,type=CephString", \ COMMAND("fs set " \ "name=fs_name,type=CephString " \ "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags " \ + "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer " \ "name=val,type=CephString " \ "name=confirm,type=CephString,req=false", \ "set mds parameter to ", "mds", "rw", "cli,rest")