mirror of
https://github.com/ceph/ceph
synced 2025-03-04 23:40:07 +00:00
mantle: re-design and fixes
- fix legacy encoding in mds map and add balancer to dumps - fix blocking rados read and remove temporary files - fix beacon message spamming Signed-off-by: Michael Sevilla <mikesevilla3@gmail.com>
This commit is contained in:
parent
0829028d1c
commit
5cc43cf0a7
@ -51,12 +51,12 @@ Mantle with `vstart.sh`
|
||||
|
||||
::
|
||||
|
||||
./vstart.sh -n -l
|
||||
cd build
|
||||
../src/vstart.sh -n -l
|
||||
for i in a b c; do
|
||||
./ceph --admin-daemon out/mds.$i.asok config set debug_ms 0
|
||||
./ceph --admin-daemon out/mds.$i.asok config set debug_mds 0
|
||||
./ceph --admin-daemon out/mds.$i.asok config set debug_mds_balancer 2
|
||||
./ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500
|
||||
bin/ceph --admin-daemon out/mds.$i.asok config set debug_ms 0
|
||||
bin/ceph --admin-daemon out/mds.$i.asok config set debug_mds 2
|
||||
bin/ceph --admin-daemon out/mds.$i.asok config set mds_beacon_grace 1500
|
||||
done
|
||||
|
||||
|
||||
@ -64,23 +64,23 @@ Mantle with `vstart.sh`
|
||||
|
||||
::
|
||||
|
||||
./rados put --pool=cephfs_metadata_a greedyspill.lua mds/balancers/greedyspill.lua
|
||||
bin/rados put --pool=cephfs_metadata_a greedyspill.lua ../src/mds/balancers/greedyspill.lua
|
||||
|
||||
|
||||
3. Activate Mantle:
|
||||
|
||||
::
|
||||
|
||||
./ceph mds set allow_multimds true --yes-i-really-mean-it
|
||||
./ceph mds set max_mds 5
|
||||
./ceph mds set balancer greedyspill.lua
|
||||
bin/ceph mds set allow_multimds true --yes-i-really-mean-it
|
||||
bin/ceph mds set max_mds 5
|
||||
bin/ceph fs set cephfs_a balancer greedyspill.lua
|
||||
|
||||
|
||||
4. Mount CephFS in another window:
|
||||
|
||||
::
|
||||
|
||||
./ceph-fuse /cephfs -o allow_other &
|
||||
bin/ceph-fuse /cephfs -o allow_other &
|
||||
tail -f out/mds.a.log
|
||||
|
||||
|
||||
@ -89,8 +89,7 @@ Mantle with `vstart.sh`
|
||||
last MDS tries to check the load of its neighbor, which does not exist.
|
||||
|
||||
5. Run a simple benchmark. In our case, we use the Docker mdtest image to
|
||||
create load. Assuming that CephFS is mounted in the first container, we can
|
||||
share the mount and run 3 clients using:
|
||||
create load:
|
||||
|
||||
::
|
||||
|
||||
@ -213,6 +212,27 @@ jargon: a dummy value is pushed onto the stack and the next iterator replaces
|
||||
the top of the stack with a (k, v) pair. After reading each value, pop that
|
||||
value but keep the key for the next call to `lua_next`.
|
||||
|
||||
Reading from RADOS
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
All MDSs will read balancing code from RADOS when the balancer version changes
|
||||
in the MDS Map. The balancer pulls the Lua code from RADOS synchronously. We do
|
||||
this with a timeout: if the asynchronous read does not come back within half
|
||||
the balancing tick interval the operation is cancelled and a Connection Timeout
|
||||
error is returned. By default, the balancing tick interval is 10 seconds, so
|
||||
Mantle will use a 5 second second timeout. This design allows Mantle to
|
||||
immediately return an error if anything RADOS-related goes wrong.
|
||||
|
||||
We use this implementation because we do not want to do a blocking OSD read
|
||||
from inside the global MDS lock. Doing so would bring down the MDS cluster if
|
||||
any of the OSDs are not responsive -- this is tested in the ceph-qa-suite by
|
||||
setting all OSDs to down/out and making sure the MDS cluster stays active.
|
||||
|
||||
One approach would be to asynchronously fire the read when handling the MDS Map
|
||||
and fill in the Lua code in the background. We cannot do this because the MDS
|
||||
does not support daemon-local fallbacks and the balancer assumes that all MDSs
|
||||
come to the same decision at the same time (e.g., importers, exporters, etc.).
|
||||
|
||||
Debugging
|
||||
~~~~~~~~
|
||||
|
||||
@ -228,6 +248,10 @@ It is implemented by passing a function that wraps the `dout` logging framework
|
||||
(`dout_wrapper`) to Lua with the `lua_register()` primitive. The Lua code is
|
||||
actually calling the `dout` function in C++.
|
||||
|
||||
Warning and Info messages are centralized using the clog/Beacon. Successful
|
||||
messages are only sent on version changes by the first MDS to avoid spamming
|
||||
the `ceph -w` utility. These messages are used for the integration tests.
|
||||
|
||||
Testing
|
||||
~~~~~~
|
||||
|
||||
|
@ -171,24 +171,45 @@ mds_load_t MDBalancer::get_load(utime_t now)
|
||||
return load;
|
||||
}
|
||||
|
||||
int MDBalancer::localize_balancer(string const balancer)
|
||||
/*
|
||||
* Read synchronously from RADOS using a timeout. We cannot do daemon-local
|
||||
* fallbacks (i.e. kick off async read when we are processing the map and
|
||||
* check status when we get here) with the way the mds is structured.
|
||||
*/
|
||||
int MDBalancer::localize_balancer()
|
||||
{
|
||||
int64_t pool_id = mds->mdsmap->get_metadata_pool();
|
||||
string fname = "/tmp/" + balancer;
|
||||
/* reset everything */
|
||||
bool ack = false;
|
||||
int r = 0;
|
||||
bufferlist lua_src;
|
||||
Mutex lock("lock");
|
||||
Cond cond;
|
||||
|
||||
dout(15) << "looking for balancer=" << balancer << " in RADOS pool_id=" << pool_id << dendl;
|
||||
object_t oid = object_t(balancer);
|
||||
object_locator_t oloc(pool_id);
|
||||
bufferlist data;
|
||||
C_SaferCond waiter;
|
||||
mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &data, 0, &waiter);
|
||||
int r = waiter.wait();
|
||||
if (r == 0) {
|
||||
dout(15) << "write data from RADOS into fname=" << fname << " data=" << data.c_str() << dendl;
|
||||
data.write_file(fname.c_str());
|
||||
} else {
|
||||
dout(0) << "tick could not find balancer " << balancer
|
||||
<< " in RADOS: " << cpp_strerror(r) << dendl;
|
||||
/* we assume that balancer is in the metadata pool */
|
||||
object_t oid = object_t(mds->mdsmap->get_balancer());
|
||||
object_locator_t oloc(mds->mdsmap->get_metadata_pool());
|
||||
ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
|
||||
new C_SafeCond(&lock, &cond, &ack, &r));
|
||||
dout(15) << "launched non-blocking read tid=" << tid
|
||||
<< " oid=" << oid << " oloc=" << oloc << dendl;
|
||||
|
||||
/* timeout: if we waste half our time waiting for RADOS, then abort! */
|
||||
double t = ceph_clock_now(g_ceph_context) + g_conf->mds_bal_interval/2;
|
||||
utime_t timeout;
|
||||
timeout.set_from_double(t);
|
||||
lock.Lock();
|
||||
int ret_t = cond.WaitUntil(lock, timeout);
|
||||
lock.Unlock();
|
||||
|
||||
/* success: store the balancer in memory and set the version. */
|
||||
if (!r) {
|
||||
if (ret_t == ETIMEDOUT) {
|
||||
mds->objecter->op_cancel(tid, -ECANCELED);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
bal_code.assign(lua_src.to_str());
|
||||
bal_version.assign(oid.name);
|
||||
dout(0) << "localized balancer, bal_code=" << bal_code << dendl;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
@ -301,16 +322,14 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m)
|
||||
// let's go!
|
||||
//export_empties(); // no!
|
||||
|
||||
int r = mantle_prep_rebalance();
|
||||
if (!r) {
|
||||
mds->clog->info() << "mantle succeeded; "
|
||||
<< "balancer=" << mds->mdsmap->get_balancer();
|
||||
return;
|
||||
/* avoid spamming ceph -w if user does not turn mantle on */
|
||||
if (mds->mdsmap->get_balancer() != "") {
|
||||
int r = mantle_prep_rebalance();
|
||||
if (!r) return;
|
||||
mds->clog->warn() << "using old balancer; mantle failed for "
|
||||
<< "balancer=" << mds->mdsmap->get_balancer()
|
||||
<< " : " << cpp_strerror(r);
|
||||
}
|
||||
|
||||
mds->clog->warn() << "mantle failed (falling back to original balancer); "
|
||||
<< "balancer=" << mds->mdsmap->get_balancer()
|
||||
<< " : " << cpp_strerror(r);
|
||||
prep_rebalance(m->get_beat());
|
||||
}
|
||||
}
|
||||
@ -629,13 +648,16 @@ void MDBalancer::prep_rebalance(int beat)
|
||||
|
||||
int MDBalancer::mantle_prep_rebalance()
|
||||
{
|
||||
/* pull metadata balancer from RADOS */
|
||||
string const balancer = mds->mdsmap->get_balancer();
|
||||
if (balancer == "" || localize_balancer(balancer))
|
||||
return -ENOENT;
|
||||
ifstream f("/tmp/" + balancer);
|
||||
string script((istreambuf_iterator<char>(f)),
|
||||
istreambuf_iterator<char>());
|
||||
/* refresh balancer if it has changed */
|
||||
if (bal_version != mds->mdsmap->get_balancer()) {
|
||||
bal_version.assign("");
|
||||
int r = localize_balancer();
|
||||
if (r) return r;
|
||||
|
||||
/* only spam the cluster log from 1 mds on version changes */
|
||||
if (mds->get_nodeid() == 0)
|
||||
mds->clog->info() << "mantle balancer version changed: " << bal_version;
|
||||
}
|
||||
|
||||
/* prepare for balancing */
|
||||
int cluster_size = mds->get_mds_map()->get_num_in_mds();
|
||||
@ -654,17 +676,16 @@ int MDBalancer::mantle_prep_rebalance()
|
||||
std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
|
||||
mds_load_t &load(r.first->second);
|
||||
|
||||
metrics[i].insert(make_pair("auth.meta_load", load.auth.meta_load()));
|
||||
metrics[i].insert(make_pair("all.meta_load", load.all.meta_load()));
|
||||
metrics[i].insert(make_pair("req_rate", load.req_rate));
|
||||
metrics[i].insert(make_pair("queue_len", load.queue_len));
|
||||
metrics[i].insert(make_pair("cpu_load_avg", load.cpu_load_avg));
|
||||
metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
|
||||
{"all.meta_load", load.all.meta_load()},
|
||||
{"req_rate", load.req_rate},
|
||||
{"queue_len", load.queue_len},
|
||||
{"cpu_load_avg", load.cpu_load_avg}};
|
||||
}
|
||||
|
||||
/* execute the balancer */
|
||||
Mantle *mantle = new Mantle();
|
||||
int ret = mantle->balance(script, mds->get_nodeid(), metrics, my_targets);
|
||||
delete mantle;
|
||||
Mantle mantle;
|
||||
int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, my_targets);
|
||||
dout(2) << " mantle decided that new targets=" << my_targets << dendl;
|
||||
|
||||
/* mantle doesn't know about cluster size, so check target len here */
|
||||
|
@ -24,6 +24,7 @@ using std::map;
|
||||
|
||||
#include "include/types.h"
|
||||
#include "common/Clock.h"
|
||||
#include "common/Cond.h"
|
||||
#include "CInode.h"
|
||||
|
||||
|
||||
@ -44,6 +45,8 @@ class MDBalancer {
|
||||
|
||||
int last_epoch_under;
|
||||
int last_epoch_over;
|
||||
string bal_code;
|
||||
string bal_version;
|
||||
|
||||
utime_t last_heartbeat;
|
||||
utime_t last_fragment;
|
||||
@ -88,7 +91,7 @@ public:
|
||||
|
||||
int proc_message(Message *m);
|
||||
|
||||
int localize_balancer(string const balancer);
|
||||
int localize_balancer();
|
||||
void send_heartbeat();
|
||||
void handle_heartbeat(MHeartbeat *m);
|
||||
|
||||
|
@ -184,6 +184,7 @@ void MDSMap::dump(Formatter *f) const
|
||||
f->dump_int("metadata_pool", metadata_pool);
|
||||
f->dump_bool("enabled", enabled);
|
||||
f->dump_string("fs_name", fs_name);
|
||||
f->dump_string("balancer", balancer);
|
||||
}
|
||||
|
||||
void MDSMap::generate_test_instances(list<MDSMap*>& ls)
|
||||
@ -226,6 +227,7 @@ void MDSMap::print(ostream& out) const
|
||||
out << "data_pools\t" << data_pools << "\n";
|
||||
out << "metadata_pool\t" << metadata_pool << "\n";
|
||||
out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
|
||||
out << "balancer\t" << balancer << "\n";
|
||||
|
||||
multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
|
||||
for (const auto &p : mds_info) {
|
||||
@ -487,7 +489,6 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
|
||||
::encode(session_autoclose, bl);
|
||||
::encode(max_file_size, bl);
|
||||
::encode(max_mds, bl);
|
||||
::encode(balancer, bl);
|
||||
__u32 n = mds_info.size();
|
||||
::encode(n, bl);
|
||||
for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
|
||||
@ -516,7 +517,6 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
|
||||
::encode(session_autoclose, bl);
|
||||
::encode(max_file_size, bl);
|
||||
::encode(max_mds, bl);
|
||||
::encode(balancer, bl);
|
||||
__u32 n = mds_info.size();
|
||||
::encode(n, bl);
|
||||
for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
|
||||
@ -544,7 +544,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
|
||||
return;
|
||||
}
|
||||
|
||||
ENCODE_START(6, 4, bl);
|
||||
ENCODE_START(5, 4, bl);
|
||||
::encode(epoch, bl);
|
||||
::encode(flags, bl);
|
||||
::encode(last_failure, bl);
|
||||
@ -553,13 +553,12 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
|
||||
::encode(session_autoclose, bl);
|
||||
::encode(max_file_size, bl);
|
||||
::encode(max_mds, bl);
|
||||
::encode(balancer, bl);
|
||||
::encode(mds_info, bl, features);
|
||||
::encode(data_pools, bl);
|
||||
::encode(cas_pool, bl);
|
||||
|
||||
// kclient ignores everything from here
|
||||
__u16 ev = 10;
|
||||
__u16 ev = 11;
|
||||
::encode(ev, bl);
|
||||
::encode(compat, bl);
|
||||
::encode(metadata_pool, bl);
|
||||
@ -578,6 +577,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
|
||||
::encode(enabled, bl);
|
||||
::encode(fs_name, bl);
|
||||
::encode(damaged, bl);
|
||||
::encode(balancer, bl);
|
||||
ENCODE_FINISH(bl);
|
||||
}
|
||||
|
||||
@ -586,7 +586,7 @@ void MDSMap::decode(bufferlist::iterator& p)
|
||||
std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
|
||||
|
||||
cached_up_features = 0;
|
||||
DECODE_START_LEGACY_COMPAT_LEN_16(6, 4, 4, p);
|
||||
DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
|
||||
::decode(epoch, p);
|
||||
::decode(flags, p);
|
||||
::decode(last_failure, p);
|
||||
@ -595,7 +595,6 @@ void MDSMap::decode(bufferlist::iterator& p)
|
||||
::decode(session_autoclose, p);
|
||||
::decode(max_file_size, p);
|
||||
::decode(max_mds, p);
|
||||
::decode(balancer, p);
|
||||
::decode(mds_info, p);
|
||||
if (struct_v < 3) {
|
||||
__u32 n;
|
||||
@ -683,6 +682,10 @@ void MDSMap::decode(bufferlist::iterator& p)
|
||||
if (ev >= 9) {
|
||||
::decode(damaged, p);
|
||||
}
|
||||
|
||||
if (ev >= 11) {
|
||||
::decode(balancer, p);
|
||||
}
|
||||
DECODE_FINISH(p);
|
||||
}
|
||||
|
||||
|
@ -193,7 +193,7 @@ protected:
|
||||
*/
|
||||
|
||||
mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */
|
||||
string balancer; /* The name and version of the metadata load balancer. */
|
||||
string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
|
||||
|
||||
std::set<mds_rank_t> in; // currently defined cluster
|
||||
|
||||
@ -290,7 +290,7 @@ public:
|
||||
mds_rank_t get_max_mds() const { return max_mds; }
|
||||
void set_max_mds(mds_rank_t m) { max_mds = m; }
|
||||
|
||||
std::string get_balancer() const { return balancer; }
|
||||
const std::string get_balancer() const { return balancer; }
|
||||
void set_balancer(std::string val) { balancer.assign(val); }
|
||||
|
||||
mds_rank_t get_tableserver() const { return tableserver; }
|
||||
|
@ -1,3 +1,17 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mdstypes.h"
|
||||
#include "MDSRank.h"
|
||||
#include "Mantle.h"
|
||||
|
@ -1,23 +1,28 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef CEPH_MANTLE_H
|
||||
#define CEPH_MANTLE_H
|
||||
|
||||
#include <lua.hpp>
|
||||
#include <list>
|
||||
#include <map>
|
||||
using std::list;
|
||||
using std::map;
|
||||
|
||||
#include "include/types.h"
|
||||
#include "common/Clock.h"
|
||||
#include "CInode.h"
|
||||
|
||||
|
||||
|
||||
class MDSRank;
|
||||
class Message;
|
||||
class MHeartbeat;
|
||||
class CInode;
|
||||
class CDir;
|
||||
class Messenger;
|
||||
class MonClient;
|
||||
|
||||
class Mantle {
|
||||
protected:
|
||||
lua_State *L;
|
||||
@ -32,3 +37,5 @@ class Mantle {
|
||||
vector < map<string, double> > metrics,
|
||||
map<mds_rank_t,double> &my_targets);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -330,7 +330,7 @@ COMMAND_WITH_FLAG("mds set_max_mds " \
|
||||
"set max MDS index", "mds", "rw", "cli,rest", FLAG(DEPRECATED))
|
||||
COMMAND_WITH_FLAG("mds set " \
|
||||
"name=var,type=CephChoices,strings=max_mds|max_file_size"
|
||||
"|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags|balancer " \
|
||||
"|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags " \
|
||||
"name=val,type=CephString " \
|
||||
"name=confirm,type=CephString,req=false", \
|
||||
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest", FLAG(DEPRECATED))
|
||||
@ -398,7 +398,7 @@ COMMAND("fs get name=fs_name,type=CephString", \
|
||||
COMMAND("fs set " \
|
||||
"name=fs_name,type=CephString " \
|
||||
"name=var,type=CephChoices,strings=max_mds|max_file_size"
|
||||
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags " \
|
||||
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer " \
|
||||
"name=val,type=CephString " \
|
||||
"name=confirm,type=CephString,req=false", \
|
||||
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
|
||||
|
Loading…
Reference in New Issue
Block a user