mds: optimize MDBalancer code path config access

This change is necessary because the new way of setting config is to use the
ceph config command or the asok interface rather than the old way which
involved editing the ceph.conf and restarting the daemons to reflect the
changes. Have updated the code to support runtime config changes.

Signed-off-by: Sidharth Anupkrishnan <sanupkri@redhat.com>
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Sidharth Anupkrishnan 2020-10-01 21:04:56 +05:30 committed by Patrick Donnelly
parent 8fbefb00ba
commit ddc57388ce
No known key found for this signature in database
GPG Key ID: FA47FD0B0367D313
8 changed files with 186 additions and 67 deletions

View File

@ -633,7 +633,8 @@ options:
default: true
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_export_ephemeral_random
type: bool
level: advanced
@ -690,7 +691,8 @@ options:
default: 3
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_replicate_threshold
type: float
level: advanced
@ -700,7 +702,8 @@ options:
default: 8000
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_unreplicate_threshold
type: float
level: advanced
@ -710,7 +713,8 @@ options:
default: 0
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_split_size
type: int
level: advanced
@ -720,7 +724,8 @@ options:
default: 10000
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_split_rd
type: float
level: advanced
@ -730,7 +735,8 @@ options:
default: 25000
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_split_wr
type: float
level: advanced
@ -740,7 +746,8 @@ options:
default: 10000
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_split_bits
type: int
level: advanced
@ -749,9 +756,10 @@ options:
default: 3
services:
- mds
flags:
- runtime
min: 1
max: 24
with_legacy: true
- name: mds_bal_merge_size
type: int
level: advanced
@ -761,7 +769,8 @@ options:
default: 50
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_interval
type: int
level: advanced
@ -770,6 +779,8 @@ options:
default: 10
services:
- mds
flags:
- runtime
- name: mds_bal_fragment_interval
type: int
level: advanced
@ -779,6 +790,8 @@ options:
default: 5
services:
- mds
flags:
- runtime
# order of magnitude higher than split size
- name: mds_bal_fragment_size_max
type: int
@ -800,7 +813,8 @@ options:
default: 1.5
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_fragment_dirs
type: bool
level: advanced
@ -813,6 +827,8 @@ options:
default: true
services:
- mds
flags:
- runtime
- name: mds_bal_idle_threshold
type: float
level: advanced
@ -822,7 +838,8 @@ options:
default: 0
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_bal_max
type: int
level: dev
@ -831,7 +848,8 @@ options:
- mds
fmt_desc: The number of iterations to run balancer before Ceph stops.
(used for testing purposes only)
with_legacy: true
flags:
- runtime
- name: mds_bal_max_until
type: int
level: dev
@ -840,7 +858,8 @@ options:
- mds
fmt_desc: The number of seconds to run balancer before Ceph stops.
(used for testing purposes only)
with_legacy: true
flags:
- runtime
- name: mds_bal_mode
type: int
level: dev
@ -853,7 +872,8 @@ options:
- ``0`` = Hybrid.
- ``1`` = Request rate and latency.
- ``2`` = CPU load.
with_legacy: true
flags:
- runtime
# must be this much above average before we export anything
- name: mds_bal_min_rebalance
type: float
@ -863,7 +883,8 @@ options:
default: 0.1
services:
- mds
with_legacy: true
flags:
- runtime
# must be overloaded for more than these epochs before we export anything
- name: mds_bal_overload_epochs
type: int
@ -882,7 +903,8 @@ options:
services:
- mds
fmt_desc: The minimum subtree temperature before Ceph searches a subtree.
with_legacy: true
flags:
- runtime
# take within this range of what we need
- name: mds_bal_need_min
type: float
@ -891,7 +913,8 @@ options:
services:
- mds
fmt_desc: The minimum fraction of target subtree size to accept.
with_legacy: true
flags:
- runtime
- name: mds_bal_need_max
type: float
level: dev
@ -899,7 +922,8 @@ options:
services:
- mds
fmt_desc: The maximum fraction of target subtree size to accept.
with_legacy: true
flags:
- runtime
# any sub bigger than this taken in full
- name: mds_bal_midchunk
type: float
@ -909,7 +933,8 @@ options:
- mds
fmt_desc: Ceph will migrate any subtree that is larger than this fraction
of the target subtree size.
with_legacy: true
flags:
- runtime
# never take anything smaller than this
- name: mds_bal_minchunk
type: float
@ -919,7 +944,8 @@ options:
- mds
fmt_desc: Ceph will ignore any subtree that is smaller than this fraction
of the target subtree size.
with_legacy: true
flags:
- runtime
# target decay half-life in MDSMap (2x larger is approx. 2x slower)
- name: mds_bal_target_decay
type: float
@ -928,7 +954,8 @@ options:
default: 10
services:
- mds
with_legacy: true
flags:
- runtime
- name: mds_oft_prefetch_dirfrags
type: bool
level: advanced

View File

@ -1018,6 +1018,12 @@ void CDir::init_fragment_pins()
get(PIN_SUBTREE);
}
bool CDir::should_split() const {
uint64_t split_size = mdcache->mds->balancer->get_bal_split_size();
uint64_t items = get_frag_size() + get_num_snap_items();
return split_size > 0 && items > split_size;
}
void CDir::split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay)
{
dout(10) << "split by " << bits << " bits on " << *this << dendl;
@ -3774,7 +3780,10 @@ std::string CDir::get_path() const
bool CDir::should_split_fast() const
{
// Max size a fragment can be before trigger fast splitting
int fast_limit = g_conf()->mds_bal_split_size * g_conf()->mds_bal_fragment_fast_factor;
auto&& balancer = mdcache->mds->balancer;
auto split_size = balancer->get_bal_split_size();
auto fragment_fast_factor = balancer->get_bal_fragment_fast_factor();
int64_t fast_limit = split_size * fragment_fast_factor;
// Fast path: the sum of accounted size and null dentries does not
// exceed threshold: we definitely are not over it.
@ -3811,7 +3820,9 @@ bool CDir::should_merge() const
return false;
}
return ((int)get_frag_size() + (int)get_num_snap_items()) < g_conf()->mds_bal_merge_size;
uint64_t merge_size = mdcache->mds->balancer->get_bal_merge_size();
uint64_t items = get_frag_size() + get_num_snap_items();
return items < merge_size;
}
MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);

View File

@ -403,10 +403,7 @@ public:
void split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay);
void merge(const std::vector<CDir*>& subs, MDSContext::vec& waiters, bool replay);
bool should_split() const {
return g_conf()->mds_bal_split_size > 0 &&
((int)get_frag_size() + (int)get_num_snap_items()) > g_conf()->mds_bal_split_size;
}
bool should_split() const;
bool should_split_fast() const;
bool should_merge() const;

View File

@ -26,6 +26,7 @@
#include "MDLog.h"
#include "Locker.h"
#include "Mutation.h"
#include "MDBalancer.h"
#include "events/EUpdate.h"
@ -5384,7 +5385,8 @@ void CInode::queue_export_pin(mds_rank_t export_pin)
void CInode::maybe_export_pin(bool update)
{
if (!g_conf()->mds_bal_export_pin)
auto&& balancer = mdcache->mds->balancer;
if (!balancer->get_bal_export_pin())
return;
if (!is_dir() || !is_normal())
return;
@ -5499,7 +5501,9 @@ void CInode::set_export_pin(mds_rank_t rank)
mds_rank_t CInode::get_export_pin(bool inherit) const
{
if (!g_conf()->mds_bal_export_pin)
auto&& balancer = mdcache->mds->balancer;
auto export_pin = balancer->get_bal_export_pin();
if (!export_pin)
return MDS_RANK_NONE;
/* An inode that is export pinned may not necessarily be a subtree root, we

View File

@ -81,18 +81,58 @@ int MDBalancer::proc_message(const cref_t<Message> &m)
MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
mds(m), messenger(msgr), mon_client(monc)
{
bal_export_pin = g_conf().get_val<bool>("mds_bal_export_pin");
bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
bal_fragment_fast_factor = g_conf().get_val<double>("mds_bal_fragment_fast_factor");
bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
bal_merge_size = g_conf().get_val<int64_t>("mds_bal_merge_size");
bal_mode = g_conf().get_val<int64_t>("mds_bal_mode");
bal_replicate_threshold = g_conf().get_val<double>("mds_bal_replicate_threshold");
bal_sample_interval = g_conf().get_val<double>("mds_bal_sample_interval");
bal_split_rd = g_conf().get_val<double>("mds_bal_split_rd");
bal_split_bits = g_conf().get_val<int64_t>("mds_bal_split_bits");
bal_split_size = g_conf().get_val<int64_t>("mds_bal_split_size");
bal_split_wr = g_conf().get_val<double>("mds_bal_split_wr");
bal_unreplicate_threshold = g_conf().get_val<double>("mds_bal_unreplicate_threshold");
num_bal_times = g_conf().get_val<int64_t>("mds_bal_max");
}
void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
{
if (changed.count("mds_bal_fragment_dirs")) {
if (changed.count("mds_bal_export_pin"))
bal_export_pin = g_conf().get_val<bool>("mds_bal_export_pin");
if (changed.count("mds_bal_fragment_dirs"))
bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
}
if (changed.count("mds_bal_fragment_interval")) {
if (changed.count("mds_bal_fragment_fast_factor"))
bal_fragment_fast_factor = g_conf().get_val<double>("mds_bal_fragment_fast_factor");
if (changed.count("mds_bal_fragment_interval"))
bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
}
if (changed.count("mds_bal_interval"))
bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
if (changed.count("mds_bal_max_until"))
bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
if (changed.count("mds_bal_merge_size"))
bal_merge_size = g_conf().get_val<int64_t>("mds_bal_merge_size");
if (changed.count("mds_bal_mode"))
bal_mode = g_conf().get_val<int64_t>("mds_bal_mode");
if (changed.count("mds_bal_replicate_threshold"))
bal_replicate_threshold = g_conf().get_val<double>("mds_bal_replicate_threshold");
if (changed.count("mds_bal_sample_interval"))
bal_sample_interval = g_conf().get_val<double>("mds_bal_sample_interval");
if (changed.count("mds_bal_split_rd"))
bal_split_rd = g_conf().get_val<double>("mds_bal_split_rd");
if (changed.count("mds_bal_split_bits"))
bal_split_bits = g_conf().get_val<int64_t>("mds_bal_split_bits");
if (changed.count("mds_bal_split_size"))
bal_split_size = g_conf().get_val<int64_t>("mds_bal_split_size");
if (changed.count("mds_bal_split_wr"))
bal_split_wr = g_conf().get_val<double>("mds_bal_split_wr");
if (changed.count("mds_bal_unreplicate_threshold"))
bal_unreplicate_threshold = g_conf().get_val<double>("mds_bal_unreplicate_threshold");
if (changed.count("mds_bal_max"))
num_bal_times = g_conf().get_val<int64_t>("mds_bal_max");
}
bool MDBalancer::test_rank_mask(mds_rank_t rank)
@ -229,19 +269,16 @@ void MDBalancer::handle_export_pins(void)
void MDBalancer::tick()
{
static int num_bal_times = g_conf()->mds_bal_max;
bool balance_automate = mds->mdsmap->allows_balance_automate();
auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
time now = clock::now();
if (g_conf()->mds_bal_export_pin) {
if (bal_export_pin) {
handle_export_pins();
}
// sample?
if (chrono::duration<double>(now-last_sample).count() >
g_conf()->mds_bal_sample_interval) {
bal_sample_interval) {
dout(15) << "tick last_sample now " << now << dendl;
last_sample = now;
}
@ -275,9 +312,9 @@ public:
};
double mds_load_t::mds_load() const
double mds_load_t::mds_load(int64_t bal_mode) const
{
switch(g_conf()->mds_bal_mode) {
switch(bal_mode) {
case 0:
return
.8 * auth.meta_load() +
@ -397,7 +434,6 @@ int MDBalancer::localize_balancer()
/* timeout: if we waste half our time waiting for RADOS, then abort! */
std::cv_status ret_t = [&] {
auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
std::unique_lock locker{lock};
return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
}();
@ -434,7 +470,7 @@ void MDBalancer::send_heartbeat()
// my load
mds_load_t load = get_load();
mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
mds->logger->set(l_mds_load_cent, 100 * load.mds_load(bal_mode));
mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
@ -607,7 +643,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast)
// Pass on to MDCache: note that the split might still not
// happen if the checks in MDCache::can_fragment fail.
dout(10) << _func_ << " splitting " << *dir << dendl;
int bits = g_conf()->mds_bal_split_bits;
int bits = bal_split_bits;
if (dir->inode->is_ephemeral_dist()) {
unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
if (df.frag.bits() + bits < min_frag_bits)
@ -741,9 +777,9 @@ void MDBalancer::prep_rebalance(int beat)
// rescale! turn my mds_load back into meta_load units
double load_fac = 1.0;
map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
if ((m != mds_load.end()) && (m->second.mds_load(bal_mode) > 0)) {
double metald = m->second.auth.meta_load();
double mdsld = m->second.mds_load();
double mdsld = m->second.mds_load(bal_mode);
load_fac = metald / mdsld;
dout(7) << " load_fac is " << load_fac
<< " <- " << m->second.auth << " " << metald
@ -758,13 +794,13 @@ void MDBalancer::prep_rebalance(int beat)
for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
mds_load_t& load = mds_load.at(i);
double l = load.mds_load() * load_fac;
double l = load.mds_load(bal_mode) * load_fac;
mds_meta_load[i] = l;
if (whoami == 0)
dout(7) << " mds." << i
<< " " << load
<< " = " << load.mds_load()
<< " = " << load.mds_load(bal_mode)
<< " ~ " << l << dendl;
if (whoami == i) my_load = l;
@ -781,9 +817,9 @@ void MDBalancer::prep_rebalance(int beat)
<< dendl;
// under or over?
auto bal_min_rebalance = g_conf().get_val<double>("mds_bal_min_rebalance");
for (const auto& [load, rank] : load_map) {
if (test_rank_mask(rank) &&
load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
if (test_rank_mask(rank) && load < target_load * (1.0 + bal_min_rebalance)) {
dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
mds_last_epoch_under_map[rank] = beat_epoch;
}
@ -962,8 +998,9 @@ void MDBalancer::try_rebalance(balance_state_t& state)
mds_rank_t from = diri->authority().first;
double pop = dir->pop_auth_subtree.meta_load();
if (g_conf()->mds_bal_idle_threshold > 0 &&
pop < g_conf()->mds_bal_idle_threshold &&
const auto bal_idle_threshold = g_conf().get_val<double>("mds_bal_idle_threshold");
if (bal_idle_threshold > 0 &&
pop < bal_idle_threshold &&
diri != mds->mdcache->get_root() &&
from != mds->get_nodeid()) {
dout(5) << " exporting idle (" << pop << ") import " << *dir
@ -1125,13 +1162,14 @@ void MDBalancer::find_exports(CDir *dir,
ceph_assert(dir->is_auth());
double need = amount - have;
if (need < amount * g_conf()->mds_bal_min_start)
const auto bal_min_start = g_conf().get_val<double>("mds_bal_min_start");
if (need < amount * bal_min_start)
return; // good enough!
double needmax = need * g_conf()->mds_bal_need_max;
double needmin = need * g_conf()->mds_bal_need_min;
double midchunk = need * g_conf()->mds_bal_midchunk;
double minchunk = need * g_conf()->mds_bal_minchunk;
double needmax = need * g_conf().get_val<double>("mds_bal_need_max");
double needmin = need * g_conf().get_val<double>("mds_bal_need_min");
double midchunk = need * g_conf().get_val<double>("mds_bal_midchunk");
double minchunk = need * g_conf().get_val<double>("mds_bal_minchunk");
std::vector<CDir*> bigger_rep, bigger_unrep;
multimap<double, CDir*> smaller;
@ -1285,8 +1323,8 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
// hit me
double v = dir->pop_me.get(type).hit(amount);
const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
(v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
const bool hot = (v > bal_split_rd && type == META_POP_IRD) ||
(v > bal_split_wr && type == META_POP_IWR);
dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
<< " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
@ -1303,7 +1341,7 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
dout(20) << type << " pop " << dir_pop << " spread in " << *dir << dendl;
if (dir->is_auth() && !dir->is_ambiguous_auth() && dir->can_rep()) {
if (dir_pop >= g_conf()->mds_bal_replicate_threshold) {
if (dir_pop >= bal_replicate_threshold) {
// replicate
double rdp = dir->pop_me.get(META_POP_IRD).get();
rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
@ -1321,7 +1359,7 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
if (dir->ino() != 1 &&
dir->is_rep() &&
dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
dir_pop < bal_unreplicate_threshold) {
// unreplicate
dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
@ -1488,12 +1526,12 @@ int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
f->open_object_section("mds_load");
{
auto dump_mds_load = [f](const mds_load_t& load) {
auto dump_mds_load = [f](const mds_load_t& load, int64_t bal_mode) {
f->dump_float("request_rate", load.req_rate);
f->dump_float("cache_hit_rate", load.cache_hit_rate);
f->dump_float("queue_length", load.queue_len);
f->dump_float("cpu_load", load.cpu_load_avg);
f->dump_float("mds_load", load.mds_load());
f->dump_float("mds_load", load.mds_load(bal_mode));
f->open_object_section("auth_dirfrags");
load.auth.dump(f);
@ -1507,7 +1545,7 @@ int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
CachedStackStringStream css;
*css << "mds." << rank;
f->open_object_section(css->strv());
dump_mds_load(load);
dump_mds_load(load, bal_mode);
f->close_section();
}
}

View File

@ -76,6 +76,19 @@ public:
int dump_loads(Formatter *f, int64_t depth = -1) const;
bool get_bal_export_pin() const {
return bal_export_pin;
}
int64_t get_bal_merge_size() const {
return bal_merge_size;
}
int64_t get_bal_split_size() const {
return bal_split_size;
}
double get_bal_fragment_fast_factor() const {
return bal_fragment_fast_factor;
}
private:
typedef struct {
std::map<mds_rank_t, double> targets;
@ -83,6 +96,8 @@ private:
std::map<mds_rank_t, double> exported;
} balance_state_t;
static const unsigned int AUTH_TREES_THRESHOLD = 5;
//set up the rebalancing targets for export and do one if the
//MDSMap is up to date
void prep_rebalance(int beat);
@ -121,7 +136,20 @@ private:
bool bal_fragment_dirs;
int64_t bal_fragment_interval;
static const unsigned int AUTH_TREES_THRESHOLD = 5;
int64_t bal_interval;
int64_t bal_max_until;
int64_t bal_mode;
bool bal_export_pin;
double bal_sample_interval;
double bal_split_rd;
double bal_split_wr;
double bal_replicate_threshold;
double bal_unreplicate_threshold;
double bal_fragment_fast_factor;
int64_t bal_split_bits;
int64_t bal_split_size;
int64_t bal_merge_size;
int64_t num_bal_times;
MDSRank *mds;
Messenger *messenger;

View File

@ -673,9 +673,9 @@ void MDSRank::update_targets()
void MDSRank::hit_export_target(mds_rank_t rank, double amount)
{
double rate = g_conf()->mds_bal_target_decay;
double rate = g_conf().get_val<double>("mds_bal_target_decay");
if (amount < 0.0) {
amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
amount = 100.0/rate; /* a good default for "i am trying to keep this export_target active" */
}
auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
auto &counter = em.first->second;
@ -4042,9 +4042,23 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
"fsid",
"host",
"mds_alternate_name_max",
"mds_bal_export_pin",
"mds_bal_fragment_dirs",
"mds_bal_fragment_fast_factor",
"mds_bal_fragment_interval",
"mds_bal_fragment_size_max",
"mds_bal_interval",
"mds_bal_max",
"mds_bal_max_until",
"mds_bal_merge_size",
"mds_bal_mode",
"mds_bal_replicate_threshold",
"mds_bal_sample_interval",
"mds_bal_split_bits",
"mds_bal_split_rd",
"mds_bal_split_size",
"mds_bal_split_wr",
"mds_bal_unreplicate_threshold",
"mds_cache_memory_limit",
"mds_cache_mid",
"mds_cache_reservation",

View File

@ -973,7 +973,7 @@ struct mds_load_t {
double cpu_load_avg = 0.0;
double mds_load() const; // defiend in MDBalancer.cc
double mds_load(int64_t bal_mode) const; // defiend in MDBalancer.cc
void encode(ceph::buffer::list& bl) const;
void decode(ceph::buffer::list::const_iterator& bl);
void dump(ceph::Formatter *f) const;