mirror of
https://github.com/ceph/ceph
synced 2025-02-24 11:37:37 +00:00
mds: optimize MDBalancer code path config access
This change is necessary because the new way of setting config is to use the ceph config command or the asok interface rather than the old way which involved editing the ceph.conf and restarting the daemons to reflect the changes. Have updated the code to support runtime config changes. Signed-off-by: Sidharth Anupkrishnan <sanupkri@redhat.com> Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
parent
8fbefb00ba
commit
ddc57388ce
@ -633,7 +633,8 @@ options:
|
||||
default: true
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_export_ephemeral_random
|
||||
type: bool
|
||||
level: advanced
|
||||
@ -690,7 +691,8 @@ options:
|
||||
default: 3
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_replicate_threshold
|
||||
type: float
|
||||
level: advanced
|
||||
@ -700,7 +702,8 @@ options:
|
||||
default: 8000
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_unreplicate_threshold
|
||||
type: float
|
||||
level: advanced
|
||||
@ -710,7 +713,8 @@ options:
|
||||
default: 0
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_split_size
|
||||
type: int
|
||||
level: advanced
|
||||
@ -720,7 +724,8 @@ options:
|
||||
default: 10000
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_split_rd
|
||||
type: float
|
||||
level: advanced
|
||||
@ -730,7 +735,8 @@ options:
|
||||
default: 25000
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_split_wr
|
||||
type: float
|
||||
level: advanced
|
||||
@ -740,7 +746,8 @@ options:
|
||||
default: 10000
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_split_bits
|
||||
type: int
|
||||
level: advanced
|
||||
@ -749,9 +756,10 @@ options:
|
||||
default: 3
|
||||
services:
|
||||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
min: 1
|
||||
max: 24
|
||||
with_legacy: true
|
||||
- name: mds_bal_merge_size
|
||||
type: int
|
||||
level: advanced
|
||||
@ -761,7 +769,8 @@ options:
|
||||
default: 50
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_interval
|
||||
type: int
|
||||
level: advanced
|
||||
@ -770,6 +779,8 @@ options:
|
||||
default: 10
|
||||
services:
|
||||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_fragment_interval
|
||||
type: int
|
||||
level: advanced
|
||||
@ -779,6 +790,8 @@ options:
|
||||
default: 5
|
||||
services:
|
||||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
# order of magnitude higher than split size
|
||||
- name: mds_bal_fragment_size_max
|
||||
type: int
|
||||
@ -800,7 +813,8 @@ options:
|
||||
default: 1.5
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_fragment_dirs
|
||||
type: bool
|
||||
level: advanced
|
||||
@ -813,6 +827,8 @@ options:
|
||||
default: true
|
||||
services:
|
||||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_idle_threshold
|
||||
type: float
|
||||
level: advanced
|
||||
@ -822,7 +838,8 @@ options:
|
||||
default: 0
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_max
|
||||
type: int
|
||||
level: dev
|
||||
@ -831,7 +848,8 @@ options:
|
||||
- mds
|
||||
fmt_desc: The number of iterations to run balancer before Ceph stops.
|
||||
(used for testing purposes only)
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_max_until
|
||||
type: int
|
||||
level: dev
|
||||
@ -840,7 +858,8 @@ options:
|
||||
- mds
|
||||
fmt_desc: The number of seconds to run balancer before Ceph stops.
|
||||
(used for testing purposes only)
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_mode
|
||||
type: int
|
||||
level: dev
|
||||
@ -853,7 +872,8 @@ options:
|
||||
- ``0`` = Hybrid.
|
||||
- ``1`` = Request rate and latency.
|
||||
- ``2`` = CPU load.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# must be this much above average before we export anything
|
||||
- name: mds_bal_min_rebalance
|
||||
type: float
|
||||
@ -863,7 +883,8 @@ options:
|
||||
default: 0.1
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# must be overloaded for more than these epochs before we export anything
|
||||
- name: mds_bal_overload_epochs
|
||||
type: int
|
||||
@ -882,7 +903,8 @@ options:
|
||||
services:
|
||||
- mds
|
||||
fmt_desc: The minimum subtree temperature before Ceph searches a subtree.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# take within this range of what we need
|
||||
- name: mds_bal_need_min
|
||||
type: float
|
||||
@ -891,7 +913,8 @@ options:
|
||||
services:
|
||||
- mds
|
||||
fmt_desc: The minimum fraction of target subtree size to accept.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_bal_need_max
|
||||
type: float
|
||||
level: dev
|
||||
@ -899,7 +922,8 @@ options:
|
||||
services:
|
||||
- mds
|
||||
fmt_desc: The maximum fraction of target subtree size to accept.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# any sub bigger than this taken in full
|
||||
- name: mds_bal_midchunk
|
||||
type: float
|
||||
@ -909,7 +933,8 @@ options:
|
||||
- mds
|
||||
fmt_desc: Ceph will migrate any subtree that is larger than this fraction
|
||||
of the target subtree size.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# never take anything smaller than this
|
||||
- name: mds_bal_minchunk
|
||||
type: float
|
||||
@ -919,7 +944,8 @@ options:
|
||||
- mds
|
||||
fmt_desc: Ceph will ignore any subtree that is smaller than this fraction
|
||||
of the target subtree size.
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
# target decay half-life in MDSMap (2x larger is approx. 2x slower)
|
||||
- name: mds_bal_target_decay
|
||||
type: float
|
||||
@ -928,7 +954,8 @@ options:
|
||||
default: 10
|
||||
services:
|
||||
- mds
|
||||
with_legacy: true
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_oft_prefetch_dirfrags
|
||||
type: bool
|
||||
level: advanced
|
||||
|
@ -1018,6 +1018,12 @@ void CDir::init_fragment_pins()
|
||||
get(PIN_SUBTREE);
|
||||
}
|
||||
|
||||
bool CDir::should_split() const {
|
||||
uint64_t split_size = mdcache->mds->balancer->get_bal_split_size();
|
||||
uint64_t items = get_frag_size() + get_num_snap_items();
|
||||
return split_size > 0 && items > split_size;
|
||||
}
|
||||
|
||||
void CDir::split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay)
|
||||
{
|
||||
dout(10) << "split by " << bits << " bits on " << *this << dendl;
|
||||
@ -3774,7 +3780,10 @@ std::string CDir::get_path() const
|
||||
bool CDir::should_split_fast() const
|
||||
{
|
||||
// Max size a fragment can be before trigger fast splitting
|
||||
int fast_limit = g_conf()->mds_bal_split_size * g_conf()->mds_bal_fragment_fast_factor;
|
||||
auto&& balancer = mdcache->mds->balancer;
|
||||
auto split_size = balancer->get_bal_split_size();
|
||||
auto fragment_fast_factor = balancer->get_bal_fragment_fast_factor();
|
||||
int64_t fast_limit = split_size * fragment_fast_factor;
|
||||
|
||||
// Fast path: the sum of accounted size and null dentries does not
|
||||
// exceed threshold: we definitely are not over it.
|
||||
@ -3811,7 +3820,9 @@ bool CDir::should_merge() const
|
||||
return false;
|
||||
}
|
||||
|
||||
return ((int)get_frag_size() + (int)get_num_snap_items()) < g_conf()->mds_bal_merge_size;
|
||||
uint64_t merge_size = mdcache->mds->balancer->get_bal_merge_size();
|
||||
uint64_t items = get_frag_size() + get_num_snap_items();
|
||||
return items < merge_size;
|
||||
}
|
||||
|
||||
MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
|
||||
|
@ -403,10 +403,7 @@ public:
|
||||
void split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay);
|
||||
void merge(const std::vector<CDir*>& subs, MDSContext::vec& waiters, bool replay);
|
||||
|
||||
bool should_split() const {
|
||||
return g_conf()->mds_bal_split_size > 0 &&
|
||||
((int)get_frag_size() + (int)get_num_snap_items()) > g_conf()->mds_bal_split_size;
|
||||
}
|
||||
bool should_split() const;
|
||||
bool should_split_fast() const;
|
||||
bool should_merge() const;
|
||||
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "MDLog.h"
|
||||
#include "Locker.h"
|
||||
#include "Mutation.h"
|
||||
#include "MDBalancer.h"
|
||||
|
||||
#include "events/EUpdate.h"
|
||||
|
||||
@ -5384,7 +5385,8 @@ void CInode::queue_export_pin(mds_rank_t export_pin)
|
||||
|
||||
void CInode::maybe_export_pin(bool update)
|
||||
{
|
||||
if (!g_conf()->mds_bal_export_pin)
|
||||
auto&& balancer = mdcache->mds->balancer;
|
||||
if (!balancer->get_bal_export_pin())
|
||||
return;
|
||||
if (!is_dir() || !is_normal())
|
||||
return;
|
||||
@ -5499,7 +5501,9 @@ void CInode::set_export_pin(mds_rank_t rank)
|
||||
|
||||
mds_rank_t CInode::get_export_pin(bool inherit) const
|
||||
{
|
||||
if (!g_conf()->mds_bal_export_pin)
|
||||
auto&& balancer = mdcache->mds->balancer;
|
||||
auto export_pin = balancer->get_bal_export_pin();
|
||||
if (!export_pin)
|
||||
return MDS_RANK_NONE;
|
||||
|
||||
/* An inode that is export pinned may not necessarily be a subtree root, we
|
||||
|
@ -81,18 +81,58 @@ int MDBalancer::proc_message(const cref_t<Message> &m)
|
||||
MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
|
||||
mds(m), messenger(msgr), mon_client(monc)
|
||||
{
|
||||
bal_export_pin = g_conf().get_val<bool>("mds_bal_export_pin");
|
||||
bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
|
||||
bal_fragment_fast_factor = g_conf().get_val<double>("mds_bal_fragment_fast_factor");
|
||||
bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
|
||||
bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
|
||||
bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
|
||||
bal_merge_size = g_conf().get_val<int64_t>("mds_bal_merge_size");
|
||||
bal_mode = g_conf().get_val<int64_t>("mds_bal_mode");
|
||||
bal_replicate_threshold = g_conf().get_val<double>("mds_bal_replicate_threshold");
|
||||
bal_sample_interval = g_conf().get_val<double>("mds_bal_sample_interval");
|
||||
bal_split_rd = g_conf().get_val<double>("mds_bal_split_rd");
|
||||
bal_split_bits = g_conf().get_val<int64_t>("mds_bal_split_bits");
|
||||
bal_split_size = g_conf().get_val<int64_t>("mds_bal_split_size");
|
||||
bal_split_wr = g_conf().get_val<double>("mds_bal_split_wr");
|
||||
bal_unreplicate_threshold = g_conf().get_val<double>("mds_bal_unreplicate_threshold");
|
||||
num_bal_times = g_conf().get_val<int64_t>("mds_bal_max");
|
||||
}
|
||||
|
||||
void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
|
||||
{
|
||||
if (changed.count("mds_bal_fragment_dirs")) {
|
||||
if (changed.count("mds_bal_export_pin"))
|
||||
bal_export_pin = g_conf().get_val<bool>("mds_bal_export_pin");
|
||||
if (changed.count("mds_bal_fragment_dirs"))
|
||||
bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
|
||||
}
|
||||
if (changed.count("mds_bal_fragment_interval")) {
|
||||
if (changed.count("mds_bal_fragment_fast_factor"))
|
||||
bal_fragment_fast_factor = g_conf().get_val<double>("mds_bal_fragment_fast_factor");
|
||||
if (changed.count("mds_bal_fragment_interval"))
|
||||
bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
|
||||
}
|
||||
if (changed.count("mds_bal_interval"))
|
||||
bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
|
||||
if (changed.count("mds_bal_max_until"))
|
||||
bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
|
||||
if (changed.count("mds_bal_merge_size"))
|
||||
bal_merge_size = g_conf().get_val<int64_t>("mds_bal_merge_size");
|
||||
if (changed.count("mds_bal_mode"))
|
||||
bal_mode = g_conf().get_val<int64_t>("mds_bal_mode");
|
||||
if (changed.count("mds_bal_replicate_threshold"))
|
||||
bal_replicate_threshold = g_conf().get_val<double>("mds_bal_replicate_threshold");
|
||||
if (changed.count("mds_bal_sample_interval"))
|
||||
bal_sample_interval = g_conf().get_val<double>("mds_bal_sample_interval");
|
||||
if (changed.count("mds_bal_split_rd"))
|
||||
bal_split_rd = g_conf().get_val<double>("mds_bal_split_rd");
|
||||
if (changed.count("mds_bal_split_bits"))
|
||||
bal_split_bits = g_conf().get_val<int64_t>("mds_bal_split_bits");
|
||||
if (changed.count("mds_bal_split_size"))
|
||||
bal_split_size = g_conf().get_val<int64_t>("mds_bal_split_size");
|
||||
if (changed.count("mds_bal_split_wr"))
|
||||
bal_split_wr = g_conf().get_val<double>("mds_bal_split_wr");
|
||||
if (changed.count("mds_bal_unreplicate_threshold"))
|
||||
bal_unreplicate_threshold = g_conf().get_val<double>("mds_bal_unreplicate_threshold");
|
||||
if (changed.count("mds_bal_max"))
|
||||
num_bal_times = g_conf().get_val<int64_t>("mds_bal_max");
|
||||
}
|
||||
|
||||
bool MDBalancer::test_rank_mask(mds_rank_t rank)
|
||||
@ -229,19 +269,16 @@ void MDBalancer::handle_export_pins(void)
|
||||
|
||||
void MDBalancer::tick()
|
||||
{
|
||||
static int num_bal_times = g_conf()->mds_bal_max;
|
||||
bool balance_automate = mds->mdsmap->allows_balance_automate();
|
||||
auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
|
||||
auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
|
||||
time now = clock::now();
|
||||
|
||||
if (g_conf()->mds_bal_export_pin) {
|
||||
if (bal_export_pin) {
|
||||
handle_export_pins();
|
||||
}
|
||||
|
||||
// sample?
|
||||
if (chrono::duration<double>(now-last_sample).count() >
|
||||
g_conf()->mds_bal_sample_interval) {
|
||||
bal_sample_interval) {
|
||||
dout(15) << "tick last_sample now " << now << dendl;
|
||||
last_sample = now;
|
||||
}
|
||||
@ -275,9 +312,9 @@ public:
|
||||
};
|
||||
|
||||
|
||||
double mds_load_t::mds_load() const
|
||||
double mds_load_t::mds_load(int64_t bal_mode) const
|
||||
{
|
||||
switch(g_conf()->mds_bal_mode) {
|
||||
switch(bal_mode) {
|
||||
case 0:
|
||||
return
|
||||
.8 * auth.meta_load() +
|
||||
@ -397,7 +434,6 @@ int MDBalancer::localize_balancer()
|
||||
|
||||
/* timeout: if we waste half our time waiting for RADOS, then abort! */
|
||||
std::cv_status ret_t = [&] {
|
||||
auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
|
||||
std::unique_lock locker{lock};
|
||||
return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
|
||||
}();
|
||||
@ -434,7 +470,7 @@ void MDBalancer::send_heartbeat()
|
||||
|
||||
// my load
|
||||
mds_load_t load = get_load();
|
||||
mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
|
||||
mds->logger->set(l_mds_load_cent, 100 * load.mds_load(bal_mode));
|
||||
mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
|
||||
|
||||
auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
|
||||
@ -607,7 +643,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast)
|
||||
// Pass on to MDCache: note that the split might still not
|
||||
// happen if the checks in MDCache::can_fragment fail.
|
||||
dout(10) << _func_ << " splitting " << *dir << dendl;
|
||||
int bits = g_conf()->mds_bal_split_bits;
|
||||
int bits = bal_split_bits;
|
||||
if (dir->inode->is_ephemeral_dist()) {
|
||||
unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
|
||||
if (df.frag.bits() + bits < min_frag_bits)
|
||||
@ -741,9 +777,9 @@ void MDBalancer::prep_rebalance(int beat)
|
||||
// rescale! turn my mds_load back into meta_load units
|
||||
double load_fac = 1.0;
|
||||
map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
|
||||
if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
|
||||
if ((m != mds_load.end()) && (m->second.mds_load(bal_mode) > 0)) {
|
||||
double metald = m->second.auth.meta_load();
|
||||
double mdsld = m->second.mds_load();
|
||||
double mdsld = m->second.mds_load(bal_mode);
|
||||
load_fac = metald / mdsld;
|
||||
dout(7) << " load_fac is " << load_fac
|
||||
<< " <- " << m->second.auth << " " << metald
|
||||
@ -758,13 +794,13 @@ void MDBalancer::prep_rebalance(int beat)
|
||||
for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
|
||||
mds_load_t& load = mds_load.at(i);
|
||||
|
||||
double l = load.mds_load() * load_fac;
|
||||
double l = load.mds_load(bal_mode) * load_fac;
|
||||
mds_meta_load[i] = l;
|
||||
|
||||
if (whoami == 0)
|
||||
dout(7) << " mds." << i
|
||||
<< " " << load
|
||||
<< " = " << load.mds_load()
|
||||
<< " = " << load.mds_load(bal_mode)
|
||||
<< " ~ " << l << dendl;
|
||||
|
||||
if (whoami == i) my_load = l;
|
||||
@ -781,9 +817,9 @@ void MDBalancer::prep_rebalance(int beat)
|
||||
<< dendl;
|
||||
|
||||
// under or over?
|
||||
auto bal_min_rebalance = g_conf().get_val<double>("mds_bal_min_rebalance");
|
||||
for (const auto& [load, rank] : load_map) {
|
||||
if (test_rank_mask(rank) &&
|
||||
load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
|
||||
if (test_rank_mask(rank) && load < target_load * (1.0 + bal_min_rebalance)) {
|
||||
dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
|
||||
mds_last_epoch_under_map[rank] = beat_epoch;
|
||||
}
|
||||
@ -962,8 +998,9 @@ void MDBalancer::try_rebalance(balance_state_t& state)
|
||||
|
||||
mds_rank_t from = diri->authority().first;
|
||||
double pop = dir->pop_auth_subtree.meta_load();
|
||||
if (g_conf()->mds_bal_idle_threshold > 0 &&
|
||||
pop < g_conf()->mds_bal_idle_threshold &&
|
||||
const auto bal_idle_threshold = g_conf().get_val<double>("mds_bal_idle_threshold");
|
||||
if (bal_idle_threshold > 0 &&
|
||||
pop < bal_idle_threshold &&
|
||||
diri != mds->mdcache->get_root() &&
|
||||
from != mds->get_nodeid()) {
|
||||
dout(5) << " exporting idle (" << pop << ") import " << *dir
|
||||
@ -1125,13 +1162,14 @@ void MDBalancer::find_exports(CDir *dir,
|
||||
ceph_assert(dir->is_auth());
|
||||
|
||||
double need = amount - have;
|
||||
if (need < amount * g_conf()->mds_bal_min_start)
|
||||
const auto bal_min_start = g_conf().get_val<double>("mds_bal_min_start");
|
||||
if (need < amount * bal_min_start)
|
||||
return; // good enough!
|
||||
|
||||
double needmax = need * g_conf()->mds_bal_need_max;
|
||||
double needmin = need * g_conf()->mds_bal_need_min;
|
||||
double midchunk = need * g_conf()->mds_bal_midchunk;
|
||||
double minchunk = need * g_conf()->mds_bal_minchunk;
|
||||
double needmax = need * g_conf().get_val<double>("mds_bal_need_max");
|
||||
double needmin = need * g_conf().get_val<double>("mds_bal_need_min");
|
||||
double midchunk = need * g_conf().get_val<double>("mds_bal_midchunk");
|
||||
double minchunk = need * g_conf().get_val<double>("mds_bal_minchunk");
|
||||
|
||||
std::vector<CDir*> bigger_rep, bigger_unrep;
|
||||
multimap<double, CDir*> smaller;
|
||||
@ -1285,8 +1323,8 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
|
||||
// hit me
|
||||
double v = dir->pop_me.get(type).hit(amount);
|
||||
|
||||
const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
|
||||
(v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
|
||||
const bool hot = (v > bal_split_rd && type == META_POP_IRD) ||
|
||||
(v > bal_split_wr && type == META_POP_IWR);
|
||||
|
||||
dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
|
||||
<< " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
|
||||
@ -1303,7 +1341,7 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
|
||||
|
||||
dout(20) << type << " pop " << dir_pop << " spread in " << *dir << dendl;
|
||||
if (dir->is_auth() && !dir->is_ambiguous_auth() && dir->can_rep()) {
|
||||
if (dir_pop >= g_conf()->mds_bal_replicate_threshold) {
|
||||
if (dir_pop >= bal_replicate_threshold) {
|
||||
// replicate
|
||||
double rdp = dir->pop_me.get(META_POP_IRD).get();
|
||||
rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
|
||||
@ -1321,7 +1359,7 @@ void MDBalancer::hit_dir(CDir *dir, int type, double amount)
|
||||
|
||||
if (dir->ino() != 1 &&
|
||||
dir->is_rep() &&
|
||||
dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
|
||||
dir_pop < bal_unreplicate_threshold) {
|
||||
// unreplicate
|
||||
dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
|
||||
|
||||
@ -1488,12 +1526,12 @@ int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
|
||||
f->open_object_section("mds_load");
|
||||
{
|
||||
|
||||
auto dump_mds_load = [f](const mds_load_t& load) {
|
||||
auto dump_mds_load = [f](const mds_load_t& load, int64_t bal_mode) {
|
||||
f->dump_float("request_rate", load.req_rate);
|
||||
f->dump_float("cache_hit_rate", load.cache_hit_rate);
|
||||
f->dump_float("queue_length", load.queue_len);
|
||||
f->dump_float("cpu_load", load.cpu_load_avg);
|
||||
f->dump_float("mds_load", load.mds_load());
|
||||
f->dump_float("mds_load", load.mds_load(bal_mode));
|
||||
|
||||
f->open_object_section("auth_dirfrags");
|
||||
load.auth.dump(f);
|
||||
@ -1507,7 +1545,7 @@ int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
|
||||
CachedStackStringStream css;
|
||||
*css << "mds." << rank;
|
||||
f->open_object_section(css->strv());
|
||||
dump_mds_load(load);
|
||||
dump_mds_load(load, bal_mode);
|
||||
f->close_section();
|
||||
}
|
||||
}
|
||||
|
@ -76,6 +76,19 @@ public:
|
||||
|
||||
int dump_loads(Formatter *f, int64_t depth = -1) const;
|
||||
|
||||
bool get_bal_export_pin() const {
|
||||
return bal_export_pin;
|
||||
}
|
||||
int64_t get_bal_merge_size() const {
|
||||
return bal_merge_size;
|
||||
}
|
||||
int64_t get_bal_split_size() const {
|
||||
return bal_split_size;
|
||||
}
|
||||
double get_bal_fragment_fast_factor() const {
|
||||
return bal_fragment_fast_factor;
|
||||
}
|
||||
|
||||
private:
|
||||
typedef struct {
|
||||
std::map<mds_rank_t, double> targets;
|
||||
@ -83,6 +96,8 @@ private:
|
||||
std::map<mds_rank_t, double> exported;
|
||||
} balance_state_t;
|
||||
|
||||
static const unsigned int AUTH_TREES_THRESHOLD = 5;
|
||||
|
||||
//set up the rebalancing targets for export and do one if the
|
||||
//MDSMap is up to date
|
||||
void prep_rebalance(int beat);
|
||||
@ -121,7 +136,20 @@ private:
|
||||
|
||||
bool bal_fragment_dirs;
|
||||
int64_t bal_fragment_interval;
|
||||
static const unsigned int AUTH_TREES_THRESHOLD = 5;
|
||||
int64_t bal_interval;
|
||||
int64_t bal_max_until;
|
||||
int64_t bal_mode;
|
||||
bool bal_export_pin;
|
||||
double bal_sample_interval;
|
||||
double bal_split_rd;
|
||||
double bal_split_wr;
|
||||
double bal_replicate_threshold;
|
||||
double bal_unreplicate_threshold;
|
||||
double bal_fragment_fast_factor;
|
||||
int64_t bal_split_bits;
|
||||
int64_t bal_split_size;
|
||||
int64_t bal_merge_size;
|
||||
int64_t num_bal_times;
|
||||
|
||||
MDSRank *mds;
|
||||
Messenger *messenger;
|
||||
|
@ -673,9 +673,9 @@ void MDSRank::update_targets()
|
||||
|
||||
void MDSRank::hit_export_target(mds_rank_t rank, double amount)
|
||||
{
|
||||
double rate = g_conf()->mds_bal_target_decay;
|
||||
double rate = g_conf().get_val<double>("mds_bal_target_decay");
|
||||
if (amount < 0.0) {
|
||||
amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
|
||||
amount = 100.0/rate; /* a good default for "i am trying to keep this export_target active" */
|
||||
}
|
||||
auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
|
||||
auto &counter = em.first->second;
|
||||
@ -4042,9 +4042,23 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
|
||||
"fsid",
|
||||
"host",
|
||||
"mds_alternate_name_max",
|
||||
"mds_bal_export_pin",
|
||||
"mds_bal_fragment_dirs",
|
||||
"mds_bal_fragment_fast_factor",
|
||||
"mds_bal_fragment_interval",
|
||||
"mds_bal_fragment_size_max",
|
||||
"mds_bal_interval",
|
||||
"mds_bal_max",
|
||||
"mds_bal_max_until",
|
||||
"mds_bal_merge_size",
|
||||
"mds_bal_mode",
|
||||
"mds_bal_replicate_threshold",
|
||||
"mds_bal_sample_interval",
|
||||
"mds_bal_split_bits",
|
||||
"mds_bal_split_rd",
|
||||
"mds_bal_split_size",
|
||||
"mds_bal_split_wr",
|
||||
"mds_bal_unreplicate_threshold",
|
||||
"mds_cache_memory_limit",
|
||||
"mds_cache_mid",
|
||||
"mds_cache_reservation",
|
||||
|
@ -973,7 +973,7 @@ struct mds_load_t {
|
||||
|
||||
double cpu_load_avg = 0.0;
|
||||
|
||||
double mds_load() const; // defiend in MDBalancer.cc
|
||||
double mds_load(int64_t bal_mode) const; // defiend in MDBalancer.cc
|
||||
void encode(ceph::buffer::list& bl) const;
|
||||
void decode(ceph::buffer::list::const_iterator& bl);
|
||||
void dump(ceph::Formatter *f) const;
|
||||
|
Loading…
Reference in New Issue
Block a user