From 47342a8b3ce1ee998ac70687468253135e29f46b Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 23 Jun 2023 17:01:00 -0400 Subject: [PATCH] mds: add balance_automate fs setting To turn off the automatic ("default") balancer in multiple MDS clusters. The new default is "off" as the balancer is a constant source of problems and surprise for administrators trying multiple actives. Instead, it should be a deliberate decision to turn it on and usually with customization like the "bal_rank_mask" setting or pinning. Fixes: https://tracker.ceph.com/issues/61378 Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 4 ++++ src/include/ceph_fs.h | 1 + src/mds/MDBalancer.cc | 4 +++- src/mds/MDSMap.cc | 3 +++ src/mds/MDSMap.h | 12 +++++++++++- src/mon/FSCommands.cc | 15 +++++++++++++++ src/mon/MonCommands.h | 25 ++++++++++++++++++++----- 7 files changed, 57 insertions(+), 7 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index c44851a5eaf..8e344dba79b 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -95,6 +95,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config mirroring policies between RGW and AWS, you may wish to set "rgw policy reject invalid principals" to "false". This affects only newly set policies, not policies that are already in place. +* The CephFS automatic metadata load (sometimes called "default") balancer is + now disabled by default. The new file system flag `balance_automate` + can be used to toggle it on or off. It can be enabled or disabled via + `ceph fs set balance_automate `. * RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file. The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path` defaults to "/var/log/ceph/ops-log-$cluster-$name.log". diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index f567a26f411..24542168026 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -292,6 +292,7 @@ struct ceph_mon_subscribe_ack { request */ #define CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS (1<<7) /* fs is forbidden to use standby for another fs */ +#define CEPH_MDSMAP_BALANCE_AUTOMATE (1<<8) /* automate metadata balancing */ #define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \ CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS) diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index cd2e7d9bd00..b48c59f2606 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void) void MDBalancer::tick() { static int num_bal_times = g_conf()->mds_bal_max; + bool balance_automate = mds->mdsmap->allows_balance_automate(); auto bal_interval = g_conf().get_val("mds_bal_interval"); auto bal_max_until = g_conf().get_val("mds_bal_max_until"); time now = clock::now(); @@ -248,7 +249,8 @@ void MDBalancer::tick() // We can use duration_cast below, although the result is an int, // because the values from g_conf are also integers. // balance? - if (mds->get_nodeid() == 0 + if (balance_automate + && mds->get_nodeid() == 0 && mds->is_active() && bal_interval > 0 && chrono::duration_cast(now - last_heartbeat).count() >= bal_interval diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 47c823bf763..699765ebe91 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -239,6 +239,7 @@ void MDSMap::dump_flags_state(Formatter *f) const f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay()); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)); + f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)); f->close_section(); } @@ -383,6 +384,8 @@ void MDSMap::print_flags(std::ostream& out) const { out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION); if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)) + out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE); } void MDSMap::get_health(list >& summary, diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 9057f05a8ce..746ae859715 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -244,6 +244,15 @@ public: bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; } + void set_balance_automate() { + set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); + ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + } + void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; } + void set_multimds_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; @@ -676,7 +685,8 @@ private: {CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"}, {CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"}, {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"}, - {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"} + {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"}, + {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"} }; }; WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 2faba87f73d..26061996907 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -665,6 +665,21 @@ public: } }; fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); + } else if (var == "balance_automate") { + bool allow = false; + int r = parse_bool(val, &allow, ss); + if (r != 0) { + return r; + } + + auto f = [allow](auto&& fs) { + if (allow) { + fs.get_mds_map().set_balance_automate(); + } else { + fs.get_mds_map().clear_balance_automate(); + } + }; + fsmap.modify_filesystem(fsp->get_fscid(), std::move(f)); } else if (var == "min_compat_client") { auto vno = ceph_release_from_name(val.c_str()); if (!vno) { diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 71a8bec76ac..6f8f25e050a 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -374,11 +374,26 @@ COMMAND("fs get name=fs_name,type=CephString", "fs", "r") COMMAND("fs set " "name=fs_name,type=CephString " - "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" - "|standby_count_wanted|session_timeout|session_autoclose" - "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask" - "|refuse_client_session|max_xattr_size|refuse_standby_for_another_fs " + "name=var,type=CephChoices,strings=max_mds" + "|allow_dirfrags" + "|allow_new_snaps" + "|allow_standby_replay" + "|bal_rank_mask" + "|balance_automate" + "|balancer" + "|cluster_down" + "|down" + "|inline_data" + "|joinable" + "|max_file_size" + "|max_xattr_size" + "|min_compat_client" + "|refuse_client_session" + "|refuse_standby_for_another_fs" + "|session_autoclose" + "|session_timeout" + "|standby_count_wanted" + " " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false " "name=yes_i_really_really_mean_it,type=CephBool,req=false",